Exemplo n.º 1
0
def request_the_same(url):
    global HTTP_GET_REQUESTS_COUNT

    file1 = TDownloadedFile(url)
    assert HTTP_GET_REQUESTS_COUNT == 1
    file2 = TDownloadedFile(url)
    assert HTTP_GET_REQUESTS_COUNT == 1
def copy_files(args, toloka_results):
    assert args.positive_folder is not None
    assert args.negative_folder is not None
    with TRobotProject(args.project, ROBOT_STEPS) as project:
        project.read_project(fetch_morda_url=False)
        office_info = project.offices[0]
        index = 0
        domain = strip_html_url(office_info.morda_url)
        for export_record in office_info.exported_files:
            index += 1
            cached_file = export_record['cached_file']
            url = export_record['url']
            print ()
            extension = TDownloadedFile(url).file_extension
            out_file = "{}_{}_{}{}".format(domain, index, int(time.time()), extension)
            tol_res = toloka_results.get(cached_file)
            if tol_res == "YES":
                folder = args.positive_folder
            elif tol_res == "NO":
                folder = args.negative_folder
            else:
                folder = None
            if folder is not None:
                out_file = os.path.join(folder, out_file)
                print ("{} -> {}".format(url, out_file))
                shutil.copy(cached_file, out_file)
Exemplo n.º 3
0
    def add_link_wrapper(self, link_info: TLinkInfo):
        assert link_info.target_url is not None
        try:
            downloaded_file = TDownloadedFile(link_info.target_url)
        except RobotHttpException as err:
            self.logger.error(err)
            return

        href = link_info.target_url

        self.website.url_nodes[link_info.source_url].add_child_link(href, link_info.to_json())
        link_info.weight = max(link_info.weight, self.step_urls[href])
        self.step_urls[href] = link_info.weight

        if href not in self.website.url_nodes:
            if link_info.target_title is None and downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
                link_info.target_title = get_html_title(downloaded_file.data)
            self.website.url_nodes[href] = TUrlInfo(title=link_info.target_title, step_name=self.get_step_name())

        self.website.url_nodes[href].parent_nodes.add(link_info.source_url)

        if self.is_last_step():
            self.website.export_env.export_file_if_relevant(downloaded_file, link_info)

        if self.step_passport.get('transitive', False):
            if href not in self.processed_pages:
                if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
                    self.pages_to_process[href] = link_info.weight

        self.logger.debug("add link {0}".format(href))
Exemplo n.º 4
0
def request_timeouted(url):
    got_timeout_exception = False
    try:
        TDownloadedFile(url)
    except RobotHttpException as exp:
        got_timeout_exception = True
    assert got_timeout_exception
Exemplo n.º 5
0
def request_too_many_404(url):
    codes = list()
    for i in range(4):
        try:
            x = TDownloadedFile(url)
        except RobotHttpException as exp:
            codes.append(exp.http_code)
    assert codes == [404, 404, 404, 429]
Exemplo n.º 6
0
    def add_page_links(self, url, use_selenium=True, use_urllib=True):
        try:
            downloaded_file = TDownloadedFile(url)
        except RobotHttpException as err:
            self.logger.error(err)
            return
        if downloaded_file.file_extension != DEFAULT_HTML_EXTENSION:
            return
        html_parser = None
        already_processed_by_urllib = None
        try:
            if use_urllib:
                html_parser = THtmlParser(downloaded_file.data)
                already_processed_by_urllib = self.website.find_a_web_page_with_a_similar_html(self, url, html_parser.html_text)
        except Exception as e:
            self.logger.error('cannot parse html, exception {}'.format(url, e))
            return

        try:
            if use_urllib and already_processed_by_urllib is None:
                find_links_in_html_by_text(self, url, html_parser)
            else:
                if use_urllib:
                    self.logger.debug(
                        'skip processing {} in find_links_in_html_by_text, a similar file is already processed on this step: {}'.format(url, already_processed_by_urllib))

                if not use_selenium and len(list(html_parser.soup.findAll('a'))) < 10:
                    self.logger.debug('temporal switch on selenium, since this file can be fully javascripted')
                    use_selenium = True

            if use_selenium:  # switch off selenium is almost a panic mode (too many links)
                if downloaded_file.get_file_extension_only_by_headers() != DEFAULT_HTML_EXTENSION:
                    # selenium reads only http headers, so downloaded_file.file_extension can be DEFAULT_HTML_EXTENSION
                    self.logger.debug("do not browse {} with selenium, since it has wrong http headers".format(url))
                else:
                    click_all_selenium(self, url, self.website.parent_project.selenium_driver)
        except (RobotHttpException, WebDriverException, InvalidSwitchToTargetException) as e:
            self.logger.error('add_links failed on url={}, exception: {}'.format(url, e))
def create_toloka_pool(project_path, toloka_stream):
    with TRobotProject(logging, project_path, ROBOT_STEPS, None) as project:
        project.read_project(fetch_morda_url=False)
        office_info = project.offices[0]
        toloka_stream.write("INPUT:url\tINPUT:file_link\tINPUT:file_extension\tINPUT:html\n")
        ec = TExternalConverters()
        cnt = 0
        all_files = 0
        for export_record in office_info.exported_files:
            all_files += 1
            sys.stderr.write("{}/{}\n".format(all_files, len(office_info.exported_files)))
            sys.stderr.flush()
            url = export_record['url']
            cached_file = export_record['cached_file']
            extension = TDownloadedFile(url).file_extension
            temp_file = "dummy" + extension
            shutil.copy(cached_file, temp_file)
            html = ec.convert_to_html_with_soffice(temp_file)
            os.unlink(temp_file)
            if html is not None:
                html = html.replace("\t", " ").replace("\n", " ").replace("\r", " ")
                toloka_stream.write("\t".join((url, cached_file, extension, html)) + "\n\n")
                cnt += 1
        sys.stderr.write("written {} lines of of {}".format(cnt, all_files))
Exemplo n.º 8
0
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger


if __name__ == '__main__':
    logger = setup_logging()
    TDownloadEnv.clear_cache_folder()
    web_addr = sys.argv[1]
    host, port = web_addr.split(":")
    HTTP_SERVER = http.server.HTTPServer((host, int(port)), THttpServer)
    server_thread = threading.Thread(target=start_server)
    server_thread.start()
    time.sleep(1)
    url = web_addr + "/somepath"
    wrong_extension = get_file_extension_only_by_headers(url)
    assert wrong_extension == ".doc"  # see minvr.ru for this error
    downloaded_file = TDownloadedFile(url)
    right_extension = downloaded_file.file_extension  #read file contents to determine it's type
    assert right_extension == DEFAULT_HTML_EXTENSION
    assert HTTP_GET_REQUESTS_COUNT == 1
    assert HTTP_HEAD_REQUESTS_COUNT == 1

    # test redirects
    dummy1, dummy2, data = make_http_request_urllib(logger,
                                                    web_addr + "/redirect1",
                                                    "GET")
    assert data.decode('utf8').startswith("<html>")
    HTTP_SERVER.shutdown()
Exemplo n.º 9
0
    logger.setLevel(logging.DEBUG)

    # create formatter and add it to the handlers
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if os.path.exists(logfilename):
        os.remove(logfilename)
    # create file handler which logs even debug messages
    fh = logging.FileHandler(logfilename, encoding="utf8")
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", dest='url', required=True)
    return parser.parse_args()


if __name__ == '__main__':
    logger = setup_logging("dlrobot.log")
    args = parse_args()
    TDownloadEnv.clear_cache_folder()
    TRequestPolicy.ENABLE = False

    #see https://stackoverflow.com/questions/38015537/python-requests-exceptions-sslerror-dh-key-too-small
    #for http://primorie.fas.gov.ru
    file = TDownloadedFile(args.url)
    assert file is not None
Exemplo n.º 10
0
 def init_morda_url_if_necessary(self):
     if len(self.url_nodes) == 0:
         title = get_html_title(TDownloadedFile(self.morda_url).data)
         self.url_nodes[self.morda_url] = TUrlInfo(title=title)