def read_robots(link: 'darc_link.Link', text: str, host: 'Optional[str]' = None) -> 'List[darc_link.Link]': """Read ``robots.txt`` to fetch link to sitemaps. Args: link: Original link to ``robots.txt``. text: Content of ``robots.txt``. host: Hostname of the URL to ``robots.txt``, the value may not be same as in ``link``. Returns: List of link to sitemaps. Note: If the link to sitemap is not specified in ``robots.txt`` [*]_, the fallback link ``/sitemap.xml`` will be used. .. [*] https://www.sitemaps.org/protocol.html#submit_robots """ rp = RobotFileParser() with io.StringIO(text) as file: rp.parse(file) sitemaps = rp.site_maps() if sitemaps is None: return [parse_link(urljoin(link.url, '/sitemap.xml'), backref=link)] return [parse_link(urljoin(link.url, sitemap), host=host, backref=link) for sitemap in sitemaps]
def extract_links_from_text(link: Link, text: str) -> typing.List[Link]: """Extract links from raw text source. Args: link: Original link of the source document. text: Content of source text document. Returns: List of extracted links. Important: The extraction is **NOT** as reliable since we did not perform `TLD`_ checks on the extracted links and we cannot guarantee all links to be extracted. .. _TLD: https://pypi.org/project/tld/ The URL patterns used to extract links are defined by :data:`darc.parse.URL_PAT` and you may register your own expressions by :envvar:`DARC_URL_PAT`. """ temp_list = list() for part in text.split(): for pattern in URL_PAT: for match in pattern.finditer(part): match_url = match.group('url') # add scheme if not exist if not urlsplit(match_url).scheme: match_url = f'{link.url_parse.scheme}:{match_url}' temp_link = parse_link(match_url) temp_list.append(temp_link) return temp_list
def read_hosts(text: str, check: bool = CHECK) -> typing.List[Link]: """Read ``hosts.txt``. Args: text: Content of ``hosts.txt``. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. """ temp_list = list() for line in filter(None, map(lambda s: s.strip(), text.splitlines())): if line.startswith('#'): continue link = line.split('=', maxsplit=1)[0] if I2P_REGEX.fullmatch(link) is None: continue temp_list.append(parse_link(f'http://{link}')) if check: return _check(temp_list) return temp_list
def read_hosts(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]': """Read ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. text: Content of ``hosts.txt``. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. """ temp_list = [] for line in filter(None, map(lambda s: s.strip(), text.splitlines())): if line.startswith('#'): continue host = line.split('=', maxsplit=1)[0] if I2P_REGEX.fullmatch(host) is None: continue temp_list.append(parse_link(f'http://{host}', backref=link)) if check: return _check(temp_list) return temp_list
def read_sitemap(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]': """Read sitemap. Args: link: Original link to the sitemap. text: Content of the sitemap. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. See Also: * :func:`darc.parse._check` * :func:`darc.parse._check_ng` """ soup = bs4.BeautifulSoup(text, 'html5lib') # https://www.sitemaps.org/protocol.html temp_list = [parse_link(urljoin(link.url, loc.text), host=link.host, backref=link) for loc in soup.select('urlset > url > loc')] # check content / proxy type if check: return _check(temp_list) return temp_list
def extract_links(link: 'darc_link.Link', html: 'Union[str, bytes]', check: bool = CHECK) -> 'List[darc_link.Link]': """Extract links from HTML document. Args: link: Original link of the HTML document. html: Content of the HTML document. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of extracted links. See Also: * :func:`darc.parse._check` * :func:`darc.parse._check_ng` """ soup = bs4.BeautifulSoup(html, 'html5lib') temp_list = [] for child in soup.find_all( lambda tag: tag.has_attr('href') or tag.has_attr('src')): if (href := child.get('href', child.get('src'))) is None: continue temp_link = parse_link(urljoin(link.url, href), backref=link) temp_list.append(temp_link)
def get_sitemap(link: 'darc_link.Link', text: str, host: 'Optional[str]' = None) -> 'List[darc_link.Link]': """Fetch link to other sitemaps from a sitemap. Args: link: Original link to the sitemap. text: Content of the sitemap. host: Hostname of the URL to the sitemap, the value may not be same as in ``link``. Returns: List of link to sitemaps. Note: As specified in the sitemap protocol, it may contain links to other sitemaps. [*]_ .. [*] https://www.sitemaps.org/protocol.html#index """ sitemaps = [] soup = bs4.BeautifulSoup(text, 'html5lib') # https://www.sitemaps.org/protocol.html#index for loc in soup.select('sitemapindex > sitemap > loc'): sitemaps.append(urljoin(link.url, loc.text)) return [parse_link(sitemap, host=host, backref=link) for sitemap in sitemaps]
def _check_ng(temp_list: typing.List[Link]) -> typing.List[Link]: """Check content type of links through ``HEAD`` requests. Args: temp_list: List of links to be checked. Returns: List of links matches the requirements. See Also: * :func:`darc.parse.match_host` * :func:`darc.parse.match_proxy` * :func:`darc.parse.match_mime` """ from darc.crawl import request_session # pylint: disable=import-outside-toplevel session_map = dict() result_list = list() for link in temp_list: if match_host(link.host): continue if match_proxy(link.proxy): continue # get session session = session_map.get(link.proxy) if session is None: session = request_session(link, futures=True) session_map[link.proxy] = session result = session.head(link.url, allow_redirects=True) result_list.append(result) print(f'[HEAD] Checking content type from {link.url}') link_list = list() for result in concurrent.futures.as_completed(result_list): try: response: typing.Response = result.result() except requests.RequestException as error: if error.response is None: print(render_error(f'[HEAD] Checking failed <{error}>', stem.util.term.Color.RED)) # pylint: disable=no-member continue print(render_error(f'[HEAD] Failed on {error.response.url} <{error}>', stem.util.term.Color.RED)) # pylint: disable=no-member link_list.append(error.response.url) continue ct_type = get_content_type(response) print(f'[HEAD] Checked content type from {response.url} ({ct_type})') if match_mime(ct_type): continue temp_link = parse_link(response.request.url) link_list.append(temp_link) return link_list
def _check_ng(temp_list: 'List[darc_link.Link]') -> 'List[darc_link.Link]': """Check content type of links through ``HEAD`` requests. Args: temp_list: List of links to be checked. Returns: List of links matches the requirements. See Also: * :func:`darc.parse.match_host` * :func:`darc.parse.match_proxy` * :func:`darc.parse.match_mime` """ from darc.crawl import request_session # pylint: disable=import-outside-toplevel session_map = {} # type: Dict[str, FuturesSession] result_list = [] for link in temp_list: if match_host(link.host): continue if match_proxy(link.proxy): continue # get session session = session_map.get(link.proxy) if session is None: session = request_session(link, futures=True) session_map[link.proxy] = session result = session.head(link.url, allow_redirects=True) result_list.append(result) logger.info('[HEAD] Checking content type from %s', link.url) link_list = [] for result in concurrent.futures.as_completed(result_list): # type: ignore try: response = result.result() # type: Response except requests.RequestException as error: if error.response is None: logger.pexc(message='[HEAD] Checking failed') continue logger.pexc(message=f'[HEAD] Failed on {error.response.url}') link_list.append(error.response.url) continue ct_type = get_content_type(response) logger.info('[HEAD] Checked content type from %s (%s)', response.url, ct_type) if match_mime(ct_type): continue temp_link = parse_link(response.request.url) # type: ignore link_list.append(temp_link) return link_list
def fetch_hosts(link: 'darc_link.Link', force: bool = False) -> None: """Fetch ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. force: Force refetch ``hosts.txt``. Returns: Content of the ``hosts.txt`` file. """ if force: logger.warning('[HOSTS] Force refetch %s', link.url) hosts_path = None if force else have_hosts(link) if hosts_path is not None: logger.warning('[HOSTS] Cached %s', link.url) # pylint: disable=no-member with open(hosts_path) as hosts_file: hosts_text = hosts_file.read() else: from darc.requests import i2p_session # pylint: disable=import-outside-toplevel hosts_link = parse_link(urljoin(link.url, '/hosts.txt'), backref=link) logger.info('[HOSTS] Subscribing %s', hosts_link.url) with i2p_session() as session: try: response = session.get(hosts_link.url) except requests.RequestException: logger.pexc(message=f'[HOSTS] Failed on {hosts_link.url}') return if not response.ok: logger.error('[HOSTS] Failed on %s [%d]', hosts_link.url, response.status_code) return ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: logger.error('[HOSTS] Unresolved content type on %s (%s)', hosts_link.url, ct_type) return hosts_text = response.text save_hosts(hosts_link, hosts_text) logger.info('[HOSTS] Subscribed %s', hosts_link.url) from darc.db import save_requests # pylint: disable=import-outside-toplevel # add link to queue save_requests(read_hosts(link, hosts_text))
def _extract_links(cls, link: Link, html: typing.Union[str, bytes], check: bool = CHECK) -> typing.List[Link]: """Extract links from HTML document. Args: link: Original link of the HTML document. html: Content of the HTML document. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of extracted links. """ temp_list = cls.extract_links(link, html) link_list = [parse_link(link) for link in temp_list] # check content / proxy type if check: return _check(link_list) return link_list
def fetch_sitemap(link: 'darc_link.Link', force: bool = False) -> None: """Fetch sitemap. The function will first fetch the ``robots.txt``, then fetch the sitemaps accordingly. Args: link: Link object to fetch for its sitemaps. force: Force refetch its sitemaps. Returns: Contents of ``robots.txt`` and sitemaps. See Also: * :func:`darc.proxy.null.read_robots` * :func:`darc.proxy.null.read_sitemap` * :func:`darc.parse.get_sitemap` """ if force: logger.warning('[ROBOTS] Force refetch %s', link.url) robots_path = None if force else have_robots(link) if robots_path is not None: logger.warning('[ROBOTS] Cached %s', link.url) with open(robots_path) as file: robots_text = file.read() else: robots_link = parse_link(urljoin(link.url, '/robots.txt'), backref=link) logger.info('[ROBOTS] Checking %s', robots_link.url) with request_session(robots_link) as session: try: response = session.get(robots_link.url) except requests.RequestException: logger.pexc(message=f'[ROBOTS] Failed on {robots_link.url}') return if response.ok: ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: logger.error('[ROBOTS] Unresolved content type on %s (%s)', robots_link.url, ct_type) robots_text = '' else: robots_text = response.text save_robots(robots_link, robots_text) logger.info('[ROBOTS] Checked %s', robots_link.url) else: logger.error('[ROBOTS] Failed on %s [%d]', robots_link.url, response.status_code) robots_text = '' if force: logger.warning('[SITEMAP] Force refetch %s', link.url) sitemaps = read_robots(link, robots_text, host=link.host) for sitemap_link in sitemaps: sitemap_path = None if force else have_sitemap(sitemap_link) if sitemap_path is not None: logger.warning('[SITEMAP] Cached %s', sitemap_link.url) with open(sitemap_path) as file: sitemap_text = file.read() else: logger.info('[SITEMAP] Fetching %s', sitemap_link.url) with request_session(sitemap_link) as session: try: response = session.get(sitemap_link.url) except requests.RequestException: logger.pexc(message=f'[SITEMAP] Failed on {sitemap_link.url}') continue if not response.ok: logger.error('[SITEMAP] Failed on %s [%d]', sitemap_link.url, response.status_code) continue # check content type ct_type = get_content_type(response) if ct_type == 'application/gzip': try: sitemap_text = gzip.decompress(response.content).decode() except UnicodeDecodeError: sitemap_text = response.text elif ct_type in ['text/xml', 'text/html']: sitemap_text = response.text save_sitemap(sitemap_link, sitemap_text) else: logger.error('[SITEMAP] Unresolved content type on %s (%s)', sitemap_link.url, ct_type) continue logger.info('[SITEMAP] Fetched %s', sitemap_link.url) # get more sitemaps sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host)) # add link to queue save_requests(read_sitemap(link, sitemap_text))
def fetch_sitemap(link: Link, force: bool = False) -> None: """Fetch sitemap. The function will first fetch the ``robots.txt``, then fetch the sitemaps accordingly. Args: link: Link object to fetch for its sitemaps. force: Force refetch its sitemaps. Returns: Contents of ``robots.txt`` and sitemaps. See Also: * :func:`darc.proxy.null.read_robots` * :func:`darc.proxy.null.read_sitemap` * :func:`darc.parse.get_sitemap` """ if force: print(stem.util.term.format(f'[ROBOTS] Force refetch {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member robots_path = None if force else have_robots(link) if robots_path is not None: print(stem.util.term.format(f'[ROBOTS] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(robots_path) as file: robots_text = file.read() else: robots_link = parse_link(urljoin(link.url, '/robots.txt')) print(f'[ROBOTS] Checking {robots_link.url}') with request_session(robots_link) as session: try: response = session.get(robots_link.url) except requests.RequestException as error: print(render_error(f'[ROBOTS] Failed on {robots_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if response.ok: ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: print(render_error(f'[ROBOTS] Unresolved content type on {robots_link.url} ({ct_type})', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member robots_text = '' else: robots_text = response.text save_robots(robots_link, robots_text) print(f'[ROBOTS] Checked {robots_link.url}') else: print(render_error(f'[ROBOTS] Failed on {robots_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member robots_text = '' if force: print(stem.util.term.format(f'[SITEMAP] Force refetch {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member sitemaps = read_robots(link, robots_text, host=link.host) for sitemap_link in sitemaps: sitemap_path = None if force else have_sitemap(sitemap_link) if sitemap_path is not None: print(stem.util.term.format(f'[SITEMAP] Cached {sitemap_link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(sitemap_path) as file: sitemap_text = file.read() else: print(f'[SITEMAP] Fetching {sitemap_link.url}') with request_session(sitemap_link) as session: try: response = session.get(sitemap_link.url) except requests.RequestException as error: print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue if not response.ok: print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue # check content type ct_type = get_content_type(response) if ct_type == 'application/gzip': try: sitemap_text = gzip.decompress(response.content).decode() except UnicodeDecodeError: sitemap_text = response.text elif ct_type in ['text/xml', 'text/html']: sitemap_text = response.text save_sitemap(sitemap_link, sitemap_text) else: print(render_error(f'[SITEMAP] Unresolved content type on {sitemap_link.url} ({ct_type})', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue print(f'[SITEMAP] Fetched {sitemap_link.url}') # get more sitemaps sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host)) # add link to queue save_requests(read_sitemap(link, sitemap_text))
def main(argv: 'Optional[List[str]]' = None) -> int: """Entrypoint. Args: argv: Optional command line arguments. Returns: Exit code. """ parser = get_parser() args = parser.parse_args(argv) pid = os.getpid() with open(PATH_ID, 'w') as file: print(pid, file=file) # wait for Redis if _WAIT_REDIS: if not FLAG_DB: _redis_command('set', 'darc', pid) if FLAG_DB: while True: try: with DB: _db_operation(DB.create_tables, [ HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel, ]) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='DB.create_tables([HostnameQueueModel, ...]') continue break if SAVE_DB: while True: try: with DB_WEB: _db_operation(DB_WEB.create_tables, [ HostnameModel, URLModel, URLThroughModel, RobotsModel, SitemapModel, HostsModel, RequestsModel, RequestsHistoryModel, SeleniumModel, ]) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='DB.create_tables([HostnameModel, ...]') continue break logger.debug('-*- Initialisation -*-') if DEBUG and not FLAG_DB: # nuke the db _redis_command('delete', 'queue_hostname') _redis_command('delete', 'queue_requests') _redis_command('delete', 'queue_selenium') link_list = [] for link in filter( None, map(lambda s: s.strip(), args.link)): # type: ignore[name-defined,var-annotated] logger.pline(LOG_DEBUG, link) link_list.append(link) if args.file is not None: for path in args.file: with open(path) as file: for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue logger.pline(LOG_DEBUG, line) link_list.append(line) # write to database link_pool = [parse_link(link, backref=None) for link in link_list] save_requests(link_pool, score=0, nx=True) logger.pline(LOG_DEBUG, logger.horizon) # init link file if not os.path.isfile(PATH_LN): with open(PATH_LN, 'w') as file: print('proxy,scheme,host,hash,link', file=file) try: process(args.type) except BaseException: traceback.print_exc() _exit() return 0
def fetch_hosts(link: Link): """Fetch ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. Returns: Content of the ``hosts.txt`` file. """ hosts_path = have_hosts(link) if hosts_path is not None: print( stem.util.term.format(f'[HOSTS] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(hosts_path) as hosts_file: hosts_text = hosts_file.read() else: from darc.requests import i2p_session # pylint: disable=import-outside-toplevel hosts_link = parse_link(urljoin(link.url, '/hosts.txt')) print(f'[HOSTS] Subscribing {hosts_link.url}') with i2p_session() as session: try: response = session.get(hosts_link.url) except requests.RequestException as error: print(render_error( f'[HOSTS] Failed on {hosts_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if not response.ok: print(render_error( f'[HOSTS] Failed on {hosts_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: print(render_error( f'[HOSTS] Unresolved content type on {hosts_link.url} ({ct_type}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return hosts_text = response.text save_hosts(hosts_link, hosts_text) print(f'[HOSTS] Subscribed {hosts_link.url}') from darc.db import save_requests # pylint: disable=import-outside-toplevel # add link to queue save_requests(read_hosts(hosts_text))
def main(argv: typing.Optional[typing.List[str]] = None) -> int: """Entrypoint. Args: argv: Optional command line arguments. Returns: Exit code. """ parser = get_parser() args = parser.parse_args(argv) pid = os.getpid() with open(PATH_ID, 'w') as file: print(pid, file=file) # wait for Redis if _WAIT_REDIS: if not FLAG_DB: _redis_command('set', 'darc', pid) if FLAG_DB: while True: try: with DB: _db_operation(DB.create_tables, [ HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel, ]) except Exception as error: warning = warnings.formatwarning( error, DatabaseOperaionFailed, __file__, 102, # type: ignore[arg-type] 'DB.create_tables([HostnameQueueModel, ...])') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member continue break if SAVE_DB: while True: try: with DB_WEB: _db_operation(DB_WEB.create_tables, [ HostnameModel, URLModel, RobotsModel, SitemapModel, HostsModel, RequestsModel, RequestsHistoryModel, SeleniumModel, ]) except Exception as error: warning = warnings.formatwarning( error, DatabaseOperaionFailed, __file__, 117, # type: ignore[arg-type] 'DB.create_tables([HostnameModel, ...])') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member continue break if DEBUG: print( stem.util.term.format('-*- Initialisation -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # nuke the db if not FLAG_DB: _redis_command('delete', 'queue_hostname') _redis_command('delete', 'queue_requests') _redis_command('delete', 'queue_selenium') link_list = list() for link in filter( None, map(lambda s: s.strip(), args.link)): # type: ignore[name-defined,var-annotated] if DEBUG: print(stem.util.term.format(link, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(link) if args.file is not None: for path in args.file: with open(path) as file: for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue if DEBUG: print( stem.util.term.format( line, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(line) # write to database link_pool = [parse_link(link) for link in link_list] save_requests(link_pool, score=0, nx=True) if DEBUG: print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # init link file if not os.path.isfile(PATH_LN): with open(PATH_LN, 'w') as file: print('proxy,scheme,host,hash,link', file=file) try: process(args.type) except BaseException: traceback.print_exc() _exit() return 0
def main(): """Entrypoint.""" parser = get_parser() args = parser.parse_args() pid = os.getpid() with open(PATH_ID, 'w') as file: print(pid, file=file) # wait for Redis if _WAIT_REDIS: if not FLAG_DB: _redis_command('set', 'darc', pid) if FLAG_DB: while True: with contextlib.suppress(Exception): with DB: DB.create_tables([ HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel, ]) break if SAVE_DB: while True: with contextlib.suppress(Exception): with DB_WEB: DB_WEB.create_tables([ HostnameModel, URLModel, RobotsModel, SitemapModel, HostsModel, RequestsModel, RequestsHistoryModel, SeleniumModel, ]) break if DEBUG: print(stem.util.term.format('-*- Initialisation -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # nuke the db if not FLAG_DB: _redis_command('delete', 'queue_hostname') _redis_command('delete', 'queue_requests') _redis_command('delete', 'queue_selenium') link_list = list() for link in filter(None, map(lambda s: s.strip(), args.link)): if DEBUG: print(stem.util.term.format(link, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(link) if args.file is not None: for path in args.file: with open(path) as file: for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue if DEBUG: print(stem.util.term.format(line, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(line) # write to database link_pool = [parse_link(link) for link in link_list] save_requests(link_pool, score=0, nx=True) if DEBUG: print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # init link file if not os.path.isfile(PATH_LN): with open(PATH_LN, 'w') as file: print('proxy,scheme,host,hash,link', file=file) try: process(args.type) except BaseException: traceback.print_exc() _exit()