parser.add_argument("--save_linkgraph_domains", default=False, type=str, help="Save a linkgraph domain file to this path") parser.add_argument("--profile", action='store_true', help="Profile Python usage") return parser.parse_args() # Shared variables while indexing args = get_args() indexer = Indexer() urlclient = indexer.urlclient def list_warc_filenames(): """ Return a list of all indexable WARC files """ if args.warc_files: if args.warc_files.endswith(".txt"): with open(args.warc_files, "rb") as f: warc_files = [x.strip() for x in f.readlines()] else: warc_files = [x.strip() for x in args.warc_files.split(",")] else: warc_files = list_commoncrawl_warc_filenames(limit=args.warc_limit,
def make_client(self): return Indexer()