"url": { "rank": DOCUMENTS[res["url"]]["static_rank"] } } yield res if "--warc" in sys.argv: # Generate a WARC file devindex_dir = os.path.join(config["PATH_LOCALDATA"], "devindex") if not os.path.isdir(devindex_dir): os.makedirs(devindex_dir) warc_file = os.path.join(devindex_dir, "crawl.warc") create_warc_from_corpus(generate_corpus(), filename=warc_file) print "Created WARC file:", warc_file elif "--index" in sys.argv: indexer = Indexer() if "--empty" in sys.argv: indexer.empty() docs = indexer.index_corpus(generate_corpus(), flush=True, refresh=True) print "Indexed %s documents." % len(docs) else: print "Usage: python build_devindex.py [--warc | --index]" sys.exit(1)
parser.add_argument("--save_linkgraph_domains", default=False, type=str, help="Save a linkgraph domain file to this path") parser.add_argument("--profile", action='store_true', help="Profile Python usage") return parser.parse_args() # Shared variables while indexing args = get_args() indexer = Indexer() urlclient = indexer.urlclient def list_warc_filenames(): """ Return a list of all indexable WARC files """ if args.warc_files: if args.warc_files.endswith(".txt"): with open(args.warc_files, "rb") as f: warc_files = [x.strip() for x in f.readlines()] else: warc_files = [x.strip() for x in args.warc_files.split(",")] else: warc_files = list_commoncrawl_warc_filenames(limit=args.warc_limit,
from cosrlib.document import load_document_type from cosrlib.config import config from cosrlib.searcher import Searcher from cosrlib.indexer import Indexer CURRENT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) app = Flask( "explainer", static_folder=os.path.join(CURRENT_DIRECTORY, "static"), template_folder=os.path.join(CURRENT_DIRECTORY, "templates") ) indexer = Indexer() indexer.connect() searcher = Searcher() searcher.connect() @app.route('/') def route_search(): """ Homepage, for debugging searches """ return render_template("search.html", config={}) @app.route('/url') def route_url(): """ URL page, for debugging parsing """
def make_client(self): return Indexer()
#!/usr/bin/env python # # This scripts empties and recreate the Elasticsearch indexes # import sys import os sys.path.insert(-1, os.getcwd()) from cosrlib.indexer import Indexer indexer = Indexer() if "--delete" in sys.argv or (raw_input("Do you want to delete the current indices and all data? [y/N]") == "y"): indexer.empty() print "Reset done."
#!/usr/bin/env python # # This scripts empties and recreate the Elasticsearch indexes # import sys import os sys.path.insert(-1, os.getcwd()) from cosrlib.indexer import Indexer indexer = Indexer() if "--delete" in sys.argv or (raw_input( "Do you want to delete the current indices and all data? [y/N]") == "y"): indexer.empty() print "Reset done."
import requests sys.path.insert(0, os.getcwd()) from cosrlib.document import load_document_type from cosrlib.config import config from cosrlib.searcher import Searcher from cosrlib.indexer import Indexer CURRENT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) app = Flask("explainer", static_folder=os.path.join(CURRENT_DIRECTORY, "static"), template_folder=os.path.join(CURRENT_DIRECTORY, "templates")) indexer = Indexer() indexer.connect() searcher = Searcher() searcher.connect() @app.route('/') def route_search(): """ Homepage, for debugging searches """ return render_template("search.html", config={}) @app.route('/url') def route_url(): """ URL page, for debugging parsing """