import arxiv arxiv = arxiv from arxiv.db_utils import db if __name__ == "__main__": cmd = sys.argv[1] if cmd == u"scrape": print(u"Scraping all the meta-data from the arxiv...") arxiv.get() if cmd == u"parse": print(u"Parsing the XML...") arxiv.parse() if cmd == u"build-vocab": print(u"Building the vocabulary list...") arxiv.build_vocab() if cmd == u"get-vocab": initial, N = 1000, 5000 if len(sys.argv) >= 3: initial = int(sys.argv[2]) elif len(sys.argv) >= 4: N = int(sys.argv[3]) arxiv.get_vocab(initial=initial, N=N) if cmd in [u"run", u"results"]:
def read_file(filename, inc): in_file = open(filename) tree = arxiv.parse(in_file) in_file.close() yield inc, tree