def main(): global start, stop, topN, SENTENCES, WORDS, ENTITIES, ys_list if MEM_DEBUG: tracemalloc.start(10) parser = argparse.ArgumentParser( description='Runs basic analytics over pre-sorted n-grams') parser.add_argument(dest="N", type=int, help='Number of output phrases per metric') parser.add_argument( dest="grams", type=str, nargs="+", help="Gram types to include. Numerical or any of 'emnsuw'") parser.add_argument('-s', dest="sentences", action='store_const', const=True, default=False, help='Examine sentences') parser.add_argument('-w', dest="words", action='store_const', const=True, default=False, help='Examine words') parser.add_argument('-e', dest="entities", action='store_const', const=True, default=False, help='Examine entities') util.add_arguments(parser) args = parser.parse_args() #Arguments: #analytics.py <MIN> <MAX> <N> (sw) #Finds the top N n-grams for each n \in [MIN .. MAX] #"s" in the last argument indicates including sentences, "w" words. Blank for nothing topN = args.N gram_list = [] for gram in args.grams: try: #If it's an integer, add it gram_list.append(int(gram)) except: for char in list(gram): if char not in "swenum": raise Exception("Illegal gram: %s" % char) gram_list.append(char) util.process_arguments(args) util.CACHE_DB = False ys_list = list(util.iter_yearseason()) do_analytics(gram_list)
write_obj.append({"id": i, "domains": l}) i += 1 json.dump(write_obj, f) def make_dirs(intervals): for ys in intervals: Path("../data/text_sim/%s/" % ys).mkdir(parents=True, exist_ok=True) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Find and flag duplicate documents') parser.add_argument('intervals', type=str, nargs='+', help='Which intervals to process over. "all" scans all intervals in sequence') parser.add_argument('--sample', dest="sample_fdd", action='store_const', const=True, default=False, help='Sample') util.add_arguments(parser) args = parser.parse_args() intervals = args.intervals SAMPLE = args.sample_fdd util.process_arguments(args) if intervals[0] == "all": intervals = list(util.iter_yearseason()) make_dirs(intervals) for interval in intervals: make_textsim_graph(interval) util.close_pool()
def main(): global NO_PUNCTUATION if LOG_MEM: tracemalloc.start() logging.info("Starting at %s " % datetime.now().strftime("%H:%M:%S")) parser = argparse.ArgumentParser( description='Breaks documents into n-grams under a variety of fitlers') parser.add_argument('--start', dest="MIN", default=3, type=int, help='Analyze n-grams with n>=start') parser.add_argument('--stop', dest="MAX", default=9, type=int, help='Analyze n-grams with n<=stop') parser.add_argument(dest="intervals", type=str, nargs='+', help='Intervals to collect n-grams over') parser.add_argument('-s', dest="sentences", action='store_const', const=True, default=False, help='Examine sentences') parser.add_argument('-w', dest="words", action='store_const', const=True, default=False, help='Examine words') parser.add_argument('-e', dest="entities", action='store_const', const=True, default=False, help='Examine entities') util.add_arguments(parser) args = parser.parse_args() #Arguments: #analytics.py <MIN> <MAX> <N> (sw) #Finds the top N n-grams for each n \in [MIN .. MAX] #"s" in the last argument indicates including sentences, "w" words. Blank for nothing start = args.MIN stop = args.MAX + 1 yearseasons = args.intervals SENTENCES = args.sentences WORDS = args.words ENTITIES = args.entities util.process_arguments(args) NO_PUNCTUATION = util.NO_PUNCTUATION MERGE_SIMILAR = util.MERGE_SIMILAR clean = "_CL" if util.USE_CLEAN else "" np = "_NP" if NO_PUNCTUATION else "" global stopwords stopwords = set(nltk.corpus.stopwords.words('english')) cleaned_words = set(["_organization_", "_number_", "_url_", "_email_"]) stopwords.update(cleaned_words) try: os.mkdir("../data/%s/" % yearseason) except: pass gram_groups = [[n] for n in range(start, stop)] if SENTENCES: gram_groups.append(["s"]) if WORDS: gram_groups.append(["w"]) if ENTITIES: gram_groups.append(["e", "m", "u"]) #Decide how much we're going to iterate if yearseasons[0] == "all": logging.info("Removing old data at %s " % datetime.now().strftime("%H:%M:%S")) ioutils.remove_grams() logging.info("Done removing old data at %s" % datetime.now().strftime("%H:%M:%S")) intervals = [t for t in util.iter_year_season()] else: intervals = [] for yearseason in yearseasons: year = int(yearseason[:4]) if len(yearseason) == 5: season = yearseason[4] intervals.append((year, season)) elif len(yearseason) == 4: intervals.append((year, 'A')) intervals.append((year, 'B')) else: logging.error("Error on %s\n" % yearseason) generate_gram_list(gram_groups, intervals) #logging.info("Closing DB at %s " % datetime.now().strftime("%H:%M:%S")) #ioutils.close_db() #logging.info("Finished at %s" % datetime.now().strftime("%H:%M:%S")) if LOG_MEM: print("Max memory usage:") print("Current: %s, Peak: %s" % tuple( (tracemalloc._format_size(m, False) for m in tracemalloc.get_traced_memory())))