def queue_from_file(fn): from subprocess import check_call debug("Queueing users in %r for download" % (fn,)) os.chdir(BASE_DIR) #because twextract has git-root relative imports f = open(fn) for line in f: score, user = line.strip().split(None, 1) check_call(['python', 'twextract/request_queue.py', user, '-c' 'config/ceres']) time.sleep(0.001) f.close()
def bisect_the_hose(trigram, infile, goodfile, rejectfile, threshold): f = open_maybe_gzip(infile) if goodfile is None: goodfile = os.devnull if rejectfile is None: rejectfile = os.devnull fgood = open_maybe_gzip(goodfile, 'w') frej = open_maybe_gzip(rejectfile, 'w') if isinstance(threshold, str): threshold = trigram.probable_similarity(threshold) debug("threshold is", threshold) hose_filter = trigram.hose_filter(f) for d in hose_filter: if d['score'] >= threshold: fgood.write("%(score)5f %(text)s\n" % d) else: frej.write("%(score)5f %(text)s\n" % d) f.close() fgood.close() frej.close()
def main(): parser = OptionParser() parser.add_option("-m", "--trigram-mode", help="how to trigrammise [%s]" % DEFAULT_MODE, default=DEFAULT_MODE) parser.add_option("-c", "--recompile", help="Derive trigrams from corpora", action="store_true") parser.add_option("-C", "--recompile-all", help="Derive trigrams for all modes and exit", action="store_true") parser.add_option("-T", "--trial", help="show scores of tweets, not users", action="store_true") parser.add_option("-t", "--threshold", help="use this as threshold", default=str(DEFAULT_THRESHOLD), metavar="(STRING|FLOAT)") parser.add_option("-i", "--input", help="input file or directory", metavar="PATH") parser.add_option("-b", "--bad-file", help="write rejects here", metavar="FILE") parser.add_option("-g", "--good-file", help="write good ones here", metavar="FILE") parser.add_option("-d", "--dump-file", help="write them all here, perhaps in order", metavar="FILE") parser.add_option("-q", "--queue", help="queue the good users for download", action="store_true") parser.add_option("-Q", "--queue-from-file", help="queue from a pre-existing list (no evaluation)", metavar="FILE") parser.add_option("-r", "--report", help="get statistical data on stderr", action="store_true") parser.add_option("-f", "--offset-factor", help="English unseen trigram probablility factor", type="float", default=TRIGRAM_OFFSET_FACTOR, metavar="FLOAT") parser.add_option("-a", "--anti-offset-factor", help="non-English unseen trigram probablility factor", type="float", default=ANTI_TRIGRAM_OFFSET_FACTOR, metavar="FLOAT") (options, args) = parser.parse_args() if len(sys.argv) < 2: parser.print_help() sys.exit() if options.recompile_all: pre_cook(modes=MODES, corpi=CORPI + ANTI_CORPI) pre_cook_full(modes=MODES, corpi=CORPI + ANTI_CORPI) sys.exit() if options.queue_from_file: queue_from_file(options.queue_from_file) sys.exit() src = options.input good = options.good_file bad = options.bad_file dump = options.dump_file mode = options.trigram_mode if options.recompile: pre_cook_full(modes=[mode], corpi=CORPI + ANTI_CORPI) tg = get_trigram_with_antimodel(mode, tof=options.offset_factor, atof=options.anti_offset_factor) try: threshold = float(options.threshold) except ValueError: threshold = tg.probable_similarity(options.threshold) debug("Threshold from %r is %s" %(options.threshold, threshold)) if options.trial: if src is None: src = TEST_FILE_1 if good or bad: bisect_the_hose(tg, src, good, bad, threshold=threshold) if dump: order_the_hose(tg, src, dump) elif good or bad or dump: if src is None: src = STASH_DIR users = _group_by_user(tg, src) if options.report: users_report(users) if good or bad: partition_users(users, good, bad, threshold) if dump is not None: dump_users(users, dump) if good and options.queue: queue_from_file(good) else: debug("nothing much to do!")