Exemplo n.º 1
0
def queue_from_file(fn):
    from subprocess import check_call
    debug("Queueing users in %r for download" % (fn,))
    os.chdir(BASE_DIR) #because twextract has git-root relative imports
    f = open(fn)
    for line in f:
        score, user = line.strip().split(None, 1)
        check_call(['python', 'twextract/request_queue.py', user, '-c' 'config/ceres'])
        time.sleep(0.001)
    f.close()
Exemplo n.º 2
0
def bisect_the_hose(trigram, infile, goodfile, rejectfile, threshold):
    f = open_maybe_gzip(infile)
    if goodfile is None:
        goodfile = os.devnull
    if rejectfile is None:
        rejectfile = os.devnull
    fgood = open_maybe_gzip(goodfile, 'w')
    frej = open_maybe_gzip(rejectfile, 'w')
    if isinstance(threshold, str):
        threshold = trigram.probable_similarity(threshold)
        debug("threshold is", threshold)

    hose_filter = trigram.hose_filter(f)

    for d in hose_filter:
        if d['score'] >= threshold:
            fgood.write("%(score)5f %(text)s\n" % d)
        else:
            frej.write("%(score)5f %(text)s\n" % d)

    f.close()
    fgood.close()
    frej.close()
Exemplo n.º 3
0
def main():
    parser = OptionParser()
    parser.add_option("-m", "--trigram-mode", help="how to trigrammise [%s]" % DEFAULT_MODE,
                      default=DEFAULT_MODE)
    parser.add_option("-c", "--recompile", help="Derive trigrams from corpora", action="store_true")
    parser.add_option("-C", "--recompile-all", help="Derive trigrams for all modes and exit",
                      action="store_true")

    parser.add_option("-T", "--trial", help="show scores of tweets, not users", action="store_true")

    parser.add_option("-t", "--threshold", help="use this as threshold",
                      default=str(DEFAULT_THRESHOLD), metavar="(STRING|FLOAT)")

    parser.add_option("-i", "--input", help="input file or directory", metavar="PATH")
    parser.add_option("-b", "--bad-file", help="write rejects here", metavar="FILE")
    parser.add_option("-g", "--good-file", help="write good ones here", metavar="FILE")
    parser.add_option("-d", "--dump-file", help="write them all here, perhaps in order", metavar="FILE")
    parser.add_option("-q", "--queue", help="queue the good users for download", action="store_true")
    parser.add_option("-Q", "--queue-from-file",
                      help="queue from a pre-existing list (no evaluation)", metavar="FILE")
    parser.add_option("-r", "--report", help="get statistical data on stderr", action="store_true")

    parser.add_option("-f", "--offset-factor", help="English unseen trigram probablility factor",
                      type="float", default=TRIGRAM_OFFSET_FACTOR, metavar="FLOAT")
    parser.add_option("-a", "--anti-offset-factor", help="non-English unseen trigram probablility factor",
                      type="float", default=ANTI_TRIGRAM_OFFSET_FACTOR, metavar="FLOAT")


    (options, args) = parser.parse_args()

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()

    if options.recompile_all:
        pre_cook(modes=MODES, corpi=CORPI + ANTI_CORPI)
        pre_cook_full(modes=MODES, corpi=CORPI + ANTI_CORPI)
        sys.exit()

    if options.queue_from_file:
        queue_from_file(options.queue_from_file)
        sys.exit()

    src = options.input
    good = options.good_file
    bad = options.bad_file
    dump = options.dump_file

    mode = options.trigram_mode

    if options.recompile:
        pre_cook_full(modes=[mode], corpi=CORPI + ANTI_CORPI)

    tg = get_trigram_with_antimodel(mode, tof=options.offset_factor,
                                    atof=options.anti_offset_factor)

    try:
        threshold = float(options.threshold)
    except ValueError:
        threshold = tg.probable_similarity(options.threshold)
        debug("Threshold from %r is %s" %(options.threshold, threshold))

    if options.trial:
        if src is None:
            src = TEST_FILE_1
        if good or bad:
            bisect_the_hose(tg, src, good, bad, threshold=threshold)
        if dump:
            order_the_hose(tg, src, dump)

    elif good or bad or dump:
        if src is None:
            src = STASH_DIR
        users = _group_by_user(tg, src)
        if options.report:
            users_report(users)
        if good or bad:
            partition_users(users, good, bad, threshold)
        if dump is not None:
            dump_users(users, dump)
        if good and options.queue:
            queue_from_file(good)
    else:
        debug("nothing much to do!")