def get_common_vocab(in_dir, out_dir, out_file_name, in_suffix, years, n_vocab, donor_path, receptor_path):
    common_vocab = None
    for year in years:
        col1, col2 = ioutils.load_word_pairs(in_dir + str(year) + in_suffix)
        file_vocab = set(col1)
        #file_vocab = set(read_corpus_to_list(in_dir + str(year) + in_suffix))
        # f = open(in_dir + str(year) + in_suffix)
        # for line in f:
        #     for sent in nltk.sent_tokenize(line):
        #         for word in nltk.word_tokenize(sent):
        #             file_vocab.add(word)
        if common_vocab is None:
            common_vocab = file_vocab
        else:
            common_vocab = common_vocab & file_vocab
        # f.close()
    data_bern = bernoulli.rvs(size=len(common_vocab), p=float(n_vocab) / len(common_vocab))

    common_vocab_list = list(common_vocab)
    random_common_vocab = set()
    for idx, i in enumerate(data_bern):
        if i == 1:
            random_common_vocab.add(common_vocab_list[idx])
    random_common_vocab = random_common_vocab.union(set(ioutils.load_word_list(donor_path)).union(ioutils.load_word_list(receptor_path)))
    ioutils.write_list(out_dir + out_file_name, list(random_common_vocab))
示例#2
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, vocab_dir, sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = ioutils.load_word_list(vocab_dir + str(year) + ".vocab")
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)):
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0)
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0)
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(np.ceil(mat.data[i] * prop_keep))):
                    fp.write(line)
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt"
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
示例#3
0
def main(args):
    lines = ioutils.load_word_list(args.filename)
    # f = open(args.filename)
    D = {}
    tag_set = set([])
    tb = Blobber(pos_tagger=PerceptronTagger())
    for i, line in enumerate(lines):
        b1 = tb(line)
        for w, t in b1.tags:
            tag_set.add(t)
            if w not in D:
                D[w] = Counter()
            D[w][t] = float(D[w][t] + 1)
    # print D['fawn'].most_common(1)[0]
    # print D['yellow'].most_common(1)[0]

    sorted_pos_tags = sorted(list(tag_set))
    rows = []
    most_common_rows = []
    for w in D.keys():
        row = [w]
        pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags])
        pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word))
        assert (np.isclose(np.sum(pos_dist_word), 1.0))
        row = row + list(pos_dist_word)
        rows.append(row)
        most_common_rows.append([
            w,
            np.max(pos_counts_word),
            sorted_pos_tags[np.argmax(pos_counts_word)]
        ])

    header = ['word'] + sorted_pos_tags
    print "Set of POS tags in sorted order", header
    df = pd.DataFrame().from_records(rows, columns=header)
    print "Dumping the POS distribution."
    df.to_csv(args.outputfile + ".csv", index=None, encoding='utf-8')
    print "Dumping most common pos tag"
    df2 = pd.DataFrame().from_records(most_common_rows,
                                      columns=['word', 'count', 'POS'])
    df2.to_csv(args.outputfile + "_pos.csv", index=None, encoding='utf-8')
def create_comman_vocab(in_dir, ngram_file_suffix, out_dir, out_file_name,
                        years, lang):
    stop_set = set(stopwords.words(lang))
    common_vocab_set = set()

    for year in years:
        file_content_list = ioutils.load_word_list(in_dir + str(year) +
                                                   ngram_file_suffix)
        words_set = set()
        for line in file_content_list:
            words_line = line.split()
            for w in words_line:
                if not (w.lower().isalpha()) or (w.lower() in stop_set) or (
                        w.lower() in words_set) or (len(w) <= 2):
                    continue
                words_set.add(w.lower())
        # print words_set

        if year != years[0]:
            common_vocab_set = common_vocab_set.intersection(words_set)
        else:
            common_vocab_set = words_set
    # print list(common_vocab_set)
    ioutils.write_list(out_dir + out_file_name, list(common_vocab_set))
示例#5
0
def get_precision_at_k(ref_list_path, ranked_list_path, k):
    ref_set = set(ioutils.load_word_list(ref_list_path))
    ranked_set = set(ioutils.load_word_list(ranked_list_path)[:k])
    #print ranked_set.intersection(ref_set)
    return (len(ranked_set.intersection(ref_set)) * 1.0) / len(ref_set) * 1.0
    parser.add_argument("--workers",
                        type=int,
                        help="Number of processes to spawn",
                        default=8)
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=1900)
    parser.add_argument("--end-year",
                        type=int,
                        help="end year (inclusive)",
                        default=1990)
    parser.add_argument("--year-inc",
                        type=int,
                        help="year increment",
                        default=10)
    args = parser.parse_args()

    years = range(args.start_year, args.end_year + 1, args.year_inc)
    ioutils.mkdir(args.out_dir)

    corpus = read_corpus_to_list(args.corpus_file_path)
    donor_list = ioutils.load_word_list(args.donor_list_path)
    donor_occuarances_map = get_donor_occurances(corpus, donor_list)

    run_parallel(args.workers, years, corpus, donor_list,
                 donor_occuarances_map,
                 ioutils.load_word_list(args.receptor_list_path),
                 read_word_freq(args.word_freq_path), args.preplacement,
                 args.out_dir + "/", args.out_suffix)