def get_common_vocab(in_dir, out_dir, out_file_name, in_suffix, years, n_vocab, donor_path, receptor_path): common_vocab = None for year in years: col1, col2 = ioutils.load_word_pairs(in_dir + str(year) + in_suffix) file_vocab = set(col1) #file_vocab = set(read_corpus_to_list(in_dir + str(year) + in_suffix)) # f = open(in_dir + str(year) + in_suffix) # for line in f: # for sent in nltk.sent_tokenize(line): # for word in nltk.word_tokenize(sent): # file_vocab.add(word) if common_vocab is None: common_vocab = file_vocab else: common_vocab = common_vocab & file_vocab # f.close() data_bern = bernoulli.rvs(size=len(common_vocab), p=float(n_vocab) / len(common_vocab)) common_vocab_list = list(common_vocab) random_common_vocab = set() for idx, i in enumerate(data_bern): if i == 1: random_common_vocab.add(common_vocab_list[idx]) random_common_vocab = random_common_vocab.union(set(ioutils.load_word_list(donor_path)).union(ioutils.load_word_list(receptor_path))) ioutils.write_list(out_dir + out_file_name, list(random_common_vocab))
def worker(proc_num, queue, out_dir, in_dir, count_dir, vocab_dir, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = ioutils.load_word_list(vocab_dir + str(year) + ".vocab") embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(np.ceil(mat.data[i] * prop_keep))): fp.write(line) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def main(args): lines = ioutils.load_word_list(args.filename) # f = open(args.filename) D = {} tag_set = set([]) tb = Blobber(pos_tagger=PerceptronTagger()) for i, line in enumerate(lines): b1 = tb(line) for w, t in b1.tags: tag_set.add(t) if w not in D: D[w] = Counter() D[w][t] = float(D[w][t] + 1) # print D['fawn'].most_common(1)[0] # print D['yellow'].most_common(1)[0] sorted_pos_tags = sorted(list(tag_set)) rows = [] most_common_rows = [] for w in D.keys(): row = [w] pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags]) pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word)) assert (np.isclose(np.sum(pos_dist_word), 1.0)) row = row + list(pos_dist_word) rows.append(row) most_common_rows.append([ w, np.max(pos_counts_word), sorted_pos_tags[np.argmax(pos_counts_word)] ]) header = ['word'] + sorted_pos_tags print "Set of POS tags in sorted order", header df = pd.DataFrame().from_records(rows, columns=header) print "Dumping the POS distribution." df.to_csv(args.outputfile + ".csv", index=None, encoding='utf-8') print "Dumping most common pos tag" df2 = pd.DataFrame().from_records(most_common_rows, columns=['word', 'count', 'POS']) df2.to_csv(args.outputfile + "_pos.csv", index=None, encoding='utf-8')
def create_comman_vocab(in_dir, ngram_file_suffix, out_dir, out_file_name, years, lang): stop_set = set(stopwords.words(lang)) common_vocab_set = set() for year in years: file_content_list = ioutils.load_word_list(in_dir + str(year) + ngram_file_suffix) words_set = set() for line in file_content_list: words_line = line.split() for w in words_line: if not (w.lower().isalpha()) or (w.lower() in stop_set) or ( w.lower() in words_set) or (len(w) <= 2): continue words_set.add(w.lower()) # print words_set if year != years[0]: common_vocab_set = common_vocab_set.intersection(words_set) else: common_vocab_set = words_set # print list(common_vocab_set) ioutils.write_list(out_dir + out_file_name, list(common_vocab_set))
def get_precision_at_k(ref_list_path, ranked_list_path, k): ref_set = set(ioutils.load_word_list(ref_list_path)) ranked_set = set(ioutils.load_word_list(ranked_list_path)[:k]) #print ranked_set.intersection(ref_set) return (len(ranked_set.intersection(ref_set)) * 1.0) / len(ref_set) * 1.0
parser.add_argument("--workers", type=int, help="Number of processes to spawn", default=8) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1900) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=1990) parser.add_argument("--year-inc", type=int, help="year increment", default=10) args = parser.parse_args() years = range(args.start_year, args.end_year + 1, args.year_inc) ioutils.mkdir(args.out_dir) corpus = read_corpus_to_list(args.corpus_file_path) donor_list = ioutils.load_word_list(args.donor_list_path) donor_occuarances_map = get_donor_occurances(corpus, donor_list) run_parallel(args.workers, years, corpus, donor_list, donor_occuarances_map, ioutils.load_word_list(args.receptor_list_path), read_word_freq(args.word_freq_path), args.preplacement, args.out_dir + "/", args.out_suffix)