Exemplo n.º 1
0
def merge(out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl")
        disp_yearstat = ioutils.load_pickle(out_pref + str(year) +
                                            "-disps.pkl")
        for word in word_list:
            if word not in vol_yearstat:
                vol = float('nan')
            else:
                vol = vol_yearstat[word]
            if word not in disp_yearstat:
                disp = float('nan')
            else:
                disp = disp_yearstat[word]
            vol_yearstats[word][year] = vol
            disp_yearstats[word][year] = disp
        os.remove(out_pref + str(year) + "-vols.pkl")
        os.remove(out_pref + str(year) + "-disps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "-vols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "-disps.pkl")
Exemplo n.º 2
0
def merge_year_counts(out_dir, name_list, years):
    for year in years:
        year_counts = {}
        year_doc_counts = {}
        year_pos = {}

        for name in name_list:
            tmp_year_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-counts.pkl")
            tmp_year_doc_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-doc_counts.pkl")
            tmp_year_pos = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-pos.pkl")
            for word, count in tmp_year_counts.iteritems():
                if not word in year_counts:
                    year_counts[word] = 0
                    year_doc_counts[word] = 0
                    year_pos[word] = collections.Counter()
                year_counts[word] += tmp_year_counts[word]
                year_doc_counts[word] += tmp_year_doc_counts[word]
                counter_keys = tmp_year_pos[word].keys()
                for pos in counter_keys:
                    year_pos[word][pos] += tmp_year_pos[word][pos]

        print "Writing merged counts for " + str(year) + " ..."
        ioutils.write_pickle(year_counts, out_dir + str(year) + "-counts.pkl")
        ioutils.write_pickle(year_doc_counts, out_dir + str(year) + "-doc_counts.pkl")
        ioutils.write_pickle(year_pos, out_dir + str(year) + "-pos.pkl")

    print "Deleting temp dirs ..."
    remove_tmp_dirs(out_dir, name_list)
def main(proc_num, queue, out_dir, in_dir):
    merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl")
    print proc_num, "Start loop"
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Fixing counts for year", year
        fixed_counts = {}
        old_mat = sparse_io.retrieve_mat_as_dict(in_dir + str(year) + ".bin")
        old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        for pair, count in old_mat.iteritems():
            try:
                i_word = old_index[pair[0]]
            except IndexError:
                print pair
                sys.exit(0)
            c_word = old_index[pair[1]]
            new_pair = (indexing.word_to_static_id(i_word, merged_index),
                        indexing.word_to_static_id(c_word, merged_index))
            fixed_counts[new_pair] = count

        print proc_num, "Writing counts for year", year
        sparse_io.export_mats_from_dicts({str(year): fixed_counts}, out_dir)
Exemplo n.º 4
0
def make_word_list(type):
    process_word = lambda word: word if type != "lemma_pos" else word.split(
        "_")[0]
    freqs = load_pickle(FREQS.format(type=type))
    word_lists = {}
    nstop_lists = {}
    nproper_lists = {}
    nstop_nproper_lists = {}
    print "Processing type: ", type
    proper_nouns = load_pickle(PROPER_NOUNS)
    word_lists = [
        word for word in sorted(freqs, key=lambda val: -1 * freqs[val])
        if word != "" and word.isalnum()
    ]
    nstop_lists = [
        word for word in sorted(freqs, key=lambda val: -1 * freqs[val])
        if not process_word(word) in STOPWORDS
        if word != "" and word.isalnum()
    ]
    nproper_lists = [
        word for word in sorted(freqs, key=lambda val: -1 * freqs[val])
        if not process_word(word) in proper_nouns
        if word != "" and word.isalnum()
    ]
    nstop_nproper_lists = [
        word for word in sorted(freqs, key=lambda val: -1 * freqs[val])
        if not process_word(word) in proper_nouns
        and not process_word(word) in STOPWORDS
        if word != "" and word.isalnum()
    ]
    write_pickle(word_lists, OUT.format(type=type, cond="all"))
    write_pickle(nstop_lists, OUT.format(type=type, cond="nstop"))
    write_pickle(nproper_lists, OUT.format(type=type, cond="nproper"))
    write_pickle(nstop_nproper_lists,
                 OUT.format(type=type, cond="nstop_nproper"))
Exemplo n.º 5
0
def main(proc_num, queue, out_dir, in_dir):
    merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") 
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Fixing counts for year", year
        fixed_counts = {}
        old_mat = matstore.retrieve_mat_as_dict(in_dir + str(year) + ".bin")
        old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") 
        for pair, count in old_mat.iteritems():
            try:
                i_word = old_index[pair[0]]
            except IndexError:
                print pair
                sys.exit(0)
            c_word = old_index[pair[1]]
            new_pair = (indexing.word_to_static_id(i_word, merged_index), 
                    indexing.word_to_static_id(c_word, merged_index))
            fixed_counts[new_pair] = count
        
        print proc_num, "Writing counts for year", year
        matstore.export_mats_from_dicts({str(year) : fixed_counts}, out_dir)
Exemplo n.º 6
0
def main(proc_num, queue, out_dir, in_dir):
    merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl")
    print proc_num, "Start loop"
    while True:  # Iterates through the years
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Fixing counts for year", year
        fixed_counts = {}  # This is the new co-occurrence matrix
        old_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) +
                                                ".bin").todok()
        old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        for pair, count in old_mat.iteritems(
        ):  # Iterates through the unmerged co-occurrence matrix ...
            try:
                i_word = old_index[pair[0]]
            except IndexError:
                print pair
                sys.exit(0)
            c_word = old_index[pair[1]]
            try:
                new_pair = (indexing.word_to_static_id(i_word, merged_index),
                            indexing.word_to_static_id(c_word, merged_index))
            except KeyError:  # Filters words to drop out
                continue
            fixed_counts[
                new_pair] = count  # ... and add the counts to the new one

        print proc_num, "Writing counts for year", year  # Saves the new co-occurrence matrices
        sparse_io.export_mat_from_dict(fixed_counts,
                                       out_dir + str(year) + ".bin")
Exemplo n.º 7
0
def merge(out_pref, tmp_dir, years):
    net_stats = collections.defaultdict(dict)
    rewire_net_stats = collections.defaultdict(dict)
    for year in years:
        year_stats = ioutils.load_pickle(tmp_dir + str(year) + "-tmp.pkl")
        rewire_year_stats = ioutils.load_pickle(tmp_dir + "rewire" + str(year) + "-tmp.pkl")
        for stat, val in year_stats.iteritems():
            net_stats[stat][year] = val
        for stat, val in rewire_year_stats.iteritems():
            rewire_net_stats[stat][year] = val
        os.remove(tmp_dir + str(year) + "-tmp.pkl")
        os.remove(tmp_dir + "rewire" + str(year) + "-tmp.pkl")
    for stat, year_vals in net_stats.iteritems():
        ioutils.write_pickle(year_vals, out_pref + "-" + stat + ".pkl")
    for stat, year_vals in rewire_net_stats.iteritems():
        ioutils.write_pickle(year_vals, out_pref + "-rw-" + stat + ".pkl")
Exemplo n.º 8
0
def worker(proc_num, queue, out_dir, in_dir):
    while True: # Iterates through the decades
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        for year in range(10): # Iterates through the years of the indiviual decades
            year_counts = load_pickle(in_dir + str(decade + year) + "-pos.pkl")
            if year == 0:
                merged_pos_counts = year_counts # this variable counts the occurrence of words (distinguish words by pos)
            else:
                for word, pos_counts in year_counts.iteritems(): # Iterates through the words and adds the occurrence to the merged counter
                    for pos, count in pos_counts.iteritems():
                        if not word in merged_pos_counts:
                            merged_pos_counts[word] = collections.Counter()
                        merged_pos_counts[word][pos] += count
        maj_tags = {} # Classifies words about the occurrence of the major pos
        for word, pos_counts in merged_pos_counts.iteritems(): 
            if len(pos_counts) < 1:
                continue
            max_label = sorted(pos_counts, key= lambda w : pos_counts[w], reverse=True)[0]
            if pos_counts[max_label] > 0.5 * np.sum(pos_counts.values()):
                maj_tags[word] = max_label
            else:
                maj_tags[word] = "AMB"
        write_pickle(merged_pos_counts, out_dir + str(decade) + "-pos_counts.pkl") # Saves the counts
        write_pickle(maj_tags, out_dir + str(decade) + "-pos.pkl") # Saves the maj_tags
Exemplo n.º 9
0
def worker(proc_num, queue, dir, count_dir, min_count):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print("Loading data..", year)
        #        time.sleep(120 * random.random())
        freqs = load_pickle(count_dir + str(year) + "-counts.pkl")
        iw = []
        with open(dir + str(year) + "-w.txt") as fp:
            info = fp.readline().split()
            vocab_size = int(info[0])
            dim = int(info[1])
            w_mat = np.zeros((vocab_size, dim))
            for i, line in enumerate(fp):
                line = line.strip().split()
                iw.append(line[0].decode("utf-8"))
                if freqs[iw[-1]] >= 500:
                    w_mat[i, :] = np.array(list(map(float, line[1:])))
        c_mat = np.zeros((vocab_size, dim))
        with open(dir + str(year) + "-c.txt") as fp:
            fp.readline()
            for i, line in enumerate(fp):
                line = line.strip().split()
                if freqs[line[0]] >= min_count:
                    c_mat[i, :] = np.array(list(map(float, line[1:])))
        np.save(dir + str(year) + "-w.npy", w_mat)
        np.save(dir + str(year) + "-c.npy", c_mat)
        write_pickle(iw, dir + str(year) + "-vocab.pkl")
Exemplo n.º 10
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        counts = collections.defaultdict(int)
        for year in range(10):
            embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False)
            if year == 0:
                merged_index = embed.wi
            year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl")
            mat = embed.m.tocoo()
            for i in xrange(len(mat.data)):
                if mat.data[i] == 0:
                    continue
                new_row = get_index(merged_index, year_list, mat.row[i])
                new_col = get_index(merged_index, year_list, mat.col[i])
                counts[(new_row, new_col)] += mat.data[i]
            print "Done year ", decade + year
        export_mat_from_dict(counts, decade, out_dir)
        write_pickle(merged_index, out_dir + str(decade) + "-index.pkl")
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
Exemplo n.º 11
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print("Processing decade", decade)
        for year in range(10):
            year_counts = load_pickle(in_dir + str(decade + year) + "-pos.pkl")
            if year == 0:
                merged_pos_counts = year_counts
            for word, pos_counts in year_counts.items():
                for pos, count in pos_counts.items():
                    if not word in merged_pos_counts:
                        merged_pos_counts[word] = collections.Counter()
                    merged_pos_counts[word][pos] += count
        maj_tags = {}
        for word, pos_counts in merged_pos_counts.items():
            if len(pos_counts) < 1:
                continue
            max_label = sorted(pos_counts, key= lambda w : pos_counts[w], reverse=True)[0]
            if pos_counts[max_label] > 0.5 * np.sum(list(pos_counts.values())):
                maj_tags[word] = max_label
            else:
                maj_tags[word] = "AMB"
        write_pickle(merged_pos_counts, out_dir + str(decade) + "-pos_counts.pkl")
        write_pickle(maj_tags, out_dir + str(decade) + "-pos.pkl")
Exemplo n.º 12
0
def merge(out_pref, tmp_out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jvols.pkl")
        disp_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jdisps.pkl")
        for word in word_list:
            vol_yearstats[word][year] = vol_yearstat[word]
            disp_yearstats[word][year] = disp_yearstat[word]
        os.remove(tmp_out_pref + str(year) + "-jvols.pkl")
        os.remove(tmp_out_pref + str(year) + "-jdisps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "-jvols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "-jdisps.pkl")
Exemplo n.º 13
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, vocab_dir, sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = ioutils.load_word_list(vocab_dir + str(year) + ".vocab")
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)):
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0)
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0)
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(np.ceil(mat.data[i] * prop_keep))):
                    fp.write(line)
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt"
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
Exemplo n.º 14
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        counts = collections.defaultdict(int)       
        for year in range(10):
            embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False)
            if year == 0:
                merged_index = embed.wi
            year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl")
            mat = embed.m.tocoo()
            for i in xrange(len(mat.data)):
                if mat.data[i] == 0:
                    continue
                new_row = get_index(merged_index, year_list, mat.row[i])
                new_col = get_index(merged_index, year_list, mat.col[i])
                counts[(new_row, new_col)] += mat.data[i]
            print "Done year ", decade + year
        export_mat_from_dict(counts, decade, out_dir)
        write_pickle(merged_index, out_dir + str(decade) + "-index.pkl")
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
Exemplo n.º 15
0
def worker(proc_num, queue, dir, count_dir, min_count):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print "Loading data..", year
#        time.sleep(120 * random.random())
        freqs = load_pickle(count_dir + str(year) + "-counts.pkl")
        iw = []
        with open(dir + str(year) + "-w.txt") as fp:
            info = fp.readline().split()
            vocab_size = int(info[0])
            dim = int(info[1])
            w_mat = np.zeros((vocab_size, dim))
            for i, line in enumerate(fp):
                line = line.strip().split()
                iw.append(line[0].decode("utf-8"))
                if freqs[iw[-1]] >= 500:
                    w_mat[i,:] = np.array(map(float, line[1:]))
        c_mat = np.zeros((vocab_size, dim))
        with open(dir + str(year) + "-c.txt") as fp:
            fp.readline()
            for i, line in enumerate(fp):
                line = line.strip().split()
                if freqs[line[0]] >= min_count:
                    c_mat[i,:] = np.array(map(float, line[1:]))
        np.save(dir + str(year) + "-w.npy", w_mat)
        np.save(dir + str(year) + "-c.npy", c_mat)
        write_pickle(iw, dir + str(year) + "-vocab.pkl")
Exemplo n.º 16
0
def worker(proc_num,
           queue,
           out_dir,
           in_dir,
           use_words,
           count_dir,
           num_sam,
           sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(
            in_dir + str(year) + ".bin",
            normalize=False)  # Loads embedding and its count data
        embed = embed.get_subembed(
            use_words,
            restrict_context=True)  # Restricts the vocabulary to given words
        counts = ioutils.load_pickle(count_dir + "/" + str(year) +
                                     "-counts.pkl")
        all_count = sum(counts.values())
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(
                    mat.data)):  # Iterates through the occured word pairs
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / counts[word] * all_count),
                                    1.0)
                    prop_keep *= min(
                        np.sqrt(sample / counts[context] * all_count), 1.0)
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(
                        int(mat.data[i] * prop_keep)
                ):  # Writes down the word pair as many times as it is needed
                    fp.write(line)
        mat = mat.tocsr()
        print proc_num, "Outputing vocab for year", year
        with open(out_dir + str(year) + ".vocab", "w") as fp:
            for word in use_words:
                print >> fp, word.encode("utf-8"), int(
                    mat[embed.wi[word], :].sum())
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(
            year) + ".txt"
        os.system(
            "shuf " + out_dir + str(year) + ".tmp.txt -r -n " + str(num_sam) +
            " > " + out_dir + str(year) + ".txt"
        )  # Sampling randomly from the word pairs as many times as is given
        os.remove(out_dir + str(year) + ".tmp.txt")
Exemplo n.º 17
0
def make_word_list(type):
    process_word = lambda word : word if type != "lemma_pos" else word.split("_")[0]
    freqs = load_pickle(FREQS.format(type=type))
    word_lists = {}
    nstop_lists = {}
    nproper_lists = {}
    nstop_nproper_lists = {}
    print "Processing type: ", type
    proper_nouns = load_pickle(PROPER_NOUNS)
    word_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if word != "" and word.isalnum()]
    nstop_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if not process_word(word) in STOPWORDS if word != "" and word.isalnum()]
    nproper_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if not process_word(word) in proper_nouns if word != "" and word.isalnum()]
    nstop_nproper_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if not process_word(word) in proper_nouns and not process_word(word) in STOPWORDS if word != "" and word.isalnum()]
    write_pickle(word_lists, OUT.format(type=type, cond="all"))
    write_pickle(nstop_lists, OUT.format(type=type, cond="nstop"))
    write_pickle(nproper_lists, OUT.format(type=type, cond="nproper"))
    write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
Exemplo n.º 18
0
def merge(out_pref, tmp_out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) +
                                           "-jvols.pkl")
        disp_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) +
                                            "-jdisps.pkl")
        for word in word_list:
            vol_yearstats[word][year] = vol_yearstat[word]
            disp_yearstats[word][year] = disp_yearstat[word]
        os.remove(tmp_out_pref + str(year) + "-jvols.pkl")
        os.remove(tmp_out_pref + str(year) + "-jdisps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "-jvols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "-jdisps.pkl")
Exemplo n.º 19
0
def worker(proc_num, queue, dir, count_dir, min_count):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print "Loading data..", year
        #        time.sleep(120 * random.random())
        freqs = load_pickle(count_dir + str(year) + "-counts.pkl")
        text2numpy(dir, freqs, year)
Exemplo n.º 20
0
def get_extra_vectors(year, train_dir, print_dir, queries_list):
    extra_vectors = []
    vocab = ioutils.load_pickle(train_dir + VOCAB_FILE.format(year=year))
    print_vectors = load_file_lines(print_dir + TRAINED_VEC_FILE.format(year=year))

    for i, w in enumerate(queries_list):
        if w.decode("utf-8") not in vocab:
            extra_vectors.append(print_vectors[i + 1])
    return extra_vectors
Exemplo n.º 21
0
def merge(years, out_pref, out_dir):
    word_freqs = collections.defaultdict(dict)
    word_lists = {}
    word_set = set([])
    for year in years:
        word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl")
        word_set = word_set.union(set(word_lists[year]))
        os.remove(out_dir + str(year) + "tmp.pkl")
    for year in years:
        year_freqs= ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl")
        for word in word_set:
            if word not in year_freqs:
                word_freqs[word][year] = float('nan')
            else:
                word_freqs[word][year] = year_freqs[word]
        os.remove(out_dir + str(year) + "freqstmp.pkl")

    ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl")
    ioutils.write_pickle(word_lists, out_pref + ".pkl")
Exemplo n.º 22
0
def merge(word_list, years, in_dir, out_file):
    yearstats = {}
    for word in word_list:
        yearstats[word] = {}
    for year in years:
        yearstat = ioutils.load_pickle(in_dir + str(year) + "-freqstmp.pkl")
        for word in yearstat.keys():
            yearstats[word][year] = yearstat[word]
        os.remove(in_dir + str(year) + "-freqstmp.pkl")
    ioutils.write_pickle(yearstats, out_file)
Exemplo n.º 23
0
def merge(years, out_pref, out_dir):
    word_freqs = collections.defaultdict(dict)
    word_lists = {}
    word_set = set([])
    for year in years:
        word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl")
        word_set = word_set.union(set(word_lists[year]))
        os.remove(out_dir + str(year) + "tmp.pkl")
    for year in years:
        year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl")
        for word in word_set:
            if word not in year_freqs:
                word_freqs[word][year] = float('nan')
            else:
                word_freqs[word][year] = year_freqs[word]
        os.remove(out_dir + str(year) + "freqstmp.pkl")

    ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl")
    ioutils.write_pickle(word_lists, out_pref + ".pkl")
Exemplo n.º 24
0
def worker(proc_num, queue, out_dir, input_dir, out_suffix):
    while True:
        if queue.empty():
            break
        year = queue.get()

        print proc_num, "Cleaning vocab of year", year
        vocab_list = ioutils.load_pickle(input_dir + VOCAB_FILE.format(year=year))
        cleaned_vocab_list = remove_non_alph(vocab_list)
        ioutils.write_list(out_dir + str(year) + out_suffix, cleaned_vocab_list)
Exemplo n.º 25
0
def numpy2text(vec_path, year, extension):
    vocab_list = load_pickle(vec_path + str(year) + "-vocab.pkl")
    w_mat = np.load(vec_path + str(year) + "-w.npy")
    vocab_size = len(vocab_list)
    dim = len(w_mat[0])
    ioutils.write_list(vec_path + str(year) + ".vocab", vocab_list)
    with open(vec_path + str(year) + "-w" + extension, "w") as fp:
        print >> fp, str(vocab_size), str(dim)
        for i, w in enumerate(vocab_list):
            print >> fp, w.encode("utf-8"), " ".join(map(str, w_mat[i, :]))
Exemplo n.º 26
0
def main(proc_num, lock, out_dir, in_dir, years):
    print proc_num, "Start loop"
    years.reverse()
    while True:
        lock.acquire()
        work_left = False
        # Iterates through the years (in a so complicated way)
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(
                    year
            ) + ".bin" in dirs:  # Checks if the individual year exists in the target directory. If it doesn't, allows the merging
                continue

            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + ".bin"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Merging counts for year", year  # Merging starts here
        full_counts = collections.defaultdict(float)
        merged_index = collections.OrderedDict()
        for chunk_num in os.listdir(
                in_dir
        ):  # Iterates through the alphabetically separated co-occurrence data
            chunk_name = in_dir + str(chunk_num) + "/" + str(year) + ".bin"
            if not os.path.isfile(chunk_name):
                continue
            chunk_counts = sparse_io.retrieve_mat_as_coo(chunk_name)
            chunk_index = ioutils.load_pickle(in_dir + str(chunk_num) +
                                              "/index.pkl")
            chunk_index = list(chunk_index)
            for pair, count in chunk_counts.todok().iteritems(
            ):  # Iterates through the co-occurrence matrices and add the occurrence of the word-pairs to the merged co-occurrence matrix
                i_word = chunk_index[pair[0]]
                c_word = chunk_index[pair[1]]
                new_pair = (indexing.word_to_cached_id(i_word, merged_index),
                            indexing.word_to_cached_id(c_word, merged_index))
                full_counts[new_pair] += count

        print proc_num, "Writing counts for year", year
        sparse_io.export_mat_from_dict(
            full_counts, out_dir + str(year) +
            ".bin")  # Saves the yearly merged co-occurrence matrices.
        ioutils.write_pickle(merged_index, out_dir + str(year) +
                             "-index.pkl")  # ... and the merged index
        ioutils.write_pickle(list(merged_index),
                             out_dir + str(year) + "-list.pkl")
Exemplo n.º 27
0
def load_vocabulary(mat, path):
    if os.path.isfile(path.split(".")[0] + "-index.pkl"):
        path = path.split(".")[0] + "-index.pkl"
    else:
        print "Could not find local index. Attempting to load directory wide index..."
        path = "/".join(path.split("/")[:-1]) + "/index.pkl"
    index = util.load_pickle(path)
    vocab = sorted(index, key=lambda word: index[word])
    iw = vocab[:mat.shape[0]]
    ic = vocab[:mat.shape[1]]
    return iw, ic
def make_word_list(type):
    process_word = lambda word : word if type != "lemma_pos" else word.split("_")[0]
    freqs = load_pickle(FREQS.format(type=type))
    word_lists = {}
    nstop_lists = {}
    nproper_lists = {}
    nstop_nproper_lists = {}
    print "Processing type: ", type
    for year, year_freqs in freqs.iteritems():
        proper_nouns = load_pickle(PROPER_NOUNS.format(year=year))
        word_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if word != "" and word.isalnum()]
        nstop_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in STOPWORDS and not word == "" and word.isalnum()]
        nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns and not word == "" and word.isalnum()]
        nstop_nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns 
                and not process_word(word) in STOPWORDS and not word == "" and word.isalnum()]
        print "Finished year: ", year
    write_pickle(word_lists, OUT.format(type=type, cond="all"))
    write_pickle(nstop_lists, OUT.format(type=type, cond="nstop"))
    write_pickle(nproper_lists, OUT.format(type=type, cond="nproper"))
    write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
Exemplo n.º 29
0
def load_vocabulary(mat, path):
    if os.path.isfile(path.split(".")[0] + "-index.pkl"):
        path = path.split(".")[0] + "-index.pkl"
    else:
        print "Could not find local index. Attempting to load directory wide index..."
        path = "/".join(path.split("/")[:-1]) + "/index.pkl"
    index = util.load_pickle(path)
    vocab = sorted(index, key = lambda word : index[word])
    iw = vocab[:mat.shape[0]]
    ic = vocab[:mat.shape[1]]
    return iw, ic
Exemplo n.º 30
0
def main(years, out_dir, in_dir, count_dir, min_count, num_words):
    print "Making common vocab"
    words = ioutils.load_pickle(in_dir + str(years[0]) + "-list.pkl")
    for year in years:
        counts_year = ioutils.load_pickle(count_dir + str(year) +
                                          "-counts.pkl")
        use_words = sorted(counts_year.keys(),
                           key=lambda word: counts_year[word])[:num_words]
        use_words = [
            word for word in use_words if counts_year[word] > min_count
        ]
        i = 0
        while i < len(words):
            if words[i] not in use_words:
                words.pop(i)
                i -= 1
            i += 1
        print year, "vocab, done"

    ioutils.write_pickle(list(words), out_dir + "common_vocab.pkl")
Exemplo n.º 31
0
def merge(years, out_pref, out_dir):
    word_freqs = collections.defaultdict(
        dict)  # dict mapping year to word-relative_frequency pairs
    word_lists = {}  # dict mapping year to list of used words
    word_set = set([])  # set of words ever used
    for year in years:  # Collects word_lists
        word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl")
        word_set = word_set.union(set(word_lists[year]))
        os.remove(out_dir + str(year) + "tmp.pkl")
    for year in years:  # Collects relative frequencies
        year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl")
        for word in word_set:
            if word not in year_freqs:
                word_freqs[word][year] = float('nan')
            else:
                word_freqs[word][year] = year_freqs[word]
        os.remove(out_dir + str(year) + "freqstmp.pkl")

    ioutils.write_pickle(word_freqs,
                         out_pref + "-freqs.pkl")  # Saves relative frequencies
    ioutils.write_pickle(word_lists, out_pref + ".pkl")  # Saves word_lists
Exemplo n.º 32
0
def load_vocabulary(mat, path):
    """Loads index from path + "-index.pkl" sorts words by their ids and return the first mat.shape[0] elements and the first mat.shape[1] elements in two different lists."""
    if os.path.isfile(path.split(".")[0] + "-index.pkl"):
        path = path.split(".")[0] + "-index.pkl"
    else:
        print "Could not find local index. Attempting to load directory wide index..."
        path = "/".join(path.split("/")[:-1]) + "/merged_index.pkl"
    index = util.load_pickle(path)
    vocab = sorted(index, key=lambda word: index[word])
    iw = vocab[:mat.shape[0]]
    ic = vocab[:mat.shape[1]]
    return iw, ic
Exemplo n.º 33
0
def run(out_dir, in_dir):
    index = collections.OrderedDict()
    for year in YEARS:
        print "Merging year", year
        year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        i = 0
        for i in xrange(len(year_list)):
            word = year_list[i]
            indexing.word_to_cached_id(word, index)

    ioutils.write_pickle(index, out_dir + "merged_index.pkl") 
    ioutils.write_pickle(list(index), out_dir + "merged_list.pkl") 
Exemplo n.º 34
0
def run(out_dir, in_dir):
    index = collections.OrderedDict()
    for year in YEARS:
        print "Merging year", year
        year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        i = 0
        for i in xrange(len(year_list)):
            word = year_list[i]
            indexing.word_to_cached_id(word, index)

    ioutils.write_pickle(index, out_dir + "merged_index.pkl")
    ioutils.write_pickle(list(index), out_dir + "merged_list.pkl")
def worker(proc_num, queue, dir, count_dir, min_count, checkpoints):
    while True:
        if queue.empty():
            break
        year = queue.get()
        freqs = load_pickle(count_dir + str(year) + "-counts.pkl")
        for n in checkpoints:
            out_dir =dir + '{:03d}'.format(n) + "/"
            mkdir(out_dir)
            subprocess.call(['mv', dir + str(year) + '-w.' + '{:03d}'.format(n), out_dir + str(year) + '-w'])
            print "Loading data..", year, "iterations", n
            text2numpy(out_dir, freqs, year)
Exemplo n.º 36
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print "Proc:", proc_num, "Decade:", decade
        proper_nouns = set([])
        pos_tags = load_pickle(DATA + str(decade) + "-pos-maj.pkl")
        for word, tag in pos_tags.iteritems():
            if tag == "np":
                proper_nouns.add(word)
        write_pickle(proper_nouns, OUT + str(decade) + "-proper_nouns.pkl")
Exemplo n.º 37
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:  # Iterates through the decades
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        counts = collections.defaultdict(
            int)  # this dict represents the co-occurrence matrix
        for year in range(10):  # Iterates through the years in the decade
            embed = Explicit.load(
                in_dir + str(decade + year) + ".bin", normalize=False
            )  # Makes an embedding about the individual year (here is needed the own index)
            if year == 0:
                merged_index = embed.wi
            if os.path.isfile(in_dir + str(decade + year) + "-list.pkl"):
                year_list = load_pickle(in_dir + str(decade + year) +
                                        "-list.pkl")
            else:
                year_list = load_pickle(in_dir + "merged_list.pkl")
            mat = embed.m.tocoo()
            for i in xrange(
                    len(mat.data)
            ):  # Iterates through the word-context pairs and counts the co-occurrence
                if mat.data[i] == 0:
                    continue
                new_row = word_to_cached_id(year_list[mat.row[i]],
                                            merged_index)
                new_col = word_to_cached_id(year_list[mat.col[i]],
                                            merged_index)
                counts[(new_row, new_col)] += mat.data[
                    i]  # Adds the co-occurrence to the decade-data
            print "Done year ", decade + year
        export_mat_from_dict(counts, out_dir + str(decade) +
                             ".bin")  # Saves the decadely co-occurrence matrix
        write_pickle(merged_index, out_dir + str(decade) +
                     "-index.pkl")  # Saves the decadely index
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
Exemplo n.º 38
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print("Proc:", proc_num, "Decade:", decade)
        proper_nouns = set([])
        pos_tags = load_pickle(DATA + str(decade) + "-pos-maj.pkl")
        for word, tag in pos_tags.items():
            if tag == "np":
                proper_nouns.add(word)
        write_pickle(proper_nouns, OUT + str(decade) + "-proper_nouns.pkl")
Exemplo n.º 39
0
def load_shared_vocabulary(mat, mat_file):
    vocab_file = ""
    i = 0
    path = mat_file.split("/")
    while True:
        if "nppmi" in path[i]:
            break
        vocab_file += "/" + path[i]
        i += 1
    vocab_file += "/5grams/merged_list.pkl"
    shared_vocab = ioutils.load_pickle(vocab_file)
    iw = shared_vocab[:mat.shape[0]]
    ic = shared_vocab[:mat.shape[1]]
    return iw, ic
Exemplo n.º 40
0
def load_shared_vocabulary(mat, mat_file):
    vocab_file = ""
    i = 0
    path = mat_file.split("/")
    while True:
        if "nppmi" in path[i]:
            break
        vocab_file += "/" + path[i]
        i += 1
    vocab_file += "/5grams/merged_list.pkl"
    shared_vocab = ioutils.load_pickle(vocab_file)
    iw = shared_vocab[:mat.shape[0]]
    ic = shared_vocab[:mat.shape[1]]
    return iw, ic
Exemplo n.º 41
0
def merge(out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl")
        disp_yearstat = ioutils.load_pickle(out_pref + str(year) + "-disps.pkl")
        for word in word_list:
            if word not in vol_yearstat:
                vol = float('nan')
            else:
                vol = vol_yearstat[word]
            if word not in disp_yearstat:
                disp = float('nan')
            else:
                disp = disp_yearstat[word]
            vol_yearstats[word][year] = vol
            disp_yearstats[word][year] = disp
        os.remove(out_pref + str(year) + "-vols.pkl")
        os.remove(out_pref + str(year) + "-disps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "-vols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "-disps.pkl")
Exemplo n.º 42
0
def main(proc_num, lock, out_dir, in_dir):
    years = YEARS
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(year) + "-a.pkl" in dirs:
                continue
            
            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + "-a.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Merging grams for year", year
        year_grams = {}
        for letter in string.ascii_lowercase:
            year_grams[letter] = collections.defaultdict(list)

        for chunk_name in os.listdir(in_dir): 
            print "Processing chunk", chunk_name
            chunk_name = in_dir + str(chunk_name) + "/" + str(year) + ".pkl"
            if not os.path.isfile(chunk_name):
                continue
            chunk_counts = ioutils.load_pickle(chunk_name)
            for word, info_list in chunk_counts.iteritems():
                if word[0] not in year_grams:
                    continue
                for info in info_list:
                    gram = info[0].split("\t")[0]
                    count = info[1]
                    year_grams[word[0]][word].append((gram, count))
            
        print proc_num, "Writing counts for year", year
        for letter, letter_grams in year_grams.iteritems():
            for word in letter_grams:
                letter_grams[word] = sorted(letter_grams[word], key = lambda info : info[1], reverse=True)
            ioutils.write_pickle(letter_grams, out_dir + str(year) + "-" + letter + ".pkl")
Exemplo n.º 43
0
def merge(out_pref, years, full_word_list):
    merged_word_stats = {}
    for stat in STATS:
        merged_word_stats[stat] = {}
        for word in full_word_list:
            merged_word_stats[stat][word] = {}
    for year in years:
        year_stats = ioutils.load_pickle(out_pref + str(year) + "-tmp.pkl")
        for stat, stat_vals in year_stats.iteritems():
            for word in full_word_list:
                if not word in stat_vals:
                    merged_word_stats[stat][word][year] = NAN
                else:
                    merged_word_stats[stat][word][year] = stat_vals[word]
        os.remove(out_pref + str(year) + "-tmp.pkl")
    ioutils.write_pickle(merged_word_stats, out_pref +  ".pkl")
Exemplo n.º 44
0
def merge_bootstrap(out_pref):
    dir = "/".join(out_pref.split("/")[0:-1])
    bootfiles = os.listdir(dir)
    word_stat_lists = {}
    first_file = True
    file_num = 0
    for file in bootfiles:
        bootstats = ioutils.load_pickle(dir + "/" + file)
        print "Processing file", file
        for stat, stat_vals in bootstats.iteritems():
            if first_file:
                word_stat_lists[stat] = {}
            for word, val in stat_vals.iteritems():
                year_vals = stat_vals[word]
                if first_file:
                    word_stat_lists[stat][word] = {}
                for year, val in year_vals.iteritems():
                    if type(val) == float and np.isnan(val):
                        word_stat_lists[stat][word][year] = float('nan')
                    else:
                        if first_file:
                            word_stat_lists[stat][word][year] = np.empty((val.shape[0] * len(bootfiles)))
                        word_stat_lists[stat][word][year][file_num * val.shape[0]:(file_num + 1) * val.shape[0]] = val[:]
        first_file = False
        file_num += 1
    print "Making means and stds"
    word_stat_means = {}
    word_stat_stds = {}
    for stat, stat_vals in word_stat_lists.iteritems():
        word_stat_means[stat] = {}
        word_stat_stds[stat] = {}
        for word, year_vals in stat_vals.iteritems():
            word_stat_means[stat][word] = {}
            word_stat_stds[stat][word] = {}
            for year, val in year_vals.iteritems():
                if type(val) == float and np.isnan(val):
                    word_stat_means[stat][word][year] = float('nan')
                    word_stat_stds[stat][word][year] = float('nan')
                else:
                    word_stat_means[stat][word][year] = val.mean()
                    word_stat_stds[stat][word][year] = val.std()
    print "Writing data"
    for stat, mean_vals in word_stat_means.iteritems():
        ioutils.write_pickle(mean_vals, out_pref + "-" + stat + "-mean.pkl")
    for stat, std_vals in word_stat_stds.iteritems():
        ioutils.write_pickle(std_vals, out_pref + "-" + stat + "-std.pkl")
Exemplo n.º 45
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        year_words = valid_words[year][:num_words]
        count_words = set(ioutils.words_above_count(count_dir, year, min_count))
        freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = list(count_words.intersection(year_words)) 
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)): 
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) 
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) 
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(mat.data[i] * prop_keep)):
                    fp.write(line)
        mat = mat.tocsr()
        print proc_num, "Outputing vocab for year", year
        with open(out_dir + str(year) + ".vocab", "w") as fp:
            for word in year_words:
                if not word in count_words:
                    print >>fp, word.encode("utf-8"), 1
                else:
                    print >>fp, word.encode("utf-8"), int(mat[embed.wi[word], :].sum())
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" 
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
Exemplo n.º 46
0
    def __init__(self, path, normalize=True, eig=0.0, **kwargs):
        ut = np.load(path + '-u.npy')
        s = np.load(path + '-s.npy')
        vocabfile = path + '-vocab.pkl'
        self.iw = load_pickle(vocabfile)
        self.wi = {w:i for i, w in enumerate(self.iw)}
 
        if eig == 0.0:
            self.m = ut
        elif eig == 1.0:
            self.m = s * ut
        else:
            self.m = np.power(s, eig) * ut

        self.dim = self.m.shape[1]

        if normalize:
            self.normalize()
Exemplo n.º 47
0
def get_sorted_words(years, out_dir, in_dir):
    word_freqs = collections.defaultdict(float)
    for year in years:
        print "Processing year", year
        year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl")
        sum = 0.0
        for _, counts in year_freqs.iteritems():
            sum += counts[0]  
        for word, counts in year_freqs.iteritems():
            if not word.isalpha():
                continue
            word_freqs[word] += float(counts[0]) / sum 
    print "Writing data"
    sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
    out_pref = out_dir + "sortedwords-" + str(years[0]) + "-" + str(years[-1]) 
    out_fp = open(out_pref + ".txt", "w")
    for word in sorted_list:
        out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n")
    ioutils.write_pickle(sorted_list, out_pref + ".pkl")
Exemplo n.º 48
0
def worker(proc_num, queue, in_dir):
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Making second orders for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat)
        old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl"))
        new_index = collections.OrderedDict()
        for i in xrange(len(keep_rows)):
            new_index[old_index[keep_rows[i]]] = i
        ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl")
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")
Exemplo n.º 49
0
def main(proc_num, lock, in_dir, years, word_list):
    years = range(years[0], years[-1] + 1)
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir))
            if str(year) + "-freqstmp.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = in_dir + str(year) + "-freqstmp.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break
        
        year_freqs = ioutils.load_pickle(in_dir + "/" + str(year) + "-freqs.pkl")
        word_stats = {}
        print proc_num, "Getting stats for year", year
        sum = 0
        for word in word_list:
            if word in year_freqs:
                word_count = year_freqs[word][1]
                sum += word_count
                word_stats[word] = word_count
        for word in word_stats:
            word_stats[word] /= float(sum)

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqstmp.pkl")
Exemplo n.º 50
0
def get_sorted_words(years, out_pref, in_dir, avg_thresh, min_thresh):
    stop_set = set(stopwords.words('english'))
    word_freqs = collections.defaultdict(float)
    word_mins = collections.defaultdict(lambda : 1.0)
    for year in years:
        print "Processing year", year
        year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl")
        sum = 0.0
        for _, counts in year_freqs.iteritems():
            sum += counts[0]  
        for word, counts in year_freqs.iteritems():
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = float(counts[0]) / sum 
            word_freqs[word] += year_freq
            word_mins[word] = min(word_mins[word], year_freq)
    print "Writing data"
    sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
    sorted_list = [word for word in sorted_list 
            if (word_freqs[word] / float(len(years)) > avg_thresh and word_mins[word] > min_thresh)]
    out_fp = open(out_pref + ".txt", "w")
    for word in sorted_list:
        out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n")
    ioutils.write_pickle(sorted_list, out_pref + ".pkl")
Exemplo n.º 51
0
 parser.add_argument(
     "--num-words",
     type=int,
     help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.",
     default=-1,
 )
 parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
 parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR)
 parser.add_argument("--thresh", type=float, help="optional threshold", default=None)
 args = parser.parse_args()
 years = range(args.start_year, args.end_year + 1)
 if args.word_file != None:
     if args.index_dir == None:
         print >> sys.stderr, "Must specify index dir with word file!"
         sys.exit()
     word_pickle = ioutils.load_pickle(args.word_file)
     if not args.start_year in word_pickle:
         word_lists = {}
         for year in years:
             word_lists[year] = word_pickle
     else:
         word_lists = word_pickle
     word_infos = {}
     for year, word_list in word_lists.iteritems():
         year_index = ioutils.load_pickle(args.index_dir + "/" + str(year) + "-index.pkl")
         if args.num_words != -1:
             word_list = word_list[: args.num_words]
         word_list, word_indices = get_word_indices(word_list, year_index)
         word_infos[year] = (word_list, word_indices)
     outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0]
     if args.num_words != -1:
Exemplo n.º 52
0
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")

def run_parallel(num_procs, years, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    queue = Queue()
    for year in years:
        queue.put(year)
    procs = [Process(target=main, args=[i, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    merge(years, out_pref, out_dir)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Get yearly sorted by-frequency list of (non-stop) words and dicts with their frequencies")
    parser.add_argument("out_dir", help="output directory")
    parser.add_argument("in_dir", help="directory with 5 grams and index")
    parser.add_argument("num_procs", type=int, help="num procs")
    parser.add_argument("--start-year", type=int, default=1900, help="start year (inclusive)")
    parser.add_argument("--end-year", type=int, default=2000, help="end year (inclusive)")
    parser.add_argument("--freq-thresh", type=int, default=7, help="frequency threshold (neg. power of 10)")
    parser.add_argument("--lang", type=str, default="english", help="language")
    args = parser.parse_args()

    years = range(args.start_year, args.end_year + 1)
    index = ioutils.load_pickle(args.in_dir + "/merged_list.pkl")
    out_pref = args.out_dir + "/freqnonstop_peryear-" + str(years[0]) + "-" + str(years[-1]) + "-"  + str(args.freq_thresh)
    freq_thresh = 10.0 ** (-1.0 * float(args.freq_thresh))
    run_parallel(args.num_procs, years, out_pref , args.out_dir + "/", args.in_dir + "/", index, freq_thresh, args.lang)
Exemplo n.º 53
0
                word_stats[word] = word_count
        for word in word_stats:
            word_stats[word] /= float(sum)

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqstmp.pkl")


def run_parallel(num_procs, in_dir, years, word_list, out_file):
    lock = Lock()
    procs = [Process(target=main, args=[i, lock, in_dir, years, word_list]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    print "Merging"
    merge(word_list, years, in_dir, out_file)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("out_file", help="path to network data (also where output goes)")
    parser.add_argument("in_dir", help="path to network data (also where output goes)")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    word_list = ioutils.load_pickle(args.word_file)
    run_parallel(args.num_procs, args.in_dir + "/", years, word_list, args.out_file)       
Exemplo n.º 54
0
if __name__ == "__main__":
    queue = Queue()
    for decade in range(1810, 2010, 10):
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue]) for i in range(25)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    print "Getting full set..."
    proper_nouns = set([])
    pos_counts = {}
    print "Merging pos counts.."
    for decade in range(1810, 2010, 10):
        decade_pos_counts = load_pickle(DATA + str(decade) + "-pos-counts.pkl")
        for word, counts in decade_pos_counts.iteritems(): 
            if word not in pos_counts:
                pos_counts[word] = collections.Counter()
            for pos, count in counts.iteritems():
                pos_counts[word][pos] += count
    write_pickle(pos_counts, DATA + "all-pos-counts.pkl")
    pos_maj = {}
    proper_nouns = set([])
    for word, p_counts in pos_counts.iteritems():
        pos_maj[word] = sorted(p_counts, key = lambda t : -1*p_counts[t])[0]
        if pos_maj[word] == "np":
            proper_nouns.add(word)
    write_pickle(pos_maj, OUT + "all-pos-maj.pkl")
    write_pickle(proper_nouns, OUT + "proper_nouns.pkl")
Exemplo n.º 55
0
from ioutils import load_pickle, write_pickle

DIR = "/dfs/scratch0/COHA/decade_freqs/"
word = {}
lemma = {}
lemma_pos = {}
for year in range(1810, 2010, 10):
    word[year] = load_pickle(DIR + str(year) + "-word.pkl")
    lemma[year] = load_pickle(DIR + str(year) + "-lemma.pkl")
    lemma_pos[year] = load_pickle(DIR + str(year) + "-lemma_pos.pkl")

write_pickle(word, DIR + "word.pkl")
write_pickle(lemma, DIR + "lemma.pkl")
write_pickle(lemma_pos, DIR + "lemma_pos.pkl")
Exemplo n.º 56
0
def load_vocabulary(mat, path):
    shared_vocab = list(ioutils.load_pickle(path.split(".")[0] + "-index.pkl"))
    iw = shared_vocab[:mat.shape[0]]
    ic = shared_vocab[:mat.shape[1]]
    return iw, ic
Exemplo n.º 57
0
                        continue
                    word_freqs[word] += 1
                    lemma_freqs[lemma] += 1
                    lemma_pos_freqs[lemma_pos] += 1
        write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl") 
        write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") 
        write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") 

if __name__ == "__main__":
    queue = Queue()
    for decade in range(1810, 2010, 10):
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue]) for i in range(25)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    print "Getting full freqs..."
    word_freqs = Counter()
    lemma_freqs = Counter()
    lemma_pos_freqs = Counter()
    for decade in range(1810, 2010, 10):
        decade = str(decade)
        print decade
        word_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-word.pkl") 
        lemma_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-lemma.pkl") 
        lemma_pos_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") 
    write_pickle(word_freqs, OUT + "full_freqs/word.pkl") 
    write_pickle(lemma_freqs, OUT + "full_freqs/lemma.pkl") 
    write_pickle(lemma_pos_freqs, OUT + "full_freqs/lemma_pos.pkl")