Пример #1
0
def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list, word_indices, displacement_base, thresh):
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir + "/volstats/"))
            if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = tmp_out_pref + str(year) + "-jvols.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break
        
        print proc_num, "Loading matrices..."
        base = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year - 1) + ".bin", args.thresh, min_size=MIN_SIZE)
        base = base.tocsr()
        delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year) + ".bin", args.thresh, min_size=MIN_SIZE)
        delta = delta.tocsr()
        print proc_num, "Getting deltas..."
        year_vols = get_jaccard_deltas(base, delta, word_list, word_indices)
        year_disp = get_jaccard_deltas(displacement_base, delta, word_list, word_indices)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols, tmp_out_pref + str(year) + "-jvols.pkl")
        ioutils.write_pickle(year_disp, tmp_out_pref + str(year) + "-jdisps.pkl")
Пример #2
0
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        stop_set = set(stopwords.words(lang))
        word_freqs = {}
        print "Loading mat for year", year
        year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum()
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = year_mat[word_i, :].sum()
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
        sorted_list = [word for word in sorted_list 
                    if word_freqs[word] > freq_thresh]
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
Пример #3
0
def main(proc_num, queue, out_dir, download_dir, context_size):
    print proc_num, "Start loop"
    while True:
        if queue.empty():
            break
        name = queue.get()
        loc_dir = out_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        time.sleep(120 * random.random())
        with open(download_dir + name) as f:
            for i, l in enumerate(f):
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()]
                year = split[1]
                count = int(split[2])
                if context_size == 2:
                    year_counters = update_count(ngram, 2, year, count, year_counters)
                elif context_size == 4:
                    year_counters = update_count(ngram, 0, year, count, year_counters)
                    year_counters = update_count(ngram, 4, year, count, year_counters)
                else:
                    raise Exception("Unsupported context size")

        print proc_num, "Writing", name
        time.sleep(120 * random.random())
        sparse_io.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")
Пример #4
0
def worker(proc_num, queue, dir, count_dir, min_count):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print "Loading data..", year
#        time.sleep(120 * random.random())
        freqs = load_pickle(count_dir + str(year) + "-counts.pkl")
        iw = []
        with open(dir + str(year) + "-w.txt") as fp:
            info = fp.readline().split()
            vocab_size = int(info[0])
            dim = int(info[1])
            w_mat = np.zeros((vocab_size, dim))
            for i, line in enumerate(fp):
                line = line.strip().split()
                iw.append(line[0].decode("utf-8"))
                if freqs[iw[-1]] >= 500:
                    w_mat[i,:] = np.array(map(float, line[1:]))
        c_mat = np.zeros((vocab_size, dim))
        with open(dir + str(year) + "-c.txt") as fp:
            fp.readline()
            for i, line in enumerate(fp):
                line = line.strip().split()
                if freqs[line[0]] >= min_count:
                    c_mat[i,:] = np.array(map(float, line[1:]))
        np.save(dir + str(year) + "-w.npy", w_mat)
        np.save(dir + str(year) + "-c.npy", c_mat)
        write_pickle(iw, dir + str(year) + "-vocab.pkl")
Пример #5
0
def main(out_dir, source):
    page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' % (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    year_freqs = {}
    for year in YEARS:
        year_freqs[year] = {}

    print "Start loop"
    for url in urls:
        name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1)

        print  "Downloading", name

        success = False
        while not success:
            with open(out_dir + name + '.csv.zip', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print  "Unzipping", name
        subprocess.call(['unzip', '-o', out_dir + name + '.csv.zip', '-d', out_dir])
        subprocess.call(['mv', out_dir + 'googlebooks-' + source + '-' +  TYPE + '-' + VERSION + '-' + name + '.csv', out_dir + name])

        print  "Going through", name
        with open(out_dir + name) as f:
            for l in f:
                try:
                    split = l.strip().split('\t')
                    word = split[0].decode('utf-8').lower()
                    word = word.strip("\"")
                    word = word.strip("'s")
                    year = int(split[1])
                    count = int(split[2])
                    doc_count = int(split[4])
                    if not year in YEARS:
                        continue
                    if not word in year_freqs[year]:
                        year_freqs[year][word] = (count, doc_count)
                    else:
                        old_counts = year_freqs[year][word]
                        year_freqs[year][word] = (old_counts[0] + count, old_counts[1] + count)
                except UnicodeDecodeError:
                    pass

        print "Deleting", name
        try:
            os.remove(out_dir + name)
            os.remove(out_dir + name + '.csv.zip')
        except:
            pass

    print "Writing..."
    for year in YEARS:
        ioutils.write_pickle(year_freqs[year], out_dir + str(year) + "-freqs.pkl")
Пример #6
0
def main(proc_num, lock, in_dir, years, word_list, index):
    years = range(years[0], years[-1] + 1)
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir))
            if str(year) + "-freqs.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = in_dir + str(year) + "-freqs.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        print proc_num, "Making inverse freq mat", year
        mat = mat.tocsr()
        mat = mat / mat.sum()
        word_stats = {}
        print proc_num, "Getting stats for year", year
        for word in word_list:
            word_stats[word] = compute_word_stats(mat, word, index)

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl")
Пример #7
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        counts = collections.defaultdict(int)       
        for year in range(10):
            embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False)
            if year == 0:
                merged_index = embed.wi
            year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl")
            mat = embed.m.tocoo()
            for i in xrange(len(mat.data)):
                if mat.data[i] == 0:
                    continue
                new_row = get_index(merged_index, year_list, mat.row[i])
                new_col = get_index(merged_index, year_list, mat.col[i])
                counts[(new_row, new_col)] += mat.data[i]
            print "Done year ", decade + year
        export_mat_from_dict(counts, decade, out_dir)
        write_pickle(merged_index, out_dir + str(decade) + "-index.pkl")
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
Пример #8
0
def merge(word_list, years, in_dir, out_file):
    yearstats = {}
    for word in word_list:
        yearstats[word] = {}
    for year in years:
        yearstat = ioutils.load_pickle(in_dir + str(year) + "-freqstmp.pkl")
        for word in yearstat.keys():
            yearstats[word][year] = yearstat[word]
        os.remove(in_dir + str(year) + "-freqstmp.pkl")
    ioutils.write_pickle(yearstats, out_file)
Пример #9
0
def run(out_file, in_dir, years, year_indices):
    samplesizes = {}
    for year in years:
        print "Processing year", year
        indices = year_indices[year]
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        mat = mat.tocsr()
        mat = mat[indices, :]
        mat = mat[:, indices]
        samplesizes[year] = mat.sum()
    ioutils.write_pickle(samplesizes, out_file)
Пример #10
0
def run(out_dir, in_dir):
    index = collections.OrderedDict()
    for year in YEARS:
        print "Merging year", year
        year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        i = 0
        for i in xrange(len(year_list)):
            word = year_list[i]
            indexing.word_to_cached_id(word, index)

    ioutils.write_pickle(index, out_dir + "merged_index.pkl") 
    ioutils.write_pickle(list(index), out_dir + "merged_list.pkl") 
Пример #11
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print "Proc:", proc_num, "Decade:", decade
        proper_nouns = set([])
        pos_tags = load_pickle(DATA + str(decade) + "-pos-maj.pkl")
        for word, tag in pos_tags.iteritems():
            if tag == "np":
                proper_nouns.add(word)
        write_pickle(proper_nouns, OUT + str(decade) + "-proper_nouns.pkl")
Пример #12
0
def main(proc_num, lock, out_dir, in_dir):
    years = YEARS
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(year) + "-a.pkl" in dirs:
                continue
            
            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + "-a.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Merging grams for year", year
        year_grams = {}
        for letter in string.ascii_lowercase:
            year_grams[letter] = collections.defaultdict(list)

        for chunk_name in os.listdir(in_dir): 
            print "Processing chunk", chunk_name
            chunk_name = in_dir + str(chunk_name) + "/" + str(year) + ".pkl"
            if not os.path.isfile(chunk_name):
                continue
            chunk_counts = ioutils.load_pickle(chunk_name)
            for word, info_list in chunk_counts.iteritems():
                if word[0] not in year_grams:
                    continue
                for info in info_list:
                    gram = info[0].split("\t")[0]
                    count = info[1]
                    year_grams[word[0]][word].append((gram, count))
            
        print proc_num, "Writing counts for year", year
        for letter, letter_grams in year_grams.iteritems():
            for word in letter_grams:
                letter_grams[word] = sorted(letter_grams[word], key = lambda info : info[1], reverse=True)
            ioutils.write_pickle(letter_grams, out_dir + str(year) + "-" + letter + ".pkl")
Пример #13
0
def merge(out_pref, years, full_word_list):
    merged_word_stats = {}
    for stat in STATS:
        merged_word_stats[stat] = {}
        for word in full_word_list:
            merged_word_stats[stat][word] = {}
    for year in years:
        year_stats = ioutils.load_pickle(out_pref + str(year) + "-tmp.pkl")
        for stat, stat_vals in year_stats.iteritems():
            for word in full_word_list:
                if not word in stat_vals:
                    merged_word_stats[stat][word][year] = NAN
                else:
                    merged_word_stats[stat][word][year] = stat_vals[word]
        os.remove(out_pref + str(year) + "-tmp.pkl")
    ioutils.write_pickle(merged_word_stats, out_pref +  ".pkl")
Пример #14
0
def merge(out_pref, tmp_dir, years):
    net_stats = collections.defaultdict(dict)
    rewire_net_stats = collections.defaultdict(dict)
    for year in years:
        year_stats = ioutils.load_pickle(tmp_dir + str(year) + "-tmp.pkl")
        rewire_year_stats = ioutils.load_pickle(tmp_dir + "rewire" + str(year) + "-tmp.pkl")
        for stat, val in year_stats.iteritems():
            net_stats[stat][year] = val
        for stat, val in rewire_year_stats.iteritems():
            rewire_net_stats[stat][year] = val
        os.remove(tmp_dir + str(year) + "-tmp.pkl")
        os.remove(tmp_dir + "rewire" + str(year) + "-tmp.pkl")
    for stat, year_vals in net_stats.iteritems():
        ioutils.write_pickle(year_vals, out_pref + "-" + stat + ".pkl")
    for stat, year_vals in rewire_net_stats.iteritems():
        ioutils.write_pickle(year_vals, out_pref + "-rw-" + stat + ".pkl")
Пример #15
0
def merge(out_pref, tmp_out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jvols.pkl")
        disp_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jdisps.pkl")
        for word in word_list:
            vol_yearstats[word][year] = vol_yearstat[word]
            disp_yearstats[word][year] = disp_yearstat[word]
        os.remove(tmp_out_pref + str(year) + "-jvols.pkl")
        os.remove(tmp_out_pref + str(year) + "-jdisps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "-jvols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "-jdisps.pkl")
Пример #16
0
def worker(proc_num, queue, out_pref, in_dir, target_lists, context_lists, displacement_base, thresh, year_inc, type):
    time.sleep(10*random.random())
    while True:
        if queue.empty():
            print proc_num, "Finished"
            break
        year = queue.get()
        print proc_num, "Loading matrices..."
        base = create_representation(type, in_dir + str(year-year_inc),  thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False)
        delta = create_representation(type, in_dir + str(year),  thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False)
        print proc_num, "Getting deltas..."
        year_vols = get_cosine_deltas(base, delta, target_lists[year], type)
        year_disp = get_cosine_deltas(displacement_base, delta, target_lists[year], type)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl")
        ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
Пример #17
0
def merge_bootstrap(out_pref):
    dir = "/".join(out_pref.split("/")[0:-1])
    bootfiles = os.listdir(dir)
    word_stat_lists = {}
    first_file = True
    file_num = 0
    for file in bootfiles:
        bootstats = ioutils.load_pickle(dir + "/" + file)
        print "Processing file", file
        for stat, stat_vals in bootstats.iteritems():
            if first_file:
                word_stat_lists[stat] = {}
            for word, val in stat_vals.iteritems():
                year_vals = stat_vals[word]
                if first_file:
                    word_stat_lists[stat][word] = {}
                for year, val in year_vals.iteritems():
                    if type(val) == float and np.isnan(val):
                        word_stat_lists[stat][word][year] = float('nan')
                    else:
                        if first_file:
                            word_stat_lists[stat][word][year] = np.empty((val.shape[0] * len(bootfiles)))
                        word_stat_lists[stat][word][year][file_num * val.shape[0]:(file_num + 1) * val.shape[0]] = val[:]
        first_file = False
        file_num += 1
    print "Making means and stds"
    word_stat_means = {}
    word_stat_stds = {}
    for stat, stat_vals in word_stat_lists.iteritems():
        word_stat_means[stat] = {}
        word_stat_stds[stat] = {}
        for word, year_vals in stat_vals.iteritems():
            word_stat_means[stat][word] = {}
            word_stat_stds[stat][word] = {}
            for year, val in year_vals.iteritems():
                if type(val) == float and np.isnan(val):
                    word_stat_means[stat][word][year] = float('nan')
                    word_stat_stds[stat][word][year] = float('nan')
                else:
                    word_stat_means[stat][word][year] = val.mean()
                    word_stat_stds[stat][word][year] = val.std()
    print "Writing data"
    for stat, mean_vals in word_stat_means.iteritems():
        ioutils.write_pickle(mean_vals, out_pref + "-" + stat + "-mean.pkl")
    for stat, std_vals in word_stat_stds.iteritems():
        ioutils.write_pickle(std_vals, out_pref + "-" + stat + "-std.pkl")
Пример #18
0
def worker(proc_num, queue, out_pref, in_dir, word_list, displacement_base, thresh):
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
       
        print proc_num, "Loading matrices..."
        base = simple_create_representation(REP_TYPE, in_dir + str(year-1) + ".bin", restricted_context=word_list[year-1], thresh=thresh)
        delta = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", restricted_context=word_list[year], thresh=thresh)
        print proc_num, "Getting deltas..."
        year_vols = get_cosine_deltas(base, delta, word_list[year])
        year_disp = get_cosine_deltas(displacement_base, delta, word_list[year])
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl")
        ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
Пример #19
0
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print "Loading year:", year
        year_embed =  create_representation(rep_type, in_dir + str(year), **rep_args)
        year_words = words_above_count(count_dir, year, min_count)
        year_embed.get_subembed(year_words)
        print "Aligning year:", year
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed)
        base_embed = aligned_embed
        print "Writing year:", year
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy",aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Пример #20
0
def worker(proc_num, queue, in_dir):
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Making second orders for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat)
        old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl"))
        new_index = collections.OrderedDict()
        for i in xrange(len(keep_rows)):
            new_index[old_index[keep_rows[i]]] = i
        ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl")
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")
Пример #21
0
def get_sorted_words(years, out_dir, in_dir):
    word_freqs = collections.defaultdict(float)
    for year in years:
        print "Processing year", year
        year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl")
        sum = 0.0
        for _, counts in year_freqs.iteritems():
            sum += counts[0]  
        for word, counts in year_freqs.iteritems():
            if not word.isalpha():
                continue
            word_freqs[word] += float(counts[0]) / sum 
    print "Writing data"
    sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
    out_pref = out_dir + "sortedwords-" + str(years[0]) + "-" + str(years[-1]) 
    out_fp = open(out_pref + ".txt", "w")
    for word in sorted_list:
        out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n")
    ioutils.write_pickle(sorted_list, out_pref + ".pkl")
Пример #22
0
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print("Loading year:", year)
        year_embed =  create_representation(rep_type, in_dir + str(year), **rep_args)
        year_words = words_above_count(count_dir, year, min_count)
        year_embed.get_subembed(year_words)
        print("Aligning year:", year)
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed)
        base_embed = aligned_embed
        print("Writing year:", year)
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy",aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Пример #23
0
def merge(years, out_pref, out_dir):
    word_freqs = collections.defaultdict(dict)
    word_lists = {}
    word_set = set([])
    for year in years:
        word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl")
        word_set = word_set.union(set(word_lists[year]))
        os.remove(out_dir + str(year) + "tmp.pkl")
    for year in years:
        year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl")
        for word in word_set:
            if word not in year_freqs:
                word_freqs[word][year] = float('nan')
            else:
                word_freqs[word][year] = year_freqs[word]
        os.remove(out_dir + str(year) + "freqstmp.pkl")

    ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl")
    ioutils.write_pickle(word_lists, out_pref + ".pkl")
Пример #24
0
def merge(years, out_pref, out_dir):
    word_freqs = collections.defaultdict(dict)
    word_lists = {}
    word_set = set([])
    for year in years:
        word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl")
        word_set = word_set.union(set(word_lists[year]))
        os.remove(out_dir + str(year) + "tmp.pkl")
    for year in years:
        year_freqs= ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl")
        for word in word_set:
            if word not in year_freqs:
                word_freqs[word][year] = float('nan')
            else:
                word_freqs[word][year] = year_freqs[word]
        os.remove(out_dir + str(year) + "freqstmp.pkl")

    ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl")
    ioutils.write_pickle(word_lists, out_pref + ".pkl")
Пример #25
0
def main(proc_num, queue, out_dir, download_dir, context_size, is_zipped):
    print proc_num, "Start loop"
    while True:
        if queue.empty():
            break
        name = queue.get()

        if is_zipped:
            if not name.endswith((".gz")):
                continue
            print "Unzipping " + name + " ..."
            subprocess.call(['gunzip', '-f', download_dir + name, '-d'])
            name = name.split(".gz")[0]

        loc_dir = out_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        time.sleep(120 * random.random())
        with open(download_dir + name) as f:
            for i, l in enumerate(f):
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()]
                year = split[1]
                count = int(split[2])
                if context_size == 2:
                    year_counters = update_count(ngram, 2, year, count, year_counters)
                elif context_size == 4:
                    year_counters = update_count(ngram, 0, year, count, year_counters)
                    year_counters = update_count(ngram, 4, year, count, year_counters)
                else:
                    raise Exception("Unsupported context size")

        print proc_num, "Writing", name
        time.sleep(120 * random.random())
        sparse_io_ref.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")
        os.remove(download_dir + name)
Пример #26
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print "Loading embeddings for year", year
        time.sleep(random.random() * 120)
        valid_words = set(words_above_count(count_dir, year, min_count))
        print len(valid_words)
        words = list(valid_words.intersection(words[year][:num_words]))
        print len(words)
        base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False)
        base_embed = base_embed.get_subembed(words, restrict_context=True)
        print "SVD for year", year
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        print "Saving year", year
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s)
        write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
Пример #27
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        for year in range(10):
            year_counts = load_pickle(in_dir + str(decade + year) + "-counts.pkl")


            if year == 0:
                merged_year_counts = year_counts
            for word, count in year_counts.iteritems():
                if not word in merged_year_counts:
                    merged_year_counts[word] = 0
                merged_year_counts[word] += year_counts[word]

        write_pickle(merged_year_counts, out_dir + str(decade) + "-counts.pkl")
Пример #28
0
def main(years, out_dir, in_dir, count_dir, min_count, num_words):
    print "Making common vocab"
    words = ioutils.load_pickle(in_dir + str(years[0]) + "-list.pkl")
    for year in years:
        counts_year = ioutils.load_pickle(count_dir + str(year) +
                                          "-counts.pkl")
        use_words = sorted(counts_year.keys(),
                           key=lambda word: counts_year[word])[:num_words]
        use_words = [
            word for word in use_words if counts_year[word] > min_count
        ]
        i = 0
        while i < len(words):
            if words[i] not in use_words:
                words.pop(i)
                i -= 1
            i += 1
        print year, "vocab, done"

    ioutils.write_pickle(list(words), out_dir + "common_vocab.pkl")
Пример #29
0
def main(proc_num, queue, out_dir, in_dir, context_size):
    ioutils.mkdir(out_dir)
    print proc_num, "Start loop"
    while True:  # Iterates through the years
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "- Loading mat for year", year
        year_mat = load_matrix(in_dir + str(year) + ".bin")
        index = ioutils.load_pickle(in_dir + str(year) + "-index.pkl")
        print proc_num, "- Processing data for year", year
        counts = year_mat.sum(1) / (2 * context_size)  # sums up the occurrence
        counts = {
            word: int(counts[index[word]])
            for word in index if index[word] < len(counts)
        }
        ioutils.write_pickle(counts, out_dir + "/" + str(year) +
                             "-counts.pkl")  # writes it in a file
Пример #30
0
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, thresh):
    print proc_num, "Start loop"
    time.sleep(10 * random.random())
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        if thresh != None:
            mat = sparse_io.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh)
        else:
            mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=5000000)
        print proc_num, "Getting stats for year", year
        year_stats = get_year_stats(mat, year_index_infos[year]["index"], year_index_infos[year]["list"], index_set = set(year_index_infos[year]["indices"]))

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
Пример #31
0
def main(proc_num, lock, years, out_pref, out_dir, in_dir, index, freq_thresh):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(year) + "tmp.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + "tmp.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        stop_set = set(stopwords.words('english'))
        word_freqs = {}
        print "Loading mat for year", year
        year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum()
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = year_mat[word_i, :].sum()
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
        sorted_list = [word for word in sorted_list 
                    if word_freqs[word] > freq_thresh]
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
Пример #32
0
def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list,
         word_indices, displacement_base, thresh):
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir + "/volstats/"))
            if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = tmp_out_pref + str(year) + "-jvols.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Loading matrices..."
        base = matstore.retrieve_mat_as_binary_coo_thresh(
            in_dir + "/" + str(year - 1) + ".bin",
            args.thresh,
            min_size=MIN_SIZE)
        base = base.tocsr()
        delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" +
                                                           str(year) + ".bin",
                                                           args.thresh,
                                                           min_size=MIN_SIZE)
        delta = delta.tocsr()
        print proc_num, "Getting deltas..."
        year_vols = get_jaccard_deltas(base, delta, word_list, word_indices)
        year_disp = get_jaccard_deltas(displacement_base, delta, word_list,
                                       word_indices)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols,
                             tmp_out_pref + str(year) + "-jvols.pkl")
        ioutils.write_pickle(year_disp,
                             tmp_out_pref + str(year) + "-jdisps.pkl")
Пример #33
0
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, knn, thresh):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Making second orders for year", year
        old_embed = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", thresh=thresh)
        old_embed = old_embed.get_subembed(year_index_infos[year]["list"])
        old_mat = old_embed.m.tocoo()
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat, thresh=thresh, min_cooccurs=0, shrink_mat=False)
        second_mat = coo_matrix((data_d, (row_d, col_d)))
        if knn != None:
            row_d, col_d, data_d = make_knn_mat(second_mat, knn)
            second_mat = coo_matrix((data_d, (row_d, col_d)))
        year_stats = get_year_stats(second_mat, old_embed.wi, old_embed.iw, stats=STATS)
        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(year_stats,  out_pref + str(year) + "-tmp.pkl")
Пример #34
0
def align_years(years, rep_type, in_dir, out_dir, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:  # Iterates through years
        print "Loading year:", year
        year_embed = create_representation(
            rep_type, in_dir + str(year),
            **rep_args)  # Loads the individual embedding
        print "Aligning year:", year
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(
                base_embed, year_embed,
                post_normalize=False)  # Rotates to the previous year embedding
        base_embed = aligned_embed
        print "Writing year:", year
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy", aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Пример #35
0
def merge(years, out_pref, out_dir):
    word_freqs = collections.defaultdict(
        dict)  # dict mapping year to word-relative_frequency pairs
    word_lists = {}  # dict mapping year to list of used words
    word_set = set([])  # set of words ever used
    for year in years:  # Collects word_lists
        word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl")
        word_set = word_set.union(set(word_lists[year]))
        os.remove(out_dir + str(year) + "tmp.pkl")
    for year in years:  # Collects relative frequencies
        year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl")
        for word in word_set:
            if word not in year_freqs:
                word_freqs[word][year] = float('nan')
            else:
                word_freqs[word][year] = year_freqs[word]
        os.remove(out_dir + str(year) + "freqstmp.pkl")

    ioutils.write_pickle(word_freqs,
                         out_pref + "-freqs.pkl")  # Saves relative frequencies
    ioutils.write_pickle(word_lists, out_pref + ".pkl")  # Saves word_lists
Пример #36
0
def main(proc_num, lock, out_pref, tmp_dir, in_dir, years, word_infos, thresh):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            existing_files = set(os.listdir(tmp_dir))
            fname = str(year) + "-tmp.pkl"
            if fname in existing_files:
                continue
            work_left = True
            print proc_num, "year", year
            with open(tmp_dir + fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        if thresh != None:
            mat = matstore.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh)
        else:
            mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")

        mat.setdiag(0)
        if word_infos != None:
            word_indices = word_infos[year][1]
            indices = word_indices[word_indices < min(mat.shape[1], mat.shape[0])]
        else:
            indices = np.arange(mat.shape[0])
        year_graph = make_snap_graph(indices, mat)
        print proc_num, "Getting statistics for year", year
        year_stats = compute_graph_stats(year_graph)
        rewire_year_stats = compute_graph_stats(snap.GenRewire(year_graph, REWIRE_EDGE_SWITCHES))
        ioutils.write_pickle(year_stats, tmp_dir + fname)
        ioutils.write_pickle(rewire_year_stats, tmp_dir + "rewire" + fname)
Пример #37
0
def merge_year_counts(out_dir, name_list, years):
    for year in years:
        year_counts = {}
        year_doc_counts = {}
        year_pos = {}

        for name in name_list:
            tmp_year_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-counts.pkl")
            tmp_year_doc_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-doc_counts.pkl")
            tmp_year_pos = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-pos.pkl")
            for word, count in tmp_year_counts.iteritems():
                if not word in year_counts:
                    year_counts[word] = 0
                    year_doc_counts[word] = 0
                    year_pos[word] = collections.Counter()
                year_counts[word] += tmp_year_counts[word]
                year_doc_counts[word] += tmp_year_doc_counts[word]
                counter_keys = tmp_year_pos[word].keys()
                for pos in counter_keys:
                    year_pos[word][pos] += tmp_year_pos[word][pos]

        print "Writing merged counts for " + str(year) + " ..."
        ioutils.write_pickle(year_counts, out_dir + str(year) + "-counts.pkl")
        ioutils.write_pickle(year_doc_counts, out_dir + str(year) + "-doc_counts.pkl")
        ioutils.write_pickle(year_pos, out_dir + str(year) + "-pos.pkl")

    print "Deleting temp dirs ..."
    remove_tmp_dirs(out_dir, name_list)
Пример #38
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
            break
        print("Proc:", proc_num, "Decade:", decade)
        word_freqs = Counter()
        lemma_freqs = Counter()
        lemma_pos_freqs = Counter()
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print(proc_num, file)
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    word_freqs[word] += 1
                    lemma_freqs[lemma] += 1
                    lemma_pos_freqs[lemma_pos] += 1
        write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl")
        write_pickle(lemma_freqs,
                     OUT + "decade_freqs/" + decade + "-lemma.pkl")
        write_pickle(lemma_pos_freqs,
                     OUT + "decade_freqs/" + decade + "-lemma_pos.pkl")
Пример #39
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print "Proc:", proc_num, "Decade:", decade
        word_freqs = Counter()
        lemma_freqs = Counter()
        lemma_pos_freqs = Counter()
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print proc_num, file
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    word_freqs[word] += 1
                    lemma_freqs[lemma] += 1
                    lemma_pos_freqs[lemma_pos] += 1
        write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl") 
        write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") 
        write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") 
Пример #40
0
def worker(proc_num, queue, in_dir):
    print proc_num, "Start loop"
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Making second orders for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat)
        old_index = list(ioutils.load_pickle(in_dir + str(year) +
                                             "-index.pkl"))
        new_index = collections.OrderedDict()
        for i in xrange(len(keep_rows)):
            new_index[old_index[keep_rows[i]]] = i
        ioutils.write_pickle(new_index,
                             in_dir + "/second/" + str(year) + "-index.pkl")
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year,
                                in_dir + "/second/")
Пример #41
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:  # Iterates through the decades
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        counts = collections.defaultdict(
            int)  # this dict represents the co-occurrence matrix
        for year in range(10):  # Iterates through the years in the decade
            embed = Explicit.load(
                in_dir + str(decade + year) + ".bin", normalize=False
            )  # Makes an embedding about the individual year (here is needed the own index)
            if year == 0:
                merged_index = embed.wi
            if os.path.isfile(in_dir + str(decade + year) + "-list.pkl"):
                year_list = load_pickle(in_dir + str(decade + year) +
                                        "-list.pkl")
            else:
                year_list = load_pickle(in_dir + "merged_list.pkl")
            mat = embed.m.tocoo()
            for i in xrange(
                    len(mat.data)
            ):  # Iterates through the word-context pairs and counts the co-occurrence
                if mat.data[i] == 0:
                    continue
                new_row = word_to_cached_id(year_list[mat.row[i]],
                                            merged_index)
                new_col = word_to_cached_id(year_list[mat.col[i]],
                                            merged_index)
                counts[(new_row, new_col)] += mat.data[
                    i]  # Adds the co-occurrence to the decade-data
            print "Done year ", decade + year
        export_mat_from_dict(counts, out_dir + str(decade) +
                             ".bin")  # Saves the decadely co-occurrence matrix
        write_pickle(merged_index, out_dir + str(decade) +
                     "-index.pkl")  # Saves the decadely index
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
Пример #42
0
def main(proc_num, lock, in_dir, years, word_list):
    years = range(years[0], years[-1] + 1)
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir))
            if str(year) + "-freqstmp.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = in_dir + str(year) + "-freqstmp.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break
        
        year_freqs = ioutils.load_pickle(in_dir + "/" + str(year) + "-freqs.pkl")
        word_stats = {}
        print proc_num, "Getting stats for year", year
        sum = 0
        for word in word_list:
            if word in year_freqs:
                word_count = year_freqs[word][1]
                sum += word_count
                word_stats[word] = word_count
        for word in word_stats:
            word_stats[word] /= float(sum)

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqstmp.pkl")
Пример #43
0
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    #random.shuffle(years) # I don't know what it is for
    print proc_num, "Start loop"
    while True:  # Iterates through the years
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        #stop_set = set(stopwords.words(lang))
        word_freqs = {}  # dict with word-relative_freq pairs
        print "Loading mat for year", year
        year_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum(
        )  # normalizes the co-occurrence matrix
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha(
            ):  # or word in stop_set or len(word) == 1: # filters out the degenerated words
                continue
            year_freq = year_mat[word_i, :].sum(
            )  # thank to the normalization it's the relative frequency of the word
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(),
                             key=lambda key: word_freqs[key],
                             reverse=True)  # sorting and filtering
        sorted_list = [
            word for word in sorted_list if word_freqs[word] > freq_thresh
        ]
        ioutils.write_pickle(sorted_list, out_dir + str(year) +
                             "tmp.pkl")  # Saves the list of words
        ioutils.write_pickle(word_freqs, out_dir + str(year) +
                             "freqstmp.pkl")  # Saves the relative frequencies
Пример #44
0
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        stop_set = set(stopwords.words(lang))
        word_freqs = {}
        print "Loading mat for year", year
        try:
            year_mat = sparse_io_ref.retrieve_mat_as_coo(in_dir + str(year) +
                                                         ".bin")
        except (TypeError, ValueError):
            continue
        #year_mat = sparse_io_ref.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum()
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = year_mat[word_i, :].sum()
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(),
                             key=lambda key: word_freqs[key],
                             reverse=True)
        sorted_list = [
            word for word in sorted_list if word_freqs[word] > freq_thresh
        ]
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
Пример #45
0
def get_sorted_words(years, out_pref, in_dir, avg_thresh, min_thresh):
    stop_set = set(stopwords.words('english'))
    word_freqs = collections.defaultdict(float)
    word_mins = collections.defaultdict(lambda : 1.0)
    for year in years:
        print "Processing year", year
        year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl")
        sum = 0.0
        for _, counts in year_freqs.iteritems():
            sum += counts[0]  
        for word, counts in year_freqs.iteritems():
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = float(counts[0]) / sum 
            word_freqs[word] += year_freq
            word_mins[word] = min(word_mins[word], year_freq)
    print "Writing data"
    sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
    sorted_list = [word for word in sorted_list 
            if (word_freqs[word] / float(len(years)) > avg_thresh and word_mins[word] > min_thresh)]
    out_fp = open(out_pref + ".txt", "w")
    for word in sorted_list:
        out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n")
    ioutils.write_pickle(sorted_list, out_pref + ".pkl")
Пример #46
0
def merge(out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl")
        disp_yearstat = ioutils.load_pickle(out_pref + str(year) + "-disps.pkl")
        for word in word_list:
            if word not in vol_yearstat:
                vol = float('nan')
            else:
                vol = vol_yearstat[word]
            if word not in disp_yearstat:
                disp = float('nan')
            else:
                disp = disp_yearstat[word]
            vol_yearstats[word][year] = vol
            disp_yearstats[word][year] = disp
        os.remove(out_pref + str(year) + "-vols.pkl")
        os.remove(out_pref + str(year) + "-disps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "-vols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "-disps.pkl")
Пример #47
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print("Proc:", proc_num, "Decade:", decade)
        pos_tags = collections.defaultdict(collections.Counter)
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print(proc_num, file)
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    pos_tags[word][lemma_pos.split("_")[1]] += 1
        write_pickle(pos_tags, OUT + str(decade) + "-pos-counts.pkl")
        pos_maj = {}
        for word, pos_counts in pos_tags.items():
            pos_maj[word] = sorted(pos_counts, key = lambda t : -1*pos_counts[t])[0]
        write_pickle(pos_maj, OUT + str(decade) + "-pos-maj.pkl")
Пример #48
0
def merge(out_pref, years, word_list):
    vol_yearstats = {}
    disp_yearstats = {}
    for word in word_list:
        vol_yearstats[word] = {}
        disp_yearstats[word] = {}
    for year in years:
        vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl")
        disp_yearstat = ioutils.load_pickle(out_pref + str(year) + "-disps.pkl")
        for word in word_list:
            if word not in vol_yearstat:
                vol = float('nan')
            else:
                vol = vol_yearstat[word]
            if word not in disp_yearstat:
                disp = float('nan')
            else:
                disp = disp_yearstat[word]
            vol_yearstats[word][year] = vol
            disp_yearstats[word][year] = disp
        os.remove(out_pref + str(year) + "-vols.pkl")
        os.remove(out_pref + str(year) + "-disps.pkl")
    ioutils.write_pickle(vol_yearstats, out_pref + "vols.pkl")
    ioutils.write_pickle(disp_yearstats, out_pref + "disps.pkl")
def make_word_list(type):
    process_word = lambda word : word if type != "lemma_pos" else word.split("_")[0]
    freqs = load_pickle(FREQS.format(type=type))
    word_lists = {}
    nstop_lists = {}
    nproper_lists = {}
    nstop_nproper_lists = {}
    print "Processing type: ", type
    for year, year_freqs in freqs.iteritems():
        proper_nouns = load_pickle(PROPER_NOUNS.format(year=year))
        word_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if word != "" and word.isalnum()]
        nstop_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in STOPWORDS and not word == "" and word.isalnum()]
        nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns and not word == "" and word.isalnum()]
        nstop_nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns 
                and not process_word(word) in STOPWORDS and not word == "" and word.isalnum()]
        print "Finished year: ", year
    write_pickle(word_lists, OUT.format(type=type, cond="all"))
    write_pickle(nstop_lists, OUT.format(type=type, cond="nstop"))
    write_pickle(nproper_lists, OUT.format(type=type, cond="nproper"))
    write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
Пример #50
0
def main(proc_num, lock, download_dir, source):
    page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            name = re.search('%s-(.*).gz' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])

        print proc_num, "Going through", name
        year_grams = collections.defaultdict(dict)
        n = 0
        with open(loc_dir + name) as f:
            for l in f:
                l = l.decode('utf-8').lower()
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    if (not item.isalpha()) or item in STOPWORDS:
                        continue
                    year = split[1]
                    count = int(split[2])
                    if item not in year_grams[year]:
                        year_grams[year][item] = [(l, count)]
                    else:
                        year_grams[year][item].append((l, count))
                except:
                    #print "!", l.strip().split()
                    pass

        print proc_num, "Writing", name, n
        for year in year_grams:
            ioutils.write_pickle(year_grams[year], loc_dir + str(year) + ".pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name + '.gz')
        except:
            pass
Пример #51
0
from ioutils import load_pickle, write_pickle

DIR = "/dfs/scratch0/COHA/decade_freqs/"
word = {}
lemma = {}
lemma_pos = {}
for year in range(1810, 2010, 10):
    word[year] = load_pickle(DIR + str(year) + "-word.pkl")
    lemma[year] = load_pickle(DIR + str(year) + "-lemma.pkl")
    lemma_pos[year] = load_pickle(DIR + str(year) + "-lemma_pos.pkl")

write_pickle(word, DIR + "word.pkl")
write_pickle(lemma, DIR + "lemma.pkl")
write_pickle(lemma_pos, DIR + "lemma_pos.pkl")
Пример #52
0
        write_pickle(lemma_freqs,
                     OUT + "decade_freqs/" + decade + "-lemma.pkl")
        write_pickle(lemma_pos_freqs,
                     OUT + "decade_freqs/" + decade + "-lemma_pos.pkl")


if __name__ == "__main__":
    queue = Queue()
    for decade in range(1810, 2010, 10):
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue]) for i in range(25)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    print("Getting full freqs...")
    word_freqs = Counter()
    lemma_freqs = Counter()
    lemma_pos_freqs = Counter()
    for decade in range(1810, 2010, 10):
        decade = str(decade)
        print(decade)
        word_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-word.pkl")
        lemma_freqs += load_pickle(OUT + "decade_freqs/" + decade +
                                   "-lemma.pkl")
        lemma_pos_freqs += load_pickle(OUT + "decade_freqs/" + decade +
                                       "-lemma_pos.pkl")
    write_pickle(word_freqs, OUT + "full_freqs/word.pkl")
    write_pickle(lemma_freqs, OUT + "full_freqs/lemma.pkl")
    write_pickle(lemma_pos_freqs, OUT + "full_freqs/lemma_pos.pkl")
Пример #53
0
def main(proc_num, lock, download_dir, source):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.csv.zip', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(
            ['unzip', '-o', loc_dir + name + '.csv.zip', '-d', loc_dir])
        subprocess.call([
            'mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' +
            VERSION + '-' + name + '.csv', loc_dir + name
        ])

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        n = 0
        with open(loc_dir + name) as f:
            for l in f:
                split = l.strip().split('\t')
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    context = ngram[:middle_index] + ngram[middle_index + 1:]
                    item_id = indexing.word_to_id(item, index)
                    year = split[1]
                    count = int(split[2])
                    for context_word in context:
                        pair = (item_id,
                                indexing.word_to_id(context_word, index))
                        year_counters[year][pair] += count
                except:
                    pass

        print proc_num, "Writing", name, n
        matstore.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name)
            os.remove(loc_dir + name + '.csv.zip')
        except:
            pass
Пример #54
0
    for line in fp:
        word, lemma, lemma_pos, _ = process_lemma_line(line)
        if word == None:
            continue
        if lemma_pos == None:
            continue
        if word not in word_dict:
            id = len(word_dict)
            word_dict[word] = id
        if lemma not in lemma_dict:
            id = len(lemma_dict)
            lemma_dict[lemma] = id
        if lemma_pos not in lemma_pos_dict:
            id = len(lemma_pos_dict)
            lemma_pos_dict[lemma_pos] = id

if __name__ == "__main__":
    word_dict = {}
    lemma_dict = {}
    lemma_pos_dict = {}
    for decade in range(1810, 2010, 10):
        folder = str(decade)
        print("Processing decade...", folder)
        for file in os.listdir(DATA + folder):
            with open(DATA + folder + "/" + file) as fp:
                print("Processing file..", folder + "/" + file)
                process_file(fp, word_dict, lemma_dict, lemma_pos_dict)
    write_pickle(word_dict, OUT + "word-dict.pkl") 
    write_pickle(lemma_dict, OUT + "lemma-dict.pkl") 
    write_pickle(lemma_pos_dict, OUT + "lemma-pos-dict.pkl") 
Пример #55
0
def main(out_dir, source, years):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html"
    )  # gets the urls of the 1gram datafiles
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    year_counts = {
    }  # These are dicts that contain the occurence of a word in each year
    year_doc_counts = {}
    year_pos = {}
    for year in years:
        year_pos[year] = {
        }  # Counts the occurrence of a word (distinguish words by pos)
        year_counts[year] = {
        }  # Counts the occurrence of a word (does not distinguish words by pos)
        year_doc_counts[year] = {}  # Counts the books where the word occurred

    print "Start loop"
    for url in urls:  # iterates through the urls
        name = re.search('%s-(.*).gz' % VERSION, url).group(1)
        print "Downloading", name

        success = False
        while not success:  # downloads the acutal datafile
            with open(out_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print "Unzipping", name  # unzips the downloaded datafile
        subprocess.call(['gunzip', '-f', out_dir + name + '.gz', '-d'])

        print "Going through", name  # iterates through the lines of the datafile and counts the uccurrence of the words
        with open(out_dir + name) as f:
            for l in f:
                try:
                    split = l.strip().split('\t')
                    if not POS.match(split[0]):
                        continue
                    count = int(split[2])
                    if count < 10:
                        continue
                    word_info = split[0].split("_")
                    pos = word_info[-1]
                    word = word_info[0].decode('utf-8').lower()
                    word = word.strip("\"")
                    word = word.split("\'s")[0]
                    if not word.isalpha():
                        continue
                    esb = snowball.EnglishStemmer()
                    word = str(esb.stem(word))
                    year = int(split[1])
                    doc_count = int(split[3])
                    if not year in years:
                        continue
                    if not word in year_counts[year]:
                        year_counts[year][word] = 0
                        year_doc_counts[year][word] = 0
                        year_pos[year][word] = collections.Counter()
                    year_counts[year][word] += count
                    year_doc_counts[year][word] += doc_count
                    year_pos[year][word][pos] += count
                except UnicodeDecodeError:
                    pass

        print "Deleting", name  # deletes the downloaded files
        try:
            os.remove(out_dir + name)
            os.remove(out_dir + name + '.gz')
        except:
            pass

    print "Writing..."  # writes the data into pkl files
    for year in years:
        ioutils.write_pickle(year_counts[year],
                             out_dir + str(year) + "-counts.pkl")
        ioutils.write_pickle(year_doc_counts[year],
                             out_dir + str(year) + "-doc_counts.pkl")
        ioutils.write_pickle(year_pos[year], out_dir + str(year) + "-pos.pkl")
Пример #56
0
if __name__ == "__main__":
    queue = Queue()
    for decade in range(1810, 2010, 10):
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue]) for i in range(25)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    print("Getting full set...")
    proper_nouns = set([])
    pos_counts = {}
    print("Merging pos counts..")
    for decade in range(1810, 2010, 10):
        decade_pos_counts = load_pickle(DATA + str(decade) + "-pos-counts.pkl")
        for word, counts in decade_pos_counts.items(): 
            if word not in pos_counts:
                pos_counts[word] = collections.Counter()
            for pos, count in counts.items():
                pos_counts[word][pos] += count
    write_pickle(pos_counts, DATA + "all-pos-counts.pkl")
    pos_maj = {}
    proper_nouns = set([])
    for word, p_counts in pos_counts.items():
        pos_maj[word] = sorted(p_counts, key = lambda t : -1*p_counts[t])[0]
        if pos_maj[word] == "np":
            proper_nouns.add(word)
    write_pickle(pos_maj, OUT + "all-pos-maj.pkl")
    write_pickle(proper_nouns, OUT + "proper_nouns.pkl")
Пример #57
0
def main(out_dir, source, years):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    year_counts = {}
    year_doc_counts = {}
    year_pos = {}
    for year in years:
        year_pos[year] = {}
        year_counts[year] = {}
        year_doc_counts[year] = {}

    print "Start loop"
    for url in urls:
        name = re.search('%s-(.*).gz' % VERSION, url).group(1)
        print "Downloading", name

        success = False
        while not success:
            with open(out_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print "Unzipping", name
        subprocess.call(['gunzip', '-f', out_dir + name + '.gz', '-d'])

        print "Going through", name
        with open(out_dir + name) as f:
            for l in f:
                try:
                    split = l.strip().split('\t')
                    if not POS.match(split[0]):
                        continue
                    count = int(split[2])
                    if count < 10:
                        continue
                    word_info = split[0].split("_")
                    pos = word_info[-1]
                    word = word_info[0].decode('utf-8').lower()
                    word = word.strip("\"")
                    word = word.split("\'s")[0]
                    year = int(split[1])
                    doc_count = int(split[3])
                    if not year in years:
                        continue
                    if not word in year_counts[year]:
                        year_counts[year][word] = 0
                        year_doc_counts[year][word] = 0
                        year_pos[year][word] = collections.Counter()
                    year_counts[year][word] += count
                    year_doc_counts[year][word] += doc_count
                    year_pos[year][word][pos] += count
                except UnicodeDecodeError:
                    pass

        print "Deleting", name
        try:
            os.remove(out_dir + name)
            os.remove(out_dir + name + '.gz')
        except:
            pass

    print "Writing..."
    for year in years:
        ioutils.write_pickle(year_counts[year],
                             out_dir + str(year) + "-counts.pkl")
        ioutils.write_pickle(year_doc_counts[year],
                             out_dir + str(year) + "-doc_counts.pkl")
        ioutils.write_pickle(year_pos[year], out_dir + str(year) + "-pos.pkl")