Exemplo n.º 1
0
def load_matrix(f):
    if not f.endswith('.bin'):
        f += ".bin"
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    return sparse_io.retrieve_mat_as_coo(f.encode()).tocsr()
Exemplo n.º 2
0
def main(proc_num, queue, out_dir, in_dir):
    merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl")
    print proc_num, "Start loop"
    while True:  # Iterates through the years
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Fixing counts for year", year
        fixed_counts = {}  # This is the new co-occurrence matrix
        old_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) +
                                                ".bin").todok()
        old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        for pair, count in old_mat.iteritems(
        ):  # Iterates through the unmerged co-occurrence matrix ...
            try:
                i_word = old_index[pair[0]]
            except IndexError:
                print pair
                sys.exit(0)
            c_word = old_index[pair[1]]
            try:
                new_pair = (indexing.word_to_static_id(i_word, merged_index),
                            indexing.word_to_static_id(c_word, merged_index))
            except KeyError:  # Filters words to drop out
                continue
            fixed_counts[
                new_pair] = count  # ... and add the counts to the new one

        print proc_num, "Writing counts for year", year  # Saves the new co-occurrence matrices
        sparse_io.export_mat_from_dict(fixed_counts,
                                       out_dir + str(year) + ".bin")
Exemplo n.º 3
0
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, thresh):
    print(proc_num, "Start loop")
    time.sleep(10 * random.random())
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            break

        print(proc_num, "Retrieving mat for year", year)
        if thresh != None:
            mat = sparse_io.retrieve_mat_as_coo_thresh(
                in_dir + str(year) + ".bin", thresh)
        else:
            mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin",
                                                min_size=5000000)
        print(proc_num, "Getting stats for year", year)
        year_stats = get_year_stats(mat,
                                    year_index_infos[year]["index"],
                                    year_index_infos[year]["list"],
                                    index_set=set(
                                        year_index_infos[year]["indices"]))

        print(proc_num, "Writing stats for year", year)
        ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
def load_matrix(f):
    if not f.endswith('.bin'):
        f += ".bin"
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    return sparse_io.retrieve_mat_as_coo(f).tocsr()
Exemplo n.º 5
0
def randomize(infile, mode, outfile):
    """Betolti a coocurence matrixot, es a mode-nak megfeleloen randomizalja azaelemeit"""
    dok_matrix = sparse_io.retrieve_mat_as_coo(infile).todok()
    print "matrix loaded and converted to dok."
    dok_matrix = mode(dok_matrix)
    print "randomization done. writing..."
    sparse_io.export_mat_from_dict(dok_matrix, outfile)
    print "success"
Exemplo n.º 6
0
def main(proc_num, lock, out_dir, in_dir, years):
    print proc_num, "Start loop"
    years.reverse()
    while True:
        lock.acquire()
        work_left = False
        # Iterates through the years (in a so complicated way)
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(
                    year
            ) + ".bin" in dirs:  # Checks if the individual year exists in the target directory. If it doesn't, allows the merging
                continue

            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + ".bin"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Merging counts for year", year  # Merging starts here
        full_counts = collections.defaultdict(float)
        merged_index = collections.OrderedDict()
        for chunk_num in os.listdir(
                in_dir
        ):  # Iterates through the alphabetically separated co-occurrence data
            chunk_name = in_dir + str(chunk_num) + "/" + str(year) + ".bin"
            if not os.path.isfile(chunk_name):
                continue
            chunk_counts = sparse_io.retrieve_mat_as_coo(chunk_name)
            chunk_index = ioutils.load_pickle(in_dir + str(chunk_num) +
                                              "/index.pkl")
            chunk_index = list(chunk_index)
            for pair, count in chunk_counts.todok().iteritems(
            ):  # Iterates through the co-occurrence matrices and add the occurrence of the word-pairs to the merged co-occurrence matrix
                i_word = chunk_index[pair[0]]
                c_word = chunk_index[pair[1]]
                new_pair = (indexing.word_to_cached_id(i_word, merged_index),
                            indexing.word_to_cached_id(c_word, merged_index))
                full_counts[new_pair] += count

        print proc_num, "Writing counts for year", year
        sparse_io.export_mat_from_dict(
            full_counts, out_dir + str(year) +
            ".bin")  # Saves the yearly merged co-occurrence matrices.
        ioutils.write_pickle(merged_index, out_dir + str(year) +
                             "-index.pkl")  # ... and the merged index
        ioutils.write_pickle(list(merged_index),
                             out_dir + str(year) + "-list.pkl")
Exemplo n.º 7
0
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, thresh):
    print proc_num, "Start loop"
    time.sleep(10 * random.random())
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        if thresh != None:
            mat = sparse_io.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh)
        else:
            mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=5000000)
        print proc_num, "Getting stats for year", year
        year_stats = get_year_stats(mat, year_index_infos[year]["index"], year_index_infos[year]["list"], index_set = set(year_index_infos[year]["indices"]))

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
Exemplo n.º 8
0
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    #random.shuffle(years) # I don't know what it is for
    print proc_num, "Start loop"
    while True:  # Iterates through the years
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        #stop_set = set(stopwords.words(lang))
        word_freqs = {}  # dict with word-relative_freq pairs
        print "Loading mat for year", year
        year_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum(
        )  # normalizes the co-occurrence matrix
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha(
            ):  # or word in stop_set or len(word) == 1: # filters out the degenerated words
                continue
            year_freq = year_mat[word_i, :].sum(
            )  # thank to the normalization it's the relative frequency of the word
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(),
                             key=lambda key: word_freqs[key],
                             reverse=True)  # sorting and filtering
        sorted_list = [
            word for word in sorted_list if word_freqs[word] > freq_thresh
        ]
        ioutils.write_pickle(sorted_list, out_dir + str(year) +
                             "tmp.pkl")  # Saves the list of words
        ioutils.write_pickle(word_freqs, out_dir + str(year) +
                             "freqstmp.pkl")  # Saves the relative frequencies
Exemplo n.º 9
0
def load_matrix(f):
    """Returns matrix from f as Compressed Sparse Column matrix"""
    if not f.endswith('.bin'):
        f += ".bin"
    return sparse_io.retrieve_mat_as_coo(f).tocsr()