示例#1
0
def get_raw_tdm(setting):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__raw_tdm.npz",
            "derived_data/" + setting_string(**setting) + "__ids"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_tdm.npz")
        ids = []
        with open("derived_data/" + setting_string(**setting) + "__ids") as f:
            count = 0
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'paragraphs':
                    ids.append((count, (x[0], x[1].strip())))
                elif setting['granularity'] == 'documents':
                    ids.append((count, x[0].strip()))
                else:
                    raise ValueError("granularity must be either 'documents' or 'paragraphs'")

        row2id_map = dict(ids)
        token2index_map = get_token2index_map(setting)
        column2token_map = {index: token for token, index in token2index_map.items()}

        return mat, row2id_map, column2token_map
    else:
        token2index_map = get_token2index_map(setting)
        column2token_map = {index: token for token, index in token2index_map.items()}

        if setting['granularity'] == "paragraphs":
            paragraph_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis'])
            mat, id_log = build_raw_csr_matrix(paragraph_generator, token2index_map)
        elif setting['granularity'] == "documents":
            document_generator = get_all_documents_as_feature_map(setting['token_method'], setting['data_basis'])
            mat, id_log = build_raw_csr_matrix(document_generator, token2index_map)
        else:
            raise ValueError("granularity must be either paragraphs or documents")

        save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_tdm")

        f = open("derived_data/" + setting_string(**setting) + "__ids", "w")
        if setting['granularity'] == "paragraphs":
            for id in id_log:
                f.write(id[0] + ";" + id[1] + "\n")
        elif setting['granularity'] == "documents":
            for id in id_log:
                f.write(id + "\n")
        f.close()

        row2id_map = dict(zip(range(len(id_log)), id_log))

        return mat, row2id_map, column2token_map
示例#2
0
def get_normed_author_theorem_matrix(setting):
    if (not force_gen
            and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz")
            and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz")):
        normed_author_theorem_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz")
        normed_theorem_author_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz")

        return normed_author_theorem_mat, normed_theorem_author_mat
    else:
        mat, r, c = get_author_theorem_matrix(setting)
        normed_author_theorem_mat = normalize(mat)
        normed_theorem_author_mat = normalize(mat.transpose())
        save_csr_matrix(normed_author_theorem_mat, "derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix")
        save_csr_matrix(normed_theorem_author_mat, "derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix")
        return normed_author_theorem_mat, normed_theorem_author_mat
示例#3
0
def get_author_msc_matrix():
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/author_msc_map.npz",
            "derived_data/author_msc_map__row2author_name.json",
            "derived_data/author_msc_map__col2msc_code.json"]):

        mat = load_csr_matrix("derived_data/author_msc_map.npz")
        with open("derived_data/author_msc_map__row2author_name.json") as f:
            row2author_map = json.load(f)

        with open("derived_data/author_msc_map__col2msc_code.json") as f:
            col2msc_map = json.load(f)

        return mat, row2author_map, col2msc_map
    else:
        author2msc_map = defaultdict(lambda: defaultdict(int))
        cursor().execute("""SELECT display_name, msc, COUNT(*) FROM authorship
                            JOIN msc_assignment ON authorship.document = msc_assignment.document
                        WHERE authorship.rank <= 2 AND msc_assignment.pos <= 3
                        GROUP BY display_name, msc
                        ORDER BY display_name""")

        for row in cursor():
            author2msc_map[row[0]][row[1][:2]] += row[2]

        author_names, msc_counts = zip(*author2msc_map.items())

        msc_code2index_map = dict(zip(msc_classes, range(len(msc_classes))))
        col2msc_map = {index: msc for msc, index in msc_code2index_map.items()}

        mat = build_csr_matrix(msc_counts, token2index_map=msc_code2index_map)
        save_csr_matrix(mat, "derived_data/author_msc_map")

        row2author_map = dict(zip(range(len(author_names)), author_names))
        with open("derived_data/author_msc_map__row2author_name.json", "w") as f:
            json.dump(row2author_map, f)

        with open("derived_data/author_msc_map__col2msc_code.json", "w") as f:
            json.dump(col2msc_map, f)

        return mat, row2author_map, col2msc_map
f.close()"""

# build token dict
"""minTokenCount = 10

tokenCounts = json.load(open("derived_data/simple_corpus_" + suffix + "_token_counts.json"))
frequentTokens = map(lambda i: i[0], filter(lambda c : c[1] >= minTokenCount, tokenCounts.items()))

token2IndexMap = dict(zip(sorted(frequentTokens), range(len(frequentTokens))))

f = open("derived_data/simple_corpus_" + suffix + "_token2index_map.json", "w")
f.write(json.dumps(token2IndexMap))
f.close()"""

# create raw csr_matrix
"""t = zbMathTokenizer()
token2IndexMap = json.load(open("derived_data/simple_corpus_" + suffix + "_token2index_map.json"))

l = []
for line in open("raw_data/simple_corpus.json"):
    document = line2Document(line)
    tokens = tokenize(document[1] + " " + document[2], t)
    tokenCounts = groupAndCount(tokens)
    l.append(tokenCounts)

m = build_csr_matrix(listOfMaps=l, token2IndexMap=token2IndexMap)
save_csr_matrix(m, "derived_data/simple_corpus_" + suffix + "_raw_tdm")"""

# gen tf-idf model
"""simple_corpus_tdm = load_csr_matrix("derived_data/simple_corpus_" + suffix + "_raw_tdm.npz")
tfidf_model = TfidfTransformer()
示例#5
0
def get_author_theorem_matrix(setting):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz",
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json",
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz")
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json") as f:
            row2author_map = json.load(f)

        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json") as f:
            col2item_map = json.load(f)

        return mat, row2author_map, col2item_map
    else:
        author_set = set()
        item_id_set = set()
        with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f:
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'documents':
                    item_id_set.add(x[0])
                    offset = 1
                elif setting['granularity'] == 'paragraphs':
                    item_id_set.add((x[0], x[1]))
                    offset = 2
                else:
                    raise

                for i in range(offset, len(x)):
                    author_set.add(x[i].strip())

        count = 0
        item2index_map = {}
        with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f:
            for line in f:
                item2index_map[line.strip()] = count
                count += 1

        # item2index_map = dict(zip(sorted(list(item_id_set)), range(len(item_id_set))))
        author2index_map = dict(zip(sorted(list(author_set)), range(len(author_set))))

        author_item_indexes = map(lambda x: {}, range(len(author2index_map)))
        with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f:
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'documents':
                    item_index = item2index_map.get(x[0])
                    offset = 1
                elif setting['granularity'] == 'paragraphs':
                    item_index = item2index_map.get((x[0], x[1]))
                    offset = 2
                else:
                    raise

                if item_index is not None:
                    for i in range(offset, len(x)):
                        author_index = author2index_map[x[i].strip()]
                        author_item_indexes[author_index][item_index] = 1.0

        mat = build_csr_matrix(list_of_dicts=author_item_indexes, num_attributes=len(item2index_map))
        save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_author_matrix")

        row2author_map = {index: author for author, index in author2index_map.items()}
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "w") as f:
            json.dump(row2author_map, f)

        col2item_map = {index: item for item, index in item2index_map.items()}
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json", "w") as f:
            json.dump(col2item_map, f)

        return mat, row2author_map, col2item_map
示例#6
0
def get_processed_tdm(setting, intended_amount_of_text_tokens=None, intended_amount_of_formula_tokens=None):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__processed_tdm.npz",
            "derived_data/" + setting_string(**setting) + "__processed_ids",
            "derived_data/" + setting_string(**setting) + "__processed_token2index_map.json"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__processed_tdm.npz")
        ids = []
        with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f:
            count = 0
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'paragraphs':
                    ids.append((count, (x[0], x[1].strip())))
                elif setting['granularity'] == 'documents':
                    ids.append((count, x[0].strip()))
                else:
                    raise ValueError("granularity must be either 'documents' or 'paragraphs'")

        row2id_map = dict(ids)
        with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json") as f:
            token2index_map = json.load(f)

        column2token_map = {index: token for token, index in token2index_map.items()}

        return mat, row2id_map, column2token_map

    else:
        # retrieve best tf-idf terms
        raw_tdm, row2id_map, column2token_map = get_raw_tdm(setting)
        nz_row_indexes = non_zero_row_indexes(raw_tdm)
        raw_tdm = raw_tdm[nz_row_indexes, :]

        token2index_map = get_token2index_map(setting)

        text_token_scores, formula_token_scores = tf_idf_scores(raw_tdm, token2index_map)
        best_text_token_indexes, best_formula_token_indexes = select_best_tokens(text_token_scores, formula_token_scores, intended_amount_of_text_tokens, intended_amount_of_formula_tokens)

        text_tdm = raw_tdm[:, best_text_token_indexes]
        formula_tdm = raw_tdm[:, best_formula_token_indexes]
        if text_tdm.shape[1] == 0:
            processed_tdm = formula_tdm
        elif formula_tdm == 0:
            processed_tdm = text_tdm
        else:
            float_text_tdm = element_wise_multiply(text_tdm, 1.0)
            pruned_formula_tdm = element_wise_multiply(formula_tdm, avg_row_norm(text_tdm) / avg_row_norm(formula_tdm))
            processed_tdm = vertically_append_matrix(float_text_tdm, pruned_formula_tdm)

        new_index2old_index_map = {new_index: old_index for new_index, old_index in enumerate(best_text_token_indexes)}
        new_index2old_index_map.update({new_index+len(best_text_token_indexes): old_index for new_index, old_index in enumerate(best_formula_token_indexes)})

        new_token2index_map = {}
        for new_index, old_index in new_index2old_index_map.items():
            new_token2index_map[column2token_map[old_index]] = new_index

        new_column2token_map = {index: token for token, index in new_token2index_map.items()}

        new_row2id_map = {}
        count = 0
        for index, id in row2id_map.items():
            if index in nz_row_indexes:
                new_row2id_map[count] = id
                count += 1

        # save processed tdm
        save_csr_matrix(processed_tdm, "derived_data/" + setting_string(**setting) + "__processed_tdm")

        # save respective ids
        with open("derived_data/" + setting_string(**setting) + "__processed_ids", "w") as f:
            for index, id in sorted(new_row2id_map.items(), key=lambda x: x[0]):
                f.write(id + "\n")

        # save token2index map
        with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json", "w") as outfile:
            json.dump(new_token2index_map, outfile)

        return processed_tdm, new_row2id_map, new_column2token_map
示例#7
0
f.close()"""

# build token dict
"""minTokenCount = 10

tokenCounts = json.load(open("derived_data/simple_corpus_" + suffix + "_token_counts.json"))
frequentTokens = map(lambda i: i[0], filter(lambda c : c[1] >= minTokenCount, tokenCounts.items()))

token2IndexMap = dict(zip(sorted(frequentTokens), range(len(frequentTokens))))

f = open("derived_data/simple_corpus_" + suffix + "_token2index_map.json", "w")
f.write(json.dumps(token2IndexMap))
f.close()"""

# create raw csr_matrix
"""t = zbMathTokenizer()
token2IndexMap = json.load(open("derived_data/simple_corpus_" + suffix + "_token2index_map.json"))

l = []
for line in open("raw_data/simple_corpus.json"):
    document = line2Document(line)
    tokens = tokenize(document[1] + " " + document[2], t)
    tokenCounts = groupAndCount(tokens)
    l.append(tokenCounts)

m = build_csr_matrix(listOfMaps=l, token2IndexMap=token2IndexMap)
save_csr_matrix(m, "derived_data/simple_corpus_" + suffix + "_raw_tdm")"""

# gen tf-idf model
"""simple_corpus_tdm = load_csr_matrix("derived_data/simple_corpus_" + suffix + "_raw_tdm.npz")
tfidf_model = TfidfTransformer()
    if random.random() < testDocProportion:
        testDocIds.append(parts[0])
    else:
        trainDocIds.append(parts[0])

def serialize(l, path):
    f = open(path, "w")
    for item in l:
        f.write(item + "\n")
    f.close()

serialize(trainDocIds, "raw_data/train_doc_ids")
serialize(testDocIds, "raw_data/test_doc_ids")"""

# group theorem matrix by documents
"""mat = load_csr_matrix("derived_data/tfidf_theorem_tdm.npz")
(num_rows, num_cols) = mat.shape

def aggregate_rows(mat, indexes):
    doc = mat[indexes, :].sum(axis=0).tolist()[0]
    doc_as_map = dict(filter(lambda z: z[1] != 0.0, zip(range(len(doc)), doc)))
    return doc_as_map

theorem_ids = json.load(open("derived_data/raw_theorem_tdm_theorem_ids"))
last_doc_id = None
collected_indexes = []
documents = []
index = 0
document_id_list = []
for theorem_id in theorem_ids:
    curr_doc_id = theorem_id[0]
    if random.random() < testDocProportion:
        testDocIds.append(parts[0])
    else:
        trainDocIds.append(parts[0])

def serialize(l, path):
    f = open(path, "w")
    for item in l:
        f.write(item + "\n")
    f.close()

serialize(trainDocIds, "raw_data/train_doc_ids")
serialize(testDocIds, "raw_data/test_doc_ids")"""

# group theorem matrix by documents
"""mat = load_csr_matrix("derived_data/tfidf_theorem_tdm.npz")
(num_rows, num_cols) = mat.shape

def aggregate_rows(mat, indexes):
    doc = mat[indexes, :].sum(axis=0).tolist()[0]
    doc_as_map = dict(filter(lambda z: z[1] != 0.0, zip(range(len(doc)), doc)))
    return doc_as_map

theorem_ids = json.load(open("derived_data/raw_theorem_tdm_theorem_ids"))
last_doc_id = None
collected_indexes = []
documents = []
index = 0
document_id_list = []
for theorem_id in theorem_ids:
    curr_doc_id = theorem_id[0]