示例#1
0
def get_normed_author_theorem_matrix(setting):
    if (not force_gen
            and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz")
            and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz")):
        normed_author_theorem_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz")
        normed_theorem_author_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz")

        return normed_author_theorem_mat, normed_theorem_author_mat
    else:
        mat, r, c = get_author_theorem_matrix(setting)
        normed_author_theorem_mat = normalize(mat)
        normed_theorem_author_mat = normalize(mat.transpose())
        save_csr_matrix(normed_author_theorem_mat, "derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix")
        save_csr_matrix(normed_theorem_author_mat, "derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix")
        return normed_author_theorem_mat, normed_theorem_author_mat
示例#2
0
def get_raw_tdm(setting):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__raw_tdm.npz",
            "derived_data/" + setting_string(**setting) + "__ids"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_tdm.npz")
        ids = []
        with open("derived_data/" + setting_string(**setting) + "__ids") as f:
            count = 0
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'paragraphs':
                    ids.append((count, (x[0], x[1].strip())))
                elif setting['granularity'] == 'documents':
                    ids.append((count, x[0].strip()))
                else:
                    raise ValueError("granularity must be either 'documents' or 'paragraphs'")

        row2id_map = dict(ids)
        token2index_map = get_token2index_map(setting)
        column2token_map = {index: token for token, index in token2index_map.items()}

        return mat, row2id_map, column2token_map
    else:
        token2index_map = get_token2index_map(setting)
        column2token_map = {index: token for token, index in token2index_map.items()}

        if setting['granularity'] == "paragraphs":
            paragraph_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis'])
            mat, id_log = build_raw_csr_matrix(paragraph_generator, token2index_map)
        elif setting['granularity'] == "documents":
            document_generator = get_all_documents_as_feature_map(setting['token_method'], setting['data_basis'])
            mat, id_log = build_raw_csr_matrix(document_generator, token2index_map)
        else:
            raise ValueError("granularity must be either paragraphs or documents")

        save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_tdm")

        f = open("derived_data/" + setting_string(**setting) + "__ids", "w")
        if setting['granularity'] == "paragraphs":
            for id in id_log:
                f.write(id[0] + ";" + id[1] + "\n")
        elif setting['granularity'] == "documents":
            for id in id_log:
                f.write(id + "\n")
        f.close()

        row2id_map = dict(zip(range(len(id_log)), id_log))

        return mat, row2id_map, column2token_map
示例#3
0
def get_author_msc_matrix():
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/author_msc_map.npz",
            "derived_data/author_msc_map__row2author_name.json",
            "derived_data/author_msc_map__col2msc_code.json"]):

        mat = load_csr_matrix("derived_data/author_msc_map.npz")
        with open("derived_data/author_msc_map__row2author_name.json") as f:
            row2author_map = json.load(f)

        with open("derived_data/author_msc_map__col2msc_code.json") as f:
            col2msc_map = json.load(f)

        return mat, row2author_map, col2msc_map
    else:
        author2msc_map = defaultdict(lambda: defaultdict(int))
        cursor().execute("""SELECT display_name, msc, COUNT(*) FROM authorship
                            JOIN msc_assignment ON authorship.document = msc_assignment.document
                        WHERE authorship.rank <= 2 AND msc_assignment.pos <= 3
                        GROUP BY display_name, msc
                        ORDER BY display_name""")

        for row in cursor():
            author2msc_map[row[0]][row[1][:2]] += row[2]

        author_names, msc_counts = zip(*author2msc_map.items())

        msc_code2index_map = dict(zip(msc_classes, range(len(msc_classes))))
        col2msc_map = {index: msc for msc, index in msc_code2index_map.items()}

        mat = build_csr_matrix(msc_counts, token2index_map=msc_code2index_map)
        save_csr_matrix(mat, "derived_data/author_msc_map")

        row2author_map = dict(zip(range(len(author_names)), author_names))
        with open("derived_data/author_msc_map__row2author_name.json", "w") as f:
            json.dump(row2author_map, f)

        with open("derived_data/author_msc_map__col2msc_code.json", "w") as f:
            json.dump(col2msc_map, f)

        return mat, row2author_map, col2msc_map
示例#4
0
def get_author_theorem_matrix(setting):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz",
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json",
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz")
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json") as f:
            row2author_map = json.load(f)

        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json") as f:
            col2item_map = json.load(f)

        return mat, row2author_map, col2item_map
    else:
        author_set = set()
        item_id_set = set()
        with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f:
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'documents':
                    item_id_set.add(x[0])
                    offset = 1
                elif setting['granularity'] == 'paragraphs':
                    item_id_set.add((x[0], x[1]))
                    offset = 2
                else:
                    raise

                for i in range(offset, len(x)):
                    author_set.add(x[i].strip())

        count = 0
        item2index_map = {}
        with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f:
            for line in f:
                item2index_map[line.strip()] = count
                count += 1

        # item2index_map = dict(zip(sorted(list(item_id_set)), range(len(item_id_set))))
        author2index_map = dict(zip(sorted(list(author_set)), range(len(author_set))))

        author_item_indexes = map(lambda x: {}, range(len(author2index_map)))
        with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f:
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'documents':
                    item_index = item2index_map.get(x[0])
                    offset = 1
                elif setting['granularity'] == 'paragraphs':
                    item_index = item2index_map.get((x[0], x[1]))
                    offset = 2
                else:
                    raise

                if item_index is not None:
                    for i in range(offset, len(x)):
                        author_index = author2index_map[x[i].strip()]
                        author_item_indexes[author_index][item_index] = 1.0

        mat = build_csr_matrix(list_of_dicts=author_item_indexes, num_attributes=len(item2index_map))
        save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_author_matrix")

        row2author_map = {index: author for author, index in author2index_map.items()}
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "w") as f:
            json.dump(row2author_map, f)

        col2item_map = {index: item for item, index in item2index_map.items()}
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json", "w") as f:
            json.dump(col2item_map, f)

        return mat, row2author_map, col2item_map
示例#5
0
def get_processed_tdm(setting, intended_amount_of_text_tokens=None, intended_amount_of_formula_tokens=None):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__processed_tdm.npz",
            "derived_data/" + setting_string(**setting) + "__processed_ids",
            "derived_data/" + setting_string(**setting) + "__processed_token2index_map.json"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__processed_tdm.npz")
        ids = []
        with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f:
            count = 0
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'paragraphs':
                    ids.append((count, (x[0], x[1].strip())))
                elif setting['granularity'] == 'documents':
                    ids.append((count, x[0].strip()))
                else:
                    raise ValueError("granularity must be either 'documents' or 'paragraphs'")

        row2id_map = dict(ids)
        with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json") as f:
            token2index_map = json.load(f)

        column2token_map = {index: token for token, index in token2index_map.items()}

        return mat, row2id_map, column2token_map

    else:
        # retrieve best tf-idf terms
        raw_tdm, row2id_map, column2token_map = get_raw_tdm(setting)
        nz_row_indexes = non_zero_row_indexes(raw_tdm)
        raw_tdm = raw_tdm[nz_row_indexes, :]

        token2index_map = get_token2index_map(setting)

        text_token_scores, formula_token_scores = tf_idf_scores(raw_tdm, token2index_map)
        best_text_token_indexes, best_formula_token_indexes = select_best_tokens(text_token_scores, formula_token_scores, intended_amount_of_text_tokens, intended_amount_of_formula_tokens)

        text_tdm = raw_tdm[:, best_text_token_indexes]
        formula_tdm = raw_tdm[:, best_formula_token_indexes]
        if text_tdm.shape[1] == 0:
            processed_tdm = formula_tdm
        elif formula_tdm == 0:
            processed_tdm = text_tdm
        else:
            float_text_tdm = element_wise_multiply(text_tdm, 1.0)
            pruned_formula_tdm = element_wise_multiply(formula_tdm, avg_row_norm(text_tdm) / avg_row_norm(formula_tdm))
            processed_tdm = vertically_append_matrix(float_text_tdm, pruned_formula_tdm)

        new_index2old_index_map = {new_index: old_index for new_index, old_index in enumerate(best_text_token_indexes)}
        new_index2old_index_map.update({new_index+len(best_text_token_indexes): old_index for new_index, old_index in enumerate(best_formula_token_indexes)})

        new_token2index_map = {}
        for new_index, old_index in new_index2old_index_map.items():
            new_token2index_map[column2token_map[old_index]] = new_index

        new_column2token_map = {index: token for token, index in new_token2index_map.items()}

        new_row2id_map = {}
        count = 0
        for index, id in row2id_map.items():
            if index in nz_row_indexes:
                new_row2id_map[count] = id
                count += 1

        # save processed tdm
        save_csr_matrix(processed_tdm, "derived_data/" + setting_string(**setting) + "__processed_tdm")

        # save respective ids
        with open("derived_data/" + setting_string(**setting) + "__processed_ids", "w") as f:
            for index, id in sorted(new_row2id_map.items(), key=lambda x: x[0]):
                f.write(id + "\n")

        # save token2index map
        with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json", "w") as outfile:
            json.dump(new_token2index_map, outfile)

        return processed_tdm, new_row2id_map, new_column2token_map
        count += 1

    theoremTDM = horizontally_combine_matrixes(matrixList)
    save_csr_matrix(theoremTDM, "derived_data/combined_theorem_text_formula_tdm")"""

    # train lsa
    """theoremTDM = load_csr_matrix("derived_data/combined_theorem_text_formula_tdm.npz")

    svd = TruncatedSVD(n_components=250)
    svd.fit(theoremTDM)
    joblib.dump(svd, "models/combined_theorem_text_formula_lsi250_model")
    """

    # perform clustering
    theoremTDM = load_csr_matrix("derived_data/combined_theorem_text_formula_tdm.npz")
    svd = joblib.load("models/combined_theorem_text_formula_lsi250_model")
    LSI_TDM = svd.transform(theoremTDM)

    ap = AffinityPropagation(
        damping=0.5, 
        max_iter=200, 
        convergence_iter=15, 
        copy=True, 
        preference=None, 
        affinity='euclidean', 
        verbose=False
    )

    ap.fit(LSI_TDM)
    joblib.dump(ap, "models/combined_theorem_text_formula_ap_model")
import json
from scipy.sparse import csr_matrix
from util import save_csr_matrix, load_csr_matrix, get_dirpath, get_filenames_and_filepaths, DocumentParser, filesInDict
from util import connectToDb, bin2NumpyArr
import numpy as np
from time import time
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
from string import digits, ascii_letters
from os.path import isfile, join

dirpath = get_dirpath()
filenames, filepaths = get_filenames_and_filepaths("raw_data/ntcir_filenames")

tdm = load_csr_matrix("derived_data/zb_math_full_text_tdm2.npz")
translateMap = json.load(open("derived_data/zb_math_full_texts_tokens2IndexMap"))
row_number2fulltext_id_map = json.load(open("derived_data/row_number2fulltext_id_map.json"))

phrase = "theorem"
tokenizer = DocumentParser.TextTokenizer()
tokens = tokenizer.tokenize(phrase)
tokenIds = map(lambda token: translateMap[token], tokens)

candidateIds = []
index = 0
m = tdm[:, tokenIds]

candidateInd = []
currInd = 0
for i in range(len(m.indptr) - 1):
    diff = m.indptr[i + 1] - m.indptr[i]
    if diff == len(tokenIds):
target_class = "81"
ordered_document_assignments = map(
    lambda doc_id: doc2msc[str(doc_id)]
    if doc_id in doc2msc else None, document_ids)
ordered_document_labels = map(
    lambda lab: None
    if lab is None else (1 if lab[:len(target_class)] == target_class else 0),
    ordered_document_assignments)

test_doc_ind = indexes_in_list(document_ids,
                               readFileLinewise2Array("raw_data/test_doc_ids"))
train_doc_ind = indexes_in_list(
    document_ids, readFileLinewise2Array("raw_data/train_doc_ids"))

mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz")
train_mat = mat[train_doc_ind, :]
train_labels = itemgetter(*train_doc_ind)(ordered_document_labels)

svd = TruncatedSVD(n_components=1000)
svd.fit(train_mat)

test_mat = mat[test_doc_ind, :]
test_labels = itemgetter(*test_doc_ind)(ordered_document_labels)

clf = svm.LinearSVC()
clf.fit(svd.transform(train_mat), train_labels)

# eval results
predictions = clf.predict(svd.transform(test_mat)).tolist()
        # Save CSR-Matrix
        """c = corpus.toCsrMatrix()
        save_csr_matrix(c, "abschlussbericht-csr")"""

        # Save Labels
        """corpus = ArffJsonCorpus("../zb_math_cluster_experiments/raw_data/abschlussbericht-corpus.json")
        labels_list = (doc.classes for doc in corpus)

        with open("abschlussbericht-labels", "w") as f:
            for labels in labels_list:
                top_class_labels = set(map(lambda x: x[:2], labels))
                f.write(",".join(top_class_labels) + "\n")"""


        tdm = load_csr_matrix("corpus.npz")
        labels = read_labels("abschlussbericht-labels")

        mats = {}
        def get_transformed_mat(mat, transform_id, transformer_list, test_train):
            global mats
            if mats.get((transform_id, test_train)) is None:
                if transformer_list is not None or not len(transformer_list) == 0:
                    mat_copy = mat
                    for transformer in transformer_list:
                        mat_copy = transformer.transform(mat_copy)

                    mats[(transform_id, test_train)] = mat_copy
                else:
                    mats[(transform_id, test_train)] = mat
document_ids = json.load(open("derived_data/theorem_tdm_grouped_by_docs_doc_ids"))
doc2msc = {}
f = open("raw_data/doc2msc")
for line in f:
    x = line.split(";")
    doc2msc[str(x[0])] = x[1].strip()
f.close()

target_class = "81"
ordered_document_assignments = map(lambda doc_id: doc2msc[str(doc_id)] if doc_id in doc2msc else None, document_ids)
ordered_document_labels = map(lambda lab: None if lab is None else (1 if lab[:len(target_class)] == target_class else 0), ordered_document_assignments)

test_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/test_doc_ids"))
train_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/train_doc_ids"))

mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz")
train_mat = mat[train_doc_ind, :]
train_labels = itemgetter(*train_doc_ind)(ordered_document_labels)

svd = TruncatedSVD(n_components=1000)
svd.fit(train_mat)

test_mat = mat[test_doc_ind, :]
test_labels = itemgetter(*test_doc_ind)(ordered_document_labels)

clf = svm.LinearSVC()
clf.fit(svd.transform(train_mat), train_labels)

# eval results
predictions = clf.predict(svd.transform(test_mat)).tolist()
import json
from scipy.sparse import csr_matrix
from util import save_csr_matrix, load_csr_matrix, get_dirpath, get_filenames_and_filepaths, DocumentParser, filesInDict
from util import connectToDb, bin2NumpyArr
import numpy as np
from time import time
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
from string import digits, ascii_letters
from os.path import isfile, join

dirpath = get_dirpath()
filenames, filepaths = get_filenames_and_filepaths("raw_data/ntcir_filenames")

tdm = load_csr_matrix("derived_data/zb_math_full_text_tdm2.npz")
translateMap = json.load(
    open("derived_data/zb_math_full_texts_tokens2IndexMap"))
row_number2fulltext_id_map = json.load(
    open("derived_data/row_number2fulltext_id_map.json"))

phrase = "theorem"
tokenizer = DocumentParser.TextTokenizer()
tokens = tokenizer.tokenize(phrase)
tokenIds = map(lambda token: translateMap[token], tokens)

candidateIds = []
index = 0
m = tdm[:, tokenIds]

candidateInd = []
currInd = 0
for i in range(len(m.indptr) - 1):
        matrixList.append(combinedFeatures)

        count += 1

    theoremTDM = horizontally_combine_matrixes(matrixList)
    save_csr_matrix(theoremTDM, "derived_data/combined_theorem_text_formula_tdm")"""

    # train lsa
    """theoremTDM = load_csr_matrix("derived_data/combined_theorem_text_formula_tdm.npz")

    svd = TruncatedSVD(n_components=250)
    svd.fit(theoremTDM)
    joblib.dump(svd, "models/combined_theorem_text_formula_lsi250_model")
    """

    # perform clustering
    theoremTDM = load_csr_matrix(
        "derived_data/combined_theorem_text_formula_tdm.npz")
    svd = joblib.load("models/combined_theorem_text_formula_lsi250_model")
    LSI_TDM = svd.transform(theoremTDM)

    ap = AffinityPropagation(damping=0.5,
                             max_iter=200,
                             convergence_iter=15,
                             copy=True,
                             preference=None,
                             affinity='euclidean',
                             verbose=False)

    ap.fit(LSI_TDM)
    joblib.dump(ap, "models/combined_theorem_text_formula_ap_model")