def get_raw_tdm(setting): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__raw_tdm.npz", "derived_data/" + setting_string(**setting) + "__ids"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_tdm.npz") ids = [] with open("derived_data/" + setting_string(**setting) + "__ids") as f: count = 0 for line in f: x = line.split(";") if setting['granularity'] == 'paragraphs': ids.append((count, (x[0], x[1].strip()))) elif setting['granularity'] == 'documents': ids.append((count, x[0].strip())) else: raise ValueError("granularity must be either 'documents' or 'paragraphs'") row2id_map = dict(ids) token2index_map = get_token2index_map(setting) column2token_map = {index: token for token, index in token2index_map.items()} return mat, row2id_map, column2token_map else: token2index_map = get_token2index_map(setting) column2token_map = {index: token for token, index in token2index_map.items()} if setting['granularity'] == "paragraphs": paragraph_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis']) mat, id_log = build_raw_csr_matrix(paragraph_generator, token2index_map) elif setting['granularity'] == "documents": document_generator = get_all_documents_as_feature_map(setting['token_method'], setting['data_basis']) mat, id_log = build_raw_csr_matrix(document_generator, token2index_map) else: raise ValueError("granularity must be either paragraphs or documents") save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_tdm") f = open("derived_data/" + setting_string(**setting) + "__ids", "w") if setting['granularity'] == "paragraphs": for id in id_log: f.write(id[0] + ";" + id[1] + "\n") elif setting['granularity'] == "documents": for id in id_log: f.write(id + "\n") f.close() row2id_map = dict(zip(range(len(id_log)), id_log)) return mat, row2id_map, column2token_map
def get_normed_author_theorem_matrix(setting): if (not force_gen and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz") and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz")): normed_author_theorem_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz") normed_theorem_author_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz") return normed_author_theorem_mat, normed_theorem_author_mat else: mat, r, c = get_author_theorem_matrix(setting) normed_author_theorem_mat = normalize(mat) normed_theorem_author_mat = normalize(mat.transpose()) save_csr_matrix(normed_author_theorem_mat, "derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix") save_csr_matrix(normed_theorem_author_mat, "derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix") return normed_author_theorem_mat, normed_theorem_author_mat
def get_author_msc_matrix(): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/author_msc_map.npz", "derived_data/author_msc_map__row2author_name.json", "derived_data/author_msc_map__col2msc_code.json"]): mat = load_csr_matrix("derived_data/author_msc_map.npz") with open("derived_data/author_msc_map__row2author_name.json") as f: row2author_map = json.load(f) with open("derived_data/author_msc_map__col2msc_code.json") as f: col2msc_map = json.load(f) return mat, row2author_map, col2msc_map else: author2msc_map = defaultdict(lambda: defaultdict(int)) cursor().execute("""SELECT display_name, msc, COUNT(*) FROM authorship JOIN msc_assignment ON authorship.document = msc_assignment.document WHERE authorship.rank <= 2 AND msc_assignment.pos <= 3 GROUP BY display_name, msc ORDER BY display_name""") for row in cursor(): author2msc_map[row[0]][row[1][:2]] += row[2] author_names, msc_counts = zip(*author2msc_map.items()) msc_code2index_map = dict(zip(msc_classes, range(len(msc_classes)))) col2msc_map = {index: msc for msc, index in msc_code2index_map.items()} mat = build_csr_matrix(msc_counts, token2index_map=msc_code2index_map) save_csr_matrix(mat, "derived_data/author_msc_map") row2author_map = dict(zip(range(len(author_names)), author_names)) with open("derived_data/author_msc_map__row2author_name.json", "w") as f: json.dump(row2author_map, f) with open("derived_data/author_msc_map__col2msc_code.json", "w") as f: json.dump(col2msc_map, f) return mat, row2author_map, col2msc_map
f.close()""" # build token dict """minTokenCount = 10 tokenCounts = json.load(open("derived_data/simple_corpus_" + suffix + "_token_counts.json")) frequentTokens = map(lambda i: i[0], filter(lambda c : c[1] >= minTokenCount, tokenCounts.items())) token2IndexMap = dict(zip(sorted(frequentTokens), range(len(frequentTokens)))) f = open("derived_data/simple_corpus_" + suffix + "_token2index_map.json", "w") f.write(json.dumps(token2IndexMap)) f.close()""" # create raw csr_matrix """t = zbMathTokenizer() token2IndexMap = json.load(open("derived_data/simple_corpus_" + suffix + "_token2index_map.json")) l = [] for line in open("raw_data/simple_corpus.json"): document = line2Document(line) tokens = tokenize(document[1] + " " + document[2], t) tokenCounts = groupAndCount(tokens) l.append(tokenCounts) m = build_csr_matrix(listOfMaps=l, token2IndexMap=token2IndexMap) save_csr_matrix(m, "derived_data/simple_corpus_" + suffix + "_raw_tdm")""" # gen tf-idf model """simple_corpus_tdm = load_csr_matrix("derived_data/simple_corpus_" + suffix + "_raw_tdm.npz") tfidf_model = TfidfTransformer()
def get_author_theorem_matrix(setting): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz", "derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz") with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json") as f: row2author_map = json.load(f) with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json") as f: col2item_map = json.load(f) return mat, row2author_map, col2item_map else: author_set = set() item_id_set = set() with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f: for line in f: x = line.split(";") if setting['granularity'] == 'documents': item_id_set.add(x[0]) offset = 1 elif setting['granularity'] == 'paragraphs': item_id_set.add((x[0], x[1])) offset = 2 else: raise for i in range(offset, len(x)): author_set.add(x[i].strip()) count = 0 item2index_map = {} with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f: for line in f: item2index_map[line.strip()] = count count += 1 # item2index_map = dict(zip(sorted(list(item_id_set)), range(len(item_id_set)))) author2index_map = dict(zip(sorted(list(author_set)), range(len(author_set)))) author_item_indexes = map(lambda x: {}, range(len(author2index_map))) with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f: for line in f: x = line.split(";") if setting['granularity'] == 'documents': item_index = item2index_map.get(x[0]) offset = 1 elif setting['granularity'] == 'paragraphs': item_index = item2index_map.get((x[0], x[1])) offset = 2 else: raise if item_index is not None: for i in range(offset, len(x)): author_index = author2index_map[x[i].strip()] author_item_indexes[author_index][item_index] = 1.0 mat = build_csr_matrix(list_of_dicts=author_item_indexes, num_attributes=len(item2index_map)) save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_author_matrix") row2author_map = {index: author for author, index in author2index_map.items()} with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "w") as f: json.dump(row2author_map, f) col2item_map = {index: item for item, index in item2index_map.items()} with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json", "w") as f: json.dump(col2item_map, f) return mat, row2author_map, col2item_map
def get_processed_tdm(setting, intended_amount_of_text_tokens=None, intended_amount_of_formula_tokens=None): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__processed_tdm.npz", "derived_data/" + setting_string(**setting) + "__processed_ids", "derived_data/" + setting_string(**setting) + "__processed_token2index_map.json"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__processed_tdm.npz") ids = [] with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f: count = 0 for line in f: x = line.split(";") if setting['granularity'] == 'paragraphs': ids.append((count, (x[0], x[1].strip()))) elif setting['granularity'] == 'documents': ids.append((count, x[0].strip())) else: raise ValueError("granularity must be either 'documents' or 'paragraphs'") row2id_map = dict(ids) with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json") as f: token2index_map = json.load(f) column2token_map = {index: token for token, index in token2index_map.items()} return mat, row2id_map, column2token_map else: # retrieve best tf-idf terms raw_tdm, row2id_map, column2token_map = get_raw_tdm(setting) nz_row_indexes = non_zero_row_indexes(raw_tdm) raw_tdm = raw_tdm[nz_row_indexes, :] token2index_map = get_token2index_map(setting) text_token_scores, formula_token_scores = tf_idf_scores(raw_tdm, token2index_map) best_text_token_indexes, best_formula_token_indexes = select_best_tokens(text_token_scores, formula_token_scores, intended_amount_of_text_tokens, intended_amount_of_formula_tokens) text_tdm = raw_tdm[:, best_text_token_indexes] formula_tdm = raw_tdm[:, best_formula_token_indexes] if text_tdm.shape[1] == 0: processed_tdm = formula_tdm elif formula_tdm == 0: processed_tdm = text_tdm else: float_text_tdm = element_wise_multiply(text_tdm, 1.0) pruned_formula_tdm = element_wise_multiply(formula_tdm, avg_row_norm(text_tdm) / avg_row_norm(formula_tdm)) processed_tdm = vertically_append_matrix(float_text_tdm, pruned_formula_tdm) new_index2old_index_map = {new_index: old_index for new_index, old_index in enumerate(best_text_token_indexes)} new_index2old_index_map.update({new_index+len(best_text_token_indexes): old_index for new_index, old_index in enumerate(best_formula_token_indexes)}) new_token2index_map = {} for new_index, old_index in new_index2old_index_map.items(): new_token2index_map[column2token_map[old_index]] = new_index new_column2token_map = {index: token for token, index in new_token2index_map.items()} new_row2id_map = {} count = 0 for index, id in row2id_map.items(): if index in nz_row_indexes: new_row2id_map[count] = id count += 1 # save processed tdm save_csr_matrix(processed_tdm, "derived_data/" + setting_string(**setting) + "__processed_tdm") # save respective ids with open("derived_data/" + setting_string(**setting) + "__processed_ids", "w") as f: for index, id in sorted(new_row2id_map.items(), key=lambda x: x[0]): f.write(id + "\n") # save token2index map with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json", "w") as outfile: json.dump(new_token2index_map, outfile) return processed_tdm, new_row2id_map, new_column2token_map
if random.random() < testDocProportion: testDocIds.append(parts[0]) else: trainDocIds.append(parts[0]) def serialize(l, path): f = open(path, "w") for item in l: f.write(item + "\n") f.close() serialize(trainDocIds, "raw_data/train_doc_ids") serialize(testDocIds, "raw_data/test_doc_ids")""" # group theorem matrix by documents """mat = load_csr_matrix("derived_data/tfidf_theorem_tdm.npz") (num_rows, num_cols) = mat.shape def aggregate_rows(mat, indexes): doc = mat[indexes, :].sum(axis=0).tolist()[0] doc_as_map = dict(filter(lambda z: z[1] != 0.0, zip(range(len(doc)), doc))) return doc_as_map theorem_ids = json.load(open("derived_data/raw_theorem_tdm_theorem_ids")) last_doc_id = None collected_indexes = [] documents = [] index = 0 document_id_list = [] for theorem_id in theorem_ids: curr_doc_id = theorem_id[0]