def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False): """ Generate keyword candidates for files in a given directory and compute their recall in reference to ground truth answers :param data_dir: directory with .txt and .key files :param recreate_ontology: boolean flag for recreating the ontology :param verbose: whether to print computation times :return average_recall: float """ average_recall = 0 total_kw_number = 0 ontology = get_ontology(recreate=recreate_ontology) docs = get_documents(data_dir) considered_keywords = set(get_keywords()) total_docs = 0 start_time = time.clock() for doc in docs: kw_candidates = {kw.get_canonical_form() for kw in generate_keyword_candidates(doc, ontology)} answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords) # print(document.get_meaningful_words()) # print(u"Candidates:") # for kw in sorted(kw_candidates): # print(u"\t" + unicode(kw)) # print # # print(u"Answers:") # for kw in sorted(answers): # print(u"\t" + unicode(kw)) # print # # print(u"Conjunction:") # for kw in sorted(kw_candidates & answers): # print(u"\t" + unicode(kw)) # print recall = 1 if not answers else len(kw_candidates & answers) / (len(answers)) if verbose: print print("Paper: " + doc.filename) print("Candidates: " + str(len(kw_candidates))) print("Recall: " + unicode(recall * 100) + "%") average_recall += recall total_kw_number += len(kw_candidates) total_docs += 1 average_recall /= total_docs if verbose: print print("Total # of keywords: " + str(total_kw_number)) print("Time elapsed: " + str(time.clock() - start_time)) return average_recall
def build_train_matrices(docs, model, file_dir, ontology): """ Build X matrix and y vector from the input data :param docs: documents to process. Either list of generator of Document obj :param model: LearningModel object :param file_dir: directory where the answer files are located :param ontology: Ontology object :return: X and y numpy arrays """ considered_keywords = set(get_keywords()) feature_matrices = [] output_vectors = [] for doc in docs: inv_index = InvertedIndex(doc) # Generate keyword candidates kw_candidates = list(generate_keyword_candidates(doc, ontology)) # Get ground truth answers doc_answers = get_answers_for_doc( doc.filename, file_dir, filtered_by=considered_keywords, ) # If an answer was not generated, add it anyway add_gt_answers_to_candidates_set(kw_candidates, doc_answers, ontology) # Create the output vector output_vector = np.zeros((len(kw_candidates), 2), dtype=np.int16) for i, kw in enumerate(kw_candidates): if kw.get_canonical_form() in doc_answers: output_vector[i][0] = True output_vector[i][1] = doc.doc_id output_vectors.append(output_vector) X = build_feature_matrix(kw_candidates, inv_index, model) feature_matrices.append(X) # Merge the pandas X = pd.concat(feature_matrices) # Cast the output vector to numpy y = np.concatenate(output_vectors) return X, y
def build_x_and_y(filenames, file_directory, **kwargs): """ Given file names and their directory, build (X, y) data matrices :param filenames: iterable of strings showing file ids (no extension) :param file_directory: path to a directory where those files lie :param kwargs: additional necessary data for matrix building e.g. scaler :return: a tuple (X, y) """ label_indices = kwargs['label_indices'] word2vec_model = kwargs['word2vec_model'] scaler = kwargs['scaler'] nn_model = kwargs['nn_model'] x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE)) y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_) for doc_id, fname in enumerate(filenames): doc = Document(doc_id, os.path.join(file_directory, fname + '.txt')) words = doc.get_all_words()[:SAMPLE_LENGTH] for i, w in enumerate(words): if w in word2vec_model: word_vector = word2vec_model[w].reshape(1, -1) x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0] labels = get_answers_for_doc( fname + '.txt', file_directory, filtered_by=set(label_indices.keys()), ) for lab in labels: index = label_indices[lab] y_matrix[doc_id][index] = True if nn_model and type(nn_model.input) == list: return_data = [x_matrix] * len(nn_model.input), y_matrix else: return_data = [x_matrix], y_matrix if type(nn_model) == Graph: return {'input': return_data[0], 'output': return_data[1]} else: return return_data
def build_test_matrices(docs, model, file_dir, ontology): """ Build the X feature matrix and answers & kw_vector variables, needed for evaluating the predictions. :param docs: documents to process. Either list or generator of Document obj :param model: LearningModel object :param file_dir: directory where the answer files are located :param ontology: Ontology object :return: X numpy array, answers dictionary and kw_vector tuple list """ considered_keywords = set(get_considered_keywords()) feature_matrices = [] kw_vector = [] answers = dict() for doc in docs: inv_index = InvertedIndex(doc) # Generate keyword candidates kw_candidates = list(generate_keyword_candidates(doc, ontology)) X = build_feature_matrix(kw_candidates, inv_index, model) feature_matrices.append(X) # Get ground truth answers answers[doc.doc_id] = get_answers_for_doc( doc.filename, file_dir, filtered_by=considered_keywords, ) kw_vector.extend([(doc.doc_id, kw.get_canonical_form()) for kw in kw_candidates]) # Merge feature matrices from different documents X = pd.concat(feature_matrices) return X, answers, kw_vector
def build_test_matrices(docs, model, file_dir, ontology): """ Build the X feature matrix and answers & kw_vector variables, needed for evaluating the predictions. :param docs: documents to process. Either list or generator of Document obj :param model: LearningModel object :param file_dir: directory where the answer files are located :param ontology: Ontology object :return: X numpy array, answers dictionary and kw_vector tuple list """ considered_keywords = set(get_keywords()) feature_matrices = [] kw_vector = [] answers = dict() for doc in docs: inv_index = InvertedIndex(doc) # Generate keyword candidates kw_candidates = list(generate_keyword_candidates(doc, ontology)) X = build_feature_matrix(kw_candidates, inv_index, model) feature_matrices.append(X) # Get ground truth answers answers[doc.doc_id] = get_answers_for_doc( doc.filename, file_dir, filtered_by=considered_keywords, ) kw_vector.extend([(doc.doc_id, kw.get_canonical_form()) for kw in kw_candidates]) # Merge feature matrices from different documents X = pd.concat(feature_matrices) return X, answers, kw_vector
def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False): """ Generate keyword candidates for files in a given directory and compute their recall in reference to ground truth answers :param data_dir: directory with .txt and .key files :param recreate_ontology: boolean flag for recreating the ontology :param verbose: whether to print computation times :return average_recall: float """ average_recall = 0 total_kw_number = 0 ontology = get_ontology(recreate=recreate_ontology) docs = get_documents(data_dir) considered_keywords = set(get_keywords()) total_docs = 0 start_time = time.clock() for doc in docs: kw_candidates = { kw.get_canonical_form() for kw in generate_keyword_candidates(doc, ontology) } answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords) # print(document.get_meaningful_words()) # print(u"Candidates:") # for kw in sorted(kw_candidates): # print(u"\t" + unicode(kw)) # print # # print(u"Answers:") # for kw in sorted(answers): # print(u"\t" + unicode(kw)) # print # # print(u"Conjunction:") # for kw in sorted(kw_candidates & answers): # print(u"\t" + unicode(kw)) # print recall = 1 if not answers else len(kw_candidates & answers) / (len(answers)) if verbose: print print("Paper: " + doc.filename) print("Candidates: " + str(len(kw_candidates))) print("Recall: " + unicode(recall * 100) + "%") average_recall += recall total_kw_number += len(kw_candidates) total_docs += 1 average_recall /= total_docs if verbose: print print("Total # of keywords: " + str(total_kw_number)) print("Time elapsed: " + str(time.clock() - start_time)) return average_recall