def build_train_matrices(docs, model, file_dir, ontology): """ Build X matrix and y vector from the input data :param docs: documents to process. Either list of generator of Document obj :param model: LearningModel object :param file_dir: directory where the answer files are located :param ontology: Ontology object :return: X and y numpy arrays """ considered_keywords = set(get_keywords()) feature_matrices = [] output_vectors = [] for doc in docs: inv_index = InvertedIndex(doc) # Generate keyword candidates kw_candidates = list(generate_keyword_candidates(doc, ontology)) # Get ground truth answers doc_answers = get_answers_for_doc( doc.filename, file_dir, filtered_by=considered_keywords, ) # If an answer was not generated, add it anyway add_gt_answers_to_candidates_set(kw_candidates, doc_answers, ontology) # Create the output vector output_vector = np.zeros((len(kw_candidates), 2), dtype=np.int16) for i, kw in enumerate(kw_candidates): if kw.get_canonical_form() in doc_answers: output_vector[i][0] = True output_vector[i][1] = doc.doc_id output_vectors.append(output_vector) X = build_feature_matrix(kw_candidates, inv_index, model) feature_matrices.append(X) # Merge the pandas X = pd.concat(feature_matrices) # Cast the output vector to numpy y = np.concatenate(output_vectors) return X, y
def build_test_matrices(docs, model, file_dir, ontology): """ Build the X feature matrix and answers & kw_vector variables, needed for evaluating the predictions. :param docs: documents to process. Either list or generator of Document obj :param model: LearningModel object :param file_dir: directory where the answer files are located :param ontology: Ontology object :return: X numpy array, answers dictionary and kw_vector tuple list """ considered_keywords = set(get_keywords()) feature_matrices = [] kw_vector = [] answers = dict() for doc in docs: inv_index = InvertedIndex(doc) # Generate keyword candidates kw_candidates = list(generate_keyword_candidates(doc, ontology)) X = build_feature_matrix(kw_candidates, inv_index, model) feature_matrices.append(X) # Get ground truth answers answers[doc.doc_id] = get_answers_for_doc( doc.filename, file_dir, filtered_by=considered_keywords, ) kw_vector.extend([(doc.doc_id, kw.get_canonical_form()) for kw in kw_candidates]) # Merge feature matrices from different documents X = pd.concat(feature_matrices) return X, answers, kw_vector