def preprocess_claim(claim_row: pd.Series) -> list: claim_id, verifiable, label, claim, evidence = claim_row[1].values if not verifiable == 'VERIFIABLE': return [] print('Preprocessing docs for claim [{}]'.format(claim_id)) # output will be the same for all evidence items belonging to this claim output = 1 if label == 'SUPPORTS' else 0 preprocessed_pairs = [] evidence_map = get_evidence_page_line_map(claim_id, args.dataset) for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) for line_id in relevant_line_ids: line = wiki_page.lines[line_id] line_text = line.text line_contains_references = 1 if line.anchors else 0 line_position_absolute = line.id line_position_relative = line.id / len(wiki_page.lines) num_evidence_docs_for_claim = len(evidence_map.keys()) num_lines_for_evidence = len(relevant_line_ids) num_coordination_terms_claim = get_num_coordination_terms( line_text, preprocess_claim_text(claim).split()) num_coordination_terms_title = get_num_coordination_terms( line_text, preprocess_doc_title(page_id)) input = transform_NN_input( claim, line_text, line_contains_references, line_position_absolute, line_position_relative, num_evidence_docs_for_claim, num_lines_for_evidence, num_coordination_terms_claim, num_coordination_terms_title) preprocessed_pairs.append( (claim_id, page_id, line_id, input, output)) return preprocessed_pairs
def transform_LR_input(claim_text: str, line_text: str, debug: bool = False): # remove punctuation that are otherwise part of tokens preprocessed_claim = preprocess_claim_text(claim_text) # remove artifacts like -LRB- etc. preprocessed_line = preprocess_doc_text(line_text) claim_vector = transform_sentence_to_vector(preprocessed_claim, debug) line_vector = transform_sentence_to_vector(preprocessed_line, debug) return get_vector_difference(claim_vector, line_vector)
def transform_NN_input(claim_text: str, line_text: str): # remove punctuation that are otherwise part of tokens preprocessed_claim = preprocess_claim_text(claim_text) # remove artifacts like -LRB- etc. preprocessed_line = preprocess_doc_text(line_text) claim_vector = transform_sentence_to_vector(preprocessed_claim, args.debug) line_vector = transform_sentence_to_vector(preprocessed_line, args.debug) combined_claim_line_vector = get_vector_difference(claim_vector, line_vector) return combined_claim_line_vector
def preprocess_claim(claim_row: pd.Series) -> list: claim_id, verifiable, label, claim, evidence = claim_row[1].values if not verifiable == 'VERIFIABLE': return [] print('Preprocessing docs for claim [{}]'.format(claim_id)) # output will be the same for all evidence items belonging to this claim output = 1 if label == 'SUPPORTS' else 0 preprocessed_pairs = [] evidence_map = get_evidence_page_line_map(claim_id, args.dataset) evidence_sentences = [] num_evidence_docs_for_claim = len(evidence_map.keys()) num_references = 0 num_evidence_items = 0 num_evidence_words = 0 num_coordination_terms_evidence_claim = 0 num_coordination_terms_titles_claim = 0 evidence_sentence_positions = [] # concat evidence (can be from multiple wiki_pages and/or lines) for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) evidence_sentences.extend( [wiki_page.lines[id].text for id in relevant_line_ids]) # count metrics and subtract features for line_id in relevant_line_ids: line = wiki_page.lines[line_id] line_text = line.text num_evidence_words += len(line_text.split()) num_references += len(line.anchors) num_evidence_items += 1 evidence_sentence_positions.append(line.id) num_coordination_terms_evidence_claim += get_num_coordination_terms( line_text, preprocess_claim_text(claim).split()) num_coordination_terms_titles_claim += get_num_coordination_terms( line_text, preprocess_doc_title(page_id)) combined_evidence = ' '.join(evidence_sentences) avg_sentence_position = np.mean(evidence_sentence_positions) input = transform_NN_input(claim, combined_evidence, num_evidence_docs_for_claim, num_references, num_evidence_items, num_coordination_terms_evidence_claim, num_coordination_terms_titles_claim, avg_sentence_position, num_evidence_words) preprocessed_pairs.append((claim_id, input, output)) return preprocessed_pairs
def retrieve_documents_for_claim(claim: str, claim_id: int): print( colored('Retrieving documents for claim [{}]: "{}"'.format( claim_id, claim), attrs=['bold'])) preprocessed_claim = preprocess_claim_text(claim) claim_terms = process_normalise_tokenise_filter(preprocessed_claim) # only docs that appear in index for at least one claim term to be considered doc_candidates = get_candidate_documents_for_claim(claim_terms, mode='raw_count') scoring_function = get_query_likelihood_score_no_smoothing if args.smoothing == 'laplace': scoring_function = get_query_likelihood_score_laplace_smoothing if args.smoothing == 'laplace_lindstone': scoring_function = get_query_likelihood_score_laplace_lindstone_smoothing if args.smoothing == 'jelinek_mercer': scoring_function = get_query_likelihood_score_jelinek_mercer_smoothing if args.smoothing == 'dirichlet': scoring_function = get_query_likelihood_score_dirichlet_smoothing # query likelihood scores for each claim-doc combination docs_with_query_likelihood_scores = [ scoring_function(claim_terms, doc_with_terms) for doc_with_terms in doc_candidates.items() ] # zero values lead to random retrievals if all documents evaluate to zero, so might rather want to show no results if (args.remove_zero_likelihood): docs_with_query_likelihood_scores = list( filter(lambda x: x[1] != 0, docs_with_query_likelihood_scores)) # sort by query likelihood and limit to top results docs_with_query_likelihood_scores.sort(key=itemgetter(1), reverse=True) result_docs = docs_with_query_likelihood_scores[: DOCS_TO_RETRIEVE_PER_CLAIM] result_directory = '{}{}/'.format(RETRIEVED_PROBABILISTIC_DIRECTORY, args.smoothing or 'no_smoothing') display_or_store_result(claim, claim_id, result_docs, result_directory, args.print)
def retrieve_documents_for_claim(claim: str, claim_id: int): print(colored('Retrieving documents for claim [{}]: "{}"'.format(claim_id, claim), attrs=['bold'])) preprocessed_claim = preprocess_claim_text(claim) claim_terms = process_normalise_tokenise_filter(preprocessed_claim) claim_vector = get_tfidf_vector_for_claim(claim_terms) claim_norm = get_tfidf_vector_norm(claim_terms, args.variant) # only docs that appear in index for at least one claim term to be considered doc_candidates = get_candidate_documents_for_claim(claim_terms) # similarity scores for each claim-doc combination docs_with_similarity_scores = [ scoring_function(claim_terms, claim_vector, claim_norm, doc_with_terms) for doc_with_terms in doc_candidates.items()] # sort by similarity and limit to top results docs_with_similarity_scores.sort(key=itemgetter(1), reverse=True) result_docs = docs_with_similarity_scores[:DOCS_TO_RETRIEVE_PER_CLAIM] display_or_store_result(claim, claim_id, result_docs, RETRIEVED_TFIDF_DIRECTORY, args.print)
def transform_NN_input( claim_text: str, line_text: str, line_contains_references: int, line_position_absolute: int, line_position_relative: float, num_evidence_docs_for_claim: int, num_lines_for_evidence: int, num_coordination_terms_claim: int, num_coordination_terms_title: int): # remove punctuation that are otherwise part of tokens preprocessed_claim = preprocess_claim_text(claim_text) # remove artifacts like -LRB- etc. preprocessed_line = preprocess_doc_text(line_text) claim_vector = transform_sentence_to_vector(preprocessed_claim, args.debug) line_vector = transform_sentence_to_vector(preprocessed_line, args.debug) combined_claim_line_vector = get_vector_difference(claim_vector, line_vector) additional_features = np.array( (line_contains_references, line_position_absolute, line_position_relative, num_evidence_docs_for_claim, num_lines_for_evidence, num_coordination_terms_claim, num_coordination_terms_title)) return np.concatenate((combined_claim_line_vector, additional_features))