def preprocess_claim_with_doc(claim_with_docs: tuple) -> list: claim_id = claim_with_docs[0] # remove any NOT_VERIFIABLE claims that were processed earlier if not claim_is_verifiable(claim_id, dataset=args.dataset): return [] claim = get_claim(claim_id, dataset=args.dataset) evidence_map = get_evidence_page_line_map(claim_id, args.dataset) print('Preprocessing docs for claim [{}]: {}'.format( claim_id, evidence_map.keys())) preprocessed_pairs = [] for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) for line_id in relevant_line_ids: # add the relevant claim/sentence pair... positive_line = wiki_page.lines[line_id] positive_input = transform_LR_input(claim, positive_line.text) preprocessed_pairs.append( (claim_id, page_id, line_id, positive_input, 1)) # ...and, to keep it balanced, one irrelevant sample negative_line = get_irrelevant_line(wiki_page, relevant_line_ids) negative_input = transform_LR_input(claim, negative_line.text) preprocessed_pairs.append( (claim_id, page_id, negative_line.id, negative_input, 0)) return preprocessed_pairs
def preprocess_claim(claim_row: pd.Series) -> list: claim_id, verifiable, label, claim, evidence = claim_row[1].values if not verifiable == 'VERIFIABLE': return [] print('Preprocessing docs for claim [{}]'.format(claim_id)) # output will be the same for all evidence items belonging to this claim #label = 'entails' if label == 'SUPPORTS' else 'neutral' label = 1 if label == 'SUPPORTS' else 0 preprocessed_pairs = [] evidence_map = get_evidence_page_line_map(claim_id, args.dataset) evidence_sentences = [] for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) evidence_sentences.extend( [wiki_page.lines[id].text for id in relevant_line_ids]) premise = ' '.join(evidence_sentences) premise = recreate_punctuation_in_doc_text(premise) hypothesis = recreate_punctuation_in_doc_text(claim) pair = { 'label': label, 'sentence1': premise, 'sentence2': hypothesis, 'claim_id': claim_id } preprocessed_pairs.append(pair) return preprocessed_pairs
def preprocess_claim(claim_row: pd.Series) -> list: claim_id, verifiable, label, claim, evidence = claim_row[1].values if not verifiable == 'VERIFIABLE': return [] print('Preprocessing docs for claim [{}]'.format(claim_id)) # output will be the same for all evidence items belonging to this claim output = 1 if label == 'SUPPORTS' else 0 preprocessed_pairs = [] evidence_map = get_evidence_page_line_map(claim_id, args.dataset) for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) for line_id in relevant_line_ids: line = wiki_page.lines[line_id] line_text = line.text line_contains_references = 1 if line.anchors else 0 line_position_absolute = line.id line_position_relative = line.id / len(wiki_page.lines) num_evidence_docs_for_claim = len(evidence_map.keys()) num_lines_for_evidence = len(relevant_line_ids) num_coordination_terms_claim = get_num_coordination_terms( line_text, preprocess_claim_text(claim).split()) num_coordination_terms_title = get_num_coordination_terms( line_text, preprocess_doc_title(page_id)) input = transform_NN_input( claim, line_text, line_contains_references, line_position_absolute, line_position_relative, num_evidence_docs_for_claim, num_lines_for_evidence, num_coordination_terms_claim, num_coordination_terms_title) preprocessed_pairs.append( (claim_id, page_id, line_id, input, output)) return preprocessed_pairs
def preprocess_claim(claim_row: pd.Series) -> list: claim_id, verifiable, label, claim, evidence = claim_row[1].values if not verifiable == 'VERIFIABLE': return [] print('Preprocessing docs for claim [{}]'.format(claim_id)) # output will be the same for all evidence items belonging to this claim output = 1 if label == 'SUPPORTS' else 0 preprocessed_pairs = [] evidence_map = get_evidence_page_line_map(claim_id, args.dataset) evidence_sentences = [] num_evidence_docs_for_claim = len(evidence_map.keys()) num_references = 0 num_evidence_items = 0 num_evidence_words = 0 num_coordination_terms_evidence_claim = 0 num_coordination_terms_titles_claim = 0 evidence_sentence_positions = [] # concat evidence (can be from multiple wiki_pages and/or lines) for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) evidence_sentences.extend( [wiki_page.lines[id].text for id in relevant_line_ids]) # count metrics and subtract features for line_id in relevant_line_ids: line = wiki_page.lines[line_id] line_text = line.text num_evidence_words += len(line_text.split()) num_references += len(line.anchors) num_evidence_items += 1 evidence_sentence_positions.append(line.id) num_coordination_terms_evidence_claim += get_num_coordination_terms( line_text, preprocess_claim_text(claim).split()) num_coordination_terms_titles_claim += get_num_coordination_terms( line_text, preprocess_doc_title(page_id)) combined_evidence = ' '.join(evidence_sentences) avg_sentence_position = np.mean(evidence_sentence_positions) input = transform_NN_input(claim, combined_evidence, num_evidence_docs_for_claim, num_references, num_evidence_items, num_coordination_terms_evidence_claim, num_coordination_terms_titles_claim, avg_sentence_position, num_evidence_words) preprocessed_pairs.append((claim_id, input, output)) return preprocessed_pairs
def display_or_store_result(claim: str, claim_id: int, result_docs: list, dir_path: str, display_only: bool = False): if display_only: print(colored('Results for claim "{}":'.format(claim), 'yellow')) for doc in result_docs: page_id = doc[0] wiki_page = retrieve_wiki_page(page_id) print(wiki_page) else: #result_path = '{}{}.jsonl'.format(path, claim_id) #write_list_to_jsonl(result_path, result_docs) print( colored( 'Storing results for claim "{}"\n{}:'.format( claim, result_docs), 'yellow')) write_list_to_oneline_csv(dir_path, claim_id, result_docs)
def preprocess_claim(claim_row: pd.Series) -> list: claim_id, verifiable, label, claim, evidence = claim_row[1].values if not verifiable == 'VERIFIABLE': return [] print('Preprocessing docs for claim [{}]'.format(claim_id)) # output will be the same for all evidence items belonging to this claim output = 1 if label == 'SUPPORTS' else 0 preprocessed_pairs = [] evidence_map = get_evidence_page_line_map(claim_id, args.dataset) for page_id, relevant_line_ids in evidence_map.items(): wiki_page = retrieve_wiki_page(page_id) for line_id in relevant_line_ids: line_text = wiki_page.lines[line_id].text input = transform_NN_input(claim, line_text) preprocessed_pairs.append((claim_id, page_id, line_id, input, output)) return preprocessed_pairs
def preprocess_claim_with_doc(claim_with_docs: tuple) -> list: partial_result = [] claim_id = claim_with_docs[0] # remove any NOT_VERIFIABLE claims that were processed earlier if not claim_is_verifiable(claim_id, dataset=args.dataset): return [] claim = get_claim(claim_id, dataset=args.dataset) evidence_map = get_evidence_page_line_map(claim_id, args.dataset) retrieved_doc_ids = set(claim_with_docs[1]) evidence_doc_ids = evidence_map.keys() docs_for_training = retrieved_doc_ids.union(evidence_doc_ids) print('Preprocessing docs for claim [{}]: {}'.format( claim_id, docs_for_training)) docs = [retrieve_wiki_page(id) for id in docs_for_training] for doc in docs: partial_result.extend( preprocess_doc(claim_id, claim, doc, evidence_map)) return partial_result
# where the actual retrieval takes place import argparse import time from dataaccess.access_wiki_page import retrieve_wiki_page parser = argparse.ArgumentParser() parser.add_argument("--id", help="ID of a document to retrieve for test purposes", required=True) parser.add_argument( "--complete", help="print complete doc text instead of ID + text preview", action="store_true") args = parser.parse_args() if __name__ == '__main__': if (args.id): start_time = time.time() wiki_document = retrieve_wiki_page(args.id) print('Retrieved document "{}" after {:.5f} seconds'.format( args.id, time.time() - start_time)) if (args.complete): print(wiki_document.text) else: print(wiki_document) else: print('Please add ID to retrieve')