def append_ner(input_file=None, output_file=None, batch_size=None): """ Parameters ---------- Returns ------- """ input = open(input_file, encoding='utf-8') if input_file is not None else sys.stdin csv_reader = csv.reader(input) column_mapper = CsvColumnMapper(next(csv_reader), ['ner'], source_required=['sentence', 'words']) mnofc = ManageNewOutputFileCreation(output_file, batch_size) spacy_pipeline = en_core_web_sm.load() for count, entry in enumerate(csv_reader, start=0): new_file = mnofc.get_new_file_if_necessary() if new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(column_mapper.get_new_headers()) # now that we've finished creating a new file as necessary, we can proceed with the business # at hand: words = column_mapper.get_field_value_from_source(entry, 'words', True) if words is None: csv_writer.writerow(column_mapper.get_new_row_values( entry, [None])) continue sentence = column_mapper.get_field_value_from_source(entry, 'sentence') spacy_doc = spacy_pipeline(sentence) spacy_tokens = [token.text for token in spacy_doc] ner_lookup_spacy_tokenization = {} for index, token in enumerate(spacy_doc, start=1): if token.ent_type != 0: # in ['PERSON', 'ORG']: ner_lookup_spacy_tokenization[index] = token.ent_type_ ner_lookup = ner_lookup_spacy_tokenization tokens = [token for _, token in words] if tokens != spacy_tokens and len(ner_lookup_spacy_tokenization) > 0: ner_lookup = SyncTags.b_lookup_to_a_lookup( tokens, spacy_tokens, ner_lookup_spacy_tokenization) csv_writer.writerow( column_mapper.get_new_row_values(entry, [ner_lookup]))
def identify_ucca_paths(input, output): csv_reader = csv.reader(input) csv_writer = csv.writer(output) column_mapper = CsvColumnMapper( next(csv_reader), target_columns=['path_id', 'path', 'comment'], source_required=[ 'id', 'sentence', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end', 'ucca_parse', 'trigger_idx' ]) csv_writer.writerow(column_mapper.get_new_headers()) for counter, entry in enumerate(csv_reader, start=1): print('Processing sentence #', column_mapper.get_field_value_from_source(entry, 'id')) ucca_parse_serialization = column_mapper.get_field_value_from_source( entry, 'ucca_parse') if ucca_parse_serialization is None: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, 'ucca_parse missing'])) continue ucca_parse = UccaParsedPassage.from_serialization( ucca_parse_serialization) links = ucca_parse.get_links() trigger_token_id = column_mapper.get_field_value_from_source( entry, 'trigger_idx', as_int=True) ent1_start_token_id = column_mapper.get_field_value_from_source( entry, 'ent1_start', as_int=True) ent2_start_token_id = column_mapper.get_field_value_from_source( entry, 'ent2_start', as_int=True) if trigger_token_id is None or ent1_start_token_id is None or ent2_start_token_id is None: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, 'indices missing'])) continue trigger_node_id = ucca_parse.get_node_id_by_token_id(trigger_token_id) trigger_parent_node_id = Link.get_parents(links, trigger_node_id)[0] ent1_start_node_id = ucca_parse.get_node_id_by_token_id( ent1_start_token_id) ent1_parent_node_ids = Link.get_parents(links, ent1_start_node_id) if len(ent1_parent_node_ids) == 0: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, 'Could not find parent of ent1'])) continue ent1_parent_node_id = ent1_parent_node_ids[0] ent2_start_node_id = ucca_parse.get_node_id_by_token_id( ent2_start_token_id) ent2_parent_node_ids = Link.get_parents(links, ent2_start_node_id) if len(ent2_parent_node_ids) == 0: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, 'Could not find parent of ent2'])) continue ent2_parent_node_id = ent2_parent_node_ids[0] graph = DepGraph(links) ent1_to_trigger_steps = graph.get_steps(ent1_parent_node_id, trigger_parent_node_id) ent1_to_trigger_strings = ucca_parse.get_path_representations( ent1_to_trigger_steps) trigger_to_ent2_steps = graph.get_steps(trigger_parent_node_id, ent2_parent_node_id) trigger_to_ent2_strings = ucca_parse.get_path_representations( trigger_to_ent2_steps) sentence_id = column_mapper.get_field_value_from_source(entry, 'id', as_int=True) for count, (segment1, segment2) in enumerate(product(ent1_to_trigger_strings, trigger_to_ent2_strings), start=1): path_id = '{0}_{1}'.format(sentence_id, count) path = '{0} >< {1}'.format(segment1, segment2) comment = None csv_writer.writerow( column_mapper.get_new_row_values(entry, [path_id, path, comment]))
def parse_pss(port, model_path, input_file=None, output_file=None, batch_size=None): """ Parameters ---------- Returns ------- """ input = open(input_file, encoding='utf-8') if input_file is not None else sys.stdin csv_reader = csv.reader(input) column_mapper = CsvColumnMapper( next(csv_reader), ['pss'], source_required=['sentence', 'ud_parse', 'words']) batch = 0 output = None output_file = output_file[:-len('.csv')] if output_file is not None and output_file.endswith('.csv') \ else output_file print('BEGIN-INIT-PSS') from models.supersenses.lstm_mlp_supersenses_model import LstmMlpSupersensesModel from models.supersenses.preprocessing import preprocess_sentence from models.supersenses.preprocessing.corenlp import CoreNLPServer corenlp = CoreNLPServer() corenlp.start(port) model = LstmMlpSupersensesModel.load(model_path) print('END-INIT-PSS') for count, entry in enumerate(csv_reader, start=0): # the next few lines of code deal with opening and closing files (depending on the batching argument, etc) new_file = False # first option: standard output ... if count == 0 and output_file is None: output = sys.stdout new_file = True # second option: we've just started, we're writing to a real file, but no batching if count == 0 and output_file is not None and batch_size is None: output_file_actual = '{0}.csv'.format(output_file) output = open(output_file_actual, 'w', encoding='utf-8', newline='') new_file = True # second case: we've finished a batch (and we are batching..) if batch_size is not None and count % batch_size == 0: output_file_actual = '{0}-{1}.csv'.format(output_file, batch) if output is not None: output.close() output = open(output_file_actual, 'w', encoding='utf-8', newline='') batch += 1 new_file = True # if we did create a new file, let's ensure that the first row consists of column titles if new_file: csv_writer = csv.writer(output) csv_writer.writerow(column_mapper.get_new_headers()) # now that we've finished creating a new file as necessary, we can proceed with the business # at hand: words = column_mapper.get_field_value_from_source(entry, 'words', True) if words is None: csv_writer.writerow(column_mapper.get_new_row_values( entry, [None])) continue sentence = column_mapper.get_field_value_from_source(entry, 'sentence') proper_tokens = word_tokenize(sentence) print('BEGIN-PROCESS-PSS') preprocessed = preprocess_sentence(' '.join(proper_tokens)) pss_pred = model.predict( preprocessed.xs, [x.identified_for_pss for x in preprocessed.xs]) print('END-PROCESS-PSS') pss_lookup_nltk_tokens = {} for index in range(len(preprocessed.xs)): if pss_pred[index].supersense_role: pss_lookup_nltk_tokens[index + 1] = (pss_pred[index].supersense_role, pss_pred[index].supersense_func) ud_tokens = [token for _, token in words] pss_lookup = pss_lookup_nltk_tokens if ud_tokens != proper_tokens and len(pss_lookup_nltk_tokens) > 0: pss_lookup = SyncTags.b_lookup_to_a_lookup(ud_tokens, proper_tokens, pss_lookup_nltk_tokens) csv_writer.writerow( column_mapper.get_new_row_values(entry, [pss_lookup])) corenlp.stop()
def parse_ucca(model_prefix, input_file=None, output_file=None, batch_size=None): """ Parameters ---------- Returns ------- """ input = open(input_file, encoding='utf-8') if input_file is not None else sys.stdin csv_reader = csv.reader(input) column_mapper = CsvColumnMapper( source_first_row=next(csv_reader), target_columns= ['id', 'sentence', 'ent1', 'ent2', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end', 'ucca_parse', 'words', 'lemmas', 'comment'], source_required= ['tac_tokens', 'subj_start', 'subj_end', 'obj_start', 'obj_end'], filter_source_from_result= ['subj_start', 'subj_end', 'obj_start', 'obj_end'] ) detokenizer = Detokenizer() mnofc = ManageNewOutputFileCreation(output_file, batch_size) print('BEGIN-INIT-TUPA') parser = TupaParser(model_prefix) print('END-INIT-TUPA') for count, entry in enumerate(csv_reader, start=0): new_file = mnofc.get_new_file_if_necessary() if new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(column_mapper.get_new_headers()) tac_tokens = eval(column_mapper.get_field_value_from_source(entry, 'tac_tokens')) sentence = detokenizer.detokenize(tac_tokens) print('BEGIN-PROCESS-TUPA') parsed_sentence = parser.parse_sentence(sentence) print('END-PROCESS-TUPA') tokens = [] tokens_with_indices = [] lemmas_with_indices = [] for ucca_terminal in parsed_sentence.terminals: tokens.append(ucca_terminal.text) tokens_with_indices.append((ucca_terminal.token_id, ucca_terminal.text)) lemmas_with_indices.append((ucca_terminal.token_id, ucca_terminal.lemma)) tac_tokens_lookup = {} tac_tokens_lookup['subj_start'] = int(column_mapper.get_field_value_from_source(entry, 'subj_start')) tac_tokens_lookup['subj_end'] = int(column_mapper.get_field_value_from_source(entry, 'subj_end')) tac_tokens_lookup['obj_start'] = int(column_mapper.get_field_value_from_source(entry, 'obj_start')) tac_tokens_lookup['obj_end'] = int(column_mapper.get_field_value_from_source(entry, 'obj_end')) token_lookup = SyncTacTags.b_lookup_to_a_lookup(tokens, tac_tokens, tac_tokens_lookup) if len(token_lookup) != len(tac_tokens_lookup): csv_writer.writerow(column_mapper.get_new_row_values(entry, [count, sentence, None, None, None, None, None, None, None, None, None, 'was not able to reconcile TAC and Tupa\'s Spacy based indexing'])) continue ent1_start = token_lookup['subj_start'] ent1_end = token_lookup['subj_end'] ent1 = ' '.join(tokens[ent1_start:ent1_end + 1]) ent2_start = token_lookup['obj_start'] ent2_end = token_lookup['obj_end'] ent2 = ' '.join(tokens[ent2_start:ent2_end + 1]) csv_writer.writerow(column_mapper.get_new_row_values(entry, [count, sentence, ent1, ent2, ent1_start + 1, ent1_end + 1, ent2_start + 1, ent2_end + 1, parsed_sentence.serialize(), tokens_with_indices, lemmas_with_indices, None]))
def extract_relations(output, ud_input, ud_paths, ucca_input, ucca_paths, triggers): def get_output_entry_list(id, sentence, ud_words='', ud_lemmas='', ud_parse='', ucca_words='', ucca_lemmas='', ucca_parse='', ud_trigger='', ud_path='', ucca_trigger='', ucca_path='', extraction_comment=''): return [ id, sentence, ud_words, ud_lemmas, ud_parse, ucca_words, ucca_lemmas, ucca_parse, ud_trigger, ud_path, ucca_trigger, ucca_path, extraction_comment ] csv_writer = csv.writer(output) csv_writer.writerow([ 'id', 'sentence', 'ud_words', 'ud_lemmas', 'ud_parse', 'ucca_words', 'ucca_lemmas', 'ucca_parse', 'ud_trigger', 'ud_path', 'ucca_trigger', 'ucca_path', 'extraction_comment' ]) ud_reader = csv.reader(ud_input) ucca_reader = csv.reader(ucca_input) ud_column_mapper = CsvColumnMapper( source_first_row=next(ud_reader), target_columns=['trigger', 'trigger_idx', 'matched-lemma', 'path'], source_required=[ 'sentence', 'ud_parse', 'lemmas', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end' ]) ucca_column_mapper = CsvColumnMapper( source_first_row=next(ucca_reader), target_columns=['trigger', 'trigger_idx', 'matched-lemma', 'path'], source_required=[ 'sentence', 'ucca_parse', 'lemmas', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end' ]) ucca_entry_lookup = {} for ucca_row in ucca_reader: id = ucca_column_mapper.get_field_value_from_source(ucca_row, 'id', as_int=True) ucca_entry_lookup[id] = ucca_row for ud_row in ud_reader: id = ud_column_mapper.get_field_value_from_source(ud_row, 'id', as_int=True) sentence = ud_column_mapper.get_field_value_from_source( ud_row, 'sentence') ucca_row = ucca_entry_lookup.get(id) if ucca_row is None: csv_writer.writerow( get_output_entry_list( id, sentence, extraction_comment='No matching UCCA row')) continue ud_words = ud_column_mapper.get_field_value_from_source( ud_row, 'words') ud_lemmas = ud_column_mapper.get_field_value_from_source( ud_row, 'lemmas') ud_parse = ud_column_mapper.get_field_value_from_source( ud_row, 'ud_parse') ud_match = __extract_relation_ud(ud_row, ud_paths, ud_column_mapper, triggers) ud_trigger = ud_match.trigger if ud_match is not None else None ud_path = ud_match.path if ud_match is not None else None ucca_words = ucca_column_mapper.get_field_value_from_source( ucca_row, 'words') ucca1_lemmas = ucca_column_mapper.get_field_value_from_source( ucca_row, 'lemmas') ucca_parse = ucca_column_mapper.get_field_value_from_source( ucca_row, 'ucca_parse') ucca_match = __extract_relation_ucca(ucca_row, ucca_paths, ucca_column_mapper, triggers) ucca_trigger = ucca_match.trigger if ucca_match is not None else None ucca_path = ucca_match.path if ucca_match is not None else None csv_writer.writerow( get_output_entry_list(id, sentence, ud_words=ud_words, ud_lemmas=ud_lemmas, ud_parse=ud_parse, ucca_words=ucca_words, ucca_lemmas=ucca1_lemmas, ucca_parse=ucca_parse, ud_trigger=ud_trigger, ud_path=ud_path, ucca_trigger=ucca_trigger, ucca_path=ucca_path))
def filter_relations(input, output, entity_types=None): csv_reader = csv.reader(input) csv_writer = csv.writer(output) required_columns = [ 'id', 'sentence', 'words', 'lemmas', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end', 'path' ] if entity_types is not None: required_columns.append('ner') column_mapper = CsvColumnMapper(source_first_row=next(csv_reader), target_columns=[], source_required=required_columns) csv_writer.writerow(column_mapper.get_new_headers()) for counter, entry in enumerate(csv_reader, start=1): path = column_mapper.get_field_value_from_source(entry, 'path') if path is None or path == '': continue ent1_start = column_mapper.get_field_value_from_source(entry, 'ent1_start', as_int=True) ent1_end = column_mapper.get_field_value_from_source(entry, 'ent1_end', as_int=True) ent1_indexes = [idx for idx in range(ent1_start, ent1_end + 1)] ent2_start = column_mapper.get_field_value_from_source(entry, 'ent2_start', as_int=True) ent2_end = column_mapper.get_field_value_from_source(entry, 'ent2_end', as_int=True) ent2_indexes = [idx for idx in range(ent2_start, ent2_end + 1)] filtered = False if entity_types is not None: entity1_type = entity_types[0] entity2_type = entity_types[1] ner_tags = column_mapper.get_field_value_from_source(entry, 'ner', evaluate=True) # let's see if any of entity 1's tokens match entity1_type entity1_type_match = False for ent1_index in ent1_indexes: if ent1_index in ner_tags and ner_tags[ ent1_index] == entity1_type: entity1_type_match = True break entity2_type_match = False for ent2_index in ent2_indexes: if ent2_index in ner_tags and ner_tags[ ent2_index] == entity2_type: entity2_type_match = True break filtered = not entity1_type_match or not entity2_type_match if not filtered: csv_writer.writerow(column_mapper.get_new_row_values(entry, [])) output.close()
def parse_ud(input_file=None, output_file=None, batch_size=None): """ Parameters ---------- Returns ------- """ input = open(input_file, encoding='utf-8') if input_file is not None else sys.stdin csv_reader = csv.reader(input) column_mapper = CsvColumnMapper(source_first_row=next(csv_reader), target_columns=[ 'id', 'sentence', 'ent1', 'ent2', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end', 'ud_parse', 'words', 'lemmas', 'comment' ], source_required=[ 'tac_tokens', 'subj_start', 'subj_end', 'obj_start', 'obj_end' ], filter_source_from_result=[ 'subj_start', 'subj_end', 'obj_start', 'obj_end' ]) detokenizer = Detokenizer() ## The prints before and after Pipeline initialization are used by the calling script ## as markets to indicated output that should be filtered out - see bin/parse_ud ## implementation print('BEGIN-INIT-NLP') nlp = stanfordnlp.Pipeline() print('END-INIT-NLP') batch = 0 output = None output_file = output_file[:-len('.csv')] if output_file is not None and output_file.endswith('.csv') \ else output_file for count, entry in enumerate(csv_reader, start=0): # the next few lines of code deal with opening and closing files (depending on the batching argument, etc) new_file = False # first option: standard output ... if count == 0 and output_file is None: output = sys.stdout new_file = True # second option: we've just started, we're writing to a real file, but no batching if count == 0 and output_file is not None and batch_size is None: output_file_actual = '{0}.csv'.format(output_file) output = open(output_file_actual, 'w', encoding='utf-8', newline='') new_file = True # third option: we've finished a batch (and we are batching..) if output_file is not None and batch_size is not None and count % batch_size == 0: output_file_actual = '{0}-{1}.csv'.format(output_file, batch) if output is not None: output.close() output = open(output_file_actual, 'w', encoding='utf-8', newline='') batch += 1 new_file = True # if we did create a new file, let's ensure that the first row consists of column titles if new_file: csv_writer = csv.writer(output) csv_writer.writerow(column_mapper.get_new_headers()) # now that we've finished creating a new file as necessary, we can proceed with the business # at hand: tac_tokens = eval( column_mapper.get_field_value_from_source(entry, 'tac_tokens')) sentence = detokenizer.detokenize(tac_tokens) parsed_sentence = nlp(sentence) # let's ignore sentences who parse into multiple sentences - so as to avoid confusion if len(parsed_sentence.sentences) > 1: csv_writer.writerow( column_mapper.get_new_row_values(entry, [ count, sentence, None, None, None, None, None, None, None, None, None, 'python stanfordnlp parse produced more than one sentence' ])) continue ud_parse = [] for governor, dep, word in parsed_sentence.sentences[0].dependencies: ud_parse.append( (word.index, word.text, dep, governor.index, governor.text)) tokens = [] tokens_with_indices = [] lemmas_with_indices = [] for token in parsed_sentence.sentences[0].tokens: for word in token.words: tokens.append(word.text) tokens_with_indices.append((word.index, word.text)) lemmas_with_indices.append((word.index, word.lemma)) ud_parse.sort(key=lambda x: int(x[0])) tac_tokens_lookup = {} tac_tokens_lookup['subj_start'] = int( column_mapper.get_field_value_from_source(entry, 'subj_start')) tac_tokens_lookup['subj_end'] = int( column_mapper.get_field_value_from_source(entry, 'subj_end')) tac_tokens_lookup['obj_start'] = int( column_mapper.get_field_value_from_source(entry, 'obj_start')) tac_tokens_lookup['obj_end'] = int( column_mapper.get_field_value_from_source(entry, 'obj_end')) token_lookup = SyncTacTags.b_lookup_to_a_lookup( tokens, tac_tokens, tac_tokens_lookup) if len(token_lookup) != len(tac_tokens_lookup): csv_writer.writerow( column_mapper.get_new_row_values(entry, [ count, sentence, None, None, None, None, None, None, None, None, None, 'was not able to reconcile TAC and python stanfordnlp parse indexing' ])) continue ent1_start = token_lookup['subj_start'] ent1_end = token_lookup['subj_end'] ent1 = ' '.join(tokens[ent1_start:ent1_end + 1]) ent2_start = token_lookup['obj_start'] ent2_end = token_lookup['obj_end'] ent2 = ' '.join(tokens[ent2_start:ent2_end + 1]) csv_writer.writerow( column_mapper.get_new_row_values(entry, [ count, sentence, ent1, ent2, ent1_start + 1, ent1_end + 1, ent2_start + 1, ent2_end + 1, ud_parse, tokens_with_indices, lemmas_with_indices, None ]))
def extract_relations_ud(input, output, triggers, paths, include_miss=False): csv_reader = csv.reader(input) csv_writer = csv.writer(output) required_columns = [ 'sentence', 'ud_parse', 'lemmas', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end' ] column_mapper = CsvColumnMapper( next(csv_reader), ['trigger', 'trigger_idx', 'path', 'extraction_comment'], source_required=required_columns) csv_writer.writerow(column_mapper.get_new_headers()) for counter, entry in enumerate(csv_reader, start=1): ud_parse = column_mapper.get_field_value_from_source(entry, 'ud_parse', evaluate=True) if ud_parse is None: if include_miss: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, None, 'ud_parse missing'])) continue links = UdRepresentationPlaceholder.get_links_from_ud_dep(ud_parse) lemma_indices = column_mapper.get_field_value_from_source( entry, 'lemmas', evaluate=True) lemmas = [lemma for _, lemma in lemma_indices] word_indices = column_mapper.get_field_value_from_source(entry, 'words', evaluate=True) words = [word for _, word in word_indices] ent1_start = column_mapper.get_field_value_from_source(entry, 'ent1_start', as_int=True) ent1_end = column_mapper.get_field_value_from_source(entry, 'ent1_end', as_int=True) if ent1_start is None or ent1_end is None: if include_miss: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, None, 'could not identify ent1'])) continue ent1_indexes = [idx for idx in range(ent1_start, ent1_end + 1)] ent1_head = Link.get_head(links, ent1_indexes) ent2_start = column_mapper.get_field_value_from_source(entry, 'ent2_start', as_int=True) ent2_end = column_mapper.get_field_value_from_source(entry, 'ent2_end', as_int=True) if ent2_start is None or ent2_end is None: if include_miss: csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, None, 'could not identify ent2'])) continue ent2_indexes = [idx for idx in range(ent2_start, ent2_end + 1)] ent2_head = Link.get_head(links, ent2_indexes) graph = DepGraph(links) found_relation = False trigger_word_matches = [] for trigger_index, (word, lemma) in enumerate(zip(words, lemmas), start=1): if word in triggers or lemma in triggers: trigger_word_matches.append(word) trigger_to_ent2 = Step.get_default_representation( graph.get_steps(trigger_index, ent2_head)) ent1_to_trigger = Step.get_default_representation( graph.get_steps(ent1_head, trigger_index)) ent1_to_ent2_via_trigger = '{0} >< {1}'.format( ent1_to_trigger, trigger_to_ent2) if ent1_to_ent2_via_trigger in paths: found_relation = True trigger = word if word in triggers else lemma csv_writer.writerow( column_mapper.get_new_row_values( entry, [ trigger, trigger_index, ent1_to_ent2_via_trigger, None ])) break if not found_relation: if include_miss: comment = 'relation not found - considered the following matching triggers: {}' \ .format(' '.join(trigger_word_matches)) csv_writer.writerow( column_mapper.get_new_row_values( entry, [None, None, None, comment])) output.close()
def parse_ucca(tupa_dir, model_prefix, tupa_batch_size, input_file=None, output_file=None, batch_size=None): """ Parameters ---------- Returns ------- """ input = open(input_file, encoding='utf-8') if input_file is not None else sys.stdin csv_reader = csv.reader(input) column_mapper = CsvColumnMapper(source_first_row=next(csv_reader), target_columns=[ 'id', 'sentence', 'ent1', 'ent2', 'ent1_start', 'ent1_end', 'ent2_start', 'ent2_end', 'ucca_parse', 'words', 'lemmas', 'comment' ], source_required=[ 'tac_tokens', 'subj_start', 'subj_end', 'obj_start', 'obj_end' ], filter_source_from_result=[ 'subj_start', 'subj_end', 'obj_start', 'obj_end' ]) detokenizer = Detokenizer() mnofc = ManageNewOutputFileCreation(output_file, batch_size) parser = TupaParser2(tupa_dir, model_prefix) nlp = spacy.load('en_core_web_md') count = -1 for next_batch in zip_longest(*([csv_reader] * tupa_batch_size)): entries = [] sentences = [] for entry in next_batch: if entry is None: #we've reached the end of the batch break entries.append(entry) tac_tokens = eval( column_mapper.get_field_value_from_source(entry, 'tac_tokens')) sentence = detokenizer.detokenize(tac_tokens) sentences.append(sentence) # send multiple sentnces for parsing (as many as 'TUPA_BATCH_SIZE') parsed_sentences = parser.parse_sentences(sentences) # if the length of parsed_sentences is different to the length of sentences, then # all bets are off - no point in trying to consolidate. # (that's especially true as the situation in which parsed_sentences' lenght will be # different is when the 'python -m tupa' command fails, in which case parsed_sentences will # be empty if len(parsed_sentences) != len(sentences): parsed_sentences = [None] * len(sentences) for sentence, parsed_sentence, entry in zip(sentences, parsed_sentences, entries): count += 1 new_file = mnofc.get_new_file_if_necessary() if new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(column_mapper.get_new_headers()) if parsed_sentence is None: csv_writer.writerow( column_mapper.get_new_row_values(entry, [ count, sentence, None, None, None, None, None, None, None, None, None, 'TUPA did not produce a UCCA parse' ])) continue tokens = [] tokens_with_indices = [] lemmas_with_indices = [] for ucca_terminal in parsed_sentence.terminals: tokens.append(ucca_terminal.text) tokens_with_indices.append( (ucca_terminal.token_id, ucca_terminal.text)) # use spacy to get lemmas spacied = nlp(sentence) for token_id, word in enumerate(spacied, start=1): lemmas_with_indices.append((token_id, word.lemma_)) tac_tokens = eval( column_mapper.get_field_value_from_source(entry, 'tac_tokens')) tac_tokens_lookup = {} tac_tokens_lookup['subj_start'] = int( column_mapper.get_field_value_from_source(entry, 'subj_start')) tac_tokens_lookup['subj_end'] = int( column_mapper.get_field_value_from_source(entry, 'subj_end')) tac_tokens_lookup['obj_start'] = int( column_mapper.get_field_value_from_source(entry, 'obj_start')) tac_tokens_lookup['obj_end'] = int( column_mapper.get_field_value_from_source(entry, 'obj_end')) token_lookup = SyncTacTags.b_lookup_to_a_lookup( tokens, tac_tokens, tac_tokens_lookup) if len(token_lookup) != len(tac_tokens_lookup): csv_writer.writerow( column_mapper.get_new_row_values(entry, [ count, sentence, None, None, None, None, None, None, None, None, None, 'was not able to reconcile TAC and Tupa\'s Spacy based indexing' ])) continue ent1_start = token_lookup['subj_start'] ent1_end = token_lookup['subj_end'] ent1 = ' '.join(tokens[ent1_start:ent1_end + 1]) ent2_start = token_lookup['obj_start'] ent2_end = token_lookup['obj_end'] ent2 = ' '.join(tokens[ent2_start:ent2_end + 1]) csv_writer.writerow( column_mapper.get_new_row_values(entry, [ count, sentence, ent1, ent2, ent1_start + 1, ent1_end + 1, ent2_start + 1, ent2_end + 1, parsed_sentence.serialize(), tokens_with_indices, lemmas_with_indices, None ]))