def _read(self, file_path_list): print(file_path_list) source_copy = self.source_copy for lang, file_path in file_path_list: try: u_pos = self.convert_postags(lang) except: u_pos = None # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading instances from lines in file at: %s %s %s", lang, file_path, self.split) i = 0 for amr in AMRIO.read(file_path, lang=lang, universal_postags=self.universal_postags, postag_map=u_pos): i += 1 try: yield self.text_to_instance(amr, lang, source_copy, self.split) except Exception as e: if self.split != "test": continue else: raise e self.report_coverage()
def restore_file(self, file_path): for amr in AMRIO.read(file_path): try: self.restore_instance(amr) yield amr except: yield amr
def recategorize_file(self, file_path): for i, amr in enumerate(AMRIO.read(file_path), 1): self.recategorize_graph(amr) yield amr if i % 1000 == 0: logger.info('Processed {} examples.'.format(i)) logger.info('Done.\n')
def annotate_file(self, in_path, out): with open(out, 'w', encoding='utf-8') as f: for i, amr in enumerate(AMRIO.read(os.path.join(in_path))): if i % 1000 == 0: logger.info('{} processed.'.format(i)) sentence = amr.sentence if self.lang=="it": annotation = self.tint_annotate(sentence.replace("[ ... ]","")) else: annotation = self.stanza_annotate(sentence) amr.tokens = annotation['tokens'] amr.lemmas = annotation['lemmas'] amr.pos_tags = annotation['pos_tags'] amr.ner_tags = annotation['ner_tags'] amr.abstract_map = {} AMRIO.dump([amr], f)
def _update_counter_from_train_files(self, amr_train_files, base_freq=1): logger.info('Updating (lemma, frame) counter from AMR train files.') for file_path in amr_train_files: for amr in AMRIO.read(file_path): for node in amr.graph.get_nodes(): for _, frame in node.get_frame_attributes(): frame_lemma = re.sub(WORDSENSE_RE, '', frame) self._update_counter(self.lemma_frame_counter, frame_lemma, frame, base_freq) self._update_counter(self.frame_lemma_counter, frame, frame_lemma, base_freq)
def _get_senseless_node_counter(amr_train_files): logger.info('Building the senseless node counter.') sense_less_nodes = [] for amr_file in amr_train_files: for amr in AMRIO.read(amr_file): for node in amr.graph.get_nodes(): for attr, value in node.get_senseless_attributes(): sense_less_nodes.append(value) return Counter(sense_less_nodes)
def read_file_gold_amr(self, lang_sentences): with open(self.dump_dir + '{}_{}.txt'.format(self.split, self.lang), 'w', encoding='utf-8') as f: for i, amr in enumerate(AMRIO.read(os.path.join(self.in_path))): if i % 1000 == 0: logger.info('{} processed.'.format(i)) sentence = amr.sentence parallel_sentence = lang_sentences[i] amr.sentence = parallel_sentence amr.tokens = None amr.lemmas = None amr.pos_tags = None amr.ner_tags = None amr.misc = ["# ::tok-{}".format("en") + " " + sentence] amr.abstract_map = {} AMRIO.dump([amr], f)
def dump_spotlight_wiki(self, file_path): sent_map = {} for i, amr in tqdm(enumerate(AMRIO.read(file_path), 1)): if i % 20 == 0: print('+', end='') sent = amr.sentence wiki = self.spotlight_wiki_docker(sent, port=self.spotlight_port) sent_map[sent] = wiki # sleep(0.1) with open(os.path.join(self.util_dir, args.spotlight_wiki), 'w', encoding='utf-8') as f: json.dump(sent_map, f)
def expand_file(self, file_path): for i, amr in enumerate(AMRIO.read(file_path, lang=self.lang, universal_postags=self.u_pos, postag_map=self.postag_map)): # print(amr) self.expand_graph(amr) yield amr self.print_stats()
def wikify_file(self, file_path, lang="en"): for i, amr in enumerate(AMRIO.read(file_path, lang=lang)): self.wikify_graph(amr) yield amr
def read(self, file_path): for amr in AMRIO.read(file_path): yield self(amr)
parser.add_argument('files', nargs='+', help='files to annotate.') parser.add_argument('--compound_file', default='data/misc/joints.txt') parser.add_argument('--processed_sentences', default='') args = parser.parse_args() annotator = FeatureAnnotator('http://localhost:9000', args.compound_file) processed = set() if args.processed_sentences != "": with open(args.processed_sentences, "r") as infile: for line in infile: processed.add(line.rstrip()) for file_path in args.files: logger.info('Processing {}'.format(file_path)) with open(file_path + '.features{}'.format('' if len(processed) == 0 else '.partial'), 'w', encoding='utf-8') as f: for i, amr in enumerate(AMRIO.read(file_path), 1): if i % 1000 == 0: logger.info('{} processed.'.format(i)) if amr.sentence in processed: continue annotation = annotator(amr.sentence) amr.tokens = annotation['tokens'] amr.lemmas = annotation['lemmas'] amr.pos_tags = annotation['pos_tags'] amr.ner_tags = annotation['ner_tags'] AMRIO.dump([amr], f) logger.info('Done!')
def read_translations(self, lang_sentences): for i, amr in enumerate(AMRIO.read(lang_sentences)): if amr.id not in self.translations: self.translations[amr.id] = amr
if token == '911': index = i break else: break amr.replace_span([index], ['09', '11'], ['CD', 'CD'], ['DATE', 'DATE']) def replace_NT_dollar_abbr(amr): # Replace 'NT' in front of '$' with 'Taiwan'. for i, token in enumerate(amr.tokens): if token == 'NT' and len(amr.tokens) > i + 1 and amr.tokens[i + 1] in ( '$', 'dollars', 'dollar'): amr.replace_span([i], ['Taiwan'], ['NNP'], ['COUNTRY']) if __name__ == '__main__': import argparse from xlamr_stog.data.dataset_readers.amr_parsing.io import AMRIO parser = argparse.ArgumentParser('input_cleaner.py') parser.add_argument('--amr_files', nargs='+', default=[]) args = parser.parse_args() for file_path in args.amr_files: with open(file_path + '.input_clean', 'w', encoding='utf-8') as f: for amr in AMRIO.read(file_path): clean(amr) f.write(str(amr) + '\n\n')