示例#1
0
 def recategorize_file(self, file_path):
     for i, amr in enumerate(AMRIO.read(file_path), 1):
         self.recategorize_graph(amr)
         yield amr
         if i % 1000 == 0:
             logger.info('Processed {} examples.'.format(i))
     logger.info('Done.\n')
 def _read(self, file_path):
     # if `file_path` is a URL, redirect to the cache
     file_path = cached_path(file_path)
     logger.info("Reading instances from lines in file at: %s", file_path)
     for amr in AMRIO.read(file_path):
         yield self.text_to_instance(amr)
     self.report_coverage()
示例#3
0
 def _get_senseless_node_counter(amr_train_files):
     logger.info('Building the senseless node counter.')
     sense_less_nodes = []
     for amr_file in amr_train_files:
         for amr in AMRIO.read(amr_file):
             for node in amr.graph.get_nodes():
                 for attr, value in node.get_senseless_attributes():
                     sense_less_nodes.append(value)
     return Counter(sense_less_nodes)
示例#4
0
 def _update_counter_from_train_files(self, amr_train_files, base_freq=1):
     logger.info('Updating (lemma, frame) counter from AMR train files.')
     for file_path in amr_train_files:
         for amr in AMRIO.read(file_path):
             for node in amr.graph.get_nodes():
                 for _, frame in node.get_frame_attributes():
                     frame_lemma = re.sub(WORDSENSE_RE, '', frame)
                     self._update_counter(self.lemma_frame_counter, frame_lemma, frame, base_freq)
                     self._update_counter(self.frame_lemma_counter, frame, frame_lemma, base_freq)
示例#5
0
文件: wikification.py 项目: cl91/stog
 def dump_spotlight_wiki(self, file_path):
     sent_map = {}
     for i, amr in enumerate(AMRIO.read(file_path), 1):
         if i % 20 == 0:
             print('+', end='')
         sent = amr.sentence
         wiki = self.spotlight_wiki(sent)
         sent_map[sent] = wiki
         sleep(0.1)
     with open(os.path.join(self.util_dir, 'spotlight_wiki.json'),
               'w',
               encoding='utf-8') as f:
         json.dump(sent_map, f)
示例#6
0
 def dump_spotlight_wiki(self, amr_files):
     #!!! This function has been changed by Deng Cai
     sent_map = {}
     for file_path in amr_files:
         for i, amr in enumerate(AMRIO.read(file_path), 1):
             if i % 20 == 0:
                 print('+', end='')
             sent = amr.sentence
             wiki = self.spotlight_wiki(sent)
             sent_map[sent] = wiki
             sleep(0.1)
     with open(os.path.join(self.util_dir, 'spotlight_wiki.json'),
               'w',
               encoding='utf-8') as f:
         json.dump(sent_map, f)
示例#7
0
def create_dependency_parser_feature_from_file(annotator, filepath):
    dependency_feature_data = []
    amrs = []
    sentence_ids = []
    
    with open(filepath + '.features', 'w', encoding='utf-8') as f:
        for i, amr in enumerate(AMRIO.read(filepath), 1):
            if i % 100 == 0:
                print('{} processed.'.format(i))

            annotation = annotator.annotate(amr.sentence)
            dump_amr_features(amr, annotation, f)
            sentence_data = create_dependency_parser_feature(annotation, amr.sentence, i)
            dependency_feature_data.append(sentence_data)
            sentence_ids.append(i)
            amrs.append(amr)

    dataset_dict = {
        'sentence_id': sum([sum([sentence_data['sentence_id'] for sentence_data in dependency_feature_data], [])],[]),
        'sequence': sum([sum([sentence_data['sequence'] for sentence_data in dependency_feature_data], [])],[]),
        'parent': sum([sum([sentence_data['parent'] for sentence_data in dependency_feature_data], [])],[]),
        'parent_position': sum([sum([sentence_data['parent_position'] for sentence_data in dependency_feature_data], [])],[]), 
        'child': sum([sum([sentence_data['child'] for sentence_data in dependency_feature_data], [])],[]), 
        'child_position': sum([sum([sentence_data['child_position'] for sentence_data in dependency_feature_data], [])],[]), 
        'is_root' : sum([sum([sentence_data['is_root'] for sentence_data in dependency_feature_data], [])],[]),
        'parent_ner': sum([sum([sentence_data['parent_ner'] for sentence_data in dependency_feature_data], [])],[]),
        'child_ner': sum([sum([sentence_data['child_ner'] for sentence_data in dependency_feature_data], [])],[]),
        'parent_pos': sum([sum([sentence_data['parent_pos'] for sentence_data in dependency_feature_data], [])],[]),
        'dependency_role': sum([sum([sentence_data['dependency_role'] for sentence_data in dependency_feature_data], [])],[]),
        'child_pos': sum([sum([sentence_data['child_pos'] for sentence_data in dependency_feature_data], [])],[])
    }

    amr_dict = {
        'sentence_id': sentence_ids,
        'amr': [str(amr.graph) for amr in amrs]
    }

    dependency_feature_df = pd.DataFrame(dataset_dict)
    amr_df = pd.DataFrame(amr_dict)
    
    return dependency_feature_df, amr_df
示例#8
0
    def load_data(self):
        logger.info("Parsing and linearizing the AMR dataset")

        train_amr = AMRIO.read(self.train_file)

        for i, amr in tqdm(enumerate(train_amr), desc='Train AMR'):
            # Raw version
            if self.small and i > 50:
                break

            raw_amr = []
            for amr_line in str(amr.graph).splitlines():
                striped_amr = amr_line.strip()
                raw_amr.append(striped_amr)
            self.X_train_raw.append(" ".join(raw_amr))

            linearized_amr = self.get_list(amr)

            self.X_train.append(linearized_amr[1:])
            self.Y_train.append(amr.sentence)
            self.Y_train_tok.append(amr.tokens)

            # Vocabulary Create dictionaries and simplify list
            simpl = list()
            simpl_only_nodes = list()
            for step in linearized_amr:
                if step not in self.lin_to_int.keys():
                    self.lin_to_int[step] = len(self.lin_to_int)
                    self.int_to_lin[len(self.int_to_lin)] = step
                # simplyfied AMR version
                step, edge = self.simplify(step)
                simpl.append(step)
                if not step.startswith(":"):
                    simpl_only_nodes.append(step)
                # Identify edges and save them
                if edge and step not in self.edges:
                    self.edges.append(step)

            self.X_train_simple.append(simpl)
            self.X_train_simple_only_nodes.append(simpl_only_nodes)

            sent = amr.sentence.split()
            for word in sent:
                if word not in self.word_to_int.keys():
                    self.word_to_int[word] = len(self.word_to_int)
                    self.int_to_word[len(self.int_to_word)] = word

        if self.use_silver_data:
            print("Processing silver data from", self.silver_train_file)
            ii = 0

            silver_train_amr = AMRIO.read(self.silver_train_file)
            for i, amr in enumerate(silver_train_amr):
                if self.small and i > 50:
                    break

                # Raw version
                raw_amr = []
                ii += 1
                linearized_amr = self.get_list(amr)
                if linearized_amr is None:
                    continue

                for amr_line in str(amr.graph).splitlines():
                    striped_amr = amr_line.strip()
                    raw_amr.append(striped_amr)
                self.X_silver_train_raw.append(" ".join(raw_amr))

                self.X_silver_train.append(linearized_amr[1:])
                self.Y_silver_train.append(amr.sentence)
                self.Y_silver_train_tok.append(amr.tokens)

                # Vocabulary Create dictionaries and simplify list
                simpl = list()
                simpl_only_nodes = list()
                for step in linearized_amr:
                    if step not in self.lin_to_int.keys():
                        self.lin_to_int[step] = len(self.lin_to_int)
                        self.int_to_lin[len(self.int_to_lin)] = step
                    # simplyfied AMR version
                    step, edge = self.simplify(step)
                    simpl.append(step)
                    if not step.startswith(":"):
                        simpl_only_nodes.append(step)
                    # Identify edges and save them
                    if edge and step not in self.edges:
                        self.edges.append(step)

                self.X_silver_train_simple.append(simpl)
                self.X_silver_train_simple_only_nodes.append(simpl_only_nodes)

                sent = amr.sentence.split()
                for word in sent:
                    if word not in self.word_to_int.keys():
                        self.word_to_int[word] = len(self.word_to_int)
                        self.int_to_word[len(self.int_to_word)] = word
            print("Silver data with size:", len(self.X_silver_train_raw))
        else:
            print("No silver data performed")

        dev_amr = AMRIO.read(self.dev_file)
        for i, amr in tqdm(enumerate(dev_amr), desc='Dev AMR'):
            if self.small and i > 50:
                break

            # Raw input
            raw_amr = []
            for amr_line in str(amr.graph).splitlines():
                striped_amr = amr_line.strip()
                raw_amr.append(striped_amr)
            self.X_dev_raw.append(" ".join(raw_amr))

            linearized_amr = self.get_list(amr)
            self.X_dev.append(linearized_amr[1:])
            self.Y_dev.append(amr.sentence)
            self.Y_dev_tok.append(amr.tokens)

            # simplyfied AMR version
            simpl = list()
            simpl_only_nodes = list()
            for step in linearized_amr:
                step, edge = self.simplify(step)
                simpl.append(step)
                if not step.startswith(":"):
                    simpl_only_nodes.append(step)
                if edge and step not in self.edges:
                    self.edges.append(step)
            self.X_dev_simple.append(simpl)
            self.X_dev_simple_only_nodes.append(simpl_only_nodes)

        test_amr = AMRIO.read(self.test_file)
        self.amr_test = test_amr
        for i, amr in tqdm(enumerate(test_amr), desc='Test AMR'):
            if self.small and i > 50:
                break

            # Raw version
            raw_amr = []
            for amr_line in str(amr.graph).splitlines():
                striped_amr = amr_line.strip()
                raw_amr.append(striped_amr)
            self.X_test_raw.append(" ".join(raw_amr))

            linearized_amr = self.get_list(amr)
            self.X_test.append(linearized_amr[1:])
            self.Y_test.append(amr.sentence)
            self.Y_test_tok.append(amr.tokens)

            # simplyfied AMR version
            simpl = list()
            simpl_only_nodes = list()
            for step in linearized_amr:

                step, edge = self.simplify(step)
                simpl.append(step)
                if not step.startswith(":"):
                    simpl_only_nodes.append(step)

                if edge and step not in self.edges:
                    self.edges.append(step)
            self.X_test_simple.append(simpl)
            self.X_test_simple_only_nodes.append(simpl_only_nodes)
示例#9
0
 def expand_file(self, file_path):
     for i, amr in enumerate(AMRIO.read(file_path)):
         self.expand_graph(amr)
         yield amr
     self.print_stats()
示例#10
0

if __name__ == '__main__':
    import argparse

    from stog.data.dataset_readers.amr_parsing.io import AMRIO

    parser = argparse.ArgumentParser('feature_annotator.py')
    parser.add_argument('files', nargs='+', help='files to annotate.')
    parser.add_argument('--compound_file', default='')

    args = parser.parse_args()

    annotator = FeatureAnnotator('http://localhost:9000', args.compound_file)

    for file_path in args.files:
        logger.info('Processing {}'.format(file_path))
        for i, amr in enumerate(AMRIO.read(file_path), 1):
            if i % 1000 == 0:
                logger.info('{} processed.'.format(i))
            annotation = annotator(amr.sentence)
            amr.tokens = annotation['tokens']
            amr.lemmas = annotation['lemmas']
            amr.pos_tags = annotation['pos_tags']
            amr.ner_tags = annotation['ner_tags']
            amr.original = annotation['original']
            with open(file_path + '.features', 'a', encoding='utf-8') as f:
                AMRIO.dump([amr], f)

    logger.info('Done!')
示例#11
0
    parser = argparse.ArgumentParser("text_anonymizor.py")

    parser.add_argument('--amr_file', nargs="+", required=True, help="File to anonymize.")
    parser.add_argument('--util_dir')
    parser.add_argument('--lang')
    parser.add_argument('--exclude_ners', action="store_true", help="consider NER tags for entities not found in training.")
    args = parser.parse_args()


    if args.lang=="en":
        text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,
            "text_anonymization_rules.json"))
        lang_stopwords=None
        lang2en_span=None
        lang2en_bn=None

    else:
        text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,"text_anonymization_en-{}.json".format(args.lang)))
        lang_stopwords = set([x.rstrip() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(args.lang))])

        lang2en_span=load_name_span_map("data/cross-lingual-babelnet_mappings/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang)
        lang2en_bn=load_name_bn_wiki_map("data/cross-lingual-babelnet_mappings/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper()))

    for amr_file in args.amr_file:
        with open(amr_file + ".recategorize{}".format("_noner" if args.exclude_ners else ""), "w", encoding="utf-8") as f:
            for amr in tqdm(AMRIO.read(amr_file, lang=args.lang)):

                amr.abstract_map = text_anonymizor(amr)
                f.write(str(amr) + "\n\n")
示例#12
0
文件: wikification.py 项目: cl91/stog
 def wikify_file(self, file_path):
     for i, amr in enumerate(AMRIO.read(file_path)):
         self.wikify_graph(amr)
         yield amr
示例#13
0
        return cls(
            text_maps=d["text_maps"],
            priority_lists=d["priority_lists"],
            _VNE=d["VNE"],
            _LOCEN1=d["LOCEN1"],
            _LOCEN2=d["LOCEN2"],
            _N=d["N"],
            _M=d["M"],
            _R=d["R"],
            _INVP=d["INVP"],
            _INVS=d["INVS"],
        )


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser("text_anonymizor.py")

    parser.add_argument('--amr_file', required=True, help="File to anonymize.")
    parser.add_argument('--util_dir')
    args = parser.parse_args()

    text_anonymizor = TextAnonymizor.from_json(
        os.path.join(args.util_dir, "text_anonymization_rules.json"))

    with open(args.amr_file + ".recategorize", "w", encoding="utf-8") as f:
        for amr in AMRIO.read(args.amr_file):
            amr.abstract_map = text_anonymizor(amr)
            f.write(str(amr) + "\n\n")
示例#14
0
            if token == '911':
                index = i
                break
        else:
            break
        amr.replace_span([index], ['09', '11'], ['CD', 'CD'], ['DATE', 'DATE'])


def replace_NT_dollar_abbr(amr):
    # Replace 'NT' in front of '$' with 'Taiwan'.
    for i, token in enumerate(amr.tokens):
        if token == 'NT' and len(amr.tokens) > i + 1 and amr.tokens[i + 1] in (
                '$', 'dollars', 'dollar'):
            amr.replace_span([i], ['Taiwan'], ['NNP'], ['COUNTRY'])


if __name__ == '__main__':
    import argparse
    from stog.data.dataset_readers.amr_parsing.io import AMRIO

    parser = argparse.ArgumentParser('input_cleaner.py')
    parser.add_argument('--amr_files', nargs='+', default=[])

    args = parser.parse_args()

    for file_path in args.amr_files:
        with open(file_path + '.input_clean', 'w', encoding='utf-8') as f:
            for amr in AMRIO.read(file_path):
                clean(amr)
                f.write(str(amr) + '\n\n')
示例#15
0
 def read(self, file_path):
     for amr in AMRIO.read(file_path):
         yield self(amr)
示例#16
0
def dump_amr_features(amr, annotation, f):
    amr.tokens = annotation['tokens']
    amr.lemmas = annotation['lemmas']
    amr.pos_tags = annotation['pos_tags']
    amr.ner_tags = annotation['ner_tags']
    AMRIO.dump([amr], f)