示例#1
0
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir

        # set file names
        file_names = {
            'train': 'train.txt',
            'validation': 'dev.txt',
            'test': 'test.txt'
        }

        with open(os.path.join(input_dir, 'vocabs.txt'), 'r') as f:
            for i, l in enumerate(f.readlines()):
                if i == 2:
                    self.types_vocab = {
                        k: v
                        for v, k in enumerate(l.strip().split('\t'))
                    }

        self.words_vocab = {str(x): x for x in range(2)}

        # preprocessing trees
        for tag_name, fname in file_names.items():

            self.__init_stats__(tag_name)
            data_list = []

            with open(os.path.join(input_dir, fname), 'r') as f:
                for l in tqdm(f.readlines(),
                              desc='Preprocessing {}'.format(tag_name)):
                    #ris, t = l.strip().split('\t')
                    t = l.strip()
                    nltk_t = string_to_nltk_tree(t)
                    nx_t = nltk_tree_to_nx(
                        nltk_t,
                        get_internal_node_dict=lambda w: {
                            'x': ConstValues.NO_ELEMENT,
                            'y': int(w.strip().split('_')[0]),
                            't': self.__get_type_id__(w.strip().split('_')[1])
                        },
                        get_leaf_node_dict=lambda w: {
                            'x': self.__get_word_id__(w),
                            'y': ConstValues.NO_ELEMENT,
                            't': ConstValues.NO_ELEMENT
                        })

                    self.__update_stats__(tag_name, nx_t)
                    data_list.append((self.__nx_to_dgl__(nx_t)))

            self.__print_stats__(tag_name)
            to_pkl_file(data_list,
                        os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()
示例#2
0
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir

        tree_type = config.preprocessor_config.tree_type

        # set file names
        file_names = {
            'train': ['train_{}.pkl'.format(x) for x in tree_type],
            'validation': ['validation_{}.pkl'.format(x) for x in tree_type],
            'test': ['test_{}.pkl'.format(x) for x in tree_type]
        }

        # preprocessing trees
        for tag_name, f_list in file_names.items():
            parsed_trees_list = []
            for f in f_list:
                parsed_trees_list.append(
                    from_pkl_file(os.path.join(input_dir, f)))

            n_trees = len(parsed_trees_list[0])
            parsed_trees = [{
                'tree':
                tuple([v[i]['tree'] for v in parsed_trees_list]),
                'coarse_label':
                parsed_trees_list[0][i]['coarse_label'],
                'fine_label':
                parsed_trees_list[0][i]['fine_label']
            } for i in range(n_trees)]

            self.__init_stats__(tag_name)

            data_list = []

            for x in tqdm(parsed_trees,
                          desc='Preprocessing {}'.format(tag_name)):
                t = self.tree_transformer.transform(*x['tree'])

                self.__assign_node_features__(t)

                self.__update_stats__(tag_name, t)

                dgl_t = self.__nx_to_dgl__(t)
                data_list.append((dgl_t, x['coarse_label'], x['fine_label']))

            self.__print_stats__(tag_name)
            to_pkl_file(data_list,
                        os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()
        self.__save_word_embeddings__()
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir
        tree_type = config.preprocessor_config.tree_type
        output_type = self.__get_output_type()

        # set file names
        file_names = {
            'train': ['train_{}.pkl'.format(x) for x in tree_type],
            'validation': ['validation_{}.pkl'.format(x) for x in tree_type],
            'test': ['test_{}.pkl'.format(x) for x in tree_type]
        }

        sentiment_map_file = 'sentiment_map.pkl'

        # load sentiment map
        eprint('Loading sentiment map.')
        sentiment_map = from_pkl_file(
            os.path.join(input_dir, sentiment_map_file))

        # preprocessing trees
        for tag_name, f_list in file_names.items():
            nx_tree_list = []
            for f in f_list:
                nx_tree_list.append(from_pkl_file(os.path.join(input_dir, f)))
            nx_tree_list = list(zip(*nx_tree_list))

            self.__init_stats__(tag_name)

            tree_list = []

            for x in tqdm(nx_tree_list,
                          desc='Preprocessing {}'.format(tag_name)):
                t = self.tree_transformer.transform(*x)
                if self.__assign_node_features__(t, sentiment_map,
                                                 output_type):
                    # assign only if there is a label on the root (missing labe means neutral)
                    self.__update_stats__(tag_name, t)
                    tree_list.append(self.__nx_to_dgl__(t))

            self.__print_stats__(tag_name)
            to_pkl_file(tree_list,
                        os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()

        # compute and sabe word_embeddings
        self.__save_word_embeddings__()
    def __save_word_embeddings__(self):
        eprint('Loading word embeddings.')
        pretrained_embs_file = self.config.pretrained_embs_file
        embedding_dim = self.config.embedding_dim
        pretrained_embs = load_embeddings(pretrained_embs_file,
                                          self.words_vocab,
                                          embedding_dim=embedding_dim)
        to_pkl_file(
            pretrained_embs,
            os.path.join(self.config.output_dir, 'pretrained_embs.pkl'))

        if 'type_pretrained_embs_file' in self.config:
            eprint('Loading type embeddings.')
            type_pretrained_embs = load_embeddings(
                self.config.type_pretrained_embs_file,
                self.types_vocab,
                embedding_dim=self.config.type_embedding_dim)
            to_pkl_file(
                type_pretrained_embs,
                os.path.join(self.config.output_dir,
                             'type_pretrained_embs.pkl'))
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir

        # set file names
        file_names = {'train': 'train.txt',
                      'validation': 'dev.txt',
                      'test': 'test.txt'}

        self.words_vocab = {str(x): x for x in range(10)}

        # preprocessing trees
        for tag_name, fname in file_names.items():

            self.__init_stats__(tag_name)
            data_list = []
            with open(os.path.join(input_dir, fname), 'r') as f:
                for l in tqdm(f.readlines(), desc='Preprocessing {}'.format(fname)):
                    ris, a = l.strip().split('\t')
                    if a[0] != '(':
                        a = '(' + a + ')'
                        
                    nx_a = nltk_tree_to_nx(string_to_nltk_tree(a),
                                           get_internal_node_dict=lambda w: {'x': ConstValues.NO_ELEMENT,
                                                                             'y': ConstValues.NO_ELEMENT,
                                                                             't': self.__get_type_id__(w.strip())},
                                           get_leaf_node_dict=lambda w: {'x': self.__get_word_id__(w.strip()),
                                                                         'y': ConstValues.NO_ELEMENT,
                                                                         't': ConstValues.NO_ELEMENT})
                    self.__add_intermediate_results__(nx_a)
                    assert not (nx_a.nodes[0]['y'] != int(ris) and nx_a.number_of_nodes()>1)
                    self.__update_stats__(tag_name, nx_a)
                    data_list.append((self.__nx_to_dgl__(nx_a)))

            self.__print_stats__(tag_name)
            to_pkl_file(data_list, os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()
示例#6
0
                        for x in parser.tokenize(txt, tokenizer_prop)
                    ]
                else:
                    tok_list = [txt.lower()]

                tok_set = tuple(tok_list)

                if tok_set not in sentiment_map:
                    sentiment_map[tok_set] = id2sentiment[id]
                else:
                    if sentiment_map[tok_set] == 2:  # neutral
                        sentiment_map[tok_set] = id2sentiment[id]

        # store sentiment map
        eprint('Saving sentiment map.')
        to_pkl_file(sentiment_map, sentiment_map_out_file)

    for f_name in fnames:
        rf_name = os.path.join(input_dir, f_name)
        wf_dep_name = os.path.join(output_dir,
                                   f_name.replace('.txt', '_dep.pkl'))
        wf_const_name = os.path.join(output_dir,
                                     f_name.replace('.txt', '_const.pkl'))
        wf_bin_const_name = os.path.join(
            output_dir, f_name.replace('.txt', '_bin_const.pkl'))

        if not path_exists_with_message(
                wf_dep_name) or not path_exists_with_message(
                    wf_const_name) or not path_exists_with_message(
                        wf_bin_const_name):
            ris = {'dep': [], 'const': [], 'bin_const': []}
示例#7
0
                              desc='Buildiing trees from {}: '.format(f_name)):
                    if skip_first_line:
                        skip_first_line = False
                        continue

                    v = l.strip().split('\t')
                    sent_a = v[1]
                    sent_b = v[2]
                    rel_score = float(v[3])
                    ent_judgment = entailment_vocab[v[4]]

                    ris_a, = parser.raw_parse(sent_a)
                    ris_b, = parser.raw_parse(sent_b)
                    for k in ris_a:
                        ris[k].append({
                            'tree_a': ris_a[k],
                            'tree_b': ris_b[k],
                            'relatedness': rel_score,
                            'entailment': ent_judgment
                        })

            eprint('Saving parsed trees.')
            to_pkl_file(ris['dep'], wf_dep_name)
            to_pkl_file(ris['const'], wf_const_name)
            to_pkl_file(ris['bin_const'], wf_bin_const_name)

    words_vocab_file = os.path.join(output_dir, 'words_vocab.pkl')
    if not path_exists_with_message(words_vocab_file):
        eprint('Store word vocabulary.')
        to_pkl_file(parser.words_vocab, words_vocab_file)
示例#8
0
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir
        train_max_num_ops = config.max_num_ops_in_training

        # set file names
        file_names = {
            'train':
            ['train{}'.format(x) for x in range(train_max_num_ops + 1)],
            'validation': ['dev{}'.format(x) for x in range(13)],
            'test': ['test{}'.format(x) for x in range(13)]
        }

        # preprocessing trees
        for tag_name, fname_list in file_names.items():

            self.__init_stats__(tag_name)
            data_list = []

            for fname in fname_list:
                with open(os.path.join(input_dir, fname), 'r') as f:
                    for l in tqdm(f.readlines(),
                                  desc='Preprocessing {}'.format(fname)):
                        entail, a, b = l.strip().split('\t')

                        if a[0] != '(':
                            a = '(' + a + ')'

                        if b[0] != '(':
                            b = '(' + b + ')'
                        ax, _ = parse_string_tree(a, 0)
                        bx, _ = parse_string_tree(b, 0)
                        nx_a = nltk_tree_to_nx(
                            ax,
                            get_internal_node_dict=lambda w: {
                                'x': ConstValues.NO_ELEMENT,
                                'y': ConstValues.NO_ELEMENT,
                                't': self.__get_type_id__(w.strip())
                            },
                            get_leaf_node_dict=lambda w: {
                                'x': self.__get_word_id__(w.strip()),
                                'y': ConstValues.NO_ELEMENT,
                                't': ConstValues.NO_ELEMENT
                            })

                        nx_b = nltk_tree_to_nx(
                            bx,
                            get_internal_node_dict=lambda w: {
                                'x': ConstValues.NO_ELEMENT,
                                'y': ConstValues.NO_ELEMENT,
                                't': self.__get_type_id__(w.strip())
                            },
                            get_leaf_node_dict=lambda w: {
                                'x': self.__get_word_id__(w.strip()),
                                'y': ConstValues.NO_ELEMENT,
                                't': ConstValues.NO_ELEMENT
                            })

                        self.__update_stats__(tag_name, nx_a)
                        self.__update_stats__(tag_name, nx_b)
                        data_list.append((self.__nx_to_dgl__(nx_a),
                                          self.__nx_to_dgl__(nx_b),
                                          self.__get_output_id__(entail)))

            self.__print_stats__(tag_name)
            to_pkl_file(data_list,
                        os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()
                    sent = v[1]
                    coarse_label = fine_label.split(':')[0]

                    coarse_label_id = coarse_label_vocab.setdefault(
                        coarse_label, len(coarse_label_vocab))
                    fine_label_id = fine_label_vocab.setdefault(
                        fine_label, len(fine_label_vocab))

                    ris, = parser.raw_parse(sent)
                    for kk in ris:
                        parsed_trees[kk].append({
                            'tree': ris[kk],
                            'coarse_label': coarse_label_id,
                            'fine_label': fine_label_id
                        })
            to_pkl_file(parsed_trees, out_file)

            # save words vocab file
            eprint('Store word vocabulary.')
            words_vocab_file = os.path.join(output_dir, 'words_vocab.pkl')
            to_pkl_file(parser.words_vocab, words_vocab_file)

            # strore label vocabs
            eprint('Store label vocabulary.')
            to_json_file(coarse_label_vocab,
                         os.path.join(output_dir, 'coarse_vocab.json'))
            to_json_file(fine_label_vocab,
                         os.path.join(output_dir, 'fine_vocab.json'))
        else:
            parsed_trees = from_pkl_file(out_file)