示例#1
0
    def build(self,
              training_data_path,
              file_columns,
              input_types,
              file_with_col_header,
              answer_column_name,
              word2vec_path=None,
              word_emb_dim=None,
              format=None,
              file_type=None,
              involve_all_words=None,
              file_format="tsv",
              show_progress=True,
              cpu_num_workers=-1,
              max_vocabulary=800000,
              word_frequency=3):
        """

        Args:
            training_data_path:
            file_columns: {
                  "word1": 0,
                  "word2": 1,
                  "label":   2,
                  "postag_feature1": 3,
                  "postag_feature2": 4
                },
            input_types:
                e.g.
                {
                  "word": {
                    "cols": ["word1", "word2"],
                    "dim": 300
                  },
                  "postag": {
                    "cols": ["postag_feature1", "postag_feature2"],
                    "dim": 20
                  },
                }
                or
                {
                  "bpe": {
                    "cols": ["word1", "word2"],
                    "dim": 100
                    "bpe_path": "xxx.bpe"
                  }
                }

            word2vec_path:
            word_emb_dim:
            involve_all_word: involve all words that show up in the pretrained embedding
            file_format: "tsv", or "json". Note "json" means each sample is represented by a json string.

        Returns:

        """
        if 'bpe' in input_types:
            try:
                bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path'])
            except KeyError:
                raise Exception(
                    'Please define a bpe path at the embedding layer.')
        else:
            bpe_encoder = None

        self.file_column_num = len(file_columns)
        with open(training_data_path, "r", encoding='utf-8') as f:
            progress = self.get_data_list_from_file(f, file_with_col_header)
            docs, target_docs, cnt_legal, cnt_illegal = self.build_training_multi_processor(
                progress,
                cpu_num_workers,
                file_columns,
                input_types,
                answer_column_name,
                bpe_encoder=bpe_encoder)

        logging.info("Corpus imported: %d legal lines, %d illegal lines." %
                     (cnt_legal, cnt_illegal))

        if word2vec_path and involve_all_words is True:
            logging.info("Getting pre-trained embeddings...")
            word_emb_dict = load_embedding(word2vec_path,
                                           word_emb_dim,
                                           format,
                                           file_type,
                                           with_head=False,
                                           word_set=None)
            self.input_dicts['word'].build(
                [list(word_emb_dict.keys())],
                max_vocabulary_num=len(word_emb_dict),
                threshold=0)
        for input_type in input_types:
            if input_type != 'word':
                self.input_dicts[input_type].build(
                    docs[input_type],
                    max_vocabulary_num=max_vocabulary,
                    threshold=word_frequency)
            else:
                self.input_dicts[input_type].build(
                    docs[input_type],
                    max_vocabulary_num=max_vocabulary,
                    threshold=word_frequency)
            logging.info("%d types in %s" %
                         (self.input_dicts[input_type].cell_num(), input_type))
        if ProblemTypes[self.problem_type] == ProblemTypes.classification:
            self.output_dict.build(list(target_docs.values())[0], threshold=0)
        elif ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
            self.output_dict.build(list(target_docs.values())[0], threshold=0)
        elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
                ProblemTypes[self.problem_type] == ProblemTypes.mrc:
            pass

        if self.output_dict:
            logging.info("%d types in target" % (self.output_dict.cell_num()))

        logging.debug("Cell dict built")

        if word2vec_path:
            if not involve_all_words:
                logging.info("Getting pre-trained embeddings...")
                word_emb_dict = load_embedding(
                    word2vec_path,
                    word_emb_dim,
                    format,
                    file_type,
                    with_head=False,
                    word_set=self.input_dicts['word'].cell_id_map.keys())

            for word in word_emb_dict:
                loaded_emb_dim = len(word_emb_dict[word])
                break

            assert loaded_emb_dim == word_emb_dim, "The dimension of defined word embedding is inconsistent with the pretrained embedding provided!"

            if self.input_dicts['word'].with_unk:
                word_emb_dict['<unk>'] = np.random.random(size=word_emb_dim)
            if self.input_dicts['word'].with_pad:
                word_emb_dict['<pad>'] = np.random.random(size=word_emb_dim)

            word_emb_matrix = []
            unknown_word_count = 0
            for i in range(self.input_dicts['word'].cell_num()):
                if self.input_dicts['word'].id_cell_map[i] in word_emb_dict:
                    word_emb_matrix.append(
                        word_emb_dict[self.input_dicts['word'].id_cell_map[i]])
                else:
                    word_emb_matrix.append(word_emb_dict['<unk>'])
                    unknown_word_count += 1
            word_emb_matrix = np.array(word_emb_matrix)
            logging.info(
                "word embedding matrix shape:(%d, %d); unknown word count: %d;"
                % (len(word_emb_matrix), len(
                    word_emb_matrix[0]), unknown_word_count))
            logging.info("Word embedding loaded")
        else:
            word_emb_matrix = None
        return word_emb_matrix
示例#2
0
    def encode(self,
               data_path,
               file_columns,
               input_types,
               file_with_col_header,
               object_inputs,
               answer_column_name,
               min_sentence_len,
               extra_feature,
               max_lengths=None,
               fixed_lengths=None,
               file_format="tsv",
               show_progress=True,
               cpu_num_workers=-1):
        """

        Args:
            data_path:
            file_columns: {
                  "word1": 0,
                  "word2": 1,
                  "label":   2,
                  "postag_feature1": 3,
                  "postag_feature2": 4
                },
            input_types:
                {
                  "word": {
                    "cols": [
                      "word1",
                      "word2"
                    ],
                    "dim": 300
                  },
                  "postag": {
                    "cols": ["postag_feature1", "postag_feature2"],
                    "dim": 20
                  }
                }
                or
                {
                  "bpe": {
                    "cols": ["word1", "word2"],
                    "dim": 100
                    "bpe_path": "xxx.bpe"
                  }
                }
            object_inputs: {
              "string1": [
                "word1",
                "postag_feature1"
              ],
              "string2": [
                "word2",
                "postag_feature2"
              ]
            },
            answer_column_name: 'label' / None. None means there is no target and it is used for prediction only.
            max_lengths: if it is a dict, firstly cut the sequences if they exceed the max length. Then, pad all the sequences to the length of longest string.
                {
                    "string1": 25,
                    "string2": 100
                }
            fixed_lengths: if it is a dict, cut or pad the sequences to the fixed lengths.
                {
                    "string1": 25,
                    "string2": 100
                }
            file_format:

        Returns:
            data: indices, padded
                {
                'string1': {
                    'word1': [...],
                    'postage_feature1': [..]
                    }
                'string2': {
                    'word1': [...],
                    'postage_feature1': [..]
                }
            lengths: real length of data
                {
                'string1':   [...],
                'string2':   [...]
                }
            target: [...]

        """
        if 'bpe' in input_types:
            try:
                bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path'])
            except KeyError:
                raise Exception(
                    'Please define a bpe path at the embedding layer.')
        else:
            bpe_encoder = None

        with open(data_path, 'r', encoding='utf-8') as fin:
            progress = self.get_data_list_from_file(fin, file_with_col_header)
            data, lengths, target, cnt_legal, cnt_illegal = self.encode_data_multi_processor(
                progress,
                cpu_num_workers,
                file_columns,
                input_types,
                object_inputs,
                answer_column_name,
                min_sentence_len,
                extra_feature,
                max_lengths,
                fixed_lengths,
                file_format,
                bpe_encoder=bpe_encoder)
        logging.info("%s: %d legal samples, %d illegal samples" %
                     (data_path, cnt_legal, cnt_illegal))
        return data, lengths, target
示例#3
0
    def build(self,
              data_path_list,
              file_columns,
              input_types,
              file_with_col_header,
              answer_column_name,
              word2vec_path=None,
              word_emb_dim=None,
              format=None,
              file_type=None,
              involve_all_words=None,
              file_format="tsv",
              show_progress=True,
              cpu_num_workers=-1,
              max_vocabulary=800000,
              word_frequency=3):
        """

        Args:
            training_data_path:
            file_columns: {
                  "word1": 0,
                  "word2": 1,
                  "label":   2,
                  "postag_feature1": 3,
                  "postag_feature2": 4
                },
            input_types:
                e.g.
                {
                  "word": {
                    "cols": ["word1", "word2"],
                    "dim": 300
                  },
                  "postag": {
                    "cols": ["postag_feature1", "postag_feature2"],
                    "dim": 20
                  },
                }
                or
                {
                  "bpe": {
                    "cols": ["word1", "word2"],
                    "dim": 100
                    "bpe_path": "xxx.bpe"
                  }
                }

            word2vec_path:
            word_emb_dim:
            involve_all_word: involve all words that show up in the pretrained embedding
            file_format: "tsv", or "json". Note "json" means each sample is represented by a json string.

        Returns:

        """
        # parameter check
        if not word2vec_path:
            word_emb_dim, format, file_type, involve_all_words = None, None, None, None

        if 'bpe' in input_types:
            try:
                bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path'])
            except KeyError:
                raise Exception(
                    'Please define a bpe path at the embedding layer.')
        else:
            bpe_encoder = None

        self.file_column_num = len(file_columns)
        progress = self.get_data_generator_from_file(data_path_list,
                                                     file_with_col_header)
        preprocessed_data_generator = self.build_training_multi_processor(
            progress,
            cpu_num_workers,
            file_columns,
            input_types,
            answer_column_name,
            bpe_encoder=bpe_encoder)

        # update symbol universe
        total_cnt_legal, total_cnt_illegal = 0, 0
        for docs, target_docs, cnt_legal, cnt_illegal in tqdm(
                preprocessed_data_generator):
            total_cnt_legal += cnt_legal
            total_cnt_illegal += cnt_illegal

            # input_type
            for input_type in input_types:
                self.input_dicts[input_type].update(docs[input_type])

            # problem_type
            if ProblemTypes[self.problem_type] == ProblemTypes.classification or \
                ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
                self.output_dict.update(list(target_docs.values())[0])
            elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
                    ProblemTypes[self.problem_type] == ProblemTypes.mrc:
                pass
        logging.info("Corpus imported: %d legal lines, %d illegal lines." %
                     (total_cnt_legal, total_cnt_illegal))

        # build dictionary
        for input_type in input_types:
            self.input_dicts[input_type].build(
                threshold=word_frequency, max_vocabulary_num=max_vocabulary)
            logging.info("%d types in %s column" %
                         (self.input_dicts[input_type].cell_num(), input_type))
        if self.output_dict:
            self.output_dict.build(threshold=0)
            logging.info("%d types in target column" %
                         (self.output_dict.cell_num()))
        logging.debug("training data dict built")

        # embedding
        word_emb_matrix = None
        if word2vec_path:
            logging.info("Getting pre-trained embeddings...")
            word_emb_dict = None
            if involve_all_words is True:
                word_emb_dict = load_embedding(word2vec_path,
                                               word_emb_dim,
                                               format,
                                               file_type,
                                               with_head=False,
                                               word_set=None)
                self.input_dicts['word'].update([list(word_emb_dict.keys())])
                self.input_dicts['word'].build(
                    threshold=0, max_vocabulary_num=len(word_emb_dict))
            else:
                extend_vocabulary = set()
                for single_word in self.input_dicts['word'].cell_id_map.keys():
                    extend_vocabulary.add(single_word)
                    if single_word.lower() != single_word:
                        extend_vocabulary.add(single_word.lower())
                word_emb_dict = load_embedding(word2vec_path,
                                               word_emb_dim,
                                               format,
                                               file_type,
                                               with_head=False,
                                               word_set=extend_vocabulary)

            for word in word_emb_dict:
                loaded_emb_dim = len(word_emb_dict[word])
                break

            assert loaded_emb_dim == word_emb_dim, "The dimension of defined word embedding is inconsistent with the pretrained embedding provided!"

            logging.info("constructing embedding table")
            if self.input_dicts['word'].with_unk:
                word_emb_dict['<unk>'] = np.random.random(size=word_emb_dim)
            if self.input_dicts['word'].with_pad:
                word_emb_dict['<pad>'] = np.random.random(size=word_emb_dim)

            word_emb_matrix = []
            unknown_word_count = 0
            scale = np.sqrt(3.0 / word_emb_dim)
            for i in range(self.input_dicts['word'].cell_num()):
                single_word = self.input_dicts['word'].id_cell_map[i]
                if single_word in word_emb_dict:
                    word_emb_matrix.append(word_emb_dict[single_word])
                elif single_word.lower() in word_emb_dict:
                    word_emb_matrix.append(word_emb_dict[single_word.lower()])
                else:
                    word_emb_matrix.append(
                        np.random.uniform(-scale, scale, word_emb_dim))
                    unknown_word_count += 1
            word_emb_matrix = np.array(word_emb_matrix)
            logging.info(
                "word embedding matrix shape:(%d, %d); unknown word count: %d;"
                % (len(word_emb_matrix), len(
                    word_emb_matrix[0]), unknown_word_count))
            logging.info("Word embedding loaded")

        return word_emb_matrix