Exemplo n.º 1
0
    def build(self, **kwargs):
        self.embedding_type = 'word2vec'
        if self.name in WordEmbeddings.URL_MAP:
            url = self.URL_MAP.get(self.name)
            self.name = self.name + '.bz2'
        else:
            url = None

        self.model_path = helper.cached_path(self.name,
                                             url,
                                             sub_folders=['embedding', 'word2vec'])

        self.keyed_vector: KeyedVectors = KeyedVectors.load_word2vec_format(self.model_path, **kwargs)
        self.embedding_size = self.keyed_vector.vector_size

        word2idx = self.base_dict.copy()
        for word in self.keyed_vector.index2entity:
            word2idx[word] = len(word2idx)
        self.token2idx = word2idx

        input_layer = Input(shape=(self.sequence_length,), dtype='int32')
        embedding_matrix = self.get_embedding_matrix()

        current = Embedding(self.token_count,
                            self.embedding_size,
                            input_length=self.sequence_length,
                            weights=[embedding_matrix],
                            trainable=False)(input_layer)
        self._model = Model(input_layer, current)
        logging.debug('------------------------------------------------')
        logging.debug('Loaded gensim word2vec model')
        logging.debug('model        : {}'.format(self.model_path))
        logging.debug('word count   : {}'.format(len(self.keyed_vector.index2entity)))
        logging.debug('Top 50 word  : {}'.format(self.keyed_vector.index2entity[:50]))
        logging.debug('------------------------------------------------')
Exemplo n.º 2
0
    def build(self):
        self.embedding_type = 'bert'
        url = self.pre_trained_models.get(self.model_key_map.get(self.name, self.name))
        self.model_path = helper.cached_path(self.model_key_map.get(self.name, self.name),
                                             url,
                                             ['embedding', 'bert'])

        config_path = os.path.join(self.model_path, 'bert_config.json')
        check_point_path = os.path.join(self.model_path, 'bert_model.ckpt')
        logging.info('loading bert model from {}\n'.format(self.model_path))
        model = keras_bert.load_trained_model_from_checkpoint(config_path,
                                                              check_point_path,
                                                              seq_len=self.sequence_length)
        output_layer = helper.NonMaskingLayer()(model.output)
        self._model = Model(model.inputs, output_layer)

        self.embedding_size = self.model.output_shape[-1]
        dict_path = os.path.join(self.model_path, 'vocab.txt')
        word2idx = {}
        with open(dict_path, 'r', encoding='utf-8') as f:
            words = f.read().splitlines()
        for word in words:
            word2idx[word] = len(word2idx)
        for key, value in self.special_tokens.items():
            word2idx[key] = word2idx[value]

        self.token2idx = word2idx
Exemplo n.º 3
0
 def get_info(cls):
     folder_path = helper.cached_path(
         cls.__corpus_name__,
         cls.__zip_file__name,
     )
     logging.info("""{} info\n    dataset path: {}\n{}""".format(
         cls.__corpus_name__, folder_path, cls.__desc__))
Exemplo n.º 4
0
    def get_sequence_tagging_data(
            cls,
            data_type: str = DATA_TRAIN,
            task_name: str = 'ner',
            shuffle: bool = True,
            max_count: int = 0) -> Tuple[List[List[str]], List[List[str]]]:
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)

        if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]))
        if task_name not in ['ner', 'pos', 'chunking']:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    ['ner', 'pos', 'chunking']))
        folder_path = os.path.join(folder_path, task_name)
        if data_type == DATA_TRAIN:
            file_path = os.path.join(folder_path, 'train.txt')
        elif data_type == DATA_TEST:
            file_path = os.path.join(folder_path, 'test.txt')
        else:
            file_path = os.path.join(folder_path, 'valid.txt')
        x_list, y_list = _load_data_and_labels(file_path)
        if shuffle:
            x_list, y_list = helper.unison_shuffled_copies(x_list, y_list)
        if max_count:
            x_list = x_list[:max_count]
            y_list = y_list[:max_count]
        return x_list, y_list
Exemplo n.º 5
0
    def get_sequence_tagging_data(
            cls,
            data_type: str = DATA_TRAIN,
            shuffle: bool = True,
            max_count: int = 0) -> Tuple[List[List[str]], List[List[str]]]:
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)

        if data_type == DATA_TRAIN:
            file_path = os.path.join(folder_path, 'example.train')
        elif data_type == DATA_TEST:
            file_path = os.path.join(folder_path, 'example.test')
        else:
            file_path = os.path.join(folder_path, 'example.dev')

        data_x, data_y = [], []

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines()
            x, y = [], []
            for line in lines:
                rows = line.split(' ')
                if len(rows) == 1:
                    data_x.append(x)
                    data_y.append(y)
                    x = []
                    y = []
                else:
                    x.append(rows[0])
                    y.append(rows[1])
        return data_x, data_y
Exemplo n.º 6
0
    def get_sequence_tagging_data(
            cls,
            is_test: bool = False,
            shuffle: bool = True,
            max_count: int = 0) -> Tuple[List[str], List[str]]:
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)

        if is_test:
            file_path = os.path.join(folder_path, 'test.csv')
        else:
            file_path = os.path.join(folder_path, 'train.csv')

        df = pd.read_csv(file_path)
        x_data = []
        y_data = []

        for tagging_text in df['tagging']:
            x_item, y_item = cls.parse_ner_str(tagging_text)
            x_data.append(x_item)
            y_data.append(y_item)
        if shuffle:
            x_data, y_data = helper.unison_shuffled_copies(x_data, y_data)
        if max_count != 0:
            x_data = x_data[:max_count]
            y_data = y_data[:max_count]
        return x_data, y_data
Exemplo n.º 7
0
    def get_classification_data(
            cls,
            data_type: str = DATA_TRAIN,
            shuffle: bool = True,
            cutter: str = 'char',
            max_count: int = 0) -> Tuple[List[List[str]], List[str]]:
        """

        :param data_type: {train, validate, test}
        :param shuffle: shuffle or not
        :param cutter:
        :param max_count:
        :return:
        """
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)
        if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]))
        if cutter not in ['char', 'jieba', 'none']:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]))

        file_path = os.path.join(folder_path, '{}.csv'.format(data_type))
        df = pd.read_csv(file_path)
        x_data = df['text'].values
        y_data = df['domain'].values
        if shuffle:
            x_data, y_data = helper.unison_shuffled_copies(x_data, y_data)

        if max_count != 0:
            x_data = x_data[:max_count]
            y_data = y_data[:max_count]

        if cutter == 'jieba':
            try:
                import jieba
            except ModuleNotFoundError:
                raise ModuleNotFoundError(
                    "please install jieba, `$ pip install jieba`")
            x_data = [list(jieba.cut(item)) for item in x_data]
        elif 'char':
            x_data = [list(item) for item in x_data]
        return x_data, y_data