def build(self, **kwargs): self.embedding_type = 'word2vec' if self.name in WordEmbeddings.URL_MAP: url = self.URL_MAP.get(self.name) self.name = self.name + '.bz2' else: url = None self.model_path = helper.cached_path(self.name, url, sub_folders=['embedding', 'word2vec']) self.keyed_vector: KeyedVectors = KeyedVectors.load_word2vec_format(self.model_path, **kwargs) self.embedding_size = self.keyed_vector.vector_size word2idx = self.base_dict.copy() for word in self.keyed_vector.index2entity: word2idx[word] = len(word2idx) self.token2idx = word2idx input_layer = Input(shape=(self.sequence_length,), dtype='int32') embedding_matrix = self.get_embedding_matrix() current = Embedding(self.token_count, self.embedding_size, input_length=self.sequence_length, weights=[embedding_matrix], trainable=False)(input_layer) self._model = Model(input_layer, current) logging.debug('------------------------------------------------') logging.debug('Loaded gensim word2vec model') logging.debug('model : {}'.format(self.model_path)) logging.debug('word count : {}'.format(len(self.keyed_vector.index2entity))) logging.debug('Top 50 word : {}'.format(self.keyed_vector.index2entity[:50])) logging.debug('------------------------------------------------')
def build(self): self.embedding_type = 'bert' url = self.pre_trained_models.get(self.model_key_map.get(self.name, self.name)) self.model_path = helper.cached_path(self.model_key_map.get(self.name, self.name), url, ['embedding', 'bert']) config_path = os.path.join(self.model_path, 'bert_config.json') check_point_path = os.path.join(self.model_path, 'bert_model.ckpt') logging.info('loading bert model from {}\n'.format(self.model_path)) model = keras_bert.load_trained_model_from_checkpoint(config_path, check_point_path, seq_len=self.sequence_length) output_layer = helper.NonMaskingLayer()(model.output) self._model = Model(model.inputs, output_layer) self.embedding_size = self.model.output_shape[-1] dict_path = os.path.join(self.model_path, 'vocab.txt') word2idx = {} with open(dict_path, 'r', encoding='utf-8') as f: words = f.read().splitlines() for word in words: word2idx[word] = len(word2idx) for key, value in self.special_tokens.items(): word2idx[key] = word2idx[value] self.token2idx = word2idx
def get_info(cls): folder_path = helper.cached_path( cls.__corpus_name__, cls.__zip_file__name, ) logging.info("""{} info\n dataset path: {}\n{}""".format( cls.__corpus_name__, folder_path, cls.__desc__))
def get_sequence_tagging_data( cls, data_type: str = DATA_TRAIN, task_name: str = 'ner', shuffle: bool = True, max_count: int = 0) -> Tuple[List[List[str]], List[List[str]]]: folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]: raise ValueError( 'data_type error, please use one onf the {}'.format( [DATA_TRAIN, DATA_VALIDATE, DATA_TEST])) if task_name not in ['ner', 'pos', 'chunking']: raise ValueError( 'data_type error, please use one onf the {}'.format( ['ner', 'pos', 'chunking'])) folder_path = os.path.join(folder_path, task_name) if data_type == DATA_TRAIN: file_path = os.path.join(folder_path, 'train.txt') elif data_type == DATA_TEST: file_path = os.path.join(folder_path, 'test.txt') else: file_path = os.path.join(folder_path, 'valid.txt') x_list, y_list = _load_data_and_labels(file_path) if shuffle: x_list, y_list = helper.unison_shuffled_copies(x_list, y_list) if max_count: x_list = x_list[:max_count] y_list = y_list[:max_count] return x_list, y_list
def get_sequence_tagging_data( cls, data_type: str = DATA_TRAIN, shuffle: bool = True, max_count: int = 0) -> Tuple[List[List[str]], List[List[str]]]: folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if data_type == DATA_TRAIN: file_path = os.path.join(folder_path, 'example.train') elif data_type == DATA_TEST: file_path = os.path.join(folder_path, 'example.test') else: file_path = os.path.join(folder_path, 'example.dev') data_x, data_y = [], [] with open(file_path, 'r', encoding='utf-8') as f: lines = f.read().splitlines() x, y = [], [] for line in lines: rows = line.split(' ') if len(rows) == 1: data_x.append(x) data_y.append(y) x = [] y = [] else: x.append(rows[0]) y.append(rows[1]) return data_x, data_y
def get_sequence_tagging_data( cls, is_test: bool = False, shuffle: bool = True, max_count: int = 0) -> Tuple[List[str], List[str]]: folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if is_test: file_path = os.path.join(folder_path, 'test.csv') else: file_path = os.path.join(folder_path, 'train.csv') df = pd.read_csv(file_path) x_data = [] y_data = [] for tagging_text in df['tagging']: x_item, y_item = cls.parse_ner_str(tagging_text) x_data.append(x_item) y_data.append(y_item) if shuffle: x_data, y_data = helper.unison_shuffled_copies(x_data, y_data) if max_count != 0: x_data = x_data[:max_count] y_data = y_data[:max_count] return x_data, y_data
def get_classification_data( cls, data_type: str = DATA_TRAIN, shuffle: bool = True, cutter: str = 'char', max_count: int = 0) -> Tuple[List[List[str]], List[str]]: """ :param data_type: {train, validate, test} :param shuffle: shuffle or not :param cutter: :param max_count: :return: """ folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]: raise ValueError( 'data_type error, please use one onf the {}'.format( [DATA_TRAIN, DATA_VALIDATE, DATA_TEST])) if cutter not in ['char', 'jieba', 'none']: raise ValueError( 'data_type error, please use one onf the {}'.format( [DATA_TRAIN, DATA_VALIDATE, DATA_TEST])) file_path = os.path.join(folder_path, '{}.csv'.format(data_type)) df = pd.read_csv(file_path) x_data = df['text'].values y_data = df['domain'].values if shuffle: x_data, y_data = helper.unison_shuffled_copies(x_data, y_data) if max_count != 0: x_data = x_data[:max_count] y_data = y_data[:max_count] if cutter == 'jieba': try: import jieba except ModuleNotFoundError: raise ModuleNotFoundError( "please install jieba, `$ pip install jieba`") x_data = [list(jieba.cut(item)) for item in x_data] elif 'char': x_data = [list(item) for item in x_data] return x_data, y_data