def load_data( cls, subset_name: str = 'train', task_name: str = 'ner', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=k.DATA_PATH, untar=True) if subset_name not in {'train', 'test', 'valid'}: raise ValueError() file_path = os.path.join(corpus_path, f'{subset_name}.txt') if task_name not in {'pos', 'chunking', 'ner'}: raise ValueError() data_index = ['pos', 'chunking', 'ner'].index(task_name) + 1 x_data, y_data = DataReader.read_conll_format_file( file_path, label_index=data_index) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logging.debug( f"loaded {len(x_data)} samples from {file_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def load_data( cls, subset_name: str = 'train', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ Load dataset as sequence labeling format, char level tokenized Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=K.DATA_PATH, untar=True) if subset_name == 'train': file_path = os.path.join(corpus_path, 'example.train') elif subset_name == 'test': file_path = os.path.join(corpus_path, 'example.test') else: file_path = os.path.join(corpus_path, 'example.dev') x_data, y_data = DataReader.read_conll_format_file(file_path) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logger.debug( f"loaded {len(x_data)} samples from {file_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def load_data( self, subset_name: str = 'train', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ Load dataset as sequence labeling format, char level tokenized Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ df = pd.read_csv(self.file_path) df = df[:self.sample_count] df['y'] = df.apply(self._extract_label, axis=1) df['x'] = df['comment_text'].apply(self._text_process) df = df[['x', 'y']] if subset_name == 'train': df = df.loc[self.train_ids] elif subset_name == 'valid': df = df.loc[self.valid_ids] else: df = df.loc[self.test_ids] xs, ys = list(df['x'].values), list(df['y'].values) if shuffle: xs, ys = utils.unison_shuffled_copies(xs, ys) return xs, ys
def test_unison_shuffled_copies(self): x: np.ndarray = np.random.randint(0, 10, size=(100, 5)) y: np.ndarray = np.random.randint(0, 10, size=(100, )) new_x, new_y = unison_shuffled_copies(x, y) assert new_x.shape == x.shape assert new_y.shape == y.shape
def load_data(cls, subset_name: str = 'train', shuffle: bool = True, cutter: str = 'char') -> Tuple[List[List[str]], List[str]]: """ Load dataset as sequence classification format, char level tokenized features: ``[['听', '新', '闻', '。'], ['电', '视', '台', '在', '播', '什', '么'], ...]`` labels: ``['news', 'epg', ...]`` Samples:: train_x, train_y = SMP2018ECDTCorpus.load_data('train') test_x, test_y = SMP2018ECDTCorpus.load_data('test') Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. cutter: sentence cutter, {char, jieba} Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=k.DATA_PATH, untar=True) if cutter not in ['char', 'jieba', 'none']: raise ValueError( 'cutter error, please use one onf the {char, jieba}') df_path = os.path.join(corpus_path, f'{subset_name}.csv') df = pd.read_csv(df_path) if cutter == 'jieba': try: import jieba except ModuleNotFoundError: raise ModuleNotFoundError( "please install jieba, `$ pip install jieba`") x_data = [list(jieba.cut(item)) for item in df['query'].to_list()] elif 'char': x_data = [list(item) for item in df['query'].to_list()] y_data = df['label'].to_list() if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logging.debug(f"loaded {len(x_data)} samples from {df_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def load_data( cls, subset_name: str = 'train', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ Load dataset as sequence labeling format, char level tokenized features: ``[['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', ...], ...]`` labels: ``[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', ...], ...]`` Sample:: train_x, train_y = ChineseDailyNerCorpus.load_data('train') test_x, test_y = ChineseDailyNerCorpus.load_data('test') Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=k.DATA_PATH, untar=True) if subset_name == 'train': file_path = os.path.join(corpus_path, 'example.train') elif subset_name == 'test': file_path = os.path.join(corpus_path, 'example.test') else: file_path = os.path.join(corpus_path, 'example.dev') x_data, y_data = DataReader.read_conll_format_file(file_path) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logging.debug( f"loaded {len(x_data)} samples from {file_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def load_data(subset_name='train', shuffle=True): """ Load dataset as sequence labeling format, char level tokenized Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ if subset_name == 'train': file_path = '../../data/ChineseDailyNerCorpus/example.train' elif subset_name == 'test': file_path = '../../data/ChineseDailyNerCorpus/example.test' else: file_path = '../../data/ChineseDailyNerCorpus/example.dev' x_data, y_data = DataReader.read_conll_format_file(file_path) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) return x_data, y_data
训练包含:ORG、LOC、PER、TIME的中文NER任务模型 ''' import kashgari from kashgari.corpus import DataReader from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model from kashgari import utils kashgari.config.use_cudnn_cell = False train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train') valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev') test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test') train_x, train_y = utils.unison_shuffled_copies(train_x, train_y) valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y) test_x, test_y = utils.unison_shuffled_copies(test_x, test_y) print(f"train data count: {len(train_x)}") print(f"validate data count: {len(valid_x)}") print(f"test data count: {len(test_x)}", test_x[0], test_y[0]) bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) model = BiLSTM_CRF_Model(bert_embedding) model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20) model.save('models/all_ner.h5')