def get_sequence_tagging_data( cls, data_type: str = DATA_TRAIN, task_name: str = 'ner', shuffle: bool = True, max_count: int = 0) -> Tuple[List[List[str]], List[List[str]]]: folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]: raise ValueError( 'data_type error, please use one onf the {}'.format( [DATA_TRAIN, DATA_VALIDATE, DATA_TEST])) if task_name not in ['ner', 'pos', 'chunking']: raise ValueError( 'data_type error, please use one onf the {}'.format( ['ner', 'pos', 'chunking'])) folder_path = os.path.join(folder_path, task_name) if data_type == DATA_TRAIN: file_path = os.path.join(folder_path, 'train.txt') elif data_type == DATA_TEST: file_path = os.path.join(folder_path, 'test.txt') else: file_path = os.path.join(folder_path, 'valid.txt') x_list, y_list = _load_data_and_labels(file_path) if shuffle: x_list, y_list = helper.unison_shuffled_copies(x_list, y_list) if max_count: x_list = x_list[:max_count] y_list = y_list[:max_count] return x_list, y_list
def get_sequence_tagging_data( cls, is_test: bool = False, shuffle: bool = True, max_count: int = 0) -> Tuple[List[str], List[str]]: folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if is_test: file_path = os.path.join(folder_path, 'test.csv') else: file_path = os.path.join(folder_path, 'train.csv') df = pd.read_csv(file_path) x_data = [] y_data = [] for tagging_text in df['tagging']: x_item, y_item = cls.parse_ner_str(tagging_text) x_data.append(x_item) y_data.append(y_item) if shuffle: x_data, y_data = helper.unison_shuffled_copies(x_data, y_data) if max_count != 0: x_data = x_data[:max_count] y_data = y_data[:max_count] return x_data, y_data
def get_classification_data( cls, data_type: str = DATA_TRAIN, shuffle: bool = True, cutter: str = 'char', max_count: int = 0) -> Tuple[List[List[str]], List[str]]: """ :param data_type: {train, validate, test} :param shuffle: shuffle or not :param cutter: :param max_count: :return: """ folder_path = helper.cached_path(cls.__corpus_name__, cls.__zip_file__name) if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]: raise ValueError( 'data_type error, please use one onf the {}'.format( [DATA_TRAIN, DATA_VALIDATE, DATA_TEST])) if cutter not in ['char', 'jieba', 'none']: raise ValueError( 'data_type error, please use one onf the {}'.format( [DATA_TRAIN, DATA_VALIDATE, DATA_TEST])) file_path = os.path.join(folder_path, '{}.csv'.format(data_type)) df = pd.read_csv(file_path) x_data = df['text'].values y_data = df['domain'].values if shuffle: x_data, y_data = helper.unison_shuffled_copies(x_data, y_data) if max_count != 0: x_data = x_data[:max_count] y_data = y_data[:max_count] if cutter == 'jieba': try: import jieba except ModuleNotFoundError: raise ModuleNotFoundError( "please install jieba, `$ pip install jieba`") x_data = [list(jieba.cut(item)) for item in x_data] elif 'char': x_data = [list(item) for item in x_data] return x_data, y_data