예제 #1
0
파일: corpus.py 프로젝트: zxlzr/Kashgari
    def get_sequence_tagging_data(
            cls,
            data_type: str = DATA_TRAIN,
            task_name: str = 'ner',
            shuffle: bool = True,
            max_count: int = 0) -> Tuple[List[List[str]], List[List[str]]]:
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)

        if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]))
        if task_name not in ['ner', 'pos', 'chunking']:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    ['ner', 'pos', 'chunking']))
        folder_path = os.path.join(folder_path, task_name)
        if data_type == DATA_TRAIN:
            file_path = os.path.join(folder_path, 'train.txt')
        elif data_type == DATA_TEST:
            file_path = os.path.join(folder_path, 'test.txt')
        else:
            file_path = os.path.join(folder_path, 'valid.txt')
        x_list, y_list = _load_data_and_labels(file_path)
        if shuffle:
            x_list, y_list = helper.unison_shuffled_copies(x_list, y_list)
        if max_count:
            x_list = x_list[:max_count]
            y_list = y_list[:max_count]
        return x_list, y_list
예제 #2
0
파일: corpus.py 프로젝트: zxlzr/Kashgari
    def get_sequence_tagging_data(
            cls,
            is_test: bool = False,
            shuffle: bool = True,
            max_count: int = 0) -> Tuple[List[str], List[str]]:
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)

        if is_test:
            file_path = os.path.join(folder_path, 'test.csv')
        else:
            file_path = os.path.join(folder_path, 'train.csv')

        df = pd.read_csv(file_path)
        x_data = []
        y_data = []

        for tagging_text in df['tagging']:
            x_item, y_item = cls.parse_ner_str(tagging_text)
            x_data.append(x_item)
            y_data.append(y_item)
        if shuffle:
            x_data, y_data = helper.unison_shuffled_copies(x_data, y_data)
        if max_count != 0:
            x_data = x_data[:max_count]
            y_data = y_data[:max_count]
        return x_data, y_data
예제 #3
0
파일: corpus.py 프로젝트: zxlzr/Kashgari
    def get_classification_data(
            cls,
            data_type: str = DATA_TRAIN,
            shuffle: bool = True,
            cutter: str = 'char',
            max_count: int = 0) -> Tuple[List[List[str]], List[str]]:
        """

        :param data_type: {train, validate, test}
        :param shuffle: shuffle or not
        :param cutter:
        :param max_count:
        :return:
        """
        folder_path = helper.cached_path(cls.__corpus_name__,
                                         cls.__zip_file__name)
        if data_type not in [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]))
        if cutter not in ['char', 'jieba', 'none']:
            raise ValueError(
                'data_type error, please use one onf the {}'.format(
                    [DATA_TRAIN, DATA_VALIDATE, DATA_TEST]))

        file_path = os.path.join(folder_path, '{}.csv'.format(data_type))
        df = pd.read_csv(file_path)
        x_data = df['text'].values
        y_data = df['domain'].values
        if shuffle:
            x_data, y_data = helper.unison_shuffled_copies(x_data, y_data)

        if max_count != 0:
            x_data = x_data[:max_count]
            y_data = y_data[:max_count]

        if cutter == 'jieba':
            try:
                import jieba
            except ModuleNotFoundError:
                raise ModuleNotFoundError(
                    "please install jieba, `$ pip install jieba`")
            x_data = [list(jieba.cut(item)) for item in x_data]
        elif 'char':
            x_data = [list(item) for item in x_data]
        return x_data, y_data