Пример #1
0
def setup_datasets(dataset_name,
                   root='.data',
                   vocab_size=20000,
                   include_unk=False):
    dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive(dataset_tar)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    # generate sentencepiece  pretrained tokenizer
    if not path.exists('m_user.model'):
        logging.info('Generate SentencePiece pretrained tokenizer...')
        generate_sp_model(train_csv_path, vocab_size)

    sp_model = load_sp_model("m_user.model")
    sp_generator = sentencepiece_numericalizer(sp_model)
    train_data, train_labels = _create_data_with_sp_transform(
        sp_generator, train_csv_path)
    test_data, test_labels = _create_data_with_sp_transform(
        sp_generator, test_csv_path)

    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (text_classification.TextClassificationDataset(
        None, train_data, train_labels),
            text_classification.TextClassificationDataset(
                None, test_data, test_labels))
Пример #2
0
def prepairData(path, ngrams=NGRAMS, vocab=None):
    if not os.path.isdir(path):
        logging.error('Data path err')
        return

    train_csv_path = path + 'train.csv'
    test_csv_path = path + 'test.csv'

    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = torch_text.build_vocab_from_iterator(
            torch_text._csv_iterator(train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")

    train_data, train_labels = torch_text._create_data_from_iterator(
        vocab,
        torch_text._csv_iterator(train_csv_path, ngrams, yield_cls=True),
        include_unk=False)
    logging.info('Creating testing data')
    test_data, test_labels = torch_text._create_data_from_iterator(
        vocab,
        torch_text._csv_iterator(test_csv_path, ngrams, yield_cls=True),
        include_unk=False)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (torch_text.TextClassificationDataset(vocab, train_data,
                                                 train_labels),
            torch_text.TextClassificationDataset(vocab, test_data,
                                                 test_labels))
Пример #3
0
    def __init__(self, device, **kwargs):
        self.device = device
        self.batch_size = kwargs.get('batch_size')
        self.path_to_data = kwargs.pop('path_to_data')
        self.path_to_vectors = kwargs.pop('path_to_vectors')
        self.emb_dim = kwargs.pop('emb_dim')
        self.voc_size = kwargs.pop('voc_size')
        self.min_freq = kwargs.pop('min_freq', 1)
        self.fix_length = kwargs.pop('fix_len', 203)

        self.path_train_data = self.path_to_data + '/yelp_15/yelp.train.txt'
        self.path_val_data = self.path_to_data + '/yelp_15/yelp.valid.txt'
        self.path_test_data = self.path_to_data + '/yelp_15/yelp.test.txt'

        print("build vocab")
        vocab = self.build_vocab_from_textfile(self.path_train_data)

        print("create train split")
        list_train_data, list_train_labels = self.create_data_from_textfile(
            vocab, self.path_train_data, include_unk=True)
        train = text_classification.TextClassificationDataset(
            vocab, list_train_data, list_train_labels)

        print("create val split")
        list_val_data, list_val_labels = self.create_data_from_textfile(
            vocab, self.path_val_data, include_unk=True)
        valid = text_classification.TextClassificationDataset(
            vocab, list_val_data, list_val_labels)

        print("create test split")
        list_test_data, list_test_labels = self.create_data_from_textfile(
            vocab, self.path_test_data, include_unk=True)
        test = text_classification.TextClassificationDataset(
            vocab, list_test_data, list_test_labels)

        print("create data loaders")
        self._train_iter = DataLoader(
            train,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=self.generate_batch,
        )

        self._valid_iter = DataLoader(valid,
                                      batch_size=self.batch_size,
                                      shuffle=True,
                                      collate_fn=self.generate_batch)

        self._test_iter = DataLoader(test,
                                     batch_size=self.batch_size,
                                     shuffle=True,
                                     collate_fn=self.generate_batch)

        self.train_vocab = vocab
Пример #4
0
    def _setup_datasets(self,
                        dataset_name,
                        root='./data',
                        ngrams=1,
                        vocab=None,
                        include_unk=True,
                        download=False):
        if download:
            dataset_tar = download_from_url(URLS[dataset_name], root=root)
            extracted_files = extract_archive(dataset_tar)

            for fname in extracted_files:
                if fname.endswith('train.csv'):
                    train_csv_path = fname
                if fname.endswith('test.csv'):
                    test_csv_path = fname

        else:
            dir_name = root + "/" + dataset_name + "/"
            train_csv_path = dir_name + "train.csv"
            test_csv_path = dir_name + "test.csv"

        if vocab is None:
            print('Building Vocab based on {}'.format(train_csv_path))
            vocab = self.build_vocab_from_iterator(
                text_classification._csv_iterator(train_csv_path, ngrams))
        else:
            if not isinstance(vocab, Vocab):
                raise TypeError("Passed vocabulary is not of type Vocab")
        print('Vocab has {} entries'.format(len(vocab)))
        print('Creating training data')
        train_data, train_labels = text_classification._create_data_from_iterator(
            vocab,
            text_classification._csv_iterator(train_csv_path,
                                              ngrams,
                                              yield_cls=True), include_unk)
        print('Creating testing data')
        test_data, test_labels = text_classification._create_data_from_iterator(
            vocab,
            text_classification._csv_iterator(test_csv_path,
                                              ngrams,
                                              yield_cls=True), include_unk)
        if len(train_labels ^ test_labels) > 0:
            raise ValueError("Training and test labels don't match")
        return (text_classification.TextClassificationDataset(
            vocab, train_data, train_labels),
                text_classification.TextClassificationDataset(
                    vocab, test_data, test_labels))
Пример #5
0
def setup_datasets(dataset_name,
                   root='.data',
                   vocab_size=20000,
                   include_unk=False):
    dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive(dataset_tar)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname
    train_data, train_labels = _create_data_with_sp_transform(train_csv_path)
    test_data, test_labels = _create_data_with_sp_transform(test_csv_path)

    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (text_classification.TextClassificationDataset(
        None, train_data, train_labels),
            text_classification.TextClassificationDataset(
                None, test_data, test_labels))
Пример #6
0
def setup_datasets(dataset_name,
                   root='.data',
                   vocab_size=60000,
                   include_unk=False):

    train_csv_path = './.data/hackson/train.csv'
    test_csv_path = './.data/hackson/test_withsomelabels.csv'

    from torchtext.vocab import build_vocab_from_iterator
    vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path))
    #pdb.set_trace()

    train_data, train_labels = _create_data_with_sp_transform(
        train_csv_path, vocab)
    test_data, test_labels = _create_data_with_sp_transform(
        test_csv_path, vocab)
    #print (train_data)

    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (text_classification.TextClassificationDataset(
        None, train_data, train_labels),
            text_classification.TextClassificationDataset(
                None, test_data, test_labels))