Пример #1
0
def main(_):
    trn_snt_files = [
        # '../datasets/training/as_simplified_training.utf8',
        '../datasets/training/cityu_simplified_training.utf8',
        '../datasets/training/msr_training.utf8',
        '../datasets/training/pku_training.utf8'
    ]
    trn_lbl_files = [splitext(f)[0] + '.bies' for f in trn_snt_files]

    tf.logging.info('Loading training data...')
    trn_snts = read_sentences(trn_snt_files)
    y_trn = read_labels(trn_lbl_files)

    assert len(trn_snts) == len(y_trn), 'Sentences and labels must be equal'

    train_tok = False
    tokenizer = Tokenizer(trn_snts, verbose=True)
    if train_tok:
        tokenizer.fit()
        tokenizer.save()
    else:
        tokenizer.load()

    x_uni_trn, x_bi_trn = process_sentences(trn_snts, tokenizer)

    tf.logging.info('Creating model...')
    model = create_model(tokenizer.vocab_size(), stacked=False)
    model.summary()

    tf.logging.info('Training model...')
    epochs = 10
    batch_size = 32
    steps = int(len(x_uni_trn) / batch_size)
    for epoch in range(epochs):
        print('Epoch', epoch + 1)
        for uni_b, bi_b, lbl_b in tqdm(train_data_generator(
            [x_uni_trn, x_bi_trn, y_trn], batch_size, shuffle=True),
                                       desc='Training Loop',
                                       total=steps):
            try:
                loss, acc = model.train_on_batch([uni_b, bi_b], lbl_b)
                # print('Loss:', loss, 'Acc:', acc)
            except Exception as e:
                print(e)

    model.save('unstacked_combined_model.h5')
Пример #2
0
def get_training_data(cfg):
    if cfg.input_data_dir():
        filenames = glob.glob(cfg.input_data_dir() + os.sep + "*.csv")
        for f in filenames:
            cfg.logger.info("reading and combining  files:" + f)
        df = pd.concat([pd.read_csv(f) for f in filenames])
    else:
        cfg.logger.info("No training data dir provided")
        sys.exit()

    cfg.logger.info("input data frame:" + str(df.shape))

    df = df.astype(str)

    df_train, df_test = train_test_split(df, test_size =\
        cfg.test_train_split(), random_state=cfg.random_seed())

    x = df[cfg.input_col()].tolist()
    y = df[cfg.output_col()].tolist()

    input_pp = Tokenizer(cfg.num_input_tokens())
    output_pp = Tokenizer(cfg.num_output_tokens())

    input_pp.fit(x)
    output_pp.fit(y)

    x_train = df_train[cfg.input_col()].tolist()
    y_train = df_train[cfg.output_col()].tolist()

    input_vecs  = input_pp.transform(x_train, cfg.input_seq_len(),\
        padding=True, post=False, append_indicators=False)

    output_vecs = output_pp.transform(y_train, cfg.output_seq_len(),\
        padding=True, post=True, append_indicators=True)

    return input_vecs, output_vecs
Пример #3
0
class AttentionTFIDFClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 hiddens=300,
                 mindf=2,
                 lan='english',
                 stopwords='nltk',
                 k=512,
                 max_drop=.85,
                 batch_size=64,
                 lr=5e-3,
                 weight_decay=5e-3,
                 nepochs=1000,
                 patience=10,
                 factor=.95,
                 vocab_max_size=300000,
                 n_jobs=cpu_count(),
                 _device=torch.device('cuda:0'),
                 _verbose=False):
        super(AttentionTFIDFClassifier, self).__init__()

        self._model = None
        self._tokenizer = None
        self.nepochs = int(nepochs)
        self.hiddens = int(hiddens)
        self.mindf = int(mindf)
        self.lan = lan
        self.stopwords = stopwords
        self.k = int(k)
        self.max_drop = max_drop
        self.vocab_max_size = vocab_max_size
        self._verbose = _verbose
        self._device = _device

        self.n_jobs = int(n_jobs)

        self.lr = lr
        self.weight_decay = weight_decay
        self.patience = int(patience)
        self.factor = factor
        self.batch_size = int(batch_size)

        def collate_train(param):
            X, y = zip(*param)
            y = self._tokenizer.le.transform(y)
            doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False)

            doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)),
                                    batch_first=True,
                                    padding_value=0)

            TFs = pad_sequence(list(map(torch.tensor, TFs)),
                               batch_first=True,
                               padding_value=0)
            TFs = torch.LongTensor(torch.log2(TFs + 1).round().long())

            DFs = pad_sequence(list(map(torch.tensor, DFs)),
                               batch_first=True,
                               padding_value=0)
            DFs = torch.LongTensor(torch.log2(DFs + 1).round().long())

            return doc_tids, TFs, DFs, torch.LongTensor(y)

        def collate_predict(X):
            doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False)

            doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)),
                                    batch_first=True,
                                    padding_value=0)

            TFs = pad_sequence(list(map(torch.tensor, TFs)),
                               batch_first=True,
                               padding_value=0)
            TFs = torch.LongTensor(torch.log2(TFs + 1).round().long())

            DFs = pad_sequence(list(map(torch.tensor, DFs)),
                               batch_first=True,
                               padding_value=0)
            DFs = torch.LongTensor(torch.log2(DFs + 1).round().long())

            return doc_tids, TFs, DFs

        self.collate_train = collate_train
        self.collate_predict = collate_predict

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        if X_val is None or y_val is None:
            pass
        self._tokenizer = Tokenizer(mindf=self.mindf,
                                    lan=self.lan,
                                    stopwordsSet=self.stopwords,
                                    model='sample',
                                    k=self.k,
                                    verbose=self._verbose)
        self._tokenizer.fit(X_train, y_train)

        self.maxF = int(round(np.log2(self._tokenizer.maxF + 1)))

        self._model = AttentionTFIDF(vocab_size=self._tokenizer.vocab_size,
                                     hiddens=self.hiddens,
                                     nclass=self._tokenizer.n_class,
                                     maxF=self.maxF,
                                     drop=self.max_drop).to(self._device)

        optimizer = optim.AdamW(self._model.parameters(),
                                lr=self.lr,
                                weight_decay=self.weight_decay)
        loss_func_cel = nn.CrossEntropyLoss().to(self._device)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=self.factor,
            patience=3,
            verbose=self._verbose)

        best = 99999.
        best_acc = 0.
        counter = 1
        dl_val = DataLoader(list(zip(X_val, y_val)),
                            batch_size=self.batch_size,
                            shuffle=False,
                            collate_fn=self.collate_train,
                            num_workers=self.n_jobs)

        for e in tqdm(range(self.nepochs),
                      total=self.nepochs,
                      disable=not self._verbose):
            dl_train = DataLoader(list(zip(X_train, y_train)),
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  collate_fn=self.collate_train,
                                  num_workers=self.n_jobs)
            loss_train = 0.
            with tqdm(total=len(y_train) + len(y_val),
                      smoothing=0.,
                      desc=f"ACC_val: {best_acc:.2} Epoch {e+1}",
                      disable=not self._verbose) as pbar:
                total = 0
                correct = 0
                self._model.train()
                self._tokenizer.model = 'sample'
                for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train):

                    doc_tids = doc_tids.to(self._device)
                    TFs = TFs.to(self._device)
                    DFs = DFs.to(self._device)
                    y = y.to(self._device)

                    pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                    pred_docs = torch.softmax(pred_docs, dim=1)
                    loss = loss_func_cel(pred_docs, y)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    loss_train += loss.item()
                    total += len(y)
                    y_pred = pred_docs.argmax(axis=1)
                    correct += (y_pred == y).sum().item()
                    self._model.drop_ = (correct / total) * self.max_drop

                    pbar.update(len(y))
                    del doc_tids, TFs
                    del DFs, y, pred_docs
                    del loss, y_pred
                loss_train = loss_train / (i + 1)
                total = 0
                correct = 0
                self._model.eval()
                self._tokenizer.model = 'topk'
                with torch.no_grad():
                    loss_val = 0.
                    for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val):
                        doc_tids = doc_tids.to(self._device)
                        TFs = TFs.to(self._device)
                        DFs = DFs.to(self._device)
                        y = y.to(self._device)

                        pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                        pred_docs = torch.softmax(pred_docs, dim=1)
                        loss = loss_func_cel(pred_docs, y)

                        loss_val += loss.item()
                        total += len(y)
                        y_pred = pred_docs.argmax(axis=1)
                        correct += (y_pred == y).sum().item()
                        pbar.update(len(y))
                        loss_val
                        del doc_tids, TFs, DFs, y
                        del pred_docs, loss
                    loss_val = (loss_val / (i + 1))
                    scheduler.step(loss_val)

                    if best - loss_val > 0.0001:
                        best = loss_val
                        counter = 1
                        best_acc = correct / total
                        best_model = copy.deepcopy(self._model).to('cpu')
                    elif counter > self.patience:
                        break
                    else:
                        counter += 1

        self._model = best_model.to(self._device)

        self._loss = best
        self._acc = best_acc

        return self

    def predict(self, X):
        if self._model is None or self._tokenizer is None:
            raise Exception("Not implemented yet!")
        self._model.eval()
        self._tokenizer.model = 'topk'
        dataloader = DataLoader(X,
                                batch_size=self.batch_size,
                                shuffle=False,
                                collate_fn=self.collate_predict,
                                num_workers=self.n_jobs)
        result = []
        with torch.no_grad():
            loss_val = 0.
            for i, (doc_tids, TFs, DFs) in enumerate(dataloader):
                doc_tids = doc_tids.to(self._device)
                TFs = TFs.to(self._device)
                DFs = DFs.to(self._device)

                pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                pred_docs = torch.softmax(
                    pred_docs, dim=1).argmax(axis=1).cpu().detach().numpy()
                result.extend(list(pred_docs))
        return self._tokenizer.le.inverse_transform(np.array(result))

    def to(self, device):
        self._device = device
        if self._model is not None:
            self._model.to(self._device)
        return self
# 纯字Embedding
# 92%+

# 处理数据
X, y, classes = load_THUCNews_title_label()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=7322)

num_classes = len(classes)
# 转化成字id
print("tokenize...")
tokenizer = Tokenizer(mintf=32, cutword=False)
tokenizer.fit(X_train)

# maxlen = find_best_maxlen(X_train, mode="max")
maxlen = 48


def create_dataset(X, y, maxlen):
    X = tokenizer.transform(X)
    X = sequence.pad_sequences(X,
                               maxlen=maxlen,
                               dtype="int32",
                               padding="post",
                               truncating="post",
                               value=0.0)
    y = tf.keras.utils.to_categorical(y)
    return X, y