Exemplo n.º 1
0
def run_training(model: str) -> None:
    """
    Training our Machine Learning model and serializing to disc
    """
    # read train and test data
    df_train = pd.read_csv(config.ORIGINAL_TRAIN)
    df_test = pd.read_csv(config.TEST_DATA)

    # relabel mislabeled samples
    df_train = data_clean.relabel_target(df_train)

    # shuffle data
    df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

    # clean the text
    df_train[config.CLEANED_TEXT] = df_train[config.TEXT].apply(pp.clean_tweet)
    df_test[config.CLEANED_TEXT] = df_test[config.TEXT].apply(pp.clean_tweet)

    # save the modified train and test data
    df_train.to_csv(config.MODIFIED_TRAIN, index=False)
    df_test.to_csv(config.MODIFIED_TEST, index=False)
    del df_test

    # convert text to numerical representation
    tokenizer = Tokenizer(oov_token="<unk>")
    tokenizer.fit_on_texts(df_train[config.CLEANED_TEXT])

    # path to save model
    model_path = f"{config.MODEL_DIR}/PRETRAIN_WORD2VEC_{model}/"

    # checking the folder exist
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # saving tokenizer
    with open(f'{model_path}tokenizer.pkl', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # pad the sequences
    X_padded = pad_sequences(tokenizer.texts_to_sequences(
        df_train[config.CLEANED_TEXT].values),
                             maxlen=config.MAXLEN)

    # get the pretrained word embeddings and prepare embedding layer
    embedding_matrix = f.get_word2vec_enc(tokenizer.word_index.items(),
                                          config.PRETRAINED_WORD2VEC)
    embedding_layer = Embedding(input_dim=config.VOCAB_SIZE,
                                output_dim=config.EMBED_SIZE,
                                weights=[embedding_matrix],
                                input_length=config.MAXLEN,
                                trainable=False)

    # target values
    y = df_train[config.RELABELED_TARGET].values

    # train a single model
    clf = my_LSTM(embedding_layer)
    clf.fit(X_padded, y, epochs=config.N_EPOCHS, verbose=1)

    # persist the model
    clf.save(f"{model_path}/{model}_Word2Vec.h5")
def main(args):
    print(args)
    startime = time.time()
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

    # Set hyper-parameters.
    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = 'models/model_{}.h5'
    num_words = 40000
    num_label = 2

    # Data loading.
    print(return_time(startime), "1. Loading data ...")
    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv')

    # pre-processing.
    print(return_time(startime), "2. Preprocessing dataset ...")
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post')

    # Preparing word embedding.
    if args.loadwv:
        print(return_time(startime), "3. Loading word embedding ...")
        wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words)
        if os.path.exists(wv_path):
            wv = np.load(wv_path)
            print(return_time(startime), "Loaded word embedding successfully!")
        else:
            print(return_time(startime), "Word embedding file doesn't exist")
            exit()

    else:
        print(return_time(startime), "3. Preparing word embedding ...")
        wv = load_fasttext('data/cc.ja.300.vec.gz')
        wv = filter_embeddings(wv, vocab.word_index, num_words)
        # Saving word embedding.
        if args.savewv:
            wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words)
            np.save(wv_path, wv)
            print(return_time(startime), "Saved word embedding successfully!", wv_path)

    # Build models.
    models = [
        RNNModel(num_words, num_label, embeddings=None).build(),
        LSTMModel(num_words, num_label, embeddings=None).build(),
        CNNModel(num_words, num_label, embeddings=None).build(),
        RNNModel(num_words, num_label, embeddings=wv).build(),
        LSTMModel(num_words, num_label, embeddings=wv).build(),
        CNNModel(num_words, num_label, embeddings=wv).build(),
        CNNModel(num_words, num_label, embeddings=wv, trainable=False).build()
    ]

    model_names = [
        "RNN-None",
        "LSTM-None",
        "CNN-None",
        "RNN-wv",
        "LSTM-wv",
        "CNN-wv",
        "CNN-wv-notrain"
    ]

    print(return_time(startime), "4. Start training ...")
    for i, model in enumerate(models):
        print("***********************************")
        print(return_time(startime), "Model:", model_names[i])

        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['acc'])

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(model_names[i]), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.2,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(model_names[i]))
        api = InferenceAPI(model, vocab, preprocess_dataset)
        y_pred = api.predict_from_sequences(x_test)
        print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average='binary')))
        print('recall   : {:.4f}'.format(recall_score(y_test, y_pred, average='binary')))
        print('f1       : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(args):
    training_data = pd.read_csv(
        os.path.join(args.data_dir, "Keras_latest_training_data.csv"))

    # Version control
    model_trained_time = time.strftime("%Y%m%d-%H%M")
    model_version = 'model_' + model_trained_time

    # Intent training begins
    traindata = training_data[['text', 'intent']]
    traindata = traindata[pd.notna(traindata['text'].values)]

    # Randomly sample 5% of validation
    validation_splitrate = 0.05
    traindata_validation = traindata.sample(frac=validation_splitrate)
    traindata_train = traindata.loc[~traindata.index.isin(traindata_validation.
                                                          index)]
    # reshape the traindate as first 95% train, last 5% validation
    traindata = pd.concat([traindata_train, traindata_validation])

    # tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(traindata['text'].values)
    X = tokenizer.texts_to_sequences(traindata['text'].values)
    X = pad_sequences(X, maxlen=50)

    # Y labels
    Ylabel = np.array(sorted(set(traindata['intent'])))
    Y = pd.get_dummies(traindata['intent'], prefix='', prefix_sep='')
    Y = Y.T.reindex(Ylabel).T.fillna(0).values

    intent_tokens = {"X": X, "Y": Y, "Ylabel": Ylabel, "tokenizer": tokenizer}

    with open(
            '{}/{}_intent_tokens.pickle'.format(args.model_dir, model_version),
            'wb') as handle:
        pickle.dump(intent_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)

    ## parameters
    max_features = np.max(X)
    Ndense = len(set(traindata['intent']))

    # LSTM train
    model = Sequential()
    model.add(
        Embedding(max_features + 1, args.embed_dim, input_length=X.shape[1]))
    model.add(SpatialDropout1D(args.sdropoutrate))
    model.add(
        LSTM(args.lstm_out,
             dropout=args.dropoutrate,
             recurrent_dropout=args.rdropoutrate))
    model.add(Dense(Ndense, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    intent_history = model.fit(X,
                               Y,
                               epochs=5,
                               validation_split=validation_splitrate,
                               batch_size=args.batch_size,
                               verbose=2)

    # Saving intent model
    intent_history.model.save('{}/LSTM_history.h5'.format(args.model_dir),
                              save_format='h5')
Exemplo n.º 4
0
# convert from series to a list
text = df['twitts'].tolist()

y = df['sentiment']

token = Tokenizer()
token.fit_on_texts(text)

vocab_size = len(token.word_index) + 1

encoded_text = token.texts_to_sequences(text)

# Pad the sequences
max_len = max([len(s.split()) for s in text])

X = pad_sequences(encoded_text, maxlen=max_len, padding='post')

# How to work with GloVe vectors using the 200Dimension one.
# The embedding layer will contain words represented in 200 dimension

glove_vectors = dict()
file = open('../../../../Data/glove.twitter.27B.200d.txt', encoding='utf-8')

# Create the word embeddings
for line in file:
    value = line.split()
    word = value[0]
    vector = np.asarray(value[1:])
    glove_vectors[word] = vector
file.close()
Exemplo n.º 5
0
def sent_anly_prediction():
    if request.method == 'POST':
        text = request.form['text']
        model = keras.models.load_model('sarcasm.h5')

        with open("sarcasm.json", 'r') as f:
            datastore = json.load(f)
            sentences = []
            labels = []

        for item in datastore:
            sentences.append(item['headline'])
            labels.append(item['is_sarcastic'])

        vocab_size = 10000
        embedding_dim = 16
        max_length = 100
        trunc_type = 'post'
        padding_type = 'post'
        oov_tok = "<OOV>"
        training_size = 20000

        training_sentences = sentences[0:training_size]
        testing_sentences = sentences[training_size:]
        training_labels = labels[0:training_size]
        testing_labels = labels[training_size:]

        tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
        tokenizer.fit_on_texts(training_sentences)

        word_index = tokenizer.word_index

        training_sequences = tokenizer.texts_to_sequences(training_sentences)
        training_padded = pad_sequences(training_sequences,
                                        maxlen=max_length,
                                        padding=padding_type,
                                        truncating=trunc_type)

        testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
        testing_padded = pad_sequences(testing_sequences,
                                       maxlen=max_length,
                                       padding=padding_type,
                                       truncating=trunc_type)

        sentence = [text]

        sequences = tokenizer.texts_to_sequences(sentence)
        padded = pad_sequences(sequences,
                               maxlen=max_length,
                               padding=padding_type,
                               truncating=trunc_type)
        predict = model.predict(padded)
        classes = model.predict_classes(padded)

        if classes[0] == 0:
            senti = "not sarcastic"

        else:
            senti = "sarcastic"

    return render_template('setiment.html',
                           text=text,
                           sentiment=senti,
                           probability=predict[0])
Exemplo n.º 6
0
testing_labels_final = np.array(testing_labels)

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,
                              embedding_dim,
                              input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    def attack(self, seq, target, l, max_change=0.5):

        seq = seq.cpu().detach().numpy().squeeze(
        )  #'''label of change; convert'''
        seq_orig, seq_orig_string, l_orig = self.orig_sentence(seq)

        # print(seq_orig)
        # seq_adv = seq.copy()
        # seq_len = np.sum(np.sign(seq))
        l = l.cpu()
        # print(self.tokenizer.convert_ids_to_tokens(seq.tolist()))
        # To calculate the sampling probability
        tmp = [
            glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]),
                                                50, 0.5) for i in range(l_orig)
        ]

        # tmp = [glove_utils.pick_most_similar_words(self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), ret_count = 50, threshold = 0.5) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)]
        neighbour_list = [t[0] for t in tmp]
        neighbour_dist = [t[1] for t in tmp]
        neighbour_len = [len(i) for i in neighbour_list]
        for i in range(l_orig):
            if (seq_orig[i] < 27):
                # To prevent replacement of words like 'the', 'a', 'of', etc.
                neighbour_len[i] = 0
        prob_select = neighbour_len / np.sum(neighbour_len)
        # print(prob_select)
        # tmp = [glove_utils.pick_most_similar_words(
        #     self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), self.top_n1, 0.5
        # ) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)]
        tmp = [
            glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]),
                                                self.top_n1, 0.5)
            for i in range(l_orig)
        ]

        neighbour_list = [t[0] for t in tmp]
        neighbour_dist = [t[1] for t in tmp]
        # print('synonyms')
        # print(tmp)
        # print([[self.dataset.inv_dict[j] for j in i if j in self.dataset.inv_dict] for i in neighbour_list])
        seq_adv = seq_orig_string.copy()
        # pop = [self.perturb(seq_adv, seq, seq_orig, l_orig, neighbour_list, neighbour_dist, prob_select, seq_len, target, l) for _ in range(self.pop_size)]
        pop = [
            self.perturb(seq_adv, seq_orig_string, l_orig, neighbour_list,
                         neighbour_dist, prob_select, target, l)
            for _ in range(self.pop_size)
        ]

        l_tensor = torch.ones([len(pop)]).type(torch.LongTensor)
        pop_np = [[self.tokenizer.cls_token_id] +
                  self.tokenizer.convert_tokens_to_ids(
                      self.tokenizer.tokenize(' '.join(pop[0]).strip())) +
                  [self.tokenizer.sep_token_id]]
        l_tensor[0] = len(pop_np[0])
        # print(l_tensor)
        for p in range(1, len(pop)):
            token_ids = [
                self.tokenizer.cls_token_id
            ] + self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(' '.join(
                    pop[p]).strip())) + [self.tokenizer.sep_token_id]
            pop_np.append(token_ids)
            l_tensor[p] = len(token_ids)
        l_max = torch.max(l_tensor)

        # print(l_max, l_tensor, len(pop_np))
        pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post')
        pop_tensor = torch.tensor(pop_np)

        # print(torch.tensor(pop_np))
        sort = torch.sort(l_tensor, descending=True)[1]
        # print(len(sort), sort)
        pop_tensor = pop_tensor[sort]
        l_tensor = l_tensor[sort]
        pop = np.array(pop)[sort].tolist()
        # print(l_tensor)
        for i in range(self.max_iters):

            pop_tensor = pop_tensor.type(torch.LongTensor).to(self.device)
            l_tensor = l_tensor.to(self.device)
            # print('pop_tensor:',pop_tensor)
            # print(pop_tensor.shape)
            # print(l_tensor)
            self.batch_model.eval()
            with torch.no_grad():
                pop_preds = self.batch_model.pred(
                    pop_tensor, l_tensor, False)[1].cpu().detach().numpy()
            # print(sort)
            # print(pop_preds)
            # print(pop_tensor)
            pop_scores = pop_preds[:, target]
            print('\t\t', i, ' -- ', np.max(pop_scores))
            pop_ranks = np.argsort(pop_scores)[::-1]
            # print(l_tensor)
            # print(pop_ranks)
            top_attack = pop_ranks[0]
            # print(top_attack)
            ampl = pop_scores / self.temp
            # print(ampl)
            covariance = np.cov(ampl)
            # print('pop:', pop)
            print(covariance)
            if covariance > 10e-6:
                mean = np.mean(ampl)
                # print(mean)
                ampl_update = (ampl - mean) / np.sqrt(covariance + 0.001)
                # print(ampl_update)
                logits = np.exp(ampl_update)
            else:

                if np.max(ampl) > 100:
                    ampl = ampl / (np.max(ampl) / 5)
                logits = np.exp(ampl)
            # logits = np.exp(ampl)
            select_probs = logits / np.sum(logits)
            # print('prob:', select_probs)
            # print([self.tokenizer.convert_ids_to_tokens([i]) for i in pop_np[top_attack]])
            if np.argmax(pop_preds[top_attack, :]) == target:
                print('Success and score: {:.4f}'.format(
                    pop_scores[top_attack]))

                print(seq_orig_string)
                print(pop[top_attack])

                return pop[top_attack], seq_orig_string

            # for i in pop:
            #   print(i)
            #   print('\t')

            elite = [pop[top_attack]]  # elite
            # print(elite)

            # print(select_probs.shape)
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)

            childs = [
                self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]])
                for i in range(self.pop_size - 1)
            ]
            childs = [
                self.perturb(x, seq_orig_string, l_orig, neighbour_list,
                             neighbour_dist, prob_select, target, l)
                for x in childs
            ]
            # print(childs)
            pop = elite + childs
            # print(len(pop))
            # print('pop:', pop)
            l_tensor = torch.ones([len(pop)]).type(torch.LongTensor)
            pop_np = [[self.tokenizer.cls_token_id] +
                      self.tokenizer.convert_tokens_to_ids(
                          self.tokenizer.tokenize(' '.join(pop[0]).strip())) +
                      [self.tokenizer.sep_token_id]]
            l_tensor[0] = len(pop_np[0])
            # print(pop_np)
            # print(l_tensor)
            # print(pop_np)
            for p in range(1, len(pop)):
                token_ids = [
                    self.tokenizer.cls_token_id
                ] + self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(' '.join(
                        pop[p]).strip())) + [self.tokenizer.sep_token_id]
                pop_np.append(token_ids)
                l_tensor[p] = len(token_ids)

            # print(l_tensor)
            # print(pop_np)
            l_max = torch.max(l_tensor)
            pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post')
            pop_tensor = torch.tensor(pop_np)

            # print(torch.tensor(pop_np))
            sort = torch.sort(l_tensor, descending=True)[1]
            # print(len(sort), sort)
            pop_tensor = pop_tensor[sort]
            l_tensor = l_tensor[sort]
            pop = np.array(pop)[sort].tolist()
            # print(np.array(pop).shape)

            # pop_np = np.expand_dims(pop[0], 0)
            # for p in pop[1:]:
            #   pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)),0)

        return None, seq_orig
Exemplo n.º 8
0
def create_ml_1m_dataset(file,
                         trans_score=2,
                         embed_dim=8,
                         maxlen=40,
                         test_neg_num=100):
    """
    :param file: A string. dataset path.
    :param trans_score: A scalar. Greater than it is 1, and less than it is 0.
    :param embed_dim: A scalar. latent factor.
    :param maxlen: A scalar. maxlen.
    :param test_neg_num: A scalar. The number of test negative samples
    :return: user_num, item_num, train_df, test_df
    """
    print('==========Data Preprocess Start=============')
    data_df = pd.read_csv(file,
                          sep="::",
                          engine='python',
                          names=['user_id', 'item_id', 'label', 'Timestamp'])
    # filtering
    data_df['item_count'] = data_df.groupby('item_id')['item_id'].transform(
        'count')
    data_df = data_df[data_df.item_count >= 5]
    # trans score
    data_df = data_df[data_df.label >= trans_score]
    # sort
    data_df = data_df.sort_values(by=['user_id', 'Timestamp'])
    # split dataset and negative sampling
    print('============Negative Sampling===============')
    train_data, val_data, test_data = defaultdict(list), defaultdict(
        list), defaultdict(list)
    item_id_max = data_df['item_id'].max()
    for user_id, df in tqdm(data_df[['user_id',
                                     'item_id']].groupby('user_id')):
        pos_list = df['item_id'].tolist()

        def gen_neg():
            neg = pos_list[0]
            while neg in set(pos_list):
                neg = random.randint(1, item_id_max)
            return neg

        neg_list = [gen_neg() for i in range(len(pos_list) + test_neg_num)]
        for i in range(1, len(pos_list)):
            hist_i = pos_list[:i]
            if i == len(pos_list) - 1:
                test_data['hist'].append(hist_i)
                test_data['pos_id'].append(pos_list[i])
                test_data['neg_id'].append(neg_list[i:])
            elif i == len(pos_list) - 2:
                val_data['hist'].append(hist_i)
                val_data['pos_id'].append(pos_list[i])
                val_data['neg_id'].append(neg_list[i])
            else:
                train_data['hist'].append(hist_i)
                train_data['pos_id'].append(pos_list[i])
                train_data['neg_id'].append(neg_list[i])
    # item feature columns
    user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max(
    ) + 1
    item_feat_col = sparseFeature('item_id', item_num, embed_dim)
    # shuffle
    random.shuffle(train_data)
    random.shuffle(val_data)
    # padding
    print('==================Padding===================')
    train = [
        pad_sequences(train_data['hist'], maxlen=maxlen),
        np.array(train_data['pos_id']),
        np.array(train_data['neg_id'])
    ]
    val = [
        pad_sequences(val_data['hist'], maxlen=maxlen),
        np.array(val_data['pos_id']),
        np.array(val_data['neg_id'])
    ]
    test = [
        pad_sequences(test_data['hist'], maxlen=maxlen),
        np.array(test_data['pos_id']),
        np.array(test_data['neg_id'])
    ]
    print('============Data Preprocess End=============')
    return item_feat_col, train, val, test


# create_ml_1m_dataset('../dataset/ml-1m/ratings.dat')
Exemplo n.º 9
0
    data, label = read_files(args.data_dir)
    data = list(zip(data, label))
    random.shuffle(data)

    train_data, test_data = train_test_split(data)

    data_train = encode_sentences([content[0] for content in train_data],
                                  word_to_id)
    label_train = to_categorical(
        encode_cate([content[1] for content in train_data], cat_to_id))
    data_test = encode_sentences([content[0] for content in test_data],
                                 word_to_id)
    label_test = to_categorical(
        encode_cate([content[1] for content in test_data], cat_to_id))

    data_train = sequence.pad_sequences(data_train, maxlen=args.max_len)
    data_test = sequence.pad_sequences(data_test, maxlen=args.max_len)

    model = TextRNN(args.max_len, args.max_features,
                    args.embedding_size).build_model()
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

    logger.info('开始训练...')
    callbacks = [
        ModelCheckpoint('./model.h5', verbose=1),
        EarlyStopping(monitor='val_accuracy', patience=2, mode='max')
    ]

    history = model.fit(data_train,
                        label_train,
                        batch_size=args.batch_size,
for element in tokens_content_english:
    tmp_counter += 1
    if max_size_english < len(element):
        max_size_english = len(element)

tmp_counter = 0
max_size_german = 0
for element in tokens_content_german:
    tmp_counter += 1
    if max_size_german < len(element):
        max_size_german = len(element)

print(max_size_english)
print(max_size_german)

pad_english_sentence = pad_sequences(tokens_content_english, max_size_english)
pad_german_sentence = pad_sequences(tokens_content_german, max_size_german)
print("Padded EncodedEnglish Sentences: ")
print(pad_english_sentence[:10])
print("Padded Encoded German Sentences")
print(pad_german_sentence[:10])
print("Padded English Sentence Shape: ")
print(pad_english_sentence.shape)
print("Padded German Sentence Shape: ")
print(pad_german_sentence.shape)


# # define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    """
    Defines and Creates the model
Exemplo n.º 11
0
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['acc'])
    return model


# 3.2 划分数据集
# 3.2.1 划分测试训练集
# X_padded=pad_sequences(X_train, maxlen=300)
# Y=to_categorical(Y_train, len(class_index))
# x_train, x_test, y_train, y_test = train_test_split(X_padded, Y, test_size=0.2)

Y = to_categorical(Y_train, len(class_index))
x_train, x_test, y_train, y_test = train_test_split(X_train, Y, test_size=0.2)

x_train_raw = pad_sequences(x_train, maxlen=model_max_len)
x_test_raw = pad_sequences(x_test, maxlen=model_max_len)


# 3.3 训练
def model_fit(model, x, y):
    return model.fit(x, y, batch_size=10, epochs=5, validation_split=0.1)


model = get_lstm_model()
model_train = model_fit(model, x_train_raw, y_train)

# 3.4 测试
print(model.evaluate(x_test_raw, y_test))

# 聚类
Exemplo n.º 12
0
args = parser.parse_args()

data_path = config.data_path

training = get_data(data_path + 'snli_1.0_train.jsonl')
validation = get_data(data_path + 'snli_1.0_dev.jsonl')
test = get_data(data_path + 'snli_1.0_test.jsonl')

tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(training[0] + training[1])
tokenizer.fit_on_texts(validation[0] + validation[1])

VOCAB = len(tokenizer.word_counts) + 1
LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}

to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X),
                                 maxlen=config.max_len)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

print('Build model...')
print('Vocab size =', VOCAB)

config.vocab_size = VOCAB
config.out_dim = len(LABELS)
""" Load Glova Embedding """
GLOVE_STORE = data_path + 'precomputed_glove.weights'
if config.use_glove:
    if not os.path.exists(GLOVE_STORE + '.npy'):
Exemplo n.º 13
0
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    "My name is Shinjini", "My plushie's name is Bruno",
    "My friend loves my plushie", "I love it too!", "My plushie is very cute",
    "Don't you all think my plushie is really cute?"
]

# tokenize sentences
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print('The word index is\n', word_index)

# generate sequences out of tokens
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=10)
print('Training data sequences are\n', sequences)
print('Training data padded sequences are\n', padded)

# testing
test_data = ["But I really love my plushie", "My friend wants a plushie too!"]

test_seq = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(test_seq, maxlen=10)
print('Testing data sequences are\n', test_seq)
print('Testing data padded sequences are\n', test_padded)
Exemplo n.º 14
0
with open('./clean_test_review_okt.pkl', 'wb') as f:
    pickle.dump(clean_test_review, f)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_review)
train_sequences = tokenizer.texts_to_sequences(clean_train_review)
test_sequences = tokenizer.texts_to_sequences(clean_test_review)
word_vocab = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 8

train_inputs = pad_sequences(train_sequences,
                             maxlen=MAX_SEQUENCE_LENGTH,
                             padding='post')
train_labels = np.array(df_train['label'])  # 학습 데이터의 라벨

test_inputs = pad_sequences(test_sequences,
                            maxlen=MAX_SEQUENCE_LENGTH,
                            padding='post')
test_labels = np.array(df_test['label'])

x_train, x_val, y_train, y_val = train_test_split(train_inputs,
                                                  train_labels,
                                                  test_size=0.2)
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Flatten, Concatenate
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
Exemplo n.º 15
0
    for word_index in range(len(sentence)):
        word = sentence[word_index]
        word_vector = word2vec_model.wv[word]
        sentence_vector.append(word_vector)

    word2vec_features.append(sentence_vector)

word2vec_features = np.asarray(word2vec_features)
print("the shape of sentence embedding is ", word2vec_features.shape)

# pad the input sentence encoding
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 10

padded_sentence_encoding = pad_sequences(word2vec_features,
                                         padding="post",
                                         truncating="post",
                                         maxlen=MAX_LEN)
print("padded sentence shape is ", padded_sentence_encoding.shape)

# prepare DA tagging embedding

da_label_vectors = []

for index in range(len(Tags_List)):

    tag_vector = []

    # for class 'ES'
    if Tags_List[index] == 'ES':
        tag_vector = [1, 0, 0, 0, 0, 0, 0, 0, 0]
    # for class 'EO'
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

model = Sequential()
model.add(Embedding(total_words, 240, input_length=max_sequence_len - 1))
model.add((LSTM(150, return_sequences=True)))
model.add((LSTM(75)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])
Exemplo n.º 17
0
y = df['points'].values

# One hot encode categorical columns
encoded_countries = pd.get_dummies(df['country'])
encoded_provinces = pd.get_dummies(df['province'])

# Embedding for titles
titles = []
for index, row in df.iterrows():
	titles.append(row['title'])
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(titles)
print("Fit tokenizer on wine titles")
word_index = tokenizer.word_index
titles_sequences = tokenizer.texts_to_sequences(titles)
titles_padded = pad_sequences(titles_sequences, maxlen=max_length, padding=padding_type,
								truncating=trunc_type)

# Embedding for descriptions
descriptions = []
for index, row in df.iterrows():
	descriptions.append(row['description'])
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(descriptions)
print("Fit tokenizer on wine descriptions")
word_index = tokenizer.word_index
descriptions_sequences = tokenizer.texts_to_sequences(descriptions)
descriptions_padded = pad_sequences(descriptions_sequences, maxlen=max_length, padding=padding_type,
									truncating=trunc_type)

# Drop preprocessed columns and unwanted columns
df = df.drop(['taster_twitter_handle','taster_name','region_2',
Train_corpus = corpus[:spliting]
Train_lable  = lable[:spliting]

Test_corpus  = corpus[spliting:]
Test_lable   = lable[spliting:]


#%% text Tokenizing and padding

tokenizer       = Tokenizer(num_words =10000, oov_token = '<oov>')
tokenizer.fit_on_texts(Train_corpus)


# tokenizing and padding the training set
train_sequences = tokenizer.texts_to_sequences(Train_corpus)
train_padding   = pad_sequences(train_sequences, padding= 'post', maxlen= max_len)

# tokenizing and padding the test set
test_sequences  = tokenizer.texts_to_sequences(Test_corpus)
test_padding    = pad_sequences(test_sequences, padding= 'post', maxlen= max_len)


#%% Classification Model definition
# It is a sequential model:
    # layer 1 :  Embedding layer
    # layer 2 :  Bidirectional LSTM layer
    # layer 3 :  Bidirectional LSTM layer
    # layer 4-6: fully connected layers (Dense layers)
    
model= tf.keras.models.Sequential([tf.keras.layers.Embedding(10000, 32, input_length= max_len),
                                   tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True,activation= 'tanh')),
    def select_best_replacement(self, pos, seq_cur, seq, l_orig, target,
                                replace_list, l):

        infor_list = [self.replace(seq_cur, pos, w) if w != 0 and seq[pos].strip()!=self.dataset.inv_dict[w]  else \
                      (([self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(' '.join(seq).strip()))\
                        + [self.tokenizer.sep_token_id]), seq_cur, l) for w in replace_list]

        n_seq_list = len(infor_list)
        new_seq_list = []
        cur_seq_list = []
        l_bert_list = []
        for i in range(n_seq_list):
            new_seq_list.append(infor_list[i][0])
            cur_seq_list.append(infor_list[i][1])
            l_bert_list.append(infor_list[i][2])
        # print(cur_seq_list)
        # print(l_bert_list)
        # print([self.tokenizer.convert_ids_to_tokens([i]) for i in new_seq_list[0]])
        l_bert_list = torch.tensor(l_bert_list)
        sort = torch.argsort(l_bert_list, descending=True)

        l_max_bert = torch.max(l_bert_list)
        new_seq_list = pad_sequences(new_seq_list,
                                     maxlen=l_max_bert,
                                     padding='post')
        new_seq_list_tensor = torch.tensor(new_seq_list)[sort].type(
            torch.LongTensor).to(self.device)
        replace_list = replace_list[sort]
        # print('replace_list:', replace_list)

        l_tensor = l_bert_list[sort].type(torch.LongTensor)
        l_tensor = l_tensor.to(self.device)
        # print(new_seq_list_tensor)
        self.neighbour_model.eval()
        with torch.no_grad():
            new_seq_preds = self.neighbour_model.pred(
                new_seq_list_tensor, l_tensor,
                False)[1].cpu().detach().numpy()
        # print(new_seq_preds)
        # print(target)
        new_seq_scores = new_seq_preds[:, target]
        # print(new_seq_scores)
        # print(' '.join([self.dataset.inv_dict[i] if i!=50000 else '[UNK]' for i in seq_cur]).strip())
        seq_np = np.expand_dims(
            [self.tokenizer.cls_token_id] +
            self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(' '.join(seq_cur).strip())) +
            [self.tokenizer.sep_token_id],
            axis=0)
        seq_tensor = torch.tensor(seq_np).type(torch.LongTensor).to(
            self.device
        )  #torch.tensor(np.expand_dims(seq_cur, axis = 0)).type(torch.LongTensor).to(self.device)
        # print([self.tokenizer.convert_ids_to_tokens([i]) for i in seq_tensor[0]])
        l_tensor = torch.tensor([seq_tensor.shape[1]]).to(self.device)
        # print(seq_tensor)
        self.model.eval()
        with torch.no_grad():
            orig_score = self.model.pred(
                seq_tensor, l_tensor, False)[1].cpu().detach().numpy()[0,
                                                                       target]
        new_seq_scores -= orig_score
        # print(new_seq_scores)

        new_seq_scores[self.top_n1:] = -10000000
        # print(new_seq_scores)
        if self.use_lm:
            prefix = ['']
            suffix = ['']
            if pos > 0 and pos <= self.n_prefix:
                prefix = [seq_cur[pos - i - 1] for i in range(int(pos))[::-1]]
            elif pos > self.n_prefix:
                prefix = [
                    seq_cur[pos - i - 1] for i in range(self.n_prefix)[::-1]
                ]

#      orig_word = self.i_w_dict[seq[loc]]
            if self.use_suffix and pos < l_orig - self.n_suffix:
                suffix = [
                    seq_cur[pos + i] for i in range(1, self.n_suffix + 1)
                ]
            elif self.use_suffix and pos < l_orig:
                suffix = [seq_cur[pos + i] for i in range(1, l_orig - pos)]


#     if self.use_lm:
#       prefix = ['']
#       suffix = ['']
#       if loc > 0 and loc<=self.n_prefix:
#         prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(0, int(loc)-1)[::-1]]
#       elif loc>self.n_prefix:
#         prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(self.n_prefix)[::-1]]

# #      orig_word = self.tokenizer.convert_ids_to_tokens([seq[loc]])[0]
#       if self.use_suffix and loc < l-self.n_suffix and seq_cur[loc+self.n_suffix+1]!=0:
#         suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,self.n_suffix+1)]
#       elif self.use_suffix and loc < l:
#         suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,l-loc-1)]

#     if self.use_lm:
#       prefix = ['']
#       suffix = ['']
#       print(loc)
#       if loc > 0 and loc<=self.n_prefix:
#         prefix = []
#         for i in range(0, int(loc)-1)[::-1]:
#           w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0]
#           if len(w)>2:
#             if w[:2] == '##' and i != int(loc)-2:
#               print(w)
#               print(prefix)
#               w = prefix[-1]+w[2:]
#               prefix[-1] = w

#             elif w[:2] == '##' and i == int(loc)-2:
#               print(w)
#               print(prefix)
#               w = self.word_pre(w, i+1, loc, seq_cur)
#               prefix.append(w)
#             else:
#               prefix.append(w)
#           else:
#             prefix.append(w)
#           print(w)
#           print('pre:',prefix)

#         # prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(0, int(loc)-1)[::-1]]
#       elif loc>self.n_prefix:
#         prefix = []
#         for i in range(self.n_prefix)[::-1]:
#           print(loc-i-1)
#           w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0]
#           if len(w)>2:
#             print(i, int(loc)-2)
#             if w[:2] == '##' and i != self.n_prefix-1:
#               print(w)
#               print(prefix)
#               w = prefix[-1]+w[2:]
#               prefix[-1] = w
#             elif w[:2] == '##' and i == self.n_prefix-1:
#               print(w)
#               print(prefix)
#               w = self.word_pre(w, i+1, loc, seq_cur)
#               prefix.append(w)
#             else:
#               prefix.append(w)
#           else:
#             prefix.append(w)
#           print(w)
#           print(prefix)
#         # prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(self.n_prefix)[::-1]]
#       print('prefix:', prefix)
#       print(loc+self.n_suffix)
# #      orig_word = self.tokenizer.convert_ids_to_tokens([seq[loc]])[0]
#       if self.use_suffix and loc < l-self.n_suffix-1 and seq_cur[loc+self.n_suffix]!=0:
#         suffix = []
#         for i in range(1,self.n_suffix+1):
#           print(loc+i)
#           w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0]
#           if len(w)>2:
#             if w[:2] == '##' and i != 1:
#               print(w)
#               print(suffix)
#               w = suffix[-1]+w[2:]
#               suffix[-1] = w
#             elif w[:2] == '##' and i == 1:
#               print(w)
#               print(suffix)
#               w = self.word_pre(w, i+1, loc, seq_cur)
#               suffix.append(w)
#             else:
#               suffix.append(w)
#           else:
#             suffix.append(w)
#           print(suffix)
#         # suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,self.n_suffix+1)]
#       elif self.use_suffix and loc < l:
#         suffix = []
#         for i in range(1,l-loc-1):
#           w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0]
#           if len(w)>2:
#             if w[:2] == '##' and i != int(loc)+1:
#               print(w)
#               print(suffix)
#               w = suffix[-1]+w[2:]
#               suffix[-1] = w
#             elif w[:2] == '##' and i == int(loc)+1:
#               print(w)
#               print(suffix)
#               w = self.word_pre(w, i+1, loc, seq_cur)
#               suffix.append(w)
#             else:
#               suffix.append(w)
#           else:
#             suffix.append(w)

#       print('suffix:', suffix)
# suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,l-loc-1)]
#      print(orig_word, [self.dataset.inv_dict[w] for w in replace_list[:self.top_n1] if w in self.dataset.inv_dict])
# print(prefix, suffix)
            word_list = [
                prefix + [self.dataset.inv_dict[w]] +
                suffix if w in self.dataset.inv_dict else prefix + ['UNK'] +
                suffix for w in replace_list
            ]
            #[prefix+[self.dataset.inv_dict[w]]+suffix if w in self.dataset.inv_dict else prefix+['UNK']+suffix for w in replace_list[:self.top_n1]]
            #      replace_words_orig = [self.dataset.inv_dict[w] if w in self.dataset.inv_dict else 'UNK' for w in replace_list[:self.top_n1]] + [orig_word]
            # print(word_list)
            # print(word_list)
            # print('replace_list:', [self.dataset.inv_dict[i] if i in self.dataset.inv_dict else i for i in replace_list])

            # seqs = [self.seq_list(seq) for seq in word_list]
            # replace_words_scores = self.scorer.sentence_score(seqs, reduce = 'prod')
            # new_words_scores = np.array(replace_words_scores)
            # rank_replaces_by_lm = np.argsort(new_words_scores)[::-1]
            # # print(new_words_scores[rank_replaces_by_lm])
            # # print(rank_replaces_by_lm)

            replace_words_scores = self.lm.get_probs(word_list)
            new_words_scores = np.array(replace_words_scores)
            rank_replaces_by_lm = np.argsort(new_words_scores)

            filtered_words_idx = rank_replaces_by_lm[self.top_n2:]

            new_seq_scores[filtered_words_idx] = -10000000

        if np.max(new_seq_scores) > 0:
            # print([self.dataset.inv_dict[i] for i in cur_seq_list[np.argsort(new_seq_scores)[-1]]])
            return cur_seq_list[np.argsort(new_seq_scores)[-1]]
        return seq_cur
Exemplo n.º 20
0
        x_left = SimpleRNN(128, return_sequences=True)(embedding_left)
        x_right = SimpleRNN(128, return_sequences=True,
                            go_backwards=True)(embedding_right)
        x_right = Lambda(lambda x: K.reverse(x, axes=1))(x_right)
        x = Concatenate(axis=2)([x_left, embedding_current, x_right])
        x = Conv1D(64, kernel_size=1, activation='tanh')(x)
        x = GlobalMaxPooling1D()(x)
        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=[input_current, input_left, input_right],
                      outputs=output)
        return model


labelIds, reviewIds, uniqueword_len = gen_data(filePathTrain)
ont_hot_labelIds = keras.utils.to_categorical(labelIds, num_classes)
reviewIds = sequence.pad_sequences(reviewIds, maxlen)
print(reviewIds.shape)
print(ont_hot_labelIds.shape)
print(type(reviewIds))
textrcnn = RCNN(maxlen, uniqueword_len, embedding_dims, num_classes)
model = textrcnn.get_model()
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

left_temp = np.array([reviewIds[:, 0]])
left_temp2 = np.c_[left_temp.T, np.array(reviewIds)]
left = left_temp2[:, 0:-1]
right_temp = np.array([reviewIds[:, -1]])
right_temp2 = np.c_[right_temp.T, np.array(reviewIds)]
right = right_temp2[:, 1:]
def train():
    args = parser.parse_args()
    learning_rate = args.learning_rate
    nlayer = args.nlayer
    bidirection = args.bidirection
    save_path = args.save_path
    kept_prob = args.kept_prob

    MAX_VOCAB_SIZE = 50000
    with open(('aux_files/dataset_%d.pkl' % MAX_VOCAB_SIZE), 'rb') as f:
        dataset = pickle.load(f)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    embedding_matrix = np.load('aux_files/embeddings_glove_%d.npy' %
                               (MAX_VOCAB_SIZE))
    embedding_matrix = torch.tensor(embedding_matrix.T).to(device)

    # pytorch
    max_len = 400
    padded_train_raw = pad_sequences(dataset.train_seqs2,
                                     maxlen=max_len,
                                     padding='post')
    padded_test_raw = pad_sequences(dataset.test_seqs2,
                                    maxlen=max_len,
                                    padding='post')

    # TrainSet
    data_set_train = Data_infor(padded_train_raw, dataset.train_y)
    num_train = len(data_set_train)
    indx = list(range(num_train))
    all_train_set = Subset(data_set_train, indx)
    train_indx = random.sample(indx, int(num_train * 0.8))
    vali_indx = [i for i in indx if i not in train_indx]
    train_set = Subset(data_set_train, train_indx)
    vali_set = Subset(data_set_train, vali_indx)

    # TestSet
    data_set_test = Data_infor(padded_test_raw, dataset.test_y)
    num_test = len(data_set_test)
    indx = list(range(num_test))
    # indx = random.sample(indx, SAMPLE_SIZE)
    test_set = Subset(data_set_test, indx)

    batch_size = 64
    hidden_size = 128
    all_train_loader = DataLoader(all_train_set,
                                  batch_size=batch_size,
                                  shuffle=True)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    vali_loader = DataLoader(vali_set, batch_size=len(vali_indx) // batch_size)
    test_loader = DataLoader(test_set,
                             batch_size=int(num_test / 10),
                             shuffle=True)
    best_save_path = os.path.join(
        save_path, 'best_lstm_' + str(kept_prob) + '_' + str(learning_rate) +
        '_' + str(max_len))

    rnn = SentimentAnalysis(batch_size, embedding_matrix, hidden_size,
                            kept_prob, nlayer, bidirection)
    rnn = rnn.to(device)
    # class my_loss(nn.Module):
    #   def __init__(self):
    #     super().__init__()
    #     self.relu = nn.ReLU()

    #   def forward(self, x, y):
    #     loss = torch.mean((1-y)*x+torch.log(1+torch.exp(-abs(x)))+self.relu(-x))
    #     return loss
    criterion = nn.CrossEntropyLoss()
    optimiser = torch.optim.AdamW(rnn.parameters(), lr=learning_rate)
    # optimiser = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

    epoches = 20
    best_epoch = 0
    best_acc = 0
    patience = 15

    for epoch in range(epoches):
        test_pred = torch.tensor([])
        test_targets = torch.tensor([])
        train_pred = torch.tensor([])
        train_targets = torch.tensor([])
        test_loss = []
        train_loss = []

        rnn.train()
        for batch_index, (seqs, length, target) in enumerate(all_train_loader):

            seqs = seqs.type(torch.LongTensor)
            len_order = torch.argsort(length, descending=True)
            length = length[len_order]
            seqs = seqs[len_order]
            target = target[len_order].type(torch.LongTensor)
            optimiser.zero_grad()
            seqs, target, length = seqs.to(device), target.to(
                device), length.to(device)
            output, pred_out = rnn(seqs, length, True)
            loss = criterion(output, target)
            loss.backward()
            optimiser.step()

            train_pred = torch.cat(
                (train_pred, pred_out.type(torch.float).cpu()), dim=0)
            train_targets = torch.cat(
                (train_targets, target.type(torch.float).cpu()))
            train_loss.append(loss)

            if batch_index % 100 == 0:
                print('Train Batch:{}, Train Loss:{:.4f}.'.format(
                    batch_index, loss.item()))
        train_accuracy = rnn.evaluate_accuracy(train_pred.detach().numpy(),
                                               train_targets.detach().numpy())
        print(
            'Epoch:{}, Train Accuracy:{:.4f}, Train Mean loss:{:.4f}.'.format(
                epoch, train_accuracy,
                sum(train_loss) / len(train_loss)))

        rnn.eval()
        with torch.no_grad():
            for batch_index, (seqs, length, target) in enumerate(test_loader):

                seqs = seqs.type(torch.LongTensor)
                len_order = torch.argsort(length, descending=True)
                length = length[len_order]
                seqs = seqs[len_order]
                target = target[len_order].type(torch.LongTensor)
                seqs, target, length = seqs.to(device), target.to(
                    device), length.to(device)
                output, pred_out = rnn(seqs, length, False)
                test_pred = torch.cat(
                    (test_pred, pred_out.type(torch.float).cpu()), dim=0)
                test_targets = torch.cat(
                    (test_targets, target.type(torch.float).cpu()))
                loss = criterion(output, target)
                test_loss.append(loss.item())
                if batch_index % 100 == 0:
                    print('Vali Batch:{}, Validation Loss:{:.4f}.'.format(
                        batch_index, loss.item()))
            accuracy = rnn.evaluate_accuracy(test_pred.numpy(),
                                             test_targets.numpy())
            print('Epoch:{}, Vali Accuracy:{:.4f}, Vali Mean loss:{:.4f}.'.
                  format(epoch, accuracy,
                         sum(test_loss) / len(test_loss)))
            print('\n\n')
            # # best save
            # if accuracy > best_acc:
            #   best_acc = accuracy
            #   best_epoch = epoch
            #   torch.save(rnn.state_dict(), best_save_path)
            # # early stop
            # if epoch-best_epoch >=patience:
            #   print('Early stopping')
            #   print('Best epoch: {}, Best accuracy: {:.4f}.'.format(best_epoch, best_acc))
            #   break
    torch.save(rnn.state_dict(), best_save_path)
    rnn.load_state_dict(torch.load(best_save_path))
    rnn.to(device)
    rnn.eval()
    test_pred = torch.tensor([])
    test_targets = torch.tensor([])
    test_loss = []
    with torch.no_grad():
        for batch_index, (seqs, length, target) in enumerate(test_loader):

            seqs = seqs.type(torch.LongTensor)
            len_order = torch.argsort(length, descending=True)
            length = length[len_order]
            seqs = seqs[len_order]
            target = target[len_order]
            seqs, target, length = seqs.to(device), target.to(
                device), length.to(device)
            output, pred_out = rnn(seqs, length, False)
            test_pred = torch.cat(
                (test_pred, pred_out.type(torch.float).cpu()), dim=0)
            test_targets = torch.cat(
                (test_targets, target.type(torch.float).cpu()))
            loss = criterion(output, target)
            test_loss.append(loss.item())

        accuracy = rnn.evaluate_accuracy(test_pred.numpy(),
                                         test_targets.numpy())
    print('Test Accuracy:{:.4f}, Test Mean loss:{:.4f}.'.format(
        accuracy,
        sum(test_loss) / len(test_loss)))
Exemplo n.º 22
0
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 초기화할 GPU number

# set parameters
max_features = 6000 # max_features : 최대 단어수
max_length = 400

# 학습 데이터는 자주 등장하는 단어 6,000개로 구성한다.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

wind = imdb.get_word_index()
revind = dict((v,k) for k,v in wind.items())

# Pad sequences for computational efficiency
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

# Deep Learning architecture parameters
batch_size = 32
embedding_dims = 60
num_kernels = 260        # convolution filter 개수
kernel_size = 3          # convolution filter size
hidden_dims = 300
epochs = 10
nOutput = 1


xInput = Input(batch_shape=(None, max_length))
xEmbed = Embedding(max_features, embedding_dims)(xInput)
Exemplo n.º 23
0
    Y_train = []
    X_train = []
    pattern_words_dict = {}
    # loop through each sentence in our intents patterns
    for intent in data_as_an['intents']:
        for entities in intent['entities']:
            if entities:
                pattern_words_dict[entities] = intent['tag'][0:2]
        for sentences in intent['ask']:
            Y_train.append(intent['tag'])
            X_train.append(word2charArr(sentences, pattern_words_dict))
    return np.array(X_train), np.array(Y_train), pattern_words_dict


X_train, Y_train, pattern_words_dict = sentence2vec()
X_train = pad_sequences(X_train, maxlen=100)
highest_unicode = 8100
X_train = np.where(X_train <= highest_unicode, X_train, 0)
print(X_train, X_train.shape, highest_unicode)

from sklearn import preprocessing

cate_enc = preprocessing.LabelEncoder()
label = Y_train
Y_train = cate_enc.fit_transform(Y_train)
print(Y_train.shape)
print(Y_train)
print(len(np.unique(Y_train)))

model = Sequential()
model.add(Embedding(highest_unicode + 1, 60, input_length=X_train.shape[1]))
Exemplo n.º 24
0
    sentences.append(item['field_0'])##insert json keys here
    labels.append(item['field_1'])
    
train_sentences = sentences[0:TRAIN_SIZE]
test_sentences = sentences[TRAIN_SIZE:]
train_labels = labels[0:TRAIN_SIZE]
test_labels = labels[TRAIN_SIZE:]

##Assign Tokens for the words, and convert the sentences to token sequences.
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(train_sentences)

wordIndex = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)


#Convert to ndarrays
train_padded = np.array(train_padded)
train_labels = np.array(train_labels)
 
test_padded = np.array(test_padded)
test_labels = np.array(test_labels)

##The Neural Net Architechture
quacker = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
sequences = list()
for line in String.split('.'):  # Wn을 기준으로 문장 토큰화
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))

#샘플 확인
print(sequences)

#가장 긴 길이로 샘플 길이 맞춰주기
max_len = max(len(l) for l in sequences)  # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences)

sequences = np.array(sequences)
X = sequences[:, :-1]
y = sequences[:, -1]
# 리스트의 마지막 값을 제외하고 저장한 것은 X
# 리스트의 마지막 값만 저장한 것은 y. 이는 레이블에 해당

#분리된 X값
print(X)

#분리된 y값
print(y)

#레이블 분리완료
Exemplo n.º 26
0
# Use LabelEncoder
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)
training_labels = lbl_encoder.transform(training_labels)

#Vectorize the data using Tokenizer
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

#Neural Network Training
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()
epochs = 500
history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)
def load_lstm_inv_data():
    # =============训练集=================
    train_sequences = list()

    for index, group in train.groupby(by='fragment_id'):
        train_sequences.append(group[use_fea].values)

    # 找到序列的最大长度
    len_sequences = []
    for one_seq in train_sequences:
        len_sequences.append(len(one_seq))
    print(pd.Series(len_sequences).describe())  # 最长的序列有61个

    # 填充序列
    to_pad = 61
    train_new_seq = []
    for one_seq in train_sequences:
        len_one_seq = len(one_seq)
        last_val = one_seq[-1]
        n = to_pad - len_one_seq
        # to_concat = np.repeat(last_val, n).reshape(len(use_fea), n).transpose()
        # new_one_seq = np.concatenate([one_seq, to_concat])
        if n != 0:
            to_concat = one_seq[:n]
            new_one_seq = np.concatenate([one_seq, to_concat])
        else:
            new_one_seq = one_seq
        train_new_seq.append(new_one_seq)

    train_final_seq = np.stack(train_new_seq)
    # final_seq.shape (314, 129, 4)
    print("train_final_seq.shape", train_final_seq.shape)
    # 进行截断

    seq_len = 60
    train_final_seq = sequence.pad_sequences(train_final_seq,
                                             maxlen=seq_len,
                                             padding='post',
                                             dtype='float',
                                             truncating='post')
    print("train_final_seq.shape", train_final_seq.shape)

    # =============测试集=================
    test_sequences = list()
    for index, group in test.groupby(by='fragment_id'):
        test_sequences.append(group[use_fea].values)

    # 填充到最大长度
    to_pad = 61
    test_new_seq = []
    for one_seq in test_sequences:
        len_one_seq = len(one_seq)
        last_val = one_seq[-1]
        n = to_pad - len_one_seq
        # to_concat = np.repeat(last_val, n).reshape(len(use_fea), n).transpose()
        # new_one_seq = np.concatenate([one_seq, to_concat])
        if n != 0:
            to_concat = one_seq[:n]
            new_one_seq = np.concatenate([one_seq, to_concat])
        else:
            new_one_seq = one_seq
        test_new_seq.append(new_one_seq)

    test_final_seq = np.stack(test_new_seq)
    print("test_final_seq.shape", test_final_seq.shape)

    # 进行截断

    seq_len = 60
    test_final_seq = sequence.pad_sequences(test_final_seq,
                                            maxlen=seq_len,
                                            padding='post',
                                            dtype='float',
                                            truncating='post')
    print("test_final_seq.shape", test_final_seq.shape)
    return train_final_seq, y_train, test_final_seq, seq_len, len(use_fea)
Exemplo n.º 28
0
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)

# print(len(sentences))
# print(sentences[0])
# print(labels[0])

tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
padded = padded / 2442.
print(padded[0])
print(padded.shape)

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_word_index = label_tokenizer.word_index
label_seq = label_tokenizer.texts_to_sequences(labels)
# label_seq = [item for sublist in label_seq for item in sublist]
# label_seq = np.array(label_seq)
label_seq = keras.utils.to_categorical(label_seq)
print(label_seq[:10])
print(label_word_index)

model = keras.models.Sequential()
Exemplo n.º 29
0
tokenizer.fit_on_texts(sentences)

word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

train_to_idx = tokenizer.texts_to_sequences(sentences)

train_inputs = []
y_label = []

for i in range(len(train_to_idx)):
    for j in range(1, len(train_to_idx[i])):
        train_inputs.append(train_to_idx[i][:j])
        y_label.append(train_to_idx[i][j])

train_inputs_pad = pad_sequences(train_inputs, maxlen=7)

df_train = pd.DataFrame({'x_emb': train_inputs_pad, 'label': y_label})

train = np.array(train_inputs_pad)
label = np.array(y_label).reshape(-1, 1)
vocab_size = len(word2idx) + 1

x_train, x_test, y_train, y_test = train_test_split(train,
                                                    label,
                                                    test_size=0.1)

x_input = Input(shape=(7, ))  # batch_shape하고 shape을 쓸 때의 차이가 무엇인가?
x_emb = Embedding(input_dim=vocab_size, output_dim=8, name='emb')(x_input)

# H-network
Exemplo n.º 30
0
#TEST_DATA_FILE='liwc_test.csv'
train = pd.read_csv(r"liwc_input.csv")
test = pd.read_csv(r"liwc_test.csv")
list_sentences_train = train["text"].fillna("_na_").values
y_train = np.array(train['rating'])
y_test = np.array(test['rating'])

Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

list_sentences_test = test["text"].fillna("_na_").values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
unigrams = pad_sequences(list_tokenized_train, maxlen=maxlen)
unigrams_t = pad_sequences(list_tokenized_test, maxlen=maxlen)
liwc_scaler = preprocessing.StandardScaler()
liwc = liwc_scaler.fit_transform(train.ix[:, "WC":"OtherP"])
liwc_t = liwc_scaler.transform(test.ix[:, "WC":"OtherP"])
X_t = np.hstack(unigrams)
X_te = np.hstack(unigrams_t)


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


embeddings_index = dict(
    get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())