예제 #1
0
def main():
    obj_etl = ETLDataPipeline("data/train.csv", "data/test.csv")
    train, test = obj_etl.read_data()
    train = obj_etl.drop_cols([
        'id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
        'nom_2', 'nom_3', 'nom_4', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
        'ord_5', 'day', 'month'
    ])
    train = obj_etl.convert_dtypes(
        ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])
    train = obj_etl.encoder(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])
    target = obj_etl.get_target('target')
    X_train, X_valid, y_train, y_valid = obj_etl.get_train_test(train, target)

    val1 = np.reshape(X_train['nom_5'].values, (-1, 1))
    val2 = np.reshape(X_train['nom_6'].values, (-1, 1))
    val3 = np.reshape(X_train['nom_7'].values, (-1, 1))
    val4 = np.reshape(X_train['nom_8'].values, (-1, 1))
    val5 = np.reshape(X_train['nom_9'].values, (-1, 1))
    val6 = np.reshape(y_train.values, (-1, 1))

    val11 = np.reshape(X_valid['nom_5'].values, (-1, 1))
    val22 = np.reshape(X_valid['nom_6'].values, (-1, 1))
    val33 = np.reshape(X_valid['nom_7'].values, (-1, 1))
    val44 = np.reshape(X_valid['nom_8'].values, (-1, 1))
    val55 = np.reshape(X_valid['nom_9'].values, (-1, 1))
    val66 = np.reshape(y_valid.values, (-1, 1))

    tf.random.set_seed(0)

    # 100 is number of epochs, 32 is batch size
    s = 100 * len(X_train) // 32
    learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
        0.01, s, 0.1)
    opt = tf.keras.optimizers.Adam(learning_rate)

    model = EmbeddingModel(
        hidden_units=3,
        output_units=1,
        embeddings_initializer=tf.random.normal,
        kernel_initializer=tf.keras.initializers.he_uniform(seed=0),
        dropout_rate=0.4,
        activation="sigmoid",
        trainable=True)
    model.compile(loss=tf.keras.losses.binary_crossentropy,
                  metrics=['accuracy'],
                  optimizer=opt)
    baseline_history = model.fit(
        (val1, val2, val3, val4, val5),
        val6,
        epochs=10,
        batch_size=32,
        validation_data=((val11, val22, val33, val44, val55), val66),
        class_weight={
            0: 0.5,
            1: 0.5
        })
예제 #2
0
def train():
    global model
    data_reset = request.get_json()
    X = [data_reset['user_history'], data_reset['item_history']]
    y = data_reset['rating_history']
    nb_users, nb_items = data_reset['nb_users'], data_reset['nb_items']

    model = EmbeddingModel(nb_users, nb_items, embedding_size=30)
    model.fit(X, y, verbose=True)

    return jsonify({'info': 'successful'})
예제 #3
0
def extract_embeddings():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('database', type=str)
    parser.add_argument('collection', type=str)
    parser.add_argument('outfile', type=str)
    parser.add_argument('dimensions', type=int, default=100)
    parser.add_argument('epochs', type=int, default=10)
    args = parser.parse_args()
    model = EmbeddingModel(args.database, args.collection)
    model.train(args.dimensions, args.epochs)
    model.save(f'data/{args.outfile}')
def main():
    # hyper parameter setting 
    emb_dim = 50 
    epochs = 2
    model_path = 'model.h5'
    negative_samples = 1
    num_words = 10000 
    window_size = 1
    
    # corpus
    text = load_data(filepath = '../chap04/data/ja.text8')
    
    # vocablary 
    vocab = build_vocablary(text, num_words) 
    
    # create dataset 
    x, y = create_dataset(text, vocab, num_words, window_size, negative_samples)
    
    # construction of model
    model = EmbeddingModel(num_words, emb_dim)
    model = model.build()
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    
    # callback 
    callbacks = [
        EarlyStopping(patience=1),
        ModelCheckpoint(model_path, save_best_only=True)
    ]
    
    # model 
    model.fit(x=x,y=y,
              batch_size=128,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks)
    
    # prediction 
    model = load_model(model_path)
    api = InferenceAPI(model, vocab)
    pprint(api.most_similar(word='日本'))
예제 #5
0
    trace1 = go.Scatter(x=decomp[:, 0],
                        y=decomp[:, 1],
                        mode='markers+text',
                        text=words,
                        marker=dict(size=12,
                                    color=decomp[:, 1],
                                    colorscale='Viridis',
                                    opacity=0.8),
                        textposition='bottom center')
    dataTrace = [trace1]
    layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0), font=dict(size=20))
    fig = go.Figure(data=dataTrace, layout=layout)
    py.plot(fig, filename='bert-embedding-initial')

if args.embedding_type == 'linear':
    model = EmbeddingModel(len(args.classes))
elif args.embedding_type == 'conv':
    model = ConvolutionalEmbeddingModel(len(args.classes))
else:
    print("Model type [{0}] not supported".format(args.embedding_type))
    exit(1)

eval_dataset, eval_labels, _, __, ___ = generateData(
    args.eval_file, eval_list, 1.0, args.load_embedding_dict_from_file,
    args.save_embedding_dict, args.verbose,
    'embedding_dicts/animal_embedding_dict.pkl', False, args.classes)
embedding_dict = {}
if len(args.model_checkpoint) > 0:
    checkpoint = torch.load(args.model_checkpoint, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    embedding = checkpoint['model_state_dict']['l2.weight']
예제 #6
0
파일: train.py 프로젝트: galitLukin/zhorai
parser.add_argument('--learning-rate', type=float, default=0.001, metavar='lr', help='Learning rate for training')
parser.add_argument('--train-split-percentage', type=float, default=0.8, metavar='x', help='Percentage of data for training')
parser.add_argument('--save-embedding-dict', action='store_true', help='Save computed embeddings to file')
parser.add_argument('--load-embedding-from-file', action='store_true', help='Load precomputed embeddings from file')
parser.add_argument('--model-checkpoint', type=str, default='', help='Model checkpoint to resume training')
parser.add_argument('--embedding-type', type=str, default='linear', help='Model type: linear or conv')

args = parser.parse_args()

if torch.cuda.is_available():
	args.device = torch.device('cuda')
	torch.cuda.manual_seed(np.random.randint(1, 10000))
	torch.backends.cudnn.enabled = True 
args.classes = ["desert", "rainforest", "grassland", "tundra", "ocean"]
if args.embedding_type == 'linear':
	model = EmbeddingModel(len(args.classes))
elif args.embedding_type == 'conv':
	model = ConvolutionalEmbeddingModel(len(args.classes))
else:
	print("Model type [{0}] not supported".format(args.embedding_type))
	exit(1)
if torch.cuda.is_available():
	model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
model = model.train()
starting_epoch = 0
running_loss = 0.0
if len(args.model_checkpoint) > 0:
	checkpoint = torch.load(args.model_checkpoint)
	model.load_state_dict(checkpoint['model_state_dict'])
 def __init__(self,sim_treshold,model_file,model_type,debug=False):
     self._sim_treshold, self._debug = sim_treshold, debug
     self._previous_results = {}
     self._embedding_model = EmbeddingModel(model_file,model_type)
class PreviousResults(object):
    """class used to store previously post
    tweets for each run and query, as well as
    check novalty of the tweet
    """

    def __init__(self,sim_treshold,model_file,model_type,debug=False):
        self._sim_treshold, self._debug = sim_treshold, debug
        self._previous_results = {}
        self._embedding_model = EmbeddingModel(model_file,model_type)
        

    

    def _store_tweet(self,new_tweet_vector,run_name,qid):
        if self._debug:
            print "store new tweet for query %s run %s"\
                %(qid,run_name)
        if run_name not in self._previous_results:
            self._previous_results[run_name] = {}
        
        if qid not in self._previous_results[run_name]:
            self._previous_results[run_name][qid] = []
        
        self._previous_results[run_name][qid].append(new_tweet_vector)





    def _check_tweet_redundant(self,new_tweet_vector,tweet_vector):
        vector_sim = self._embedding_model.similarity(new_tweet_vector,tweet_vector)
        if self._debug:
            print "the metric is %f" %(term_diff)
        if vector_sim >= self._sim_treshold:
            return True
        else:
            return False



    def is_redundant(self,tweet_text,run_name,qid):
        sentence_list = re.findall("\w+",tweet_text.lower())
        new_tweet_vector = self._embedding_model.get_sentence_vector(sentence_list)

        if run_name not in self._previous_results:
            self._previous_results[run_name] = {}
            self._previous_results[run_name][qid] = []
            
        elif qid not in self._previous_results[run_name]:
            self._previous_results[run_name][qid] = []

        
        else:
            if np.count_nonzero(new_tweet_vector) == 0:
                print "Warning: tweet does not have any matching words:"
                print tweet_text
                return False
            for tweet_vector in self._previous_results[run_name][qid]:
                if self._check_tweet_redundant(new_tweet_vector,tweet_vector):
                    if self._debug:
                        print "%s is redundant" %(tweet_text)
                        print "-"*20
                    return True

        self._store_tweet(new_tweet_vector,run_name,qid)
        return False
예제 #9
0
    ax2 = fig.add_subplot(2, 1, 1)
    df1 = pd.DataFrame(tempa, index=tempb, columns=['Train'])
    df2 = pd.DataFrame(tempc, index=tempb, columns=['Test'])
    df1.plot(ax=ax1, kind='line', rot=360, grid='on')
    ax1.set_xticks(range(len(index_list)))
    ax1.set_xticklabels(range(len(index_list)))
    df2.plot(ax=ax2, kind='line', rot=360, grid='on')
    ax2.set_xticks(range(Epoch))
    ax2.set_xticklabels(range(Epoch))
    plt.show()


"""
USE_Bi=True
"""
w2v = EmbeddingModel(vocab_size, embedding_dim)
checkpoint = torch.load('Model/checkpoint.pth2.tar')
w2v.load_state_dict(checkpoint['state_dict'])  # 模型参数

print(w2v.state_dict()["in_embed.weight"])
"""
if USE_Bi:
    print("Using BiLSTM")
    model = BiLSTM_Match(w2v,embedding_dim, hidden_dim, vocab_size, target, Batchsize, stringlen)
    model_path = "./Model/BiLSTMmodel.pth"
else:
    print("Using LSTM")
    model = LSTM_Match(embedding_dim, hidden_dim, vocab_size,target,Batchsize,stringlen)
    model_path = "./Model/LSTMmodel.pth"

print(w2v.in_embed==model.word_embeddings)
예제 #10
0
                      help='size of skip window')
    parser.add_argument('--batch_size', type=int, dest='batch_size',
                      help='size of batch')
    parser.add_argument('--num_steps', type=int, dest='num_steps',
                      help='total iterations')
    parser.add_argument('--display_steps', type=int, dest='display_steps',
                      help='display steps')
    parser.add_argument('--e_steps', type=int, dest='e_steps',
                      help='e steps')
    parser.add_argument('--learning_rate', type=float, dest='learning_rate',
                      help='learning rate')

    c = Config()
    parser.parse_args(namespace=c)
    return c

    
    

if __name__ == '__main__':
    conf = parse_args()
    df = DataFactory(conf)
    df.load_data()
    t0 = time.time()
    with tf.Session() as sess:
        m = EmbeddingModel(conf)
        init_op = tf.initialize_all_variables()
        sess.run(init_op)
        m.train(sess, df)
    print 'Done train model, cost time: %0.3fs' % (time.time() - t0)
예제 #11
0
        subsampling.append(word)

vocab_count = dict(Counter(subsampling).most_common(MAX_VOCAB_SIZE - 1))
vocab_count['<UNK>'] = 1

idx2word = [word for word in vocab_count.keys()]
word2idx = {word: i for i, word in enumerate(idx2word)}

nc = np.array([count for count in vocab_count.values()],
              dtype=np.float32)**(3. / 4.)
word_freqs = nc / np.sum(nc)

dataset = WordEmbeddingDataset(subsampling, word2idx, word_freqs)
dataloader = tud.DataLoader(dataset, BATCH_SIZE, shuffle=True)

model = EmbeddingModel(len(idx2word), EMBEDDING_SIZE)
model.to(device)
model.train()
optimizer = optim.Adam(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    pbar = tqdm(dataloader)
    pbar.set_description("[Epoch {}]".format(epoch))
    for i, (input_labels, pos_labels, neg_labels) in enumerate(pbar):
        input_labels = input_labels.to(device)
        pos_labels = pos_labels.to(device)
        neg_labels = neg_labels.to(device)
        model.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()
        optimizer.step()