def main(): obj_etl = ETLDataPipeline("data/train.csv", "data/test.csv") train, test = obj_etl.read_data() train = obj_etl.drop_cols([ 'id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month' ]) train = obj_etl.convert_dtypes( ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']) train = obj_etl.encoder(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']) target = obj_etl.get_target('target') X_train, X_valid, y_train, y_valid = obj_etl.get_train_test(train, target) val1 = np.reshape(X_train['nom_5'].values, (-1, 1)) val2 = np.reshape(X_train['nom_6'].values, (-1, 1)) val3 = np.reshape(X_train['nom_7'].values, (-1, 1)) val4 = np.reshape(X_train['nom_8'].values, (-1, 1)) val5 = np.reshape(X_train['nom_9'].values, (-1, 1)) val6 = np.reshape(y_train.values, (-1, 1)) val11 = np.reshape(X_valid['nom_5'].values, (-1, 1)) val22 = np.reshape(X_valid['nom_6'].values, (-1, 1)) val33 = np.reshape(X_valid['nom_7'].values, (-1, 1)) val44 = np.reshape(X_valid['nom_8'].values, (-1, 1)) val55 = np.reshape(X_valid['nom_9'].values, (-1, 1)) val66 = np.reshape(y_valid.values, (-1, 1)) tf.random.set_seed(0) # 100 is number of epochs, 32 is batch size s = 100 * len(X_train) // 32 learning_rate = tf.keras.optimizers.schedules.ExponentialDecay( 0.01, s, 0.1) opt = tf.keras.optimizers.Adam(learning_rate) model = EmbeddingModel( hidden_units=3, output_units=1, embeddings_initializer=tf.random.normal, kernel_initializer=tf.keras.initializers.he_uniform(seed=0), dropout_rate=0.4, activation="sigmoid", trainable=True) model.compile(loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'], optimizer=opt) baseline_history = model.fit( (val1, val2, val3, val4, val5), val6, epochs=10, batch_size=32, validation_data=((val11, val22, val33, val44, val55), val66), class_weight={ 0: 0.5, 1: 0.5 })
def train(): global model data_reset = request.get_json() X = [data_reset['user_history'], data_reset['item_history']] y = data_reset['rating_history'] nb_users, nb_items = data_reset['nb_users'], data_reset['nb_items'] model = EmbeddingModel(nb_users, nb_items, embedding_size=30) model.fit(X, y, verbose=True) return jsonify({'info': 'successful'})
def extract_embeddings(): parser = argparse.ArgumentParser(description='') parser.add_argument('database', type=str) parser.add_argument('collection', type=str) parser.add_argument('outfile', type=str) parser.add_argument('dimensions', type=int, default=100) parser.add_argument('epochs', type=int, default=10) args = parser.parse_args() model = EmbeddingModel(args.database, args.collection) model.train(args.dimensions, args.epochs) model.save(f'data/{args.outfile}')
def main(): # hyper parameter setting emb_dim = 50 epochs = 2 model_path = 'model.h5' negative_samples = 1 num_words = 10000 window_size = 1 # corpus text = load_data(filepath = '../chap04/data/ja.text8') # vocablary vocab = build_vocablary(text, num_words) # create dataset x, y = create_dataset(text, vocab, num_words, window_size, negative_samples) # construction of model model = EmbeddingModel(num_words, emb_dim) model = model.build() model.compile(optimizer = 'adam', loss = 'binary_crossentropy') # callback callbacks = [ EarlyStopping(patience=1), ModelCheckpoint(model_path, save_best_only=True) ] # model model.fit(x=x,y=y, batch_size=128, epochs=epochs, validation_split=0.2, callbacks=callbacks) # prediction model = load_model(model_path) api = InferenceAPI(model, vocab) pprint(api.most_similar(word='日本'))
trace1 = go.Scatter(x=decomp[:, 0], y=decomp[:, 1], mode='markers+text', text=words, marker=dict(size=12, color=decomp[:, 1], colorscale='Viridis', opacity=0.8), textposition='bottom center') dataTrace = [trace1] layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0), font=dict(size=20)) fig = go.Figure(data=dataTrace, layout=layout) py.plot(fig, filename='bert-embedding-initial') if args.embedding_type == 'linear': model = EmbeddingModel(len(args.classes)) elif args.embedding_type == 'conv': model = ConvolutionalEmbeddingModel(len(args.classes)) else: print("Model type [{0}] not supported".format(args.embedding_type)) exit(1) eval_dataset, eval_labels, _, __, ___ = generateData( args.eval_file, eval_list, 1.0, args.load_embedding_dict_from_file, args.save_embedding_dict, args.verbose, 'embedding_dicts/animal_embedding_dict.pkl', False, args.classes) embedding_dict = {} if len(args.model_checkpoint) > 0: checkpoint = torch.load(args.model_checkpoint, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) embedding = checkpoint['model_state_dict']['l2.weight']
parser.add_argument('--learning-rate', type=float, default=0.001, metavar='lr', help='Learning rate for training') parser.add_argument('--train-split-percentage', type=float, default=0.8, metavar='x', help='Percentage of data for training') parser.add_argument('--save-embedding-dict', action='store_true', help='Save computed embeddings to file') parser.add_argument('--load-embedding-from-file', action='store_true', help='Load precomputed embeddings from file') parser.add_argument('--model-checkpoint', type=str, default='', help='Model checkpoint to resume training') parser.add_argument('--embedding-type', type=str, default='linear', help='Model type: linear or conv') args = parser.parse_args() if torch.cuda.is_available(): args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = True args.classes = ["desert", "rainforest", "grassland", "tundra", "ocean"] if args.embedding_type == 'linear': model = EmbeddingModel(len(args.classes)) elif args.embedding_type == 'conv': model = ConvolutionalEmbeddingModel(len(args.classes)) else: print("Model type [{0}] not supported".format(args.embedding_type)) exit(1) if torch.cuda.is_available(): model = model.cuda() criterion = torch.nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) model = model.train() starting_epoch = 0 running_loss = 0.0 if len(args.model_checkpoint) > 0: checkpoint = torch.load(args.model_checkpoint) model.load_state_dict(checkpoint['model_state_dict'])
def __init__(self,sim_treshold,model_file,model_type,debug=False): self._sim_treshold, self._debug = sim_treshold, debug self._previous_results = {} self._embedding_model = EmbeddingModel(model_file,model_type)
class PreviousResults(object): """class used to store previously post tweets for each run and query, as well as check novalty of the tweet """ def __init__(self,sim_treshold,model_file,model_type,debug=False): self._sim_treshold, self._debug = sim_treshold, debug self._previous_results = {} self._embedding_model = EmbeddingModel(model_file,model_type) def _store_tweet(self,new_tweet_vector,run_name,qid): if self._debug: print "store new tweet for query %s run %s"\ %(qid,run_name) if run_name not in self._previous_results: self._previous_results[run_name] = {} if qid not in self._previous_results[run_name]: self._previous_results[run_name][qid] = [] self._previous_results[run_name][qid].append(new_tweet_vector) def _check_tweet_redundant(self,new_tweet_vector,tweet_vector): vector_sim = self._embedding_model.similarity(new_tweet_vector,tweet_vector) if self._debug: print "the metric is %f" %(term_diff) if vector_sim >= self._sim_treshold: return True else: return False def is_redundant(self,tweet_text,run_name,qid): sentence_list = re.findall("\w+",tweet_text.lower()) new_tweet_vector = self._embedding_model.get_sentence_vector(sentence_list) if run_name not in self._previous_results: self._previous_results[run_name] = {} self._previous_results[run_name][qid] = [] elif qid not in self._previous_results[run_name]: self._previous_results[run_name][qid] = [] else: if np.count_nonzero(new_tweet_vector) == 0: print "Warning: tweet does not have any matching words:" print tweet_text return False for tweet_vector in self._previous_results[run_name][qid]: if self._check_tweet_redundant(new_tweet_vector,tweet_vector): if self._debug: print "%s is redundant" %(tweet_text) print "-"*20 return True self._store_tweet(new_tweet_vector,run_name,qid) return False
ax2 = fig.add_subplot(2, 1, 1) df1 = pd.DataFrame(tempa, index=tempb, columns=['Train']) df2 = pd.DataFrame(tempc, index=tempb, columns=['Test']) df1.plot(ax=ax1, kind='line', rot=360, grid='on') ax1.set_xticks(range(len(index_list))) ax1.set_xticklabels(range(len(index_list))) df2.plot(ax=ax2, kind='line', rot=360, grid='on') ax2.set_xticks(range(Epoch)) ax2.set_xticklabels(range(Epoch)) plt.show() """ USE_Bi=True """ w2v = EmbeddingModel(vocab_size, embedding_dim) checkpoint = torch.load('Model/checkpoint.pth2.tar') w2v.load_state_dict(checkpoint['state_dict']) # 模型参数 print(w2v.state_dict()["in_embed.weight"]) """ if USE_Bi: print("Using BiLSTM") model = BiLSTM_Match(w2v,embedding_dim, hidden_dim, vocab_size, target, Batchsize, stringlen) model_path = "./Model/BiLSTMmodel.pth" else: print("Using LSTM") model = LSTM_Match(embedding_dim, hidden_dim, vocab_size,target,Batchsize,stringlen) model_path = "./Model/LSTMmodel.pth" print(w2v.in_embed==model.word_embeddings)
help='size of skip window') parser.add_argument('--batch_size', type=int, dest='batch_size', help='size of batch') parser.add_argument('--num_steps', type=int, dest='num_steps', help='total iterations') parser.add_argument('--display_steps', type=int, dest='display_steps', help='display steps') parser.add_argument('--e_steps', type=int, dest='e_steps', help='e steps') parser.add_argument('--learning_rate', type=float, dest='learning_rate', help='learning rate') c = Config() parser.parse_args(namespace=c) return c if __name__ == '__main__': conf = parse_args() df = DataFactory(conf) df.load_data() t0 = time.time() with tf.Session() as sess: m = EmbeddingModel(conf) init_op = tf.initialize_all_variables() sess.run(init_op) m.train(sess, df) print 'Done train model, cost time: %0.3fs' % (time.time() - t0)
subsampling.append(word) vocab_count = dict(Counter(subsampling).most_common(MAX_VOCAB_SIZE - 1)) vocab_count['<UNK>'] = 1 idx2word = [word for word in vocab_count.keys()] word2idx = {word: i for i, word in enumerate(idx2word)} nc = np.array([count for count in vocab_count.values()], dtype=np.float32)**(3. / 4.) word_freqs = nc / np.sum(nc) dataset = WordEmbeddingDataset(subsampling, word2idx, word_freqs) dataloader = tud.DataLoader(dataset, BATCH_SIZE, shuffle=True) model = EmbeddingModel(len(idx2word), EMBEDDING_SIZE) model.to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=LR) for epoch in range(EPOCHS): pbar = tqdm(dataloader) pbar.set_description("[Epoch {}]".format(epoch)) for i, (input_labels, pos_labels, neg_labels) in enumerate(pbar): input_labels = input_labels.to(device) pos_labels = pos_labels.to(device) neg_labels = neg_labels.to(device) model.zero_grad() loss = model(input_labels, pos_labels, neg_labels).mean() loss.backward() optimizer.step()