def main(args): train_data = get_data(args.path_train) dev_data = get_data(args.path_dev) test_data = get_data(args.path_test) word_to_int,int_to_word,sz1, \ sz2,sz3=get_vocab(train_data,args.t1,args.t2) train_data = tokenize(train_data, word_to_int) dev_data = tokenize(dev_data, word_to_int) test_data = tokenize(test_data, word_to_int) train_data = batchify(train_data, args.batch_size) dev_data = batchify(dev_data, args.dev_batch_size) test_data = batchify(test_data, args.dev_batch_size) train_data = train_data.cuda() dev_data = dev_data.cuda() test_data = test_data.cuda() vocab_sizes = [sz1, sz2, sz3] g_list = [int(2**(args.N - i)) for i in range(args.N)] model = AWD_LSTM(args.embed_dim, args.h_size, args.layers, vocab_sizes, args.m, args.k, args.N, g_list, args.n_layers, args.dropouti, args.dropouth, args.dropout, args.dropout_embed, args.dropoutw).cuda() train(dev_data, train_data, model, args) test_loss, test_perplexity = validate(test_data, model) print('test loss=', test_loss, 'test perplexity=', test_perplexity)
def fit(self, train_data): # TO DO: Learn the parameters from the training data data = get_feature_vectors(train_data[0], binary=True) lab = train_data[1] c = Counter(lab) pos_count = c[1] neg_count = c[-1] total = pos_count + neg_count self.pos_prob = float(pos_count) / total self.neg_prob = float(neg_count) / total stop_indices = [] current_vocab = get_vocab() for word in self.stop_words: if word in current_vocab: stop_indices.append(current_vocab[word]) for i in range(0, len(data)): # movie reviews for j in range(0, len(data[i])): # dictionary if j not in stop_indices: if lab[i] == 1: self.pos_word[j] += data[i][j] elif lab[i] == -1: self.neg_word[j] += data[i][j] for i in range(0, len(self.pos_word)): self.pos_word[i] = (float(self.pos_word[i]) + 1) / (pos_count + self.vector_size) for i in range(0, len(self.neg_word)): self.neg_word[i] = (float(self.neg_word[i]) + 1) / (neg_count + self.vector_size)
def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1, use_memory=False, use_text=False): obs_space, self.preprocess_obss = utils.get_obss_preprocessor( obs_space) self.acmodel = ACModel(obs_space, action_space, use_memory=use_memory, use_text=use_text) self.device = device self.argmax = argmax self.num_envs = num_envs if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size) self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
def NN(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile( 'train.csv') ''' Implement your Neural Network classifier here ''' word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index, train_tweet_id2text, train_tweet_id2author_label, train_tweet_id2issue, bow=False) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) data = np.array(data) labels = np.array(labels) n_class = len(set(labels)) n_sample, n_feature = np.shape(data) print(np.shape(data)) lrates = [0.2, 0.4, 0.9] all_loss = [] for r in lrates: model = LogRegression(n_feature, n_class, lrate=r, verbose=True) train_loss = model.fit(data, labels, max_iter=500) print(len(train_loss)) all_loss.append(train_loss) file_name = 'train_loss_nn.pdf' plot_lr(lrates, all_loss, file_name)
def grid_search(): embedding_sizes = [32, 64] learning_rates = [0.15, 0.10] window_sizes = [2, 3] word_to_ix, ix_to_word, subsampled_words = u.get_vocab( vocab_path="../resources/vocab.txt", antivocab_path="../resources/antivocab.txt") for e_size in embedding_sizes: for lr in learning_rates: for w_size in window_sizes: tf.reset_default_graph() train_basic_w2v(dataset="../resources/eurosense_sentences.txt", word_to_ix=word_to_ix, subsampled_words=subsampled_words, model_path="../resources/models", model_ID="basic_w2v_E%d_LR%.3f_W%d" % (e_size, lr, w_size), epochs=30, batch_size=64, embedding_size=e_size, lr=lr, window_size=w_size, neg_samples=16, csv_export=False)
def __init__(self, vocab_file): self.map = utils.get_vocab(vocab_file) self.inv_map = {v: k for k, v in self.map.items()} self.bos = self.map['<s>'] self.eos = self.map['</s>'] self.unk = self.map['<UNK>'] self.pad = self.map['<PAD>']
def __init__(self, trainloader, valloader, config): """Initialize configurations.""" # Data loader. self.trainloader = trainloader self.validloader = valloader # Directories. self.main_dir = config.main_dir self.model_name = config.model_name # Dataset. self.data_name = config.data_name # Model configurations. self.vocab = get_vocab(self.main_dir, self.data_name) self.D = config.D self.D_prime = config.D_prime self.d = config.d self.K = config.K self.rnn_num = config.rnn_num self.margin = config.margin self.pt_path = config.pt_path # Training configurations. self.mode = config.mode self.batch_size = config.batch_size self.img_size = config.img_size self.crop_size = config.crop_size self.lr = config.lr # 0.001 self.lr_decay = config.lr_decay # 0.98 self.init_ep = config.init_ep self.max_ep = config.max_ep # Miscellaneous. self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.log_step = config.log_step self.draw_step = config.draw_step self.acc_step = config.acc_step self.save_step = config.save_step self.use_visdom = config.use_visdom self.init_from = config.init_from self.best_r = np.array([0, 0, 0]) self.best_ri = np.array([0, 0, 0]) # Build model. self.build_model() if self.use_visdom: self.viz = Visdom() self.loss_plot = create_vis(self.viz, self.model_name, 'loss', self.max_ep, 5) self.acc_plot_i2t = create_vis(self.viz, self.model_name, 'accuracy', self.max_ep, 100) self.acc_plot_t2i = create_vis(self.viz, self.model_name, 'accuracy', self.max_ep, 100)
def __init__(self, obs_space, action_space, model_dir, seed, n_columns, device=None, argmax=False, num_envs=1, use_memory=False, use_text=False): obs_space, self.preprocess_obss = utils.get_obss_preprocessor( obs_space) #self.acmodel = PNNModel(obs_space, action_space, use_memory=use_memory, use_text=use_text) self.acmodel = PNNModel(obs_space, action_space, use_memory=use_memory, use_text=use_text, use_pnn=True, base=None) self.device = device self.argmax = argmax self.num_envs = num_envs if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size) # Add a new column to the model for _ in range(n_columns): self.acmodel.base.new_task() # Load model parameters for PREVIOUS columns #for i in range(n_columns - 1): #utils.pnn_load_state_dict(self.acmodel, i, pnn_paths[i]) # Freeze the weights all previous columns #acmodel.base.freeze_columns(skip=[args.n_columns - 1]) # load CURRENT column model parameter for resuming training #if "model_state" in status: # acmodel.base.columns[args.n_columns - 1].load_state_dict(status["model_state"]) #status_path = utils.get_status_path(model_dir, args.seed) #utils.pnn_load_state_dict(acmodel, args.n_columns - 1, status_path) # Load model parameters for all columns status_path = utils.get_status_path(model_dir, seed) for i in range(n_columns - 1): utils.pnn_load_state_dict(self.acmodel, i, status_path) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab( utils.get_vocab(model_dir, seed))
def LR(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile('train.csv') ''' Implement your Logistic Regression classifier here ''' BOW = True GLOVE = False word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index,train_tweet_id2text,train_tweet_id2author_label,train_tweet_id2issue,bow=True) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) n_class = len(set(labels)) print('dataset shape:',np.shape(data)) # print(np.shape(labels)) # print(len(word2index)) n_sample,n_feature = np.shape(data) model = LogRegression(n_feature,n_class,lrate=0.8,verbose=True) model.fit(data,labels,max_iter=500) # y_pred = [model.predict(x) for x in data] # Read test data test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label = ReadFile('test.csv') # Predict test data by learned model ''' Replace the following random predictor by your prediction function. ''' test_data_dict = combine_vec(word2index,test_tweet_id2text,test_tweet_id2author_label,test_tweet_id2issue,bow = True) for tweet_id in test_tweet_id2text: # Get the text # text=test_tweet_id2text[tweet_id] # Predict the label test_x = test_data_dict[tweet_id] label = model.predict(test_x) # Store it in the dictionary test_tweet_id2label[tweet_id] = label # Save predicted labels in 'test_lr.csv' SaveFile(test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label, 'test_lr.csv')
def _load_caps(self): with open(self.caps_path) as fd: #print "Loading the captions..." self.caption_dict = pkl.load(fd) #print "Done" self.vocab, self.mapping = utils.get_vocab(self.caption_dict, remove_stop_words=False) #print "We have a vocabulary of size", len(self.vocab) if self.process_text: #print "processing the text..." self.process_captions() print "Done"
def cv_NN(kfold): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile( 'train.csv') ''' Implement your Neural Network classifier here ''' word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index, train_tweet_id2text, train_tweet_id2author_label, train_tweet_id2issue, bow=False) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) data = np.array(data) labels = np.array(labels) n_class = len(set(labels)) n_sample, n_feature = np.shape(data) print('Cross validation for Neural network') n_sample, n_feature = np.shape(data) fold_size = int(np.ceil(n_sample / kfold)) print('Fold size:', fold_size) accuracy = [] for k in range(kfold): tstart = k * fold_size tend = min(n_sample, tstart + fold_size) training_x = np.array( [x for i, x in enumerate(data) if not (tstart <= i and i < tend)]) test_x = np.array( [x for i, x in enumerate(data) if (tstart <= i and i < tend)]) training_y = np.array([ x for i, x in enumerate(labels) if not (tstart <= i and i < tend) ]) test_y = np.array( [x for i, x in enumerate(labels) if (tstart <= i and i < tend)]) model = NeuralNet(n_feature, n_class, lrate=0.9, verbose=False) model.fit(training_x, training_y, max_iter=500) accuracy.append(model.score(test_x, test_y)) print('Fold', k, 'accuracy', accuracy[-1]) print('Mean accuracy', np.mean(accuracy))
def evaluate(target_words, top_k=10, synaware_w2v=True): """ Provides a qualitative measure for the embeddings the model has learned by printing the most similar words to the ones provided as test words. :param target_words: Test words to discover the closest words to them, as List :param top_k: Number of closest words :param synaware_w2v: True: use a SynsetAwareWord2Vec model (default); False: use a basic Word2Vec model :return: None """ print("Loading vocabularies...") word_to_ix, ix_to_word, subsampled_words = u.get_vocab(vocab_path="../resources/vocab.txt", antivocab_path="../resources/antivocab.txt") print("Creating model...") if not synaware_w2v: model = Word2Vec(subsampled_words=subsampled_words, vocabulary_size=len(word_to_ix), embedding_size=EMBEDDING_SIZE, learning_rate=LEARNING_RATE, window_size=WINDOW_SIZE, neg_samples=NEG_SAMPLES) else: model = SynsetAwareWord2Vec(subsampled_words=subsampled_words, vocabulary_size=len(word_to_ix), embedding_size=EMBEDDING_SIZE, learning_rate=LEARNING_RATE, window_size=WINDOW_SIZE, neg_samples=NEG_SAMPLES) saver = tf.train.Saver() with tf.Session() as sess: print("Loading model...") saver.restore(sess, MODEL_PATH_SYN_W2V if synaware_w2v else MODEL_PATH_W2V) target_words = [word_to_ix[w] for w in target_words if w in word_to_ix] sim_val = sess.run(model.similarity, feed_dict={model.data["sim_test"]: target_words}) for i in range(len(target_words)): print("Closest %d words to %s:" % (top_k, ix_to_word[target_words[i]])) closest_words = (-sim_val[i, :]).argsort()[1:top_k + 1] for j in range(top_k): word = ix_to_word[closest_words[j]] print("\t%d. %s" % (j+1, word))
def __init__(self, config, transform, mode): self.main_dir = config.main_dir self.data_name = config.data_name self.mode = mode self.max_token_len = config.max_token_len self.split = self.get_split_type(config) self.data = self.get_data_list() self.vocab = get_vocab(self.main_dir, self.data_name) self.transform = transform ''' debugging ''' self.coco_split = config.coco_split tmp = pth( self.main_dir, self.data_name, 'annotations', 'captions_{}{}.json'.format( self.mode, '2014' if self.coco_split is 'rval' else '2017')) print('caption path: {}'.format(tmp)) '''''' self.caps = self.get_coco()
def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1): obs_space, self.preprocess_obss = utils.get_obss_preprocessor( obs_space) self.model = QModel(obs_space, action_space) self.device = device self.argmax = argmax self.num_envs = num_envs self.model.load_state_dict(utils.get_model_state(model_dir)) self.model.to(self.device) self.model.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
def fasttext_train( trained_model_dir: OutputDirectory(type='AnyDirectory'), training_data_dir: InputDirectory(type='AnyDirectory') = None, validation_data_dir: InputDirectory(type='AnyDirectory') = None, char2index_dir: InputDirectory(type='AnyDirectory') = None, epochs=1, batch_size=64, learning_rate=0.0005, embedding_dim=128 ): print('============================================') print('training_data_dir:', training_data_dir) print('validation_data_dir:', validation_data_dir) c2i = get_vocab(char2index_dir) class_ = get_classs() max_len_ = 38 n_class_ = len(class_) vocab_size_ = len(c2i) stop_patience = 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') path = os.path.join(training_data_dir, 'train.txt') train_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) path = os.path.join(validation_data_dir, 'dev.txt') dev_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) train_iter = DataIter(train_samples, batch_size) dev_iter = DataIter(dev_samples, batch_size) model = FastText(vocab_size=vocab_size_, n_class=n_class_, embed_dim=embedding_dim) start = time.time() train(model, trained_model_dir, train_iter, dev_iter=dev_iter, epochs=epochs, learning_rate=learning_rate, stop_patience=stop_patience, device=device) end = time.time() print('\nspent time: %.2f sec' % (end - start)) print('============================================')
def NN(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile('train.csv') ''' Implement your Neural Network classifier here ''' word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index,train_tweet_id2text,train_tweet_id2author_label,train_tweet_id2issue, bow = False) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) data = np.array(data) labels = np.array(labels) n_class = len(set(labels)) n_sample,n_feature = np.shape(data) print(np.shape(data)) model = NeuralNet(n_feature,n_class,lrate=0.9,verbose=True) model.fit(data,labels,max_iter=800) # Read test data test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label = ReadFile('test.csv') ''' Replace the following random predictor by your prediction function. ''' test_data_dict = combine_vec(word2index,test_tweet_id2text,test_tweet_id2author_label,test_tweet_id2issue,bow = False) for tweet_id in test_tweet_id2text: # Predict the label test_x = test_data_dict[tweet_id] label = model.predict(test_x) # Store it in the dictionary test_tweet_id2label[tweet_id] = label # Save predicted labels in 'test_lr.csv' SaveFile(test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label, 'test_nn.csv')
def regular_choice(update, context): """Given a choice user made store it in the user_data dictionary to use it later.""" ind = int(update.callback_query.data) % 100 vocab_name = screen_texts[ind].split('_')[-1] data = get_vocab(vocab_name)[:8] #context.user_data['choice'] = text buttons = [ [InlineKeyboardButton(text=data[x], callback_data=str(150+x)) for x in range(2)], [InlineKeyboardButton(text=data[x+2], callback_data=str(152+x)) for x in range(2)], [InlineKeyboardButton(text=data[x+4], callback_data=str(154+x)) for x in range(2)], [InlineKeyboardButton(text=data[x+6], callback_data=str(156+x)) for x in range(2)] ] keyboard = InlineKeyboardMarkup(buttons) text = 'Here are the most popular {} people look for'.format(vocab_name) update.callback_query.answer() update.callback_query.edit_message_text(text=text, reply_markup=keyboard) return d['typing_reply']
def textrank_keywords(processed_sentences, window_size, top_num): """ Inspired by pagerank, textrank considers each word as a node, give the weight to each edge by calculating word window pairs. And rank words by their score. :param processed_sentences: processed sentences, at least remove stopwords. :param window_size: the number of words following a word. :param top_num: the number of top words. :return: a list of Top top_num words (index 0) with their scores (index 1). """ vocab = get_vocab(processed_sentences) token_pairs = get_token_pairs(window_size, processed_sentences) # Get normalized matrix g = get_matrix(vocab, token_pairs) # Initionlization for weight(pagerank value) pr = np.array([1] * len(vocab)) d = 0.85 # damping coefficient, usually is .85 min_diff = 1e-5 # convergence threshold steps = 10 node_weight = None # save keywords and its weight # Iteration previous_pr = 0 for epoch in range(steps): pr = (1 - d) + d * np.dot(g, pr) if abs(previous_pr - sum(pr)) < min_diff: break else: previous_pr = sum(pr) # Get weight for each node node_weight = dict() for word, index in vocab.items(): node_weight[word] = pr[index] # Print Top Keywords node_weight = OrderedDict( sorted(node_weight.items(), key=lambda t: t[1], reverse=True)) keywords = [] for i, (key, value) in enumerate(node_weight.items()): keywords.append((key, value)) if i > (top_num - 2): break return keywords
def train(args): base_path = '../tweet_lm_data' train_file = os.path.join(base_path, 'train.txt') valid_file = os.path.join(base_path, 'valid.txt') vocab_file = os.path.join(base_path, 'tweet_vocab_thresh3.txt') # Grab vocabulary assert os.path.exists(vocab_file) vocab = utils.get_vocab(vocab_file) print('vocab length:', len(vocab)) print('highest word id:', max(vocab.values())) print('lowest word id:', min(vocab.values())) model_name = '' nn_object = BidirectionalLM(args, vocab=vocab, train_file=train_file, valid_file=valid_file, model_name='') print('model instantiated') nn_object.train() nn_object.model.save('bidi_lm_final.h5') print('Model Saved')
def predict(epoch_idx, logger=None): """Load model in `models` and predict.""" device = torch.device( "cuda" if torch.cuda.is_available() and USE_CUDA else "cpu") checkpoint_path = os.path.join(MODEL_DIR, "model_epoch_{}.ckpt".format(epoch_idx)) model = torch.load(checkpoint_path, map_location="cpu") model.to(device) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model.eval() vocab2idx = get_vocab(result_dir=RESULT_DIR, min_count=MIN_COUNT) X, _ = load_data(PREDICT_FILE, max_len=MAX_LEN, vocab2idx=vocab2idx, do_lower_case=DO_LOWER_CASE, text_col_name=TEXT_COL_NAME) X = torch.from_numpy(X) # (N, L) dataset = TensorDataset(X) loader = DataLoader(dataset, batch_size=BATCH_SIZE) y_pred = [] print("Start predicting...") tic = time.time() for (batch_xs, ) in loader: batch_xs = batch_xs.to(device) # (N, L) batch_out = model(batch_xs) # (N, num_classes) batch_pred = batch_out.argmax(dim=-1) # (N, ) for i in batch_pred.cpu().numpy(): y_pred.append(i) toc = time.time() logger.info("predict use time {}s".format(toc - tic)) with open(os.path.join(RESULT_DIR, "predict.txt"), "w", encoding="utf-8") as fw: for i in y_pred: fw.write(str(CLASS_NAMES[i]) + "\n")
def __init__(self, obs_space, action_space, model_dir, model_name='AC', device=None, argmax=False, num_envs=1, use_memory=False, use_text=False, input_type="image", feature_learn="curiosity"): obs_space, self.preprocess_obss = utils.get_obss_preprocessor( obs_space) if model_name == 'ac': self.acmodel = ACModel(obs_space, action_space, use_memory=use_memory, use_text=use_text) elif model_name == 'sr': self.acmodel = SRModel(obs_space, action_space, input_type=input_type, use_memory=use_memory, use_text=use_text, feature_learn=feature_learn) self.model_name = model_name self.device = device self.argmax = argmax self.num_envs = num_envs if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size) self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
def creat_train_table(seed=531): # 配布されたcsvファイルを読み込む train_df = pd.read_csv('../input/dataset/train/annotations.csv') vocab = utils.get_vocab() with open('../input/vocab/rarity.json', 'r') as f: rarity = json.load(f) train_list = Parallel(n_jobs=-1)([ delayed(process_train)(row, vocab, rarity) for index, row in tqdm(train_df.iterrows(), total=len(train_df)) ]) meta = pd.DataFrame(train_list).sort_values('ID') skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True) for k, (train_index, val_index) in enumerate(skf.split(meta.index, meta.rarity)): meta.loc[val_index, 'valid'] = k meta = meta.set_index('ID') meta.to_csv('../input/tables/meta-train.csv') drop_columns = ['height', 'width', 'aspect', 'rarity'] meta.drop(drop_columns, axis=1).to_csv('../input/tables/train.csv')
def LR(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile( 'train.csv') ''' Implement your Logistic Regression classifier here ''' BOW = True GLOVE = False word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index, train_tweet_id2text, train_tweet_id2author_label, train_tweet_id2issue, bow=True) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) n_class = len(set(labels)) print(np.shape(data)) # print(np.shape(labels)) # print(len(word2index)) n_sample, n_feature = np.shape(data) lrates = [0.2, 0.5, 0.8] all_loss = [] for r in lrates: model = LogRegression(n_feature, n_class, lrate=r, verbose=True) train_loss = model.fit(data, labels, max_iter=200) print(len(train_loss)) all_loss.append(train_loss) file_name = 'train_loss_lr.pdf' plot_lr(lrates, all_loss, file_name)
def creat_character_table(seed=531): vocab = utils.get_vocab() with open('../input/vocab/rarity.json', 'r') as f: rarity = json.load(f) image_paths = glob.glob( os.path.join('../input/dataset/train_kana/U+*/*.jpg')) char_list = Parallel(n_jobs=-1)([ delayed(process_char)(path, vocab, rarity) for path in tqdm(image_paths, total=len(image_paths)) ]) meta = pd.DataFrame(char_list).sort_values('target') skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True) #split変えても良さそう for k, (train_index, val_index) in enumerate(skf.split(meta.index, meta.rarity)): meta.loc[val_index, 'valid'] = k meta = meta.set_index('file') meta.to_csv('../input/tables/meta-character.csv') drop_columns = ['height', 'width', 'aspect', 'rarity'] meta.drop(drop_columns, axis=1).to_csv('../input/tables/character.csv')
subsampled_words=subsampled_words, model_path="../resources/models", model_ID="basic_w2v_E%d_LR%.3f_W%d" % (e_size, lr, w_size), epochs=30, batch_size=64, embedding_size=e_size, lr=lr, window_size=w_size, neg_samples=16, csv_export=False) if __name__ == "__main__": word_to_ix, ix_to_word, subsampled_words = u.get_vocab( vocab_path="../resources/vocab.txt", antivocab_path="../resources/antivocab.txt") syn_to_ix = u.get_synset_vocab(word_to_ix) #grid_search() tf.reset_default_graph() #train_basic_w2v(dataset="../resources/eurosense_sentences.txt", # word_to_ix=word_to_ix, # subsampled_words=subsampled_words, # model_path="../resources/models", # model_ID="basic_w2v", # epochs=30, # batch_size=64, # embedding_size=64,
def test_get_vocab(X, n_words, expected): result = utils.get_vocab(X, n_words=n_words) assert result == expected
neg.append(dd) elif dd['class'] == 1: pos.append(dd) else: raise Exception('no class!') shuffle(pos) shuffle(neg) num_pos_to_sample = int(NUMTRAIN * prop_value) num_neg_to_sample = NUMTRAIN - num_pos_to_sample selected_docs = pos[0:num_pos_to_sample] + neg[0:num_neg_to_sample] assert len(selected_docs) == NUMTRAIN #(2) get the vocab for that training proportions traindicts = [] trainY = [] class2wordcount = defaultdict(list) for dd in selected_docs: traindicts.append(dd['counts'].copy()) cc = dd['class'] trainY.append(cc) class2wordcount[cc].append(sum(dd['counts'].values())) trainY = np.array(trainY) trainX, word2num = utils.get_vocab(traindicts) #save these scipy.sparse.save_npz(PATH + 'trainX', trainX) np.save(PATH + 'trainY', trainY) w1 = open(PATH + 'word2num.json', 'w') json.dump(word2num, w1)
coherence /= len(top_n) print('| NPMI score: {:f}'.format(coherence)) def get_top_words(beta, vocab_bow, top_k): topic_words = [] for k, beta_k in enumerate(beta): words = [vocab_bow[idx] for idx in np.argsort(beta_k)[:-top_k-1:-1]] topic_words.append(words) print('Topic {}: {}'.format(k+1, ' '.join(words))) return topic_words if __name__ == '__main__': vocab = utils.get_vocab('./data/StackOverflow/StackOverflow.vocab') # vocab = utils.get_vocab('./data/Snippets/Snippets.vocab') # with codecs.open('./StackOverflow/nvctm_train_theta', 'rb') as fp: with codecs.open('./Snippets/cr_nvctm_train_theta', 'rb') as fp: theta = pickle.load(fp) fp.close() # with codecs.open('./StackOverflow/nvctm_train_beta', 'rb') as fp: with codecs.open('./Snippets/cr_nvctm_train_beta', 'rb') as fp: beta = pickle.load(fp) fp.close() tw = get_top_words(beta, vocab, 15) test_mat = data_set('./data/StackOverflow/train.feat', 22956)
# # The first delicate issue we need to address is the vocabulary for our model: # # * As indicated in the figure above, the first thing we do when processing an example is look up the words in an embedding (a VSM), which has to have a fixed dimensionality. # # * We can use our training data to specify the vocabulary for this embedding; at prediction time, though, we will inevitably encounter words we haven't seen before. # # * The convention we adopt here is to map them to an `$UNK` token that is in our pre-specified vocabulary. # # * At the same time, we might want to collapse infrequent tokens into `$UNK` to make optimization easier. # # In `utils`, the function `get_vocab` implements these strategies. Now we can extract the training vocab and use it for the model embedding, secure in the knowledge that we will be able to process tokens outside of this set (by mapping them to `$UNK`). # In[20]: sst_full_train_vocab = utils.get_vocab(X_rnn_train) # In[21]: print("sst_full_train_vocab has {:,} items".format(len(sst_full_train_vocab))) # This frankly seems too big relative to our dataset size. Let's restrict to just 10000 words: # In[22]: sst_train_vocab = utils.get_vocab(X_rnn_train, n_words=10000) # ### Pure NumPy RNN implementation # # The first implementation we'll look at is a pure NumPy implementation of exactly the model depicted above. This implementation is a bit slow and might not be all that effective, but it is useful to have available in case one really wants to inspect the details of how these models process examples.
def Predict(self, treebanks, datasplit, options): char_map = {} if options.char_map_file: char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) # get external embeddings for the set of words and chars in the # test vocab but not in the training vocab test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ set(test_words) - self.feature_extractor.words.viewkeys() print "Number of OOV word types at test time: %i (out of %i)" % ( len(new_test_words), len(test_words)) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=new_test_words ) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: print "External embeddings found for %i words "\ "(out of %i)" % \ (len(test_embeddings["words"]), len(new_test_words)) if options.char_emb_size > 0: new_test_chars = \ set(test_chars) - self.feature_extractor.chars.viewkeys() print "Number of OOV char types at test time: %i (out of %i)" % ( len(new_test_chars), len(test_chars)) if len(new_test_chars) > 0: for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=new_test_chars, chars=True ) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: print "External embeddings found for %i chars "\ "(out of %i)" % \ (len(test_embeddings["chars"]), len(new_test_chars)) data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): sentence = deepcopy(osentence) self.feature_extractor.Init(options) conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings) scores, exprs = self.__evaluate(conll_sentence, True) if self.proj: heads = decoder.parse_proj(scores) #LATTICE solution to multiple roots # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py ## ADD for handling multi-roots problem rootHead = [head for head in heads if head==0] if len(rootHead) != 1: print "it has multi-root, changing it for heading first root for other roots" rootHead = [seq for seq, head in enumerate(heads) if head == 0] for seq in rootHead[1:]:heads[seq] = rootHead[0] ## finish to multi-roots else: heads = chuliu_edmonds_one_root(scores.T) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]] dy.renew_cg() #keep in memory the information we need, not all the vectors oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)] for tok_o, tok in zip(oconll_sentence, conll_sentence): tok_o.pred_relation = tok.pred_relation tok_o.pred_parent_id = tok.pred_parent_id yield osentence