def load_errors(): len_history = 100 p_error = np.full((len(seed_list), len(train_mode_list), len(num_particles_list), len_history), np.nan, dtype=np.float) q_error = np.full((len(seed_list), len(train_mode_list), len(num_particles_list), len_history), np.nan, dtype=np.float) grad_std = np.full((len(seed_list), len(train_mode_list), len(num_particles_list), len_history), np.nan, dtype=np.float) for seed_idx, seed in enumerate(seed_list): for train_mode_idx, train_mode in enumerate(train_mode_list): for num_particles_idx, num_particles in enumerate( num_particles_list): print('{} {} {}'.format(seed, train_mode, num_particles)) model_folder = util.get_most_recent_model_folder_args_match( seed=seed, train_mode=train_mode, num_particles=num_particles, init_near=init_near) if model_folder is not None: stats = util.load_object(util.get_stats_path(model_folder)) p_error[seed_idx, train_mode_idx, num_particles_idx, :len(stats.p_error_history )] = stats.p_error_history q_error[seed_idx, train_mode_idx, num_particles_idx, :len(stats.q_error_history )] = stats.q_error_history grad_std[seed_idx, train_mode_idx, num_particles_idx, :len( stats.grad_std_history)] = stats.grad_std_history return p_error, q_error, grad_std
def train_sklearn_boosting(training_data_dump): training_data = util.load_object(training_data_dump) model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=1), n_estimators=20) #model = AdaBoostRegressor(SVR(kernel='linear'), n_estimators=20) #model = RandomForestRegressor(n_estimators = 50) model = model.fit(training_data[:, :-1], training_data[:, -1]) return model
def predict_random_forest(model, test_data_dump): test_data = util.load_object(test_data_dump) predictions = [] targets = [] for sample in test_data: targets.append(sample[-1]) predictions.append(model.predict(sample[:-1])) return get_average_kappa(targets, predictions)
def predict_sklearn_random_forest(model, test_data_dump): test_data = util.load_object(test_data_dump) predictions = [] targets = [] targets = test_data[:, -1] predictions = model.predict(test_data[:, :-1]) return get_average_kappa(targets, predictions)
def load_formatter_fn(formatter): ''' >>> load_formatter_fn('logagg.formatters.basescript') #doctest: +ELLIPSIS <function basescript at 0x...> ''' obj = util.load_object(formatter) if not hasattr(obj, 'ispartial'): obj.ispartial = util.ispartial return obj
def msg_store(self): targets = [] for t in self.args.target: imp_path, args = self._parse_msg_target_arg(t) target_class = util.load_object(imp_path) target_obj = target_class(**args) targets.append(target_obj) return targets
def load_results_from_disk(self): results_list = [] for i in range(0, len(self.classifier_list)): results = util.load_object( self.CLASSIFIERS_AND_RESULTS_DIR_PATH + util.convert_name_to_filename(self.classifier_name_list[i]) + '_' + self.classifier_iter + '_results.pkl') results_list.append(results) self.results = results_list return results_list
def plot_variance_analysis(): num_particles_list = [2, 5, 10, 20, 50, 100] [ vimco_grad, vimco_one_grad, reinforce_grad, reinforce_one_grad, two_grad, log_evidence_stats, log_evidence_grad, wake_phi_loss_grad, log_Q_grad, sleep_loss_grad ] = util.load_object('./variance_analysis/data.pkl') fig, axss = plt.subplots(2, 10, figsize=(20, 4), dpi=100, sharex=True, sharey='row') for i, stats in enumerate([ vimco_grad, vimco_one_grad, reinforce_grad, reinforce_one_grad, two_grad, log_evidence_grad, wake_phi_loss_grad, log_Q_grad, sleep_loss_grad, log_evidence_stats ]): for j in range(2): axss[j, i].plot(stats[:, j], color='black') axss[0, 0].set_ylabel('mean') axss[1, 0].set_ylabel('std') for ax in axss[0]: ax.set_yticks([ax.get_yticks()[0], ax.get_yticks()[-1]]) for ax in axss[1]: ax.set_yscale('log') # ax.set_yticks([0, ax.get_yticks()[-1]]) # ax.set_yticks([ax.get_yticks()[0], ax.get_yticks()[-1]]) # ax.set_yticks([1e-2, 1e4]) ax.set_xlabel('K') for axs in axss: for ax in axs: ax.set_xticks(range(len(num_particles_list))) ax.set_xticklabels(num_particles_list) sns.despine(ax=ax, trim=True) for ax, title in zip(axss[0], [ r'$g_{VIMCO}$', r'$g_{VIMCO}^1$', r'$g_{REINFORCE}$', r'$g_{REINFORCE}^1$', r'$g^2$', r'$\nabla_{\theta} \log Z_K$', r'$\nabla_{\phi}$ wake-$\phi$ loss', r'$\nabla_{\phi} \log Q$', r'$\nabla_{\phi}$ sleep loss', r'$\log \hat Z_K$' ]): ax.set_title(title) fig.tight_layout() if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/variance_analysis.pdf' fig.savefig(filename, bbox_inches='tight') print('saved to {}'.format(filename))
def main(): # step 1 模型 train_data = pd.read_csv(train_data_path, sep='\001', header=None) train_data.columns = ['id', 'title', 'doc', 'key_words'] train_candidates = util.load_object(train_candidate_path) Featutes, labels = train_model.build_train_sample(train_data, train_candidates) print(np.sum(labels)) print(Featutes.shape) dt = train_model.train_class_model(Featutes, labels) # test test_data = pd.read_csv(test_data_path, sep='\001', header=None) stop_words = util.stopwordslist(stop_words_path) test_data.columns = ['id', 'title', 'doc'] ids = test_data['id'].tolist() titles = test_data['title'].tolist() docs = test_data['doc'].tolist() test_candidates = util.load_object(test_candidate_path) sample_label_probs = train_model.get_test_sample_prob( dt, test_data, test_candidates) # util.save_object(sample_label_probs, './data/sample_labels_probs_add_title.pickle') # sample_label_probs = util.load_object('./data/sample_labels_probs_add_title.pickle') # sample_label_probs = util.load_object('./data/sample_title_doc_labels_probs1.pickle') with open('last_summit2.csv', 'w') as file: file.write('id,label1,label2\n') for (id, title, doc, words_prob) in zip(ids, titles, docs, sample_label_probs): if id == 'D087215': print('test......') if id == 'D087268': print('test......') title = str(title).strip() last_labes = extract_title_doc(id, title, stop_words, words_prob) labels_str = ",".join(last_labes) if len(last_labes) <= 1: labels_str += ',' file.write(id + "," + labels_str) file.write("\n")
def __init__(self, disease_name, fhir_resource, group_name, description, pickle_path): self.disease = disease_name self.description = description self.name = group_name self.resource_name = fhir_resource self.set_resource(fhir_resource) self.lines = {} self.pickle_path = '{}{}-{}.p'.format(pickle_path, disease_name, fhir_resource) self.criteria = load_object(self.pickle_path, list) self.mappings = {}
def get_features(self): print('Get features:', self.feature_extraction_method) if self.feature_extraction_method == FeatureExtractionMethod.BOW: return get_simple_bag_of_words_features(self.train_corpus, self.test_corpus) elif self.feature_extraction_method == FeatureExtractionMethod.TF_IDF: return get_tf_idf_features(self.train_corpus, self.test_corpus) elif self.feature_extraction_method == FeatureExtractionMethod.WORD2VEC: if self.should_load_embedding_model: print('Loading embedding model from disk') self.embedding_model = util.load_object( self.WORD2VEC_MODEL_SAVE_PATH) else: print('Calculating embeddings') self.embedding_model = get_word2vec_trained_model( self.tokenized_test, self.NUM_OF_VEC_FEATURES) util.save_object( self.embedding_model, self.CLASSIFIERS_AND_RESULTS_DIR_PATH + 'w2v_model_' + str(self.classifier_iter) + '.pkl') return self.get_document_embeddings_from_word2vec() elif self.feature_extraction_method == FeatureExtractionMethod.FASTTEXT: if self.should_load_embedding_model: print('Loading embedding model from disk') self.embedding_model = fasttext.load_model( self.FAST_TEXT_SAVE_PATH) else: print('Calculating embeddings') if not os.path.exists(self.TRAIN_DATA_FOR_FASTTEXT_PATH): self.reformat_and_save_data_for_fasttext() self.embedding_model = train_fasttext_model( self.TRAIN_DATA_FOR_FASTTEXT_PATH, self.NUM_OF_VEC_FEATURES, epoch=100) self.embedding_model.save_model(self.FAST_TEXT_SAVE_PATH) return self.get_document_embeddings_from_fasttext() else: print('No such feature extraction method:', self.feature_extraction_method)
def train(conf): train_dir = conf.get("TRAIN", "train_dir") model_path = conf.get("NORMAL", "model_path") report_dir = conf.get("TRAIN", "report_dir") N = conf.getint("TRAIN", "valdata_num") feat = load_object(conf.get("NORMAL", 'feat')) #crf = sklearn_crfsuite.CRF( crf = CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, ) X, y = zip(*load_corpus(train_dir)) X_train = feat(X[:-N]) y_train = y[:-N] X_validate = feat(X[-N:]) y_validate = y[-N:] crf.fit(X_train, y_train) numpy_pickle.dump(crf, model_path) # 性能测试 y_pred = crf.predict(X_validate) labels = list(crf.classes_) labels.remove("O") sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
def train_random_forest(training_data_dump): training_data = util.load_object(training_data_dump) model = RandomForest(training_data, 10, stats.median, 30) return model
tagger = BioTagger() def read_data(data_dir): for fname in os.listdir(data_dir): test_path = os.path.join(data_dir, fname) with open(test_path) as f: text = f.read() yield fname, text if __name__ == "__main__": config = get_config() feat = load_object(config.get("NORMAL", "feat")) test_dir = "data/test" result_dir = "data/submit" crf = numpy_pickle.load('data/models/crf.m') for fname, text in read_data(test_dir): print(fname) sents = [text] y = crf.predict(feat(sents)) anns = tagger.seq_to_ind(y[0]) anns = sorted(anns, key=lambda x:(x[1],x[2])) ann_fname = fname.replace(".txt", ".ann") save_path = os.path.join(result_dir, ann_fname) with open(save_path, 'w') as f:
train_nlp = [tn.spacy_english_model(item) for item in train_corpus] util.save_object( train_nlp, CLASSIFIERS_AND_RESULTS_DIR_PATH + 'train_nlp_' + str(CLASSIFIER_ITERATION) + '.pkl') train_glove_features = np.array([item.vector for item in train_nlp]) print('Test features') test_nlp = [tn.spacy_english_model(item) for item in test_corpus] util.save_object( train_nlp, CLASSIFIERS_AND_RESULTS_DIR_PATH + 'test_nlp_' + str(CLASSIFIER_ITERATION) + '.pkl') test_glove_features = np.array([item.vector for item in test_nlp]) return train_glove_features, test_glove_features # train_glove_features, test_glove_features = read_from_spacy_and_save() train_glove_features = util.load_object(TRAIN_NLP_PATH) test_glove_features = util.load_object(TEST_NLP_PATH) print('GloVe model:> Train features shape:', train_glove_features.shape, ' Test features shape:', test_glove_features.shape) ### def train_sgd(): svm = SGDClassifier(loss='hinge', penalty='l2', random_state=42, max_iter=500) svm.fit(train_glove_features, train_label_names) svm_glove_cv_scores = cross_val_score(svm, train_glove_features,
def train_svr(training_data_dump): training_data = util.load_object(training_data_dump) clf = svm.SVR(kernel='rbf') clf.fit(training_data[:, :-1], training_data[:, -1]) return clf
def predict_svr(clf, test_data_dump): test_data = util.load_object(test_data_dump) targets = test_data[:, -1] predictions = clf.predict(test_data[:, :-1]) return get_average_kappa(targets, predictions)
test_size=0.33, random_state=42) # tokenize corpus tokenized_train = [tn.tokenizer.tokenize(text) for text in train_corpus] tokenized_test = [tn.tokenizer.tokenize(text) for text in test_corpus] # generate word2vec word embeddings # # build and save word2vec model w2v_num_features = 1000 # w2v_model = gensim.models.Word2Vec(sentences=tokenized_train, size=w2v_num_features, # window=100, min_count=2, sample=1e-3, sg=1, # iter=5, workers=10) # util.save_object(w2v_model, CLASSIFIERS_AND_RESULTS_DIR_PATH + 'w2v_model' + str( # CLASSIFIER_ITERATION) + '.pkl') # # Load word2vec model w2v_model = util.load_object(WORD2VEC_MODEL_SAVE_PATH) # generate document level embeddings # remember we only use train dataset vocabulary embeddings # so that test dataset truly remains an unseen dataset # generate averaged word vector features from word2vec model avg_wv_train_features = document_vectorize(corpus=tokenized_train, model=w2v_model, num_features=w2v_num_features) avg_wv_test_features = document_vectorize(corpus=tokenized_test, model=w2v_model, num_features=w2v_num_features) print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape) # # pack data in one class
def plot_model_movie(): num_test_x = 5 num_particles_list = [2, 5, 10, 20] seed = seed_list[0] model_folder = util.get_most_recent_model_folder_args_match( seed=seed_list[0], train_mode=train_mode_list[0], num_particles=num_particles_list[0], init_near=init_near) args = util.load_object(util.get_args_path(model_folder)) _, _, true_generative_model = util.init_models(args) test_xs = np.linspace(0, 19, num=num_test_x) * 10 nrows = len(num_particles_list) ncols = num_test_x + 1 width = 5.5 ax_width = width / ncols height = nrows * ax_width fig, axss = plt.subplots(nrows, ncols, sharex=True, sharey=True, dpi=300) fig.set_size_inches(width, height) for num_particles_idx, num_particles in enumerate(num_particles_list): axss[num_particles_idx, 0].set_ylabel('$K = {}$'.format(num_particles), fontsize=SMALL_SIZE) handles = [mpatches.Rectangle((0, 0), 1, 1, color='black', label='True')] for color, label in zip(colors, labels): handles.append( mpatches.Rectangle((0, 0), 1, 1, color=color, label=label)) axss[-1, ncols // 2].legend(bbox_to_anchor=(0, -0.05), loc='upper center', ncol=len(handles), handles=handles) axss[0, 0].set_title(r'$p_\theta(z)$') for test_x_idx, test_x in enumerate(test_xs): axss[0, 1 + test_x_idx].set_title( r'$q_\phi(z | x = {0:.0f})$'.format(test_x)) for ax in axss[-1]: ax.set_xlabel(r'$z$', labelpad=0.5) for axs in axss: for ax in axs: ax.set_xticks([]) ax.set_xticklabels([]) ax.set_yticks([]) ax.set_yticklabels([]) ax.set_ylim(0, 8) ax.set_xlim(0, 20) # title = fig.suptitle('Iteration 0') t = axss[0, ncols // 2].text(0, 1.23, 'Iteration 0', horizontalalignment='center', verticalalignment='center', transform=axss[0, ncols // 2].transAxes, fontsize=MEDIUM_SIZE) fig.tight_layout(pad=0, rect=[0.01, 0.04, 0.99, 0.96]) def update(frame): result = [] iteration_idx = frame iteration = iteration_idx * 1000 t.set_text('Iteration {}'.format(iteration)) result.append(t) for axs in axss: for ax in axs: result.append( ax.add_artist( mpatches.Rectangle((0, 0), 20, 8, color='white'))) for num_particles_idx, num_particles in enumerate(num_particles_list): ax = axss[num_particles_idx, 0] # true generative model i = 0 plot_hinton(ax, true_generative_model.get_latent_params().data.numpy(), 8 - i, 8 - i - 1, 0, 20, color='black') # learned generative models for train_mode_idx, train_mode in enumerate(train_mode_list): label = labels[train_mode_idx] color = colors[train_mode_idx] model_folder = util.get_most_recent_model_folder_args_match( seed=seed, train_mode=train_mode, num_particles=num_particles, init_near=init_near) if model_folder is not None: generative_model, _ = util.load_models(model_folder, iteration=iteration) if generative_model is not None: plot_hinton( ax, generative_model.get_latent_params().data.numpy(), 8 - train_mode_idx - 1, 8 - train_mode_idx - 2, 0, 20, label=label, color=color) result += ax.artists # inference network for test_x_idx, test_x in enumerate(test_xs): ax = axss[num_particles_idx, test_x_idx + 1] test_x_tensor = torch.tensor(test_x, dtype=torch.float, device=args.device).unsqueeze(0) # true plot_hinton(ax, true_generative_model.get_posterior_probs( test_x_tensor)[0].data.numpy(), 8 - i, 8 - i - 1, 0, 20, color='black') # learned for train_mode_idx, train_mode in enumerate(train_mode_list): label = labels[train_mode_idx] color = colors[train_mode_idx] model_folder = \ util.get_most_recent_model_folder_args_match( seed=seed, train_mode=train_mode, num_particles=num_particles, init_near=init_near) if model_folder is not None: _, inference_network = util.load_models( model_folder, iteration=iteration) if inference_network is not None: plot_hinton(ax, inference_network.get_latent_params( test_x_tensor)[0].data.numpy(), 8 - train_mode_idx - 1, 8 - train_mode_idx - 2, 0, 20, label=label, color=color) result += ax.artists return result anim = FuncAnimation(fig, update, frames=np.arange(100), blit=True) if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/model_movie.mp4' anim.save(filename, dpi=300) print('Saved to {}'.format(filename))
def plot_models(): saving_iterations = np.arange(100) * 1000 num_iterations_to_plot = 3 iterations_to_plot = saving_iterations[np.floor( np.linspace(0, 99, num=num_iterations_to_plot)).astype(int)] num_test_x = 3 num_particles_list = [2, 20] seed = seed_list[0] model_folder = util.get_most_recent_model_folder_args_match( seed=seed_list[0], train_mode=train_mode_list[0], num_particles=num_particles_list[0], init_near=init_near) args = util.load_object(util.get_args_path(model_folder)) _, _, true_generative_model = util.init_models(args) test_xs = np.linspace(0, 19, num=num_test_x) * 10 nrows = num_iterations_to_plot ncols = len(num_particles_list) * (num_test_x + 1) fig, axss = plt.subplots(nrows, ncols, sharex=True, sharey=True) width = 5.5 ax_width = width / ncols height = nrows * ax_width fig.set_size_inches(width, height) for iteration_idx, iteration in enumerate(iterations_to_plot): axss[iteration_idx, 0].set_ylabel('Iter. {}'.format(iteration)) for num_particles_idx, num_particles in enumerate(num_particles_list): ax = axss[iteration_idx, num_particles_idx * (num_test_x + 1)] ax.set_xticks([]) ax.set_xticklabels([]) ax.set_yticks([]) ax.set_yticklabels([]) ax.set_ylim(0, 8) ax.set_xlim(0, 20) if iteration_idx == 0: ax.set_title(r'$p_\theta(z)$') # true generative model i = 0 plot_hinton(ax, true_generative_model.get_latent_params().data.numpy(), 8 - i, 8 - i - 1, 0, 20, color='black') # learned generative models for train_mode_idx, train_mode in enumerate(train_mode_list): label = labels[train_mode_idx] color = colors[train_mode_idx] model_folder = util.get_most_recent_model_folder_args_match( seed=seed, train_mode=train_mode, num_particles=num_particles, init_near=init_near) if model_folder is not None: generative_model, _ = util.load_models(model_folder, iteration=iteration) if generative_model is not None: plot_hinton( ax, generative_model.get_latent_params().data.numpy(), 8 - train_mode_idx - 1, 8 - train_mode_idx - 2, 0, 20, label=label, color=color) # inference network for test_x_idx, test_x in enumerate(test_xs): ax = axss[iteration_idx, num_particles_idx * (num_test_x + 1) + test_x_idx + 1] ax.set_xticks([]) ax.set_xticklabels([]) ax.set_yticks([]) ax.set_yticklabels([]) ax.set_ylim(0, 8) ax.set_xlim(0, 20) test_x_tensor = torch.tensor(test_x, dtype=torch.float, device=args.device).unsqueeze(0) if iteration_idx == 0: ax.set_title(r'$q_\phi(z | x = {0:.0f})$'.format(test_x)) # true plot_hinton(ax, true_generative_model.get_posterior_probs( test_x_tensor)[0].data.numpy(), 8 - i, 8 - i - 1, 0, 20, color='black') # learned for train_mode_idx, train_mode in enumerate(train_mode_list): label = labels[train_mode_idx] color = colors[train_mode_idx] model_folder = \ util.get_most_recent_model_folder_args_match( seed=seed, train_mode=train_mode, num_particles=num_particles, init_near=init_near) if model_folder is not None: _, inference_network = util.load_models( model_folder, iteration=iteration) if inference_network is not None: plot_hinton(ax, inference_network.get_latent_params( test_x_tensor)[0].data.numpy(), 8 - train_mode_idx - 1, 8 - train_mode_idx - 2, 0, 20, label=label, color=color) for num_particles_idx, num_particles in enumerate(num_particles_list): ax = axss[0, num_particles_idx * (num_test_x + 1) + (num_test_x + 1) // 2] ax.text(0, 1.25, '$K = {}$'.format(num_particles), fontsize=SMALL_SIZE, verticalalignment='bottom', horizontalalignment='center', transform=ax.transAxes) handles = [mpatches.Rectangle((0, 0), 1, 1, color='black', label='True')] for color, label in zip(colors, labels): handles.append( mpatches.Rectangle((0, 0), 1, 1, color=color, label=label)) axss[-1, ncols // 2].legend(bbox_to_anchor=(0, -0.1), loc='upper center', ncol=len(handles), handles=handles) for ax in axss[-1]: ax.set_xlabel(r'$z$', labelpad=0.5) fig.tight_layout(pad=0) if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/models.pdf' fig.savefig(filename, bbox_inches='tight') print('Saved to {}'.format(filename))
def load_fileds(self,path): self.CHAR,self.WORD = util.load_object(path,"pkl")
def train_sklearn_random_forest(training_data_dump): training_data = util.load_object(training_data_dump) model = RandomForestRegressor(n_estimators=10) model = model.fit(training_data[:, :-1], training_data[:, -1]) return model
def bulid_candidate_words(data, stop_nfile, candidate_save_path, candidata_pos={}, first_sentence_count=30, last_sentence_count=20): # ID 标题 文本内容 stop_words = util.stopwordslist(stop_nfile) # load corpus and model corpus_dict = util.load_object(corpora_dict_path) corpus = corpora.MmCorpus(corpus_path) tfidf_model = models.TfidfModel.load(tfidf_path) lda_model = models.LdaModel.load(lda_path) lsi_model = models.LsiModel.load(lsi_path) candidate_words = [] for index, row in data.iterrows(): title = str(row['title']).strip() doc = str(row['doc']).strip() candidate_word = {} # 该行记录的候选词key为word,value为id对应的特征(选择的10个特征) # doc words_doc = list(pseg.cut(doc, HMM=True)) #[(word, flag)] # title words_title = list(pseg.cut(title, HMM=True)) # 去除停用词 words_doc = [(word, pos) for word,pos in words_doc if word not in stop_words] words_title = [(word, pos) for word,pos in words_title if word not in stop_words] doc_len = len(words_doc) # 统计去除停用词后的doc长度 title_len = len(words_title) for word_index,(word,pos) in enumerate(words_doc): if pos in candidata_pos and len(word) > 1: # 特征的最后三项分别:features[-3]doc长度,features[-2]纪录候选词的首次出现位置,features[-1]最后一次出现的位置 if word in candidate_word: word_features = candidate_word[word] word_features[-1] = (word_index+1) candidate_word[word] = word_features continue else: features = [0] * 14 features[-3] = doc_len # feature 1 词性 features[0] = candidata_pos[pos] # feature 2 候选词首次出现的位置 if doc_len == 0: firoc = 0. else: firoc = (word_index+1)/float(doc_len) features[1] = firoc features[-2] = (word_index+1) # 首次出现的位置 # feature 3 候选词的长度 features[2] = len(word) # feature 4 候选词为的字符都是数字或者字母组成 if util.is_contain_char_num(word): features[3] = 1 # feature 5 候选词对应的tfidf id = corpus_dict.token2id.get(word, len(corpus_dict.token2id)+1) if id == len(corpus_dict.token2id)+1: features[4] = 1e-8 else: for (w_id, tfidf) in tfidf_model[corpus[index]]: if id == w_id: features[4] = tfidf break # feature 6 第一句中候选词出现的次数 first_sentence = words_doc[:first_sentence_count] features[5] = util.get_count_sentence(word,first_sentence) # feature 7 最后一句中候选词出现的次数[-20:] last_sentence = words_doc[-last_sentence_count:] features[6] = util.get_count_sentence(word,last_sentence) # feature 8,9 LDA,LSI:候选词的主题分布与文档的主题分布的相似度 single_list = [word] word_corpus = tfidf_model[corpus_dict.doc2bow(single_list)] features[7] = get_topic_sim(lda_model,word_corpus,corpus[index]) features[8] = get_topic_sim(lsi_model,word_corpus,corpus[index]) # feature 11 词跨度长度由的首次出现位置和最后一次出现的位置和doc长度计算 candidate_word[word] = features for word_index, (word, pos) in enumerate(words_title): if pos in candidata_pos and len(word) > 1: if word in candidate_word: word_features = candidate_word[word] # feature 10 是否出现在标题中 word_features[9] = 1 candidate_word[word] = word_features else: features = [0] * 14 features[-3] = title_len # feature 1 词性 features[0] = candidata_pos[pos] # feature 2 候选词首次出现的位置 if title_len == 0: firoc = 0. else: firoc = (word_index + 1) / float(title_len) features[1] = firoc features[-2] = (word_index + 1) # 首次出现的位置 # feature 3 候选词的长度 features[2] = len(word) # feature 4 候选词为的字符都是数字或者字母组成 if util.is_contain_char_num(word): features[3] = 1 # feature 5 候选词对应的tfidf id = corpus_dict.token2id.get(word, len(corpus_dict.token2id) + 1) if id == len(corpus_dict.token2id) + 1: features[4] = 1e-8 else: for (w_id, tfidf) in tfidf_model[corpus[index]]: if id == w_id: features[4] = tfidf break # feature 6 第一句中候选词出现的次数 first_sentence = words_doc[:first_sentence_count] features[5] = util.get_count_sentence(word, first_sentence) # feature 7 最后一句中候选词出现的次数[-20:] last_sentence = words_doc[-last_sentence_count:] features[6] = util.get_count_sentence(word, last_sentence) # feature 8,9 LDA,LSI:候选词的主题分布与文档的主题分布的相似度 single_list = [word] word_corpus = tfidf_model[corpus_dict.doc2bow(single_list)] features[7] = get_topic_sim(lda_model, word_corpus, corpus[index]) features[8] = get_topic_sim(lsi_model, word_corpus, corpus[index]) # feature 10 是否出现在标题中 features[9] = 1 # feature 11 词跨度长度由的首次出现位置和最后一次出现的位置和doc长度计算 candidate_word[word] = features candidate_words.append(candidate_word) # save if index % 2000 == 0: print('deal with sentence %d' % index) # data['candidate_words'] = candidate_words # data.to_csv(data_candidate_path, sep='\001', header=None, index=None) util.save_object(candidate_words,candidate_save_path)
from topic_classification.feature_extraction_utils import \ document_vectorize, document_vectorize_with_fasttext_model from topic_classification.dataset_utils import load_20newsgroups import util import topic_classification.experiment_config as experiment_config from topic_classification.constants import TOPIC_CLASSIFICATION_DATA_PATH import tensorflow_hub as hub import tensorflow as tf gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(gpus[0], True) data_df = load_20newsgroups() data_string = util.load_object(TOPIC_CLASSIFICATION_DATA_PATH + '20_newsgroups_one_string.txt') data_word_list = data_string.split(' ') vocabulary = set(data_word_list) train_corpus, test_corpus, train_label_names, \ test_label_names = train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Name']), test_size=0.33, random_state=42) # tokenize corpus tokenized_train = [tn.tokenizer.tokenize(text) for text in train_corpus] tokenized_test = [tn.tokenizer.tokenize(text) for text in test_corpus] # # # Feature extraction # elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True) elmo = hub.KerasLayer("https://hub.tensorflow.google.cn/google/elmo/3", trainable=True,
def load_from_file(self, filename): self.tt = util.load_object(filename) assert isinstance(self.tt, Trie)
def train_and_save(classifier_list, classifier_name_list, training_data): results = train_multiple_classifiers(classifier_list, classifier_name_list, training_data) util.save_object(results, RESULTS_PATH) util.save_classifier_list(classifier_list, classifier_name_list, CLASSIFIERS_AND_RESULTS_DIR_PATH) return results # Train and save on disk # results = train_and_save(classifier_list, classifier_name_list, training_data) # #Load from disk classifier_list = util.load_classifier_list(classifier_name_list, CLASSIFIERS_AND_RESULTS_DIR_PATH) results = util.load_object(RESULTS_PATH) # results[0] = array of crossvalidation, [1] crossvalidation scores, # [2] test score, [3] times # # Plotting cv_mean_scores = [round(result[1], SCORE_DECIMAL_PLACES) for result in results] test_scores = [round(result[2], SCORE_DECIMAL_PLACES) for result in results] elapsed_times = [round(result[3], TIME_DECIMAL_PLACES) for result in results] # create_bar_plot(classifier_name_shortcut_list, 'Classifier scores', 'Accuracy', # cv_mean_scores, y_range_tuple=(0, 1)) create_2_bar_plot(classifier_name_shortcut_list, 'Classifier scores', 'Accuracy', cv_mean_scores, test_scores, 'cv means',
def main(args): if args.mode == 'efficiency': num_runs = 10 num_particles_list = [2, 5, 10, 50, 100, 500, 1000, 5000] num_partitions_list = [2, 5, 10, 50, 100, 500, 1000] path = './save/efficiency.pkl' (memory_thermo, time_thermo, memory_vimco, time_vimco, memory_reinforce, time_reinforce) = util.load_object(path) fig, axs = plt.subplots(1, 2, dpi=200, figsize=(6, 4)) # colors = ['C0', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8'] norm = matplotlib.colors.Normalize(vmin=0, vmax=len(num_particles_list)) cmap = matplotlib.cm.ScalarMappable(norm=norm, cmap=matplotlib.cm.Blues) cmap.set_array([]) colors = [cmap.to_rgba(i + 1) for i in range(len(num_particles_list))] for i, num_partitions in enumerate(num_partitions_list): axs[0].plot(num_particles_list, np.mean(time_thermo[:, i], axis=-1), label='thermo K={}'.format(num_partitions), color=colors[i], marker='x', linestyle='none') axs[0].plot(num_particles_list, np.mean(time_vimco, axis=-1), color='black', label='vimco', marker='o', linestyle='none', fillstyle='none') axs[0].plot(num_particles_list, np.mean(time_reinforce, axis=-1), color='black', label='reinforce', marker='v', linestyle='none', fillstyle='none') axs[0].set_xscale('log') axs[0].set_yscale('log') axs[0].set_xlabel('number of particles') axs[0].set_ylabel('time (seconds)') axs[0].grid(True) axs[0].grid(True, which='minor', linewidth=0.2) # axs[0].legend(bbox_to_anchor=(1.13, -0.19), loc='upper center', ncol=3) sns.despine(ax=axs[0]) # colors = ['C0', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8'] for i, num_partitions in enumerate(num_partitions_list): axs[1].plot(num_particles_list, np.mean(memory_thermo[:, i] / 1e6, axis=-1), label='thermo K={}'.format(num_partitions), color=colors[i], marker='x', linestyle='none') axs[1].plot(num_particles_list, np.mean(memory_vimco / 1e6, axis=-1), color='black', label='vimco', marker='o', linestyle='none', fillstyle='none') axs[1].plot(num_particles_list, np.mean(memory_reinforce / 1e6, axis=-1), color='black', label='reinforce', marker='v', linestyle='none', fillstyle='none') axs[1].set_xscale('log') axs[1].set_yscale('log') axs[1].set_xlabel('number of particles') axs[1].set_ylabel('memory (MB)') axs[-1].legend(fontsize=6, ncol=2) axs[1].grid(True) axs[1].grid(True, which='minor', linewidth=0.2) sns.despine(ax=axs[1]) fig.tight_layout() if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/efficiency.pdf' fig.savefig(filename, bbox_inches='tight') print('saved to {}'.format(filename)) elif args.mode == 'insights': markersize = 3 learning_rate = 3e-4 architecture = 'linear_3' seed = 8 train_mode = 'thermo' num_particles_list = [2, 5, 10, 50] num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] log_beta_mins_1 = [-10, -1, -0.045757490560675115] log_beta_mins_2 = [ -5, -2, -1.6989700043360187, -1.5228787452803376, -1.3979400086720375, -1.3010299956639813, -1.2218487496163564, -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1, -0.6989700043360187, -0.5228787452803375, -0.3979400086720376, -0.3010299956639812, -0.2218487496163564, -0.15490195998574313, -0.09691001300805639, -0.045757490560675115 ] num_iterations = 400 log_p_thermo_partition_sweep = np.full( (len(num_particles_list), len(log_beta_mins_1), len(num_partitions_list), num_iterations), np.nan) log_p_thermo_beta_sweep = np.full( (len(num_particles_list), len(log_beta_mins_2), num_iterations), np.nan) for num_particles_idx, num_particles in enumerate(num_particles_list): for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_1): for num_partitions_idx, num_partitions in enumerate( num_partitions_list): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_thermo_partition_sweep[ num_particles_idx, log_beta_min_idx, num_partitions_idx] = stats.log_p_history[: num_iterations] print('thermo {} ({} partitions) beta_min = 1e{} after' ' {} it: {}'.format(num_particles, num_partitions, log_beta_min, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') num_partitions = 2 for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_thermo_beta_sweep[ num_particles_idx, log_beta_min_idx] = stats.log_p_history[: num_iterations] print('thermo {} ({} partitions) beta_min = 1e{} after {}' ' it: {}'.format(num_particles, num_partitions, log_beta_min, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') fig, axs = plt.subplots(2, 2, dpi=200, figsize=(12, 7), sharey=True) for log_beta_min_idx, ax in zip(range(len(log_beta_mins_1)), [axs[0, 0], axs[0, 1], axs[1, 0]]): colors = ['C1', 'C2', 'C4', 'C5'] # ax = axs[log_beta_min_idx] for num_particles_idx, num_particles in enumerate( num_particles_list): ax.plot(num_partitions_list, log_p_thermo_partition_sweep[num_particles_idx, log_beta_min_idx, :, -1], color=colors[num_particles_idx], label=num_particles, marker='o', markersize=markersize, linestyle='solid', linewidth=0.7) ax.set_title(r'$\beta_1 = {:.0e}$'.format( 10**log_beta_mins_1[log_beta_min_idx])) # ax.set_xticks(np.arange(len(num_partitions_list))) # ax.set_xticklabels(num_partitions_list) ax.set_xlabel('number of partitions') ax.set_xticks(np.arange(0, max(num_partitions_list) + 1, 10)) ax = axs[1, 1] for num_particles_idx, num_particles in enumerate(num_particles_list): ax.plot(10**np.array(log_beta_mins_2), log_p_thermo_beta_sweep[num_particles_idx, :, -1], color=colors[num_particles_idx], label=num_particles, marker='o', markersize=markersize, linestyle='solid', linewidth=0.7) ax.set_xticks(np.arange(0, 1.1, 0.2)) ax.set_title('2 partitions') ax.set_xlabel(r'$\beta_1$') print(np.max(log_p_thermo_beta_sweep[..., -1], axis=-1)) print(np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1)) print([ log_beta_mins_2[i] for i in np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1) ]) print([ 10**log_beta_mins_2[i] for i in np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1) ]) # print(log_beta_mins_2[np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1)]) for axx in axs: for ax in axx: ax.grid(True, axis='y') for ax in axs[:, 0]: ax.set_ylim(top=-88) ax.set_ylabel(r'$\log p(x)$') axs[1, 1].legend(title='number of particles', ncol=2, loc='lower right') for axx in axs: for ax in axx: sns.despine(ax=ax, trim=True) # ax.('thermo') fig.tight_layout() if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/insights.pdf' fig.savefig(filename, bbox_inches='tight') print('saved to {}'.format(filename)) elif args.mode == 'baselines': learning_rate = 3e-4 architecture = 'linear_3' seed = 8 non_thermo_train_modes = ['ww', 'vimco'] num_particles_list = [2, 5, 10, 50] num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] # log_beta_mins_1 = [-10, -1, -0.045757490560675115] log_beta_mins_2 = [ -5, -2, -1.6989700043360187, -1.5228787452803376, -1.3979400086720375, -1.3010299956639813, -1.2218487496163564, -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1, -0.6989700043360187, -0.5228787452803375, -0.3979400086720376, -0.3010299956639812, -0.2218487496163564, -0.15490195998574313, -0.09691001300805639, -0.045757490560675115 ] num_iterations = 400 log_p_thermo_beta_sweep = np.full( (len(num_particles_list), len(log_beta_mins_2), num_iterations), np.nan) log_p_non_thermo = np.full((len(non_thermo_train_modes), len(num_particles_list), num_iterations), np.nan) train_mode = 'thermo' for num_particles_idx, num_particles in enumerate(num_particles_list): num_partitions = 2 for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_thermo_beta_sweep[ num_particles_idx, log_beta_min_idx] = stats.log_p_history print('thermo {} ({} partitions) beta_min = 1e{} after {}' ' it: {}'.format(num_particles, num_partitions, log_beta_min, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') seed = 7 log_beta_min = -10 learning_rate = 3e-4 num_partitions = 1 for train_mode_idx, train_mode in enumerate(non_thermo_train_modes): for num_particles_idx, num_particles in enumerate( num_particles_list): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_non_thermo[train_mode_idx, num_particles_idx, :len( stats.log_p_history)] = stats.log_p_history print('{} {} after {} it: {}'.format( train_mode, num_particles, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') fig, ax = plt.subplots(1, 1, dpi=200, figsize=(6, 4)) colors = ['C1', 'C2', 'C4', 'C5'] linestyles = ['dashed', 'dotted'] for train_mode_idx, train_mode in enumerate(non_thermo_train_modes): for num_particles_idx, num_particles in enumerate( num_particles_list): if train_mode == 'ww': label = 'rws' else: label = train_mode ax.plot(log_p_non_thermo[train_mode_idx, num_particles_idx], linestyle=linestyles[train_mode_idx], color=colors[num_particles_idx], label='{} {} ({:.2f})'.format( label, num_particles, log_p_non_thermo[train_mode_idx, num_particles_idx, -1])) # best_num_particles_idx = 3 # best_beta_idxs = [4, 5, 11] # best_beta_idxs = [0, 4, 7, 11] best_beta_idxs = [18, 5, 11, 12] for num_particles_idx, num_particles in enumerate(num_particles_list): best_beta_idx = best_beta_idxs[num_particles_idx] color = colors[num_particles_idx] ax.plot( log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx], linestyle='solid', color=color, label='thermo S={}, K={}, $\\beta_1$={:.0e} ({:.2f})'.format( num_particles_list[num_particles_idx], 2, 10**(log_beta_mins_2[best_beta_idx]), log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx, -1])) ax.set_ylim(-110) ax.grid(True, axis='y', linewidth=0.2) ax.legend(fontsize=6, ncol=3, frameon=False) ax.set_ylabel(r'$\log p(x)$') ax.set_xlabel('iteration') ax.xaxis.set_label_coords(0.5, -0.025) ax.set_xticks([0, num_iterations]) ax.set_xticklabels([0, '4e6']) sns.despine(ax=ax, trim=True) fig.tight_layout() if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/baselines.pdf' fig.savefig(filename, bbox_inches='tight') print('saved to {}'.format(filename)) elif args.mode == 'grad_std': learning_rate = 3e-4 architecture = 'linear_3' seed = 8 non_thermo_train_modes = ['ww', 'vimco'] num_particles_list = [2, 5, 10, 50] num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] log_beta_mins_1 = [-10, -1, -0.045757490560675115] log_beta_mins_2 = [ -5, -2, -1.6989700043360187, -1.5228787452803376, -1.3979400086720375, -1.3010299956639813, -1.2218487496163564, -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1, -0.6989700043360187, -0.5228787452803375, -0.3979400086720376, -0.3010299956639812, -0.2218487496163564, -0.15490195998574313, -0.09691001300805639, -0.045757490560675115 ] num_iterations = 400 log_p_thermo_beta_sweep = np.full( (len(num_particles_list), len(log_beta_mins_2), num_iterations), np.nan) log_p_non_thermo = np.full((len(non_thermo_train_modes), len(num_particles_list), num_iterations), np.nan) train_mode = 'thermo' for num_particles_idx, num_particles in enumerate(num_particles_list): num_partitions = 2 for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_thermo_beta_sweep[ num_particles_idx, log_beta_min_idx] = stats.grad_std_history print('thermo {} ({} partitions) beta_min = 1e{} after {}' ' it: {}'.format(num_particles, num_partitions, log_beta_min, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') seed = 7 log_beta_min = -10 learning_rate = 3e-4 num_partitions = 1 for train_mode_idx, train_mode in enumerate(non_thermo_train_modes): for num_particles_idx, num_particles in enumerate( num_particles_list): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_non_thermo[train_mode_idx, num_particles_idx, :len( stats.log_p_history)] = stats.grad_std_history print('{} {} after {} it: {}'.format( train_mode, num_particles, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') fig, ax = plt.subplots(1, 1, dpi=200, figsize=(6, 4)) colors = ['C1', 'C2', 'C4', 'C5'] linestyles = ['dashed', 'dotted'] for train_mode_idx, train_mode in enumerate(non_thermo_train_modes): for num_particles_idx, num_particles in enumerate( num_particles_list): if train_mode == 'ww': label = 'rws' else: label = train_mode ax.plot(log_p_non_thermo[train_mode_idx, num_particles_idx], linestyle=linestyles[train_mode_idx], color=colors[num_particles_idx], label='{} {} ({:.2f})'.format( label, num_particles, log_p_non_thermo[train_mode_idx, num_particles_idx, -1])) # best_num_particles_idx = 3 # best_beta_idxs = [4, 5, 11] # best_beta_idxs = [0, 4, 5, 11] best_beta_idxs = [18, 5, 11, 12] for num_particles_idx, num_particles in enumerate(num_particles_list): best_beta_idx = best_beta_idxs[num_particles_idx] color = colors[num_particles_idx] ax.plot( log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx], linestyle='solid', color=color, label='thermo S={}, K={}, $\\beta_1$={:.0e} ({:.2f})'.format( num_particles_list[num_particles_idx], 2, 10**(log_beta_mins_2[best_beta_idx]), log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx, -1])) ax.set_ylim(0, 20) ax.grid(True, axis='y', linewidth=0.2) ax.legend(fontsize=6, ncol=3, frameon=False) ax.set_ylabel(r'grad std') ax.set_xlabel('iteration') ax.xaxis.set_label_coords(0.5, -0.025) ax.set_xticks([0, num_iterations]) ax.set_xticklabels([0, '4e6']) sns.despine(ax=ax, trim=True) fig.tight_layout() if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/grad_std.pdf' fig.savefig(filename, bbox_inches='tight') print('saved to {}'.format(filename)) elif args.mode == 'baselines_kl': learning_rate = 3e-4 architecture = 'linear_3' seed = 8 non_thermo_train_modes = ['ww', 'vimco'] num_particles_list = [2, 5, 10, 50] num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] # log_beta_mins_1 = [-10, -1, -0.045757490560675115] log_beta_mins_2 = [ -5, -2, -1.6989700043360187, -1.5228787452803376, -1.3979400086720375, -1.3010299956639813, -1.2218487496163564, -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1, -0.6989700043360187, -0.5228787452803375, -0.3979400086720376, -0.3010299956639812, -0.2218487496163564, -0.15490195998574313, -0.09691001300805639, -0.045757490560675115 ] num_iterations = 400 log_p_thermo_beta_sweep = np.full( (len(num_particles_list), len(log_beta_mins_2), num_iterations), np.nan) log_p_non_thermo = np.full((len(non_thermo_train_modes), len(num_particles_list), num_iterations), np.nan) train_mode = 'thermo' for num_particles_idx, num_particles in enumerate(num_particles_list): num_partitions = 2 for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_thermo_beta_sweep[ num_particles_idx, log_beta_min_idx] = stats.kl_history print('thermo {} ({} partitions) beta_min = 1e{} after {}' ' it: {}'.format(num_particles, num_partitions, log_beta_min, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') seed = 7 log_beta_min = -10 learning_rate = 3e-4 num_partitions = 1 for train_mode_idx, train_mode in enumerate(non_thermo_train_modes): for num_particles_idx, num_particles in enumerate( num_particles_list): dir_ = util.get_most_recent_dir_args_match( train_mode=train_mode, architecture=architecture, learning_rate=learning_rate, num_particles=num_particles, num_partitions=num_partitions, log_beta_min=log_beta_min, seed=seed) if dir_ is not None: stats = util.load_object(util.get_stats_path(dir_)) log_p_non_thermo[train_mode_idx, num_particles_idx, :len( stats.kl_history)] = stats.kl_history print('{} {} after {} it: {}'.format( train_mode, num_particles, len(stats.log_p_history), stats.log_p_history[-1])) else: print('missing') fig, ax = plt.subplots(1, 1, dpi=200, figsize=(6, 4)) colors = ['C1', 'C2', 'C4', 'C5'] linestyles = ['dashed', 'dotted'] for train_mode_idx, train_mode in enumerate(non_thermo_train_modes): for num_particles_idx, num_particles in enumerate( num_particles_list): if train_mode == 'ww': label = 'rws' else: label = train_mode ax.plot(log_p_non_thermo[train_mode_idx, num_particles_idx], linestyle=linestyles[train_mode_idx], color=colors[num_particles_idx], label='{} {} ({:.2f})'.format( label, num_particles, log_p_non_thermo[train_mode_idx, num_particles_idx, -1])) # best_num_particles_idx = 3 # best_beta_idxs = [4, 5, 11] # best_beta_idxs = [0, 4, 5, 11] best_beta_idxs = [18, 5, 11, 12] for num_particles_idx, num_particles in enumerate(num_particles_list): best_beta_idx = best_beta_idxs[num_particles_idx] color = colors[num_particles_idx] ax.plot( log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx], linestyle='solid', color=color, label='thermo S={}, K={}, $\\beta_1$={:.0e} ({:.2f})'.format( num_particles_list[num_particles_idx], 2, 10**(log_beta_mins_2[best_beta_idx]), log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx, -1])) ax.set_ylim(5, 20) ax.grid(True, axis='y', linewidth=0.2) ax.legend(fontsize=6, ncol=3, frameon=False) ax.set_ylabel(r'KL(q || p)') ax.set_xlabel('iteration') ax.xaxis.set_label_coords(0.5, -0.025) ax.set_xticks([0, num_iterations]) ax.set_xticklabels([0, '4e6']) sns.despine(ax=ax, trim=True) fig.tight_layout() if not os.path.exists('./plots/'): os.makedirs('./plots/') filename = './plots/baselines_kl.pdf' fig.savefig(filename, bbox_inches='tight') print('saved to {}'.format(filename))
def source_event_counter(enrollment_set, base_date): """ Counts the source-event pairs. Features -------- """ X_pkl_path = util.cache_path('source_event_counter_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(X_pkl_path): return util.fetch(X_pkl_path) logger = logging.getLogger('source_event_counter') logger.debug('preparing datasets') Enroll_all = util.load_enrollments() pkl_path = util.cache_path('Log_all_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): Log = util.fetch(pkl_path) else: Log = util.load_logs() Log = Log[Log['time'] <= base_date] Log['source_event'] = Log['source'] + '-' + Log['event'] Log['day_diff'] = (base_date - Log['time']).dt.days Log['week_diff'] = Log['day_diff'] // 7 Log['event_count'] = 1 util.dump(Log, pkl_path) Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() logger.debug('datasets prepared') Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\ .reset_index() n_proc = par.cpu_count() pkl_path = util.cache_path('event_count_by_eid_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): event_count_by_eid = util.fetch(pkl_path) else: params = [] eids = [] for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\ .groupby(['enrollment_id']): params.append(df) eids.append(eid) pool = par.Pool(processes=min(n_proc, len(params))) event_count_by_eid = dict( zip(eids, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(event_count_by_eid, pkl_path) X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']]) logger.debug('source-event pairs counted, has nan: %s, shape: %s', np.any(np.isnan(X0)), repr(X0.shape)) pkl_path = util.cache_path('D_full_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): D_full = util.fetch(pkl_path) else: D_full = pd.merge(Enroll_all, Log, on=['enrollment_id']) util.dump(D_full, pkl_path) pkl_path = util.cache_path('user_wn_courses_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): user_wn_courses = util.fetch(pkl_path) else: user_wn_courses = {} for u, df in D_full.groupby(['username']): x = [] for wn in __week_span__: x.append(len(df[df['week_diff'] == wn]['course_id'].unique())) user_wn_courses[u] = x util.dump(user_wn_courses, pkl_path) X1 = np.array([user_wn_courses[u] for u in Enroll['username']]) logger.debug('courses by user counted, has nan: %s, shape: %s', np.any(np.isnan(X1)), repr(X1.shape)) pkl_path = util.cache_path('course_population_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_population = util.fetch(pkl_path) else: course_population = {} for c, df in D_full.groupby(['course_id']): course_population[c] = len(df['username'].unique()) util.dump(course_population, pkl_path) X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']]) logger.debug('course population counted, has nan: %s, shape: %s', np.any(np.isnan(X2)), repr(X2.shape)) pkl_path = util.cache_path('course_dropout_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_dropout_count = util.fetch(pkl_path) else: course_dropout_count = course_population.copy() for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']): course_dropout_count[c] -= len(df['username'].unique()) util.dump(course_dropout_count, pkl_path) X3 = np.array( [course_dropout_count.get(c, 0) for c in Enroll['course_id']]) logger.debug('course dropout counted, has nan: %s, shape: %s', np.any(np.isnan(X3)), repr(X3.shape)) pkl_path = util.cache_path('user_ops_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): user_ops_count = util.fetch(pkl_path) else: user_ops_on_all_courses = D_full.groupby( ['username', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() params = [] users = [] for u, df in user_ops_on_all_courses.groupby(['username']): params.append(df) users.append(u) pool = par.Pool(processes=min(n_proc, len(params))) user_ops_count = dict( zip(users, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(user_ops_count, pkl_path) X4 = X0 / [user_ops_count[u] for u in Enroll['username']] X4[np.isnan(X4)] = 0 logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s', np.any(np.isnan(X4)), repr(X4.shape)) pkl_path = util.cache_path('course_ops_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_ops_count = util.fetch(pkl_path) else: course_ops_of_all_users = D_full.groupby( ['course_id', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() params = [] courses = [] for c, df in course_ops_of_all_users.groupby(['course_id']): params.append(df) courses.append(c) pool = par.Pool(processes=min(n_proc, len(params))) course_ops_count = dict( zip(courses, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(course_ops_count, pkl_path) X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']] X5[np.isnan(X5)] = 0 logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s', np.any(np.isnan(X5)), repr(X5.shape)) X6 = np.array([ course_dropout_count.get(c, 0) / course_population.get(c, 1) for c in Enroll['course_id'] ]) logger.debug('dropout ratio of courses, has nan: %s, shape: %s', np.any(np.isnan(X6)), repr(X6.shape)) Obj = util.load_object() Obj = Obj[Obj['start'] <= base_date] course_time = {} for c, df in Obj.groupby(['course_id']): start_time = np.min(df['start']) update_time = np.max(df['start']) course_time[c] = [(base_date - start_time).days, (base_date - update_time).days] avg_start_days = np.average([t[0] for _, t in course_time.items()]) avg_update_days = np.average([t[1] for _, t in course_time.items()]) default_case = [avg_start_days, avg_update_days] X7 = np.array( [course_time.get(c, default_case)[0] for c in Enroll['course_id']]) logger.debug('days from course first update, has nan: %s, shape: %s', np.any(np.isnan(X7)), repr(X7.shape)) X8 = np.array( [course_time.get(c, default_case)[1] for c in Enroll['course_id']]) logger.debug('days from course last update, has nan: %s, shape: %s', np.any(np.isnan(X8)), repr(X8.shape)) user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\ .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\ .fillna(0) X9 = np.array(user_ops_time['day_diff']['amin']) logger.debug('days from user last op, has nan: %s, shape: %s', np.any(np.isnan(X9)), repr(X9.shape)) X10 = np.array(user_ops_time['day_diff']['amax']) logger.debug('days from user first op, has nan: %s, shape: %s', np.any(np.isnan(X10)), repr(X10.shape)) X11 = X7 - X10 logger.debug( 'days from course first update to user first op, has nan: %s' ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape)) X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11] util.dump(X, X_pkl_path) return X
raise SystemExit("%s has no section." % (fname,)) if not args: raise SystemExit("No target was given.") target = args[0] try: ck = parser.get(target, "consumer_key") secret = parser.get(target, "consumer_secret") if not (ck and secret): msg = ("No consumer token was found.", "Check 'consumer_key' and 'consumer_secret' on %s." % (target)) raise SystemExit("\n".join(msg)) except ConfigParser.NoOptionError, e: raise SystemExit(e.message) consumer_token = (ck, secret) sample_user = "******" API = load_object(SERVICE[target]) ret = models.find(models.AccessToken, service_provider_name=target, user_name=sample_user) if ret: access_token = (ret.oauth_token_key, ret.oauth_token_secret) client = create_client(consumer_token, access_token) api = API(client) if target in SERVICE_SAMPLES: run = load_object(SERVICE_SAMPLES[target]) run(api) else: logging.warn("No sample was found for %s." % (target,)) else: client = create_client(consumer_token) api = API(client) access_token = api.initialize() if access_token: