def __init__(self): new_line(50) print 'started to build all training and testing data...' self.tfidf_transformer = TfidfTransformer() self.vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, min_df=5, tokenizer=stemTokenizer) self.svd = TruncatedSVD(n_components=50, random_state=42) self.nmf = NMF(n_components=50, random_state=42) self.mm = MinMaxScaler() # build training data self.train_data = fetch_data(categories, 'train') self.train_labels = build_labels(self.train_data) self.vectors = self.to_vec(self.train_data.data) self.tfidf = self.to_tfidf(self.vectors) self.tfidf_SVD = self.to_SVD(self.tfidf) self.tfidf_NMF = self.to_NMF(self.tfidf) self.tfidf_mm = self.mm.fit_transform(self.tfidf_SVD) # build testing data self.test_data = fetch_data(categories, 'test') self.test_labels = build_labels(self.test_data) self.test_vectors = self.vectorizer.transform(self.test_data.data) self.test_tfidf = self.tfidf_transformer.transform(self.test_vectors) self.test_tfidf_SVD = self.svd.transform(self.test_tfidf) self.test_tfidf_NMF = self.nmf.transform(self.test_tfidf) self.test_tfidf_mm = self.mm.fit_transform(self.test_tfidf_SVD) print 'finished building all training and testing data...' new_line(50) print ' '
def test(sess, model, test_url, batch_size): test_set, test_count, _ = utils.data_set(test_url) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d} |'.format(1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld))
def forecast_plot(tickername, steps): data = fetch_data(tickername).reset_index() data['Type'] = "HISTORICAL" model = build_model(tickername) fcast = model.forecast(int(steps)) new_series = pd.date_range(data['Date'].iloc[-1], periods=int(steps)) fcast_df = pd.DataFrame({'Date': new_series, 'Close': fcast[0], 'Type': "FORECAST"}) final_df = pd.concat([data[['Date', 'Close', 'Type']], fcast_df]) fig = px.line(final_df, x='Date', y='Close', color='Type') return fig.to_html()
def question_i(): categories = [ "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware", "rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey" ] train, test = utils.fetch_data(categories) train.target = list(map(lambda x: int(0 <= x and x < 4), train.target)) test.target = list(map(lambda x: int(0 <= x and x < 4), test.target)) params = list(range(-3, 4)) l1_accuracies = [] l2_accuracies = [] for param in params: l1_classifier = LogisticRegression(penalty='l1', C=10**param, solver='liblinear') logging.info("Regularization Parameter set to {0}".format(param)) l1_accuracies.append( utils.classify(l1_classifier, "Logistic Regression l1", train, test, cv=False, mean=True)) l2_classifier = LogisticRegression(penalty='l2', C=10**param, solver='liblinear') l2_accuracies.append( utils.classify(l2_classifier, "Logistic Regression l2", train, test, cv=False, mean=True)) plt.figure(1) plt.subplot(211) plt.plot(l1_accuracies) plt.xticks(range(6), [10**param for param in params]) plt.title("Accuracy of L1 Logistic Regression vs regularization parameter") plt.subplot(212) plt.plot(l2_accuracies) plt.xticks(range(6), [10**param for param in params]) plt.title("Accuracy of L2 Logistic Regression vs regularization parameter") plt.show()
def j(self): print_question('j') # build training data train = fetch_data(cat_4, 'train') vectors = self.to_vec(train.data) tfidf = self.to_tfidf(vectors) nmf = self.to_NMF(tfidf) # build testing data test = fetch_data(cat_4, 'test') vectors_test = self.vectorizer.transform(test.data) tfidf_test = self.tfidf_transformer.transform(vectors_test) nmf_test = self.nmf.transform(tfidf_test) # build classifiers svc = svm.LinearSVC(C=1, random_state=42) nb = MultinomialNB() ovo = OneVsOneClassifier(svc) ovr = OneVsRestClassifier(svc) # train and test self.multi_classify(nb, nmf, nmf_test, train.target, test.target, 'naive bayes') self.multi_classify(ovo, nmf, nmf_test, train.target, test.target, 'one vs one') self.multi_classify(ovr, nmf, nmf_test, train.target, test.target, 'one vs rest')
def forecast_plot(tickername, steps): data = fetch_data(tickername).reset_index() data['Type'] = 'HISTORICAL' model = build_model(tickername) fcast = model.forecast(int(steps)) new_series = pd.date_range(data['Date'].iloc[-1], periods=int(steps)) fcast_df = pd.DataFrame({ 'Date': new_series, 'Close': fcast[0], 'Type': 'FORECAST' }) final_df = pd.concat([data[['Date', 'Close', 'Type']], fcast_df]) fig = px.line(final_df, x='Date', y='Close', color='Type') # fig = go.Figure([go.Scatter(x=data['Date'], y=data['Close'])]) # fig.add_trace(go.Scatter(x=fcast['Date'], y=fcast[0])) return fig.to_html()
def __init__(self): self.tfidf_transformer = TfidfTransformer() self.vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, min_df=3, tokenizer=tokenizer) self.svd = TruncatedSVD(n_components=1000, random_state=0) # build training data self.train_data = fetch_data(categories, 'train') self.train_labels = build_labels(self.train_data) self.vectors = self.to_vec(self.train_data.data) self.tfidf = self.to_tfidf(self.vectors) #build clustering required data self.lsip2 = TruncatedSVD(n_components=2, random_state=0) self.lsi_data = self.lsip2.fit_transform(self.tfidf) self.nmfp2 = NMF(n_components=2, init='random', random_state=0) self.nmf_data = self.nmfp2.fit_transform(self.tfidf)
def train(nvdm, train_url, optimizer, batch_size=64, training_epochs=1000): train_set, train_count = utils.data_set(train_url) for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size) loss_sum = 0.0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, 2000) data_batch = torch.FloatTensor(data_batch) mask = torch.FloatTensor(mask) loss = nvdm(data_batch, mask) optimizer.zero_grad() loss.backward() optimizer.step() loss_sum += loss.item() print(loss_sum / len(train_batches))
def question_j(): logging.info("<Question J> Multiclass Classification") category = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] train, test = utils.fetch_data(category) train_idf = utils.model_data(train) test_idf = utils.model_data(test) logging.info("Creating TFxIDF Vector Representations") logging.info("Performing LSI on TFxIDF Matrices") # apply LSI to TDxIDF matrices svd = TruncatedSVD(n_components=50) train_lsi = svd.fit_transform(train_idf) test_lsi = svd.fit_transform(test_idf) logging.info("TFxIDF Matrices Transformed") logging.info("Size of Transformed Training Dataset: {0}".format( train_lsi.shape)) logging.info("Size of Transformed Testing Dataset: {0}".format( test_lsi.shape)) clf_list = [ OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear')) ] clf_name = [ 'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM', 'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM' ] # perform classification for clf, clf_n in zip(clf_list, clf_name): logging.info("Training {0} Classifier ".format(clf_n)) clf.fit(train_lsi, train.target) logging.info("Testing {0} Classifier".format(clf_n)) test_predicted = clf.predict(test_lsi) utils.calculate_stats(test.target, test_predicted)
def question_c(): logging.info( "<Question C> Getting the significance and TFxICF representation") all_categories = train_full_set.target_names all_docs_per_category = [] classes_list = [ train_full_set.target_names.index("comp.sys.ibm.pc.hardware"), train_full_set.target_names.index("comp.sys.mac.hardware"), train_full_set.target_names.index("misc.forsale"), train_full_set.target_names.index("soc.religion.christian") ] logging.info( "Store data from all docs of a certain category as entries in all_data_category" ) for cat in all_categories: train_category = utils.fetch_data([cat])[0] data_category = train_category.data temp = '' for doc in data_category: temp += ' ' + doc all_docs_per_category.append(temp) logging.info("Now build frequency tables for each class") vectorized_newsgroups_train = utils.remove_stop_words( all_docs_per_category) print(vectorized_newsgroups_train.shape) max_term_freq_per_category = [0] * vectorized_newsgroups_train.shape[0] category_count_per_term = [0] * vectorized_newsgroups_train.shape[1] for i in range(vectorized_newsgroups_train.shape[0]): max_term_freq_per_category[i] = max( vectorized_newsgroups_train[i].data) category_count_per_term = vectorized_newsgroups_train.sum(axis=0) print(max_term_freq_per_category) print(category_count_per_term)
def c(self): print_question('c') allDoc = [] for cat in allCat: data = fetch_data([cat], 'train').data poke = "" for doc in data: poke = poke + " " + doc allDoc.append(poke) vectors_full = self.to_vec(allDoc) tficf_train = self.to_tfidf(vectors_full) tficf_train_copy = tficf_train.copy() features = self.vectorizer.get_feature_names() for i in range(4): words = [] for j in range(10): doc = tficf_train_copy[i] max_index = np.argmax(doc) words.append(features[max_index]) tficf_train_copy[i, max_index] = 0 print allCat[i], words
def getPdfCOVID19(): # Get data from Google Spreadsheets raw_data = fetch_data()[0] # Create a JSON from data. column_names = raw_data[0] final_data = [] for data in raw_data[2:]: single_data = dict() counter = 0 for col in column_names: single_data[col] = data[counter] counter += 1 final_data.append(single_data) mapping_id = "1_iE1D8Pvsq7SQMMjHWhOhidGvkXENiluq01RXvb3n5g" doc_template_id = "1g7EvvBPsMi2kXyg0am-iRZ72DJNZNtyUrRwKieXhWn0" application_id = "Covid19" username = "******" unique_ids = [] print(db) for data in final_data: json_data = pdfData(reqd_data=data, var_mapping_id=mapping_id, doc_template_id=doc_template_id, form_id=None, form_name=None, instance_id=uuid.uuid4(), user_name=username, application_id=application_id, form_submission_date=None) db.session.add(json_data) # Adds new User record to database db.session.flush( ) # Pushing the object to the database so that it gets assigned a unique id unique_ids.append(json_data.unique_id) db.session.commit() # Commits all changes status = 'submitted' return {"status": status, "uniqueId": unique_ids}
def evaluate(model, training_data, training_count, session, step, train_loss=None, epoch=None, summaries=None, writer=None, saver=None): #Get theta for the H1. data_url = os.path.join(FLAGS.data_dir, 'valid_h1.feat' if step != 'test' else 'test_h1.feat') dataset, dataset_count = utils.data_set(data_url) data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False) theta = [] for idx_batch in data_batches: data_batch, count_batch, mask = utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} logit_theta = session.run(model.doc_vec, input_feed) theta.append(softmax(logit_theta, axis=1)) theta = np.concatenate(theta, axis=0) weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Matrix:0')[0].eval(session) bias = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Bias:0')[0].eval(session) beta = softmax(weights + bias, axis=1) #H2 to calculate perplexity. data_url = os.path.join(FLAGS.data_dir, 'valid_h2.feat' if step != 'test' else 'test_h2.feat') dataset, dataset_count = utils.data_set(data_url) data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False) test_data = [utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)[0] for idx_batch in data_batches] test_data = np.concatenate(test_data, axis=0) perplexity = get_perplexity(test_data, theta, beta) coherence = get_topic_coherence(beta, training_data, 'nvdm') if step == 'test' else np.nan diversity = get_topic_diversity(beta, 'nvdm') if step == 'test' else np.nan if step == 'val': #tloss = tf.get_default_graph().get_tensor_by_name('tloss:0') #vppl = tf.get_default_graph().get_tensor_by_name('vppl:0') #weight_summaries = session.run(summaries, feed_dict={tloss: train_loss, vppl: perplexity}) #weight_summaries = summaries.eval(session=session) #writer.add_summary(weight_summaries, epoch) save_path = saver.save(session, os.path.join(ckpt, 'model.ckpt')) print("Model saved in path: %s" % ckpt) print('| Epoch dev: {:d} |'.format(epoch+1)) else: ## get most used topics cnt = 0 thetaWeightedAvg = np.zeros((1, FLAGS.n_topic)) data_batches = utils.create_batches(len(training_data), FLAGS.batch_size, shuffle=False) for idx_batch in data_batches: batch, count_batch, mask = utils.fetch_data(training_data, training_count, idx_batch, FLAGS.vocab_size) sums = batch.sum(axis=1) cnt += sums.sum(axis=0) input_feed = {model.x.name: batch, model.mask.name: mask} logit_theta = session.run(model.doc_vec, input_feed) theta = softmax(logit_theta, axis=1) weighed_theta = (theta.T * sums).T thetaWeightedAvg += weighed_theta.sum(axis=0) thetaWeightedAvg = thetaWeightedAvg.squeeze() / cnt print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10])) with open(FLAGS.data_dir + '/vocab.new', 'rb') as f: vocab = pkl.load(f) topic_indices = list(np.random.choice(FLAGS.n_topic, 10)) # 10 random topics print('\n') with open(ckpt + '/topics.txt', 'w') as f: for k in range(FLAGS.n_topic): gamma = beta[k] top_words = list(gamma.argsort()[-FLAGS.n_words+1:][::-1]) topic_words = [vocab[a] for a in top_words] f.write(str(k) + ' ' + str(topic_words) + '\n') print('Topic {}: {}'.format(k, topic_words)) with open(ckpt + '/' + step + '_scores.csv', 'a') as handle: handle.write(str(perplexity) + ',' + str(coherence) + ',' + str(diversity) + '\n')
def get_weighted_scores(assignment, sess=None, ta_mean=TA_MEAN, ta_stdev=TA_STDEV, lambda_=TA_LAMBDA): @ensure_matrix def normalize_transform(data): mean = data.mean() std = data.std() return ((data - mean) / (std if std else 1)) * ta_stdev + ta_mean @ensure_matrix def bc_transform(data): normal, _ = boxcox(data) transform = inv_boxcox(normal, lambda_) return normalize_transform(transform) def transform_group(score_dict): ''' Flatten dict of scores, transform, then regroup ''' indexed = [[i, uid, score] for uid in score_dict for i, score in enumerate(score_dict[uid])] idxs, scores = [ix[:2] for ix in indexed], [ix[2] for ix in indexed] if not len(idxs): print( 'No student feedback has been given yet - cannot calculate weighted scores.' ) return {} transformed = [i + [k] for i, k in zip(idxs, bc_transform(scores))] transformed = { k: [v[2] for v in sorted(list(grp), key=lambda y: y[0]) ] # Sort by index and keep score for k, grp in groupby(transformed, lambda x: x[1]) } # Group by UID return transformed data = fetch_data(assignment, sess) st_score = [d[k] for d in data for k in d if d[k] and 'score' in k] st_mean = np.mean(st_score) st_stdev = np.std(st_score) # Collate the scores each student gave st_score = dd(list) for d in data: for i in range(1, 5): uid = d['student_display_id_%i' % i] score = d['student_score_%i' % i] if uid and score: st_score[uid].append(score) # Get kurtosis/skewness tranformed scores transformed = transform_group(st_score) # Calculate mean & stdev for each student stats = {} for s in st_score: if st_score[s]: bc = bc_transform(st_score[s]) mean = np.mean(st_score[s]) std = np.std(st_score[s]) else: bc = [None] * len(st_score[s]) mean = ta_mean std = 1 bc_ind = dict(zip(st_score[s], bc)) bc_tog = dict(zip(st_score[s], transformed[s])) stats[s] = (mean, std, bc_ind, bc_tog) # Create new scores based on normalized student scores averaged = {} all_ = [] for d in data: scores = [] name = d['First Name (Student)'].strip( ) + ' ' + d['Last Name (Student)'].strip() for i in range(1, 5): uid = d['student_display_id_%i' % i] score = d['student_score_%i' % i] if uid and score: mean, std, bc_ind, bc_tog = stats[uid] if std == 0: std = 1 ind = ((score - mean) / std) * ta_stdev + ta_mean tog = ((score - st_mean) / st_stdev) * ta_stdev + ta_mean avg = (ind + tog) / 2. bc_ind = bc_ind[score] if bc_ind[score] else avg bc_tog = bc_tog[score] final = np.mean([avg, bc_ind, bc_tog]) scores.append(final) averaged[name.lower().strip()] = np.mean(scores) if scores else ta_mean # Final Box-Cox transform scores = [averaged[k] for k in sorted(averaged.keys())] transform = bc_transform(scores) averaged = { k: round(transform[i], 2) for i, k in enumerate(sorted(averaged.keys())) } return averaged
def analyze_spreadsheet(assignment): ''' Analyze an assignment's scores after TA grading is completed ''' from glob import glob from scipy.stats import ks_2samp import matplotlib.pyplot as plt import matplotlib.mlab as mlab for folder in glob('./assignments/*/'): if assignment.lower() in folder.lower(): folder = folder.replace('\\', '/') assignment = folder.split('/')[1] break data = fetch_data(assignment, overwrite=OVERWRITE) exclude = [] data = [ d for d in data if d['TA Name (First and Last)'] not in exclude and d['TA Score'] ] TAs = sorted(set([d['TA Name (First and Last)'] for d in data])) print('Number of grading TAs: %i' % len(TAs)) st_scores = [d[k] for d in data for k in d if d[k] and 'score' in k] print('\nStudent Statistics:') st_mean, st_stdev, st_skew, st_kurt = distribution_stats(st_scores) ta_scores = [d['TA Score'] for d in data] if not any(ta_scores): print('Remaining analysis can\'t be completed without TA scores.') return print('\nTA Statistics:') ta_mean, ta_stdev, ta_skew, ta_kurt = distribution_stats(ta_scores) indexed_scores = [(i, d[k] if d[k] else ta_mean) for i, d in enumerate(data) for k in ['student_score_%i' % j for j in range(1, 5)] if k in d and d[k] is not None] idxs, scores = [ix[0] for ix in indexed_scores ], [ix[1] for ix in indexed_scores] ks_transform = ks_align(scores, ta_scores) ks_transform = zip(idxs, ks_transform) idx = 0 curr = [] transformed = [] for i, score in ks_transform: if i == idx: curr.append(score) else: transformed.append(curr) curr = [score] idx = i if curr: transformed.append(curr) # Collate the scores each student gave st_score = dd(list) st_weight = dd(list) idx = inc = 0 for d in data: scores = get_student_scores(d) for i in range(1, 5): uid = d['student_display_id_%i' % i] score = d['student_score_%i' % i] avg = np.mean(scores) if scores else ta_mean if score == 0: d['student_score_%i' % i] = score = avg if uid and score: # Add student weights - signed difference from the mean, on average st_score[uid].append(score) st_weight[uid].append((avg - score)) d['ks_Score_%i' % i] = transformed[idx][i - 1] if not i - 1: inc = 1 idx += inc inc = 0 # Make a copy for future comparison raw_data = [dict(d) for d in data] # Calculate mean & stdev for each student stats = {} weights = {} for s in st_score: if st_score[s]: ks = ks_align(st_score[s], ta_scores) mean = np.mean(st_score[s]) std = np.std(st_score[s]) else: ks = [None] * len(st_score[s]) mean = ta_mean std = 1 stats[s] = (mean, std, dict(zip(st_score[s], ks))) weights[s] = np.mean(st_weight[s]) # Create new scores based on normalized student scores individual = [] # Normalize each student by using their three grades together = [] # Normalize scores based on overall student mean & stdev averaged = [] # Normalize based on average of individual and together for d in raw_data: individual.append(dict(d)) together.append(dict(d)) averaged.append(dict(d)) for i in range(1, 5): uid = d['student_display_id_%i' % i] score = d['student_score_%i' % i] if uid and score: d['student_weight_%i' % i] = weights[d['student_display_id_%i' % i]] mean, std, normed = stats[uid] if std == 0: std = 1 ind = ((score - mean) / std) * ta_stdev + ta_mean tog = ((score - st_mean) / st_stdev) * ta_stdev + ta_mean avg = (ind + tog) / 2. ks_norm = normed[score] ks_norm2 = d['ks_Score_%i' % i] avg = np.mean([avg, ks_norm2, ks_norm if ks_norm else avg]) individual[-1]['student_score_%i' % i] = ind together[-1]['student_score_%i' % i] = tog averaged[-1]['student_score_%i' % i] = avg D = averaged ta_scores = [ d for d in D if any(k for k in ['student_score_%i' % j for j in range(1, 5)] if k in d and d[k] is not None) ] scores = [(i, np.mean([ d[k] for k in ['student_score_%i' % j for j in range(1, 5)] if d[k] ])) for i, d in enumerate(ta_scores)] ta_scores = [(i, d['TA Score']) for i, d in enumerate(ta_scores)] ta_scores = sorted(ta_scores, key=lambda x: x[1]) idxs = [t for t, _ in scores] # scores = [t for _,t in scores] scores = ks_align([t for _, t in scores], [t for _, t in ta_scores]) scores = np.array(scores) scores = ((scores - scores.mean()) / scores.std()) * ta_stdev + ta_mean i = inc = 0 for a in averaged: for k in ['student_score_%i' % j for j in range(1, 5)]: if k in a and a[k] is not None: a[k] = scores[i] inc = 1 i += inc inc = 0 D = averaged ta_scores = [ d for d in D if any(k for k in ['student_score_%i' % j for j in range(1, 5)] if k in d and d[k] is not None) ] scores = dict([(i, np.mean([ d[k] for k in ['student_score_%i' % j for j in range(1, 5)] if d[k] ])) for i, d in enumerate(ta_scores)]) st_scores = get_weighted_scores(assignment) st_scores = [ st_scores[(d['First Name (Student)'].strip() + ' ' + d['Last Name (Student)'].strip()).lower()] for d in ta_scores ] ta_scores = [(i, d['TA Score']) for i, d in enumerate(ta_scores)] ta_scores = sorted(ta_scores, key=lambda x: x[1]) # idxs = [t for t,_ in scores] # scores = dict(zip(idxs, scores)) st_scores = [scores[j] for j, _ in ta_scores] plt.plot([t for _, t in ta_scores], label='TA Scores') plt.plot(st_scores, alpha=.5, label='Weighted Scores') x = range(len(st_scores)) coef = np.polyfit(x, st_scores, 3) p = np.poly1d(coef) yhat = p(x) ybar = np.sum(st_scores) / float(len(st_scores)) ssreg = np.sum((yhat - ybar)**2) sstot = np.sum((st_scores - ybar)**2) print('R^2:%s' % ssreg / sstot) ks = ks_2samp(st_scores, [t for _, t in ta_scores]) s = ks.statistic pv = ks.pvalue x = np.linspace(0, len(st_scores), 100) plt.plot(x, p(x), label='Cubic fit (R^2=%.2f, KS-statistic=%.2f pval=%.2f)' % (ssreg / sstot, s, pv)) coef = np.polyfit(range(len(st_scores)), st_scores, 1) p = np.poly1d(coef) plt.plot(x, p(x), 'k--', label='Linear fit') plt.legend(loc='lower center', fancybox=True, prop={'size': 9}) plt.title(assignment.title()) plt.show() all_data = [('Raw Score', raw_data), ('Averaged', averaged), ('-Individual', individual), ('-Together', together)] # Sanity check to ensure scores were properly normalized for name, data in all_data[1:]: data = [d[k] for d in data for k in d if d[k] and 'score' in k] mean = np.mean(data) stdev = np.std(data) assert(abs(mean - ta_mean) < 1 and abs(stdev - ta_stdev) < 1), \ 'Scores improperly normalized: %.2f & %.2f for %s' %(mean, stdev, name) safe_diff = lambda score, data: 0 if not data else score - np.mean(data) difference = lambda data: [ safe_diff(d['TA Score'], [ d['student_score_%i' % i] for i in range(1, 5) if d['student_score_%i' % i] ]) for d in data ] print('-----------------------------\n\nCrowd / TA score differences:') # Calculate the difference of crowd-sourced score from individual TA score for ta in TAs: ta_data = [ d['TA Score'] for d in raw_data if d['TA Name (First and Last)'] == ta ] print('\n-----', ta, '-----') print('Overall mean & stdev:\t\t\t %.2f & %.2f' % (np.mean(ta_data), np.std(ta_data))) for label, data in all_data: ta_data = [d for d in data if d['TA Name (First and Last)'] == ta] diff = difference(ta_data) mu, sig = np.mean(np.abs(diff)), np.std(np.abs(diff)) print(label, 'difference mean & stdev:\t %.2f & %.2f' % (mu, sig)) # Plot best fit line for averaged if label == 'Averaged': hist, n = np.histogram(diff, 50) plt.plot(n, mlab.normpdf(n, mu, sig), alpha=.7, label=ta) print('\n=========== Overall =============') # Calculate the overall difference of crowd-sourced score from all TA scores for label, data in all_data: diff = difference(data) mu, sig = np.mean(np.abs(diff)), np.std(np.abs(diff)) print(label, 'difference mean & stdev:\t %.2f & %.2f' % (mu, sig)) # Plot best fit line for averaged if label == 'Averaged': hist, n = np.histogram(diff, 50) plt.plot(n, mlab.normpdf(n, mu, sig), '--') plt.legend(loc='best', prop={'size': 7}) plt.show() # Join all data into one cohesive set dataset = [] for i, d in enumerate(raw_data): # All sets need same amount of features if not d['student_score_3']: continue features = [] for j in range(1, 4): features.append(d['student_score_%i' % j]) features.append(len(d['student_comment_%i' % j])) features.append(d['student_weight_%i' % j]) for _, data in all_data[1:]: features.append(data[i]['student_score_%i' % j]) features.append(d['TA Score']) dataset.append(features) with open('assignments/' + assignment.title() + '/dataset.csv', 'w+') as f: for d in dataset: f.write(','.join([str(v) for v in d]) + '\n')
def organizations(request): """ To fetch and update the Organizations entity """ print("\n-------Getting Organizations entities-------\n") END_POINT = 'searches/organizations' YESTURDAY_DATE = utils.get_yesterday_date() TODAY_DATE = utils.get_today_date() COLLECTION_NAME = 'organization_entities' QUERY = { "field_ids": [ "acquirer_identifier", "aliases", "categories", "category_groups", "closed_on", "company_type", "contact_email", "created_at", "delisted_on", "demo_days", "description", "diversity_spotlights", "entity_def_id", "equity_funding_total", "exited_on", "facebook", "facet_ids", "founded_on", "founder_identifiers", "funding_stage", "funding_total", "funds_total", "hub_tags", "identifier", "image_id", "image_url", "investor_identifiers", "investor_stage", "investor_type", "ipo_status", "last_equity_funding_total", "last_equity_funding_type", "last_funding_at", "last_funding_total", "last_funding_type", "layout_id", "legal_name", "linkedin", "listed_stock_symbol", "location_group_identifiers", "location_identifiers", "name", "num_acquisitions", "num_alumni", "num_articles", "num_current_advisor_positions", "num_current_positions", "num_diversity_spotlight_investments", "num_employees_enum", "num_enrollments", "num_event_appearances", "num_exits", "num_exits_ipo", "num_founder_alumni", "num_founders", "num_funding_rounds", "num_funds", "num_investments", "num_investors", "num_lead_investments", "num_lead_investors", "num_past_positions", "num_portfolio_organizations", "num_sub_organizations", "operating_status", "override_layout_id", "owner_identifier", "permalink", "permalink_aliases", "phone_number", "program_application_deadline", "program_duration", "program_type", "rank_delta_d30", "rank_delta_d7", "rank_delta_d90", "rank_org", "rank_principal", "revenue_range", "school_method", "school_program", "school_type", "short_description", "status", "stock_exchange_symbol", "stock_symbol", "twitter", "updated_at", "uuid", "valuation", "valuation_date", "website", "website_url", "went_public_on" ], "order": [{ "field_id": "rank_org", "sort": "asc" }], "query": [ { "type": "predicate", "field_id": "updated_at", "operator_id": "gte", "values": [str(YESTURDAY_DATE)] }, ], "limit": 1000 } total_count, entities = utils.fetch_data(QUERY, END_POINT) if total_count is None: return "Error in parsing the API response. Please check the logs." print("total count: ", total_count) # get the organization collection org_col = utils.get_mongodb_collection(COLLECTION_NAME) fetch_records_count = 0 while fetch_records_count < total_count: if fetch_records_count != 0: _, entities = utils.fetch_data(QUERY, END_POINT) if not entities: print("no entities left i.e., entities = %s. moving on." % len(entities)) break for e in entities: if e: e['insert_date'] = TODAY_DATE else: print("Entity is empty: ", e) inserted = org_col.insert_many(entities) fetch_records_count += len(entities) print("inserted records: ") pprint(inserted.inserted_ids) print("total_count: ", total_count, ", fetched records: ", fetch_records_count) # get the last record print("------------------------") after_id = entities[-1].get('uuid', None) if after_id: print("Get next batch after id: ", after_id) # print("Entities len: ", ) QUERY['after_id'] = after_id entities.clear() msg = { 'entity': 'Organization', 'total_record_updated': fetch_records_count } return jsonify(msg)
def press_references(request): """ To fetch and update the Press References entity. """ print("\n-------Getting Press References entities-------\n") COLLECTION_NAME = 'press_reference_entities' END_POINT = 'searches/press_references' TODAY_DATE = utils.get_today_date() YESTURDAY_DATE = utils.get_yesterday_date() QUERY = { "field_ids": [ "activity_entities", "author", "created_at", "entity_def_id", "identifier", "posted_on", "publisher", "thumbnail_url", "title", "updated_at", "url", "uuid" ], "query": [ { "type": "predicate", "field_id": "updated_at", "operator_id": "gte", "values": [str(YESTURDAY_DATE)] }, ], "order": [{ "field_id": "updated_at", "sort": "asc", "nulls": "last" }], "limit": 1000, } total_count, entities = utils.fetch_data(QUERY, END_POINT) if total_count is None: return "Error in parsing the API response. Please check the logs." print("total count: ", total_count) # get the press_references collection col = utils.get_mongodb_collection(COLLECTION_NAME) fetch_records_count = 0 # storing into the database and pagination while fetch_records_count < total_count: if fetch_records_count != 0: _, entities = utils.fetch_data(QUERY, END_POINT) if not entities: print("no entities left i.e., entities = %s. moving on." % len(entities)) break for e in entities: if e: e['insert_date'] = TODAY_DATE else: print("Entity is empty: ", e) inserted = col.insert_many(entities) fetch_records_count += len(entities) print("inserted records: ") pprint(inserted.inserted_ids) print("total_count: ", total_count, ", fetched records: ", fetch_records_count) print("------------------------") # get the last record after_id = entities[-1].get('uuid', None) if after_id: print("Get next batch after id: ", after_id) # print("Entities len: ", ) QUERY['after_id'] = after_id entities.clear() msg = { 'entity': 'press_references', 'total_record_updated': fetch_records_count } return jsonify(msg)
def acquisitions(request): """ To fetch and update the Acquisitions entity. """ print("\n-------Getting Acquisitions entities-------\n") COLLECTION_NAME = 'acquisitions_entities' END_POINT = 'searches/acquisitions' TODAY_DATE = utils.get_today_date() YESTURDAY_DATE = utils.get_yesterday_date() QUERY = { "field_ids": [ "acquiree_categories", "acquiree_funding_total", "acquiree_identifier", "acquiree_last_funding_type", "acquiree_locations", "acquiree_num_funding_rounds", "acquiree_revenue_range", "acquiree_short_description", "acquirer_categories", "acquirer_funding_stage", "acquirer_funding_total", "acquirer_identifier", "acquirer_locations", "acquirer_num_funding_rounds", "acquirer_revenue_range", "acquirer_short_description", "acquisition_type", "announced_on", "completed_on", "created_at", "disposition_of_acquired", "entity_def_id", "identifier", "permalink", "price", "rank_acquisition", "short_description", "status", "terms", "updated_at", "uuid" ], "query": [ { "type": "predicate", "field_id": "updated_at", "operator_id": "gte", "values": [str(YESTURDAY_DATE)] }, ], "order": [{ "field_id": "updated_at", "sort": "asc", "nulls": "last" }], "limit": 1000, } total_count, entities = utils.fetch_data(QUERY, END_POINT) if total_count is None: return "Error in parsing the API response. Please check the logs." print("total count: ", total_count) # get the acquisitions collection col = utils.get_mongodb_collection(COLLECTION_NAME) fetch_records_count = 0 # storing into the database and pagination while fetch_records_count < total_count: if fetch_records_count != 0: _, entities = utils.fetch_data(QUERY, END_POINT) if not entities: print("no entities left i.e., entities = %s. moving on." % len(entities)) break for e in entities: if e: e['insert_date'] = TODAY_DATE else: print("Entity is empty: ", e) inserted = col.insert_many(entities) fetch_records_count += len(entities) print("inserted records: ") pprint(inserted.inserted_ids) print("total_count: ", total_count, ", fetched records: ", fetch_records_count) print("------------------------") # get the last record after_id = entities[-1].get('uuid', None) if after_id: print("Get next batch after id: ", after_id) # print("Entities len: ", ) QUERY['after_id'] = after_id entities.clear() msg = { 'entity': 'acquisitions', 'total_record_updated': fetch_records_count } return jsonify(msg)
def plot(tickername): data = fetch_data(tickername).reset_index() fig = px.line(data, x='Date', y='Close') return fig.to_html()
def train(sess, model, train_url, test_url, dev_url, model_url, batch_size, saver, training_epochs=400, alternate_epochs=1): """train nvctm model.""" train_set, train_count = utils.data_set(train_url) dev_set, dev_count = utils.data_set(dev_url) test_set, test_count = utils.data_set(test_url) dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) train_theta = [] train_beta = [] for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) # ------------------------------- # train for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 res_sum = 0 log_sum = 0 mean_sum = 0 var_sum = 0 m = None Um = None enc = None for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x.name: data_batch, model.mask.name: mask } _, (loss, kld, mean, Umean, enc, rec_loss, log_s, mean_s, vk_show, theta, beta, lp, v) = sess.run((optim, [ model.objective, model.kld, model.mean, model.U, model.vk, model.recons_loss, model.log_squre, model.mean_squre, model.vk_show, model.theta, model.beta, model.log_prob, model.variance ]), input_feed) m = mean Um = Umean # print('*********************vk show', vk_show) # print('Umean', Umean[0]) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) res_sum += np.sum(rec_loss) log_sum += np.sum(log_s) mean_sum += np.sum(mean_s) var_sum += np.sum(v) / np.sum(mask) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1: train_theta.extend(theta) train_beta.extend(beta) print_ppx = np.exp(loss_sum / word_count) # print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print_res = res_sum / len(train_batches) print_log = log_sum / len(train_batches) print_mean = mean_sum / len(train_batches) print_var = var_sum / len(train_batches) print( '| Epoch train: {:d} |'.format(epoch + 1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format( print_ppx), # perplexity per word # '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| stddev {:.5}'.format(print_var), '| res_loss: {:5}'.format(print_res), '| log_loss: {:5}'.format(print_log), '| mean_loss: {:5}'.format(print_mean)) with codecs.open('./nvctm_train_theta', 'wb') as fp: pickle.dump(np.array(train_theta), fp) fp.close() if (epoch + 1 ) % 50 == 0 and switch == 1 and i == alternate_epochs - 1: with codecs.open('./nvctm_train_beta', 'wb') as fp: pickle.dump(beta, fp) fp.close() npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 var_sum = 0 word_count = 0 doc_count = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld, v = sess.run( [model.objective, model.kld, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) var_sum += np.sum(v) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_var = var_sum / len(train_batches) # print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print('\n| Epoch dev: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| stddev {:.5}'.format(print_var), '| KLD: {:.5}'.format(print_kld)) # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 var_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld, v = sess.run( [model.objective, model.kld, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) var_sum += np.sum(v) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_var = var_sum / len(train_batches) # print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| stddev {:.5}'.format(print_var), '| KLD: {:.5}\n'.format(print_kld)) npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) saver.save(sess, model_url)
# ) st.sidebar.title("Simple Pattern Finder") st.sidebar.image("img/pattern.png") symbol = st.sidebar.text_input(label='Symbol', value='SPY') today = date.today() delta = timedelta(days=50) start = today - delta start_date = st.sidebar.date_input(label='From :', value=start) end_date = st.sidebar.date_input(label='To :') ticker_info = yf.Ticker(symbol) df = utils.fetch_data(symbol, start_date=start_date, end_date=end_date) try: st.title(ticker_info.info['shortName']) except: st.error('No data found, symbol may be delisted') st.stop() fig = go.Figure(data=[ go.Candlestick(x=df.index, open=df['Open'], high=df['High'], low=df['Low'], close=df['Close']) ])
def main(inp, method, training_size, epoch): sample_func, data = sampling_methods[method] if data[-4:] == '.pth': data = torch.load(data) device = torch.device('cpu') batch_size = 128 budget = epoch config = { 'ndim': 250, 'sdim': 56, 'num_gnn_layers': 2, 'g_aggr': 'gsum', 'num_acc_layers': 4, 'lr': 0.00001, } t0 = time() test_dataset = fetch_data('data/test_data_20.pth') logging.info('Loaded test graphs in {} sec.'.format(round(time() - t0, 2))) t0 = time() val_dataset = fetch_data('data/validation_data_10.pth') logging.info('Loaded validation graphs model in {} sec.'.format( round(time() - t0, 2))) test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False) val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False) criterion = nn.MSELoss() for num in [training_size]: ratio = np.round(num / 100, 2) run_name = '{}{}'.format(method, num) rmse_list = list() all_loss = list() best_rmse_list = [] for step in range(5): logger.info('sampling') sampled_dataset = sample_func(ratio, data) train_loader = DataLoader(sampled_dataset, batch_size=batch_size, shuffle=True) logger.info('start run {}_{} with {}% ({} graphs) '.format( run_name, step + 1, num, len(sampled_dataset))) model = GNNpred(config['ndim'], config['sdim']).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config['lr']) best_val = np.inf best_test = np.inf for epoch in range(int(budget)): loss = 0 model.train() running_loss = torch.Tensor().to(device) for i, graph_batch in enumerate(train_loader): graph_batch = graph_batch.to(device) optimizer.zero_grad() output = model(graph_batch.edge_index, graph_batch.node_atts, graph_batch.batch.to(device)) loss = criterion(output.view(-1), graph_batch.acc) running_loss = torch.cat([running_loss, loss.view(-1)]) loss.backward() optimizer.step() loss = torch.sqrt(torch.mean(running_loss)).item() all_loss.append(loss) logger.info('epoch {}:\tloss = {}'.format( epoch, my_round(loss, 4))) val_rmse, _, val_acc = evaluate(model, val_loader, device) logger.info('epoch {}:\tval_rmse = {}'.format( epoch, my_round(val_rmse, 4))) test_rmse, test_mae, _ = evaluate(model, test_loader, device) logger.info('epoch {}:\ttest_rmse = {}'.format( epoch, my_round(test_rmse, 4))) logger.info('epoch {}:\ttest_mae = {}'.format( epoch, my_round(test_mae, 4))) if val_rmse < best_val: best_val = val_rmse best_test = test_rmse # save(model, run_name) rmse_list.append(best_val) best_rmse_list.append(best_val) logger.info('step {}:\tbest_test = {}'.format( step + 1, my_round(best_test, 4))) torch.save(rmse_list, path_results + '/{}_all_rmse.pth'.format(run_name)) logger.info('Saved all validation rmse to {}'.format(path_results)) torch.save(best_rmse_list, path_results + '/{}_best_rmse.pth'.format(run_name)) logger.info('Saved best validation rmse of each run to {}'.format( path_results)) torch.save(all_loss, path_results + '/{}_loss.pth'.format(run_name)) logger.info('Saved trainings loss to {}'.format(path_results)) torch.save(val_acc, path_saved_acc + '/{}_val_acc.pth'.format(run_name)) logger.info( 'Saved true and predicted accuracy of validation set to {}'. format(path_saved_acc)) _, _, train_acc = evaluate(model, train_loader, device) torch.save(train_acc, path_saved_acc + '/{}_train_acc.pth'.format(run_name)) logger.info( 'Saved true and predicted accuracy of training set to {}'. format(path_saved_acc)) logger.info('epoch {}:\ttest_rmse = {}'.format( epoch, my_round(test_rmse, 4))) return loss, val_rmse, test_rmse, test_mae, model.number_of_parameters()
def train(sess, model, train_url, test_url, dev_url, batch_size, training_epochs=1000, alternate_epochs=1): """train gsm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) dev_set, dev_count = utils.data_set(dev_url) dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) kld_list = [] var_list = [] train_theta = [] train_beta = [] test_theta = [] test_beta = [] for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) # ------------------------------- # train for switch in range(0, 2): if switch == 0: optimize = model.optimize_dec print_mode = 'updating decoder' elif switch == 1: optimize = model.optimize_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 var_sum = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: True, model.gamma.name: epoch/training_epochs} _, (loss, kld, v, theta, beta) =\ sess.run((optimize, [model.reconstruction_loss, model.kld, model.variance, model.topic_dist, model.beta]), input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) var_sum += np.sum(v) / np.sum(mask) # print([np.max(theta[i]) for i in range(batch_size)]) # print([np.argmax(theta[i]) for i in range(batch_size)]) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1: train_theta.extend(theta) train_beta.extend(beta) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print_var = var_sum / len(train_batches) kld_list.append(print_kld) var_list.append(print_var) print('| Epoch train: {:d}'.format(epoch + 1), print_mode, '{:d}'.format(i + 1), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| stddev {:.5}'.format(print_var)) with codecs.open('./gsm_train_theta', 'wb') as fp: pickle.dump(np.array(train_theta), fp) fp.close() if (epoch + 1) % 50 == 0 and switch == 1 and i == alternate_epochs - 1: with codecs.open('./gsm_train_beta', 'wb') as fp: pickle.dump(beta, fp) fp.close() npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) # ------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 var_sum = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data(dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0} loss, kld, v = sess.run([model.objective, model.kld, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) var_sum += np.sum(v) / np.sum(mask) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print_var = var_sum / len(train_batches) print('\n| Epoch dev: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| stddev: {:.5}'.format(print_var)) # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx, idx_batch in enumerate(test_batches): data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0} loss, kld, theta, beta, v = sess.run([model.objective, model.kld, model.topic_dist, model.beta, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) test_theta.extend(theta) if idx == len(test_batches) - 1: test_beta.extend(beta) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| stddev: {:.5}\n'.format(print_var)) npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) with codecs.open('./test_theta', 'wb') as fp: pickle.dump(test_theta, fp) fp.close() with codecs.open('./test_beta', 'wb') as fp: pickle.dump(test_beta, fp) fp.close() with codecs.open('./kld.txt', 'w', 'utf-8') as fp: for idx, kld in enumerate(kld_list): if idx < len(kld_list) - 1: fp.write(str(kld) + ', ') else: fp.write(str(kld)) fp.close() with codecs.open('./var.txt', 'w', 'utf-8') as fp: for idx, var in enumerate(var_list): if idx < len(var_list) - 1: fp.write(str(var) + ', ') else: fp.write(str(var)) fp.close()
[estimator_model.evaluate], input_dependencies=[splitter_cv_external, transform_data_external], name='estimator_model_external', flatten_inputs=[True, False], parallel=parameters['parallel']) # Creating tree structure (for output/input flow) splitter_cv_external.set_children_tasks([compressor_external]) compressor_external.set_children_tasks([transform_data_external]) transform_data_external.set_children_tasks([estimator_model_external]) logs.validate() try: logs.info("Fetching and preprocessing input data...") stimuli_representations_paths, fMRI_paths = fetch_data( parameters['path_to_fmridata'], input_path, subject, parameters['language'], parameters['models']) stimuli_representations = transformer.process_representations( stimuli_representations_paths, parameters['models']) fMRI_data = transformer.process_fmri_data( fMRI_paths, masker, parameters['add_noise_to_constant']) logs.validate() logs.info("Executing pipeline...", end='\n') pipeline = Pipeline() pipeline.fit( splitter_cv_external, logs) # retrieve the flow from children and input_dependencies maps = pipeline.compute(stimuli_representations, fMRI_data, output_path,
def train(sess, model, train_url, test_url, batch_size, vocab_size, training_epochs=200, alternate_epochs=1,#10 lexicon=[], result_file='test.txt', B=1, warm_up_period=100): """train nvdm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset train_size=len(train_set) validation_size=int(train_size*0.1) dev_set = train_set[:validation_size] dev_count = train_count[:validation_size] train_set = train_set[validation_size:] train_count = train_count[validation_size:] print('sizes',train_size,validation_size,len(dev_set),len(train_set)) optimize_jointly = True dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) warm_up = 0 start_min_alpha = 0.00001 min_alpha = start_min_alpha warm_up_alpha=False start_B=4 curr_B=B #for early stopping best_print_ana_ppx=1e10 early_stopping_iters=30 no_improvement_iters=0 stopped=False epoch=-1 #for epoch in range(training_epochs): while not stopped: epoch+=1 train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) if warm_up<1.: warm_up += 1./warm_up_period else: warm_up=1. # train #for switch in range(0, 2): if optimize_jointly: optim = model.optim_all print_mode = 'updating encoder and decoder' elif switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ana_loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 ana_kld_sum = 0.0 word_count = 0 doc_count = 0 recon_sum=0.0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B} _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]), input_feed) loss_sum += np.sum(loss) ana_loss_sum += np.sum(ana_loss) kld_sum += np.sum(kld) / np.sum(mask) ana_kld_sum += np.sum(ana_kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_loss = recon_sum/len(train_batches) dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder') phi = dec_vars[0] phi = sess.run(phi) utils.print_top_words(phi, lexicon,result_file=None) print_ppx = np.exp(loss_sum / word_count) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(train_batches) print_ana_kld = ana_kld_sum/len(train_batches) print('| Epoch train: {:d} |'.format(epoch+1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| Loss: {:.5}'.format(print_loss), '| ppx anal.: {:.5f}'.format(print_ana_ppx), '|KLD anal.: {:.5f}'.format(print_ana_kld)) if warm_up_alpha: if min_alpha>0.0001: min_alpha-=(start_min_alpha-0.0001)/training_epochs #------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 recon_sum=0.0 print_ana_ppx = 0.0 ana_loss_sum = 0.0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective], input_feed) loss_sum += np.sum(loss) ana_loss_sum += np.sum(ana_loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(dev_batches) print_loss = recon_sum/len(dev_batches) if print_ana_ppx<best_print_ana_ppx: no_improvement_iters=0 best_print_ana_ppx=print_ana_ppx #check on validation set, if ppx better-> save improved model tf.train.Saver().save(sess, 'models/improved_model_bernoulli') else: no_improvement_iters+=1 print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx) if no_improvement_iters>=early_stopping_iters: #if model has not improved for 30 iterations, stop training ###########STOP TRAINING############ stopped=True print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters) ###########LOAD BEST MODEL########## print('load stored model') tf.train.Saver().restore(sess,'models/improved_model_bernoulli') print('| Epoch dev: {:d} |'.format(epoch+1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld) , '| Loss: {:.5}'.format(print_loss)) #------------------------------- # test #if epoch%10==0 or epoch==training_epochs-1: if FLAGS.test: #if epoch==training_epochs-1: if stopped: #only do it once in the end coherence=utils.topic_coherence(test_set,phi, lexicon) print('topic coherence',str(coherence)) loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 recon_sum = 0.0 ana_loss_sum = 0.0 ana_kld_sum = 0.0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B} loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld)/np.sum(mask) ana_loss_sum += np.sum(ana_loss) ana_kld_sum += np.sum(ana_kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_loss = recon_sum/len(test_batches) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(test_batches) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ana_kld = ana_kld_sum/len(train_batches) print('| Epoch test: {:d} |'.format(epoch+1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| Loss: {:.5}'.format(print_loss), '| ppx anal.: {:.5f}'.format(print_ana_ppx), '|KLD anal.: {:.5f}'.format(print_ana_kld))
def data(tickername): data = fetch_data(tickername) return data.reset_index().to_html()
def aggregation_as_json(request, selected_sources, source_weights, debugging = False, formfields = None, current_item_rank = None, include_all_sources = True, quick = True, compare1 = None, compare2 = None): if debugging: print "FOOBAR!!! aggregation_as_json!" print "selected_sources", selected_sources schema = [('Item', 'string'), ('Rank', 'number'), ('Latitude', 'number'), ('Longitude', 'number')] order = ['Item', 'Rank', 'Latitude', 'Longitude'] if include_all_sources: for source in selected_sources: name = 'rank' + str(source.id) schema.append((name, 'string')) order.append(name) schema.append(('index', 'number')) order.append('index') selected_sources_count = len(selected_sources) print "selected_sources_count = ", selected_sources_count if selected_sources_count == 0: table = DataTableYUI(schema, None) json_table = table.ToJSonYUI(columns_order=order, order_by=(order[1], "desc"), include_index=True) return json_table, None print "starting utils.fetch_Data!!!!!!!!!!!!!!!!!!!!!!!!!" start = time() table, rows, items, display, middle = utils.fetch_data(selected_sources) totTime = time() - start print "utils.fetch_data TOTAL TIME:", totTime using_old_ranking = False numitems = len(items) if 'old_ranking_values' in request.session and len(request.session['old_ranking_values']) == numitems and 'old_ranking' in request.session and len(request.session['old_ranking']) == numitems: schema.insert(2, ('Previous', 'string')) order.insert(2, 'Previous') using_old_ranking = True # nothing to aggregate, just zero or one data source pairs = numitems * (numitems - 1) / 2 weights = [source_weights[source] for source in selected_sources] start = time() print "starting utils.process!!!!!!!!!!!!!!!!!!!!!!!!!" p0, Ybar, num_comparisons = utils.process(table, rows, selected_sources_count, weights, pairs) totTime = time() - start print "utils.process TOTAL TIME:", totTime final_wt = [scipy.sqrt(n) for n in num_comparisons] if debugging: print "To be passed into the C++ code" print "p = ", p0 print "Ybar = ", Ybar print "wt = ", final_wt print "numitems = ", numitems print "numpairs = ", len(num_comparisons) if using_old_ranking and False: p0 = request.session['old_ranking_values'] def ij_to_index(i, j): return i * (i - 1) / 2 + j compare1 = 10 compare2 = 5 # compare items indexed compare1 and compare2 if debugging and compare1 and compare2 and compare2 < compare1: index = ij_to_index(compare1, compare2) #print "Comparing", items[compare1], items[compare2], " - ", Ybar[index] before = zip(p0, items) before.sort(key=lambda x: x[0], reverse=True) start = time() print "starting!" p1 = [0.0] * numitems leastsq.optimize(numitems, pairs, p0, Ybar, final_wt, p1) totTime = time() - start print "\n\n### TOTAL TIME2:", totTime if compare1 and compare2: pass #print "Re-comparing", items[compare1], items[compare2], " - ", p1[compare1] - p1[compare2] pairwise = after = None if debugging: pairwise = [[None] * numitems for i in range(numitems)] after = [[None] * numitems for i in range(numitems)] n = 0 for i in range(numitems): after[i][i] = pairwise[i][i] = [0, ''] for i in range(1, numitems): for j in range(i): pref1 = pref2 = '' if(Ybar[n] < 0): # negative value indicates i is preferred to j, set class for the td element so it is colored accordingly pref1 = 'row' pref2 = 'column' elif Ybar[n] > 0: pref1 = 'column' pref2 = 'row' pairwise[i][j] = [Ybar[n], pref1] pairwise[j][i] = [-1 * Ybar[n], pref2] if(p1[j] - p1[i] < 0): pref1 = 'row' pref2 = 'column' else: pref1 = 'column' pref2 = 'row' after[i][j] = [p1[j] - p1[i], pref1] after[j][i] = [p1[i] - p1[j], pref2] n += 1 assert len(p1) == numitems min_value = min(p1) values_range = max(p1) - min_value scalar = 100.0 / values_range # since there's a fair amount of error in our optimization algorithm # round to the nearest decimal so as not to have fake rankings appear p1 = [round((x - min_value) * scalar, 1) for x in p1] if debugging: print "Check against C++: ", p1 # p1 = [round(x, 1) for x in p1] ranked = zip(p1, items) ranked.sort(key=lambda x: x[0], reverse=True) request.session['current_ranking'] = list(map(None, *ranked)[1]) i = -1 if current_item_rank: # find out item's rank i = items.index(current_item_rank[0]) current_item = items[i] # "<span class=\"current_item\">%s</span>" % items[i] # items[i] = current_item if current_item_rank: # find out item's rank current_item_rank[1] = ranked.index((p1[i], current_item)) if include_all_sources: for i in range(len(table)): # INEFFICIENT!!! for j in range(len(table[i])): if table[i][j]: if True: #table[i][j].scaled_value > 0: table[i][j] = "<span class=\"table-rank\">%d</span><span class=\"table-suffix\">%s</span> <span class=\"table-score\">%.02f</span>" % (table[i][j].rank, utils.rank_suffix(table[i][j].rank), table[i][j].scaled_value) else: table[i][j] = "<span class=\"table-rank\">%d</span><span class=\"table-suffix\">%s</span> <span class=\"table-score\">%.01f</span>" % (table[i][j].rank, utils.rank_suffix(table[i][j].rank), table[i][j].scaled_value) else: table[i][j] = None #(1000, '') latitude = [item.lat for item in items] longitude = [item.long for item in items] if using_old_ranking: ranked3 = zip(request.session['old_ranking'], p1) # OMG a terrible hack! ranked3.sort(key=lambda x: x[1], reverse=False) old_ranking = [r[0] for r in ranked3] item_names = [item.name for item in items] ranked2 = zip(item_names, p1, old_ranking, latitude, longitude) else: ranked2 = zip(items, p1, latitude, longitude) ranked2 = [a + tuple(d) for a, d in zip(ranked2, table)] else: ranked2 = zip(items, p1) ranked2.sort(key=lambda x: x[1], reverse=False) request.session['old_ranking_values'] = p1 request.session['old_ranking'] = [r[0] for r in ranked2] table = DataTableYUI(schema, ranked2) json_table = table.ToJSonYUI(columns_order=order, order_by=(order[1], "desc"), include_index=True) # for r, b in zip(ranked[:15], before[:15]): # print b, r # map(None, a, b) is a confusing way of zip(a,b), but only if len(a) == len(b) # otherwise zip truncates one of the lists (http://docs.python.org/library/functions.html#zip) # whereas map puts None's in instead return json_table, items, ranked, display, pairwise, after
def train(sess, model, train_url, batch_size, training_epochs=1000, alternate_epochs=10): train_set, train_count = utils.data_set(train_url) summaries = None#get_summaries(sess) writer = None#tf.summary.FileWriter(ckpt + '/logs/', sess.graph) saver = tf.train.Saver() sess.graph.finalize() total_mem = 0 mem = 0 for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data(train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} _, (loss, kld) = sess.run((optim, [model.objective, model.kld]), input_feed) #loss, kld = tf.cast(loss, tf.float64), tf.cast(kld, tf.float64) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(train_batches) print('| Epoch train: {:d} |'.format(epoch+1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld)) evaluate(model, train_set, train_count, sess, 'val', (loss_sum + kld_sum), epoch, summaries, writer, saver) current_mem = process.memory_info().rss / (1024 ** 2) total_mem += (current_mem - mem) print("Memory increase: {}, Cumulative memory: {}, and current {} in MB".format(current_mem - mem, total_mem, current_mem)) mem = current_mem gc.collect()
def funding_rounds(request): """ To fetch and update the Funding Rounds entity. """ print("\n-------Getting Funding Rounds entities-------\n") COLLECTION_NAME = 'funding_rounds_entities' END_POINT = 'searches/funding_rounds' TODAY_DATE = utils.get_today_date() YESTURDAY_DATE = utils.get_yesterday_date() QUERY = { "field_ids": [ "announced_on", "closed_on", "created_at", "entity_def_id", "funded_organization_categories", "funded_organization_description", "funded_organization_diversity_spotlights", "funded_organization_funding_stage", "funded_organization_funding_total", "funded_organization_identifier", "funded_organization_location", "funded_organization_revenue_range", "identifier", "image_id", "investment_stage", "investment_type", "investor_identifiers", "is_equity", "lead_investor_identifiers", "money_raised", "name", "num_investors", "num_partners", "permalink", "post_money_valuation", "pre_money_valuation", "rank_funding_round", "short_description", "target_money_raised", "updated_at", "uuid", ], "query": [ { "type": "predicate", "field_id": "updated_at", "operator_id": "gte", "values": [str(YESTURDAY_DATE)] }, ], "order": [{ "field_id": "updated_at", "sort": "asc", "nulls": "last" }], "limit": 1000, } total_count, entities = utils.fetch_data(QUERY, END_POINT) # TODO to add this to all of the functions if total_count is None: return "Error in parsing the API response. Please check the logs." print("total count: ", total_count) # get the people collection col = utils.get_mongodb_collection(COLLECTION_NAME) fetch_records_count = 0 # storing into the database and pagination while fetch_records_count < total_count: if fetch_records_count != 0: _, entities = utils.fetch_data(QUERY, END_POINT) if not entities: print("no entities left i.e., entities = %s. moving on." % len(entities)) break for e in entities: if e: e['insert_date'] = TODAY_DATE else: print("Entity is empty: ", e) inserted = col.insert_many(entities) fetch_records_count += len(entities) print("inserted records: ") pprint(inserted.inserted_ids) print("total_count: ", total_count, ", fetched records: ", fetch_records_count) print("------------------------") # get the last record after_id = entities[-1].get('uuid', None) if after_id: print("Get next batch after id: ", after_id) # print("Entities len: ", ) QUERY['after_id'] = after_id entities.clear() msg = { 'entity': 'funding_rounds', 'total_record_updated': fetch_records_count } return jsonify(msg)
def train(sess, model, train_url, test_url, batch_size, training_epochs=1000, alternate_epochs=10): """train nvdm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset dev_set = test_set[:50] dev_count = test_count[:50] dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) #------------------------------- # train for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x.name: data_batch, model.mask.name: mask } _, (loss, kld) = sess.run( (optim, [model.objective, model.kld]), input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print( '| Epoch train: {:d} |'.format(epoch + 1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format( print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format( print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld)) #------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print('| Epoch dev: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld)) #------------------------------- # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld))
def people(request): """ To fetch and update the People entity. """ print("\n-------Getting people entities-------\n") END_POINT = 'searches/people' COLLECTION_NAME = 'people_entities' YESTURDAY_DATE = utils.get_yesterday_date() TODAY_DATE = utils.get_today_date() QUERY = { "field_ids": [ "aliases", "born_on", "created_at", "description", "died_on", "entity_def_id", "facebook", "facet_ids", "first_name", "gender", "identifier", "image_id", "image_url", "investor_stage", "investor_type", "last_name", "layout_id", "linkedin", "location_group_identifiers", "location_identifiers", "middle_name", "name", "num_articles", "num_current_advisor_jobs", "num_current_jobs", "num_diversity_spotlight_investments", "num_event_appearances", "num_exits", "num_exits_ipo", "num_founded_organizations", "num_investments", "num_jobs", "num_lead_investments", "num_partner_investments", "num_past_advisor_jobs", "num_past_jobs", "num_portfolio_organizations", "override_layout_id", "permalink", "permalink_aliases", "primary_job_title", "primary_organization", "rank_delta_d30", "rank_delta_d7", "rank_delta_d90", "rank_person", "rank_principal", "short_description", "twitter", "updated_at", "uuid", "website", "website_url", ], "query": [ { "type": "predicate", "field_id": "updated_at", "operator_id": "gte", "values": [str(YESTURDAY_DATE)] }, ], "order": [{ "field_id": "rank_person", "sort": "asc", "nulls": "last" }], "limit": 1000, } total_count, entities = utils.fetch_data(QUERY, END_POINT) if total_count is None: return "Error in parsing the API response. Please check the logs." print("total count: ", total_count) # get the people collection col = utils.get_mongodb_collection(COLLECTION_NAME) fetch_records_count = 0 # storing into the database and pagination while fetch_records_count < total_count: if fetch_records_count != 0: _, entities = utils.fetch_data(QUERY, END_POINT) if not entities: print("no entities left i.e., entities = %s. moving on." % len(entities)) break for e in entities: if e: e['insert_date'] = TODAY_DATE else: print("Entity is empty: ", e) inserted = col.insert_many(entities) fetch_records_count += len(entities) print("inserted records: ") pprint(inserted.inserted_ids) print("total_count: ", total_count, ", fetched records: ", fetch_records_count) # get the last record print("------------------------") after_id = entities[-1].get('uuid', None) if after_id: print("Get next batch after id: ", after_id) # print("Entities len: ", ) QUERY['after_id'] = after_id entities.clear() msg = {'entity': 'Poeple', 'total_record_updated': fetch_records_count} return jsonify(msg)