def train(data, label, token, bin_size=None): if token == 'LinR': train_data = data[:-60] train_label = label[:-60] test_data = data[-60:] test_label = label[-60:] train_label = np.expand_dims(train_label, axis=1) test_label = np.expand_dims(test_label, axis=1) model = model_generator(token) model.fit(train_data, train_label) modeldir = os.path.join('models', 'LinR') if not os.path.exists(modeldir): os.makedirs(modeldir) pickle.dump(model, open(os.path.join(modeldir, 'model.pkl'), 'wb')) predict = model.predict(test_data) figdir = os.path.join('fig', token) if not os.path.exists(figdir): os.makedirs(figdir) plot_curve(predict, test_label, token, os.path.join(figdir, 'curve.pdf')) MSE = mean_squared_error(test_label, predict) MAE = mean_absolute_error(test_label, predict) return MSE, MAE else: train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.2) min_val = min(train_label) max_val = max(train_label) bins = [ min_val + idx * (max_val - min_val) / (bin_size - 1) for idx in range(bin_size)] labels = range(bin_size-1) train_label = pd.cut(train_label, bins=bins, labels=labels) test_label = pd.cut(test_label, bins=bins, labels=labels) model = model_generator(token) for i in range(len(train_label)): if train_label[i] != train_label[i]: train_label[i] = 0 for i in range(len(test_label)): if test_label[i] != test_label[i]: test_label[i] = 0 model.fit(train_data, train_label) modeldir = os.path.join('models', token) if not os.path.exists(modeldir): os.makedirs(modeldir) pickle.dump(model, open(os.path.join(modeldir, 'bins-{}.pkl'.format(bin_size)), 'wb')) predict = model.predict(test_data) conf_matrix = confusion_matrix(test_label, predict, labels=labels) figdir = os.path.join('fig', token) if not os.path.exists(figdir): os.makedirs(figdir) figpath = os.path.join(figdir, 'bins-{}.pdf'.format(bin_size)) plot_conf_matrix(conf_matrix, labels, True, token, figpath) accuracy = accuracy_score(test_label, predict) precision, recall, f, _ = precision_recall_fscore_support(test_label, predict, average='weighted') return accuracy, precision, recall, f
def generate_samples(x1, x2, step=1): x1_min = int(np.percentile(x1, 1)) x1_max = int(np.percentile(x1, 100)) sample_x1 = np.arange(x1_min, x1_max, step).reshape(-1, 1) lm = LinearRegression() X = x1.reshape(-1, 1) model = lm.fit(X, x2) predicted_x2 = model.predict(sample_x1) plot.plot_curve(sample_x1, predicted_x2, x1, x2) return np.hstack([sample_x1, predicted_x2.reshape(-1, 1)])
from google_trends import init_google_trends, searches_for from plot import plot_curve if __name__ == "__main__": pytrend = init_google_trends() keywords = ['desinfektionsmittel', 'seife', 'klopapier', 'mundschutz'] data = searches_for(pytrend, keywords) plot_curve(data, keywords) keywords = [ 'corona', 'bundesliga', 'ausgangssperre', 'soforthilfe', 'italien' ] data = searches_for(pytrend, keywords) plot_curve(data, keywords) keywords = [ 'kinderbetreuung', 'schule', 'kita', 'alleinerziehend', 'notbetreuung' ] data = searches_for(pytrend, keywords) plot_curve(data, keywords) keywords = [ 'depression', 'mein mann schlägt mich', 'häusliche gewalt', 'seelsorge' ] data = searches_for(pytrend, keywords) plot_curve(data, keywords)
def main(): parser = argparse.ArgumentParser(description='Classification task') parser.add_argument('--normalize', action='store_true') parser.add_argument('--plot-pca', action='store_true') parser.add_argument('--plot-corr', action='store_true') parser.add_argument('--plot-feat-dist', action='store_true') parser.add_argument('--plot-curve', action='store_true') parser.add_argument('--classifier') parser.add_argument('datafile') args = parser.parse_args() data, X, y = preprocessing( args.datafile, #feature_discret=True, plot_feat_dist=args.plot_feat_dist, plot_corr=args.plot_corr, plot_PCA=args.plot_pca) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=0) X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, random_state=0) if args.normalize: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_dev = scaler.transform(X_dev) X_test = scaler.transform(X_test) """ #for neural network scaler = MinMaxScaler((-1, 1)) X_train = scaler.fit_transform(X_train) X_dev = scaler.transform(X_dev) X_test = scaler.transform(X_test) """ print("DataSet summary:") print("train X: ", X_train.shape, "y: ", y_train.shape) print("dev X: ", X_dev.shape, "y: ", y_dev.shape) print("test X:", X_test.shape, "y: ", y_test.shape) positive_weight = 1 - sum(y_train) / y_train.count() class_weight = {1: positive_weight, 0: 1 - positive_weight} print("Class weights: ", {1: positive_weight, 0: 1 - positive_weight}) if args.classifier == 'gbdt': clf = train_gbdt_classifier(X_train, y_train) elif args.classifier == 'rf': clf = train_random_forest_classifier(class_weight, X_train, y_train) elif args.classifier == 'nn': clf = train_MLP_classifier(X_train, y_train) else: clf = train_logit_classifier(class_weight, X_train, y_train) title = "Learning Curve" cv = StratifiedShuffleSplit(n_splits=10, test_size=0.33, random_state=0) X = pd.concat([X_train, X_dev]) y = pd.concat([y_train, y_dev]) plot_learning_curve(clf, title, X, y, cv=cv, n_jobs=2) plt.show() plot_feature_importance(clf, X_train, y_train, data.columns) y_train_pred = clf.predict(X_train) y_train_prop = clf.predict_proba(X_train) y_dev_pred = clf.predict(X_dev) y_dev_prop = clf.predict_proba(X_dev) y_test_pred = clf.predict(X_test) y_test_prop = clf.predict_proba(X_test) y_train_score = y_train_prop[:, 1] y_dev_score = y_dev_prop[:, 1] y_test_score = y_test_prop[:, 1] print('Training data report') print(classification_report(y_train, y_train_pred)) print('Dev data report') print(classification_report(y_dev, y_dev_pred)) #print('Test data report') #print(classification_report(y_test, y_test_pred)) if args.plot_curve: opthd = plot_curve(y_dev, y_dev_score, min_p=None, min_r=None) print("optimal threshold: ", opthd) y_test_pred = np.array(y_test_score > opthd) y_test_pred = y_test_pred.astype(int) print('Test data report (with optimal theshold)') print(classification_report(y_test, y_test_pred))
from google_trends import init_google_trends, searches_for from plot import plot_curve, make_axis_for_pytrends, make_axis_for_wiki from wikipedia import wiki_visits if __name__ == "__main__": pytrend = init_google_trends() keywords = ['desinfektionsmittel', 'seife', 'klopapier', 'mundschutz'] data = searches_for(pytrend, keywords) data = make_axis_for_pytrends(data, keywords) plot_curve(data) keywords = [ "Suizid_durch_Vergiftung_mit_Medikamenten", "Suizid_durch_Sprung_aus_der_Höhe" ] plot_data = {} for keyword in keywords: data = wiki_visits(keyword, "2019100100", "2020042800") if data: data = make_axis_for_wiki(data, keyword) plot_data.update(data) plot_curve(plot_data)
import matplotlib """ Save figure without displaying it """ matplotlib.use("Agg") import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix from pylab import rcParams epoch_list=[i for i in range(config.epoch)] class_names=[i for i in range(3)] pred_label= model.predict(test_data) print pred_label # for i in pred_label: # i=i*(max(new)-min(new))+min(new) print pred_label pred_label=list(itertools.chain.from_iterable(pred_label)) test_label=list(itertools.chain.from_iterable(test_label)) print pred_label print test_label # In[13]: from sklearn.metrics import confusion_matrix cnf_matrix = confusion_matrix(test_label, pred_label) # np.set_printoptions(prescision=2) plot.plot_confusion_matrix(cnf_matrix, class_names, False, "Confusion matrix", "/Users/nicole/Desktop/python/finalproject") plot.plot_curve(epoch_list, train_loss_list, "Accuracy curve", "/Users/wei-jer-chang/Desktop/final project", training=True, accuracy=False)
pred_list = classification.classification(pred_label) test_list = classification.classification(test_label) # print pred_list # print test_list from sklearn.metrics import confusion_matrix cnf_matrix = confusion_matrix(test_list, pred_list) class_names = [ "0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100" ] plot.plot_confusion_matrix(cnf_matrix, class_names, False, "Confusion matrix", "") plot.plot_curve(epoch_list, train_loss_list, "Clear Accuracy curve", "", training=True, accuracy=False) "write file" import csv all_value = [[i for i in range(1, config.epoch + 1)], train_loss_list, val_loss_list] title = 'epoch' + "," + 'train_loss_list' + "," + 'val_loss_list' row_value = zip(*all_value) sav_value = title + "\n" data = sav_value for i in range(len(row_value)): data += str(row_value[i][0]) + "," + str(row_value[i][1]) + "\n" with open("clear_data.csv", "w") as f1: