示例#1
0
def main():
	# img_width, img_height = 48, 48
	img_width, img_height = 200, 60
	img_channels = 1 
	# batch_size = 1024
	batch_size = 32
	nb_epoch = 1000
	post_correction = False

	save_dir = 'save_model/' + str(datetime.now()).split('.')[0].split()[0] + '/' # model is saved corresponding to the datetime
	train_data_dir = 'train_data/ip_train/'
	# train_data_dir = 'train_data/single_1000000/'
	val_data_dir = 'train_data/ip_val/'
	test_data_dir = 'test_data//'
	weights_file_path = 'save_model/2016-10-27/weights.11-1.58.hdf5'
	char_set, char2idx = get_char_set(train_data_dir)
	nb_classes = len(char_set)
	max_nb_char = get_maxnb_char(train_data_dir)
	label_set = get_label_set(train_data_dir)
	# val 'char_set:', char_set
	print 'nb_classes:', nb_classes
	print 'max_nb_char:', max_nb_char
	print 'size_label_set:', len(label_set)
	model = build_shallow(img_channels, img_width, img_height, max_nb_char, nb_classes) # build CNN architecture
	# model.load_weights(weights_file_path) # load trained model

	val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
	# val_data = None 
	train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx) 
	train(model, batch_size, nb_epoch, save_dir, train_data, val_data, char_set)
示例#2
0
文件: svm_5.py 项目: harrylclc/ist557
def main():
    global k_out
    k_out = 0
    x, y = load_data(k=2)
    kf = cross_validation.KFold(len(x), n_fold)
    scaler = preprocessing.StandardScaler()
    acc, prec, recall = [], [], []
    for train, test in kf:
        x_train, x_test, y_train, y_test = x[train] , x[test] , y[train] , y[test]
        c_star, gamma_star = choose_c_gamma(x_train, y_train)
        print '=========c*:{} g*:{}'.format(c_star, gamma_star)
        scaler.fit(x_train)
        clf = svm.SVC(C=c_star, gamma=gamma_star)
        clf.fit(scaler.transform(x_train), y_train)
        y_pred = clf.predict(scaler.transform(x_test))
        acc.append(accuracy_score(y_test, y_pred))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        print acc
        k_out += 1
    a = np.mean(acc)
    p = np.mean(prec)
    r = np.mean(recall)
    f = 2 * p * r / (p + r)
    
    print 'precision: {}'.format(p)
    print "recall: {}".format(r)
    print "f1: {}".format(f)
    print "accuracy: {}".format(a)
示例#3
0
def main():
	window_size = 100
	threshold = calc_threshold(exp_moving_average, window_size)

	print threshold

	filename = sys.argv[1]
	data_in = load_data(filename)

	# Uncomment for more realistic first values. First window_size/4 values
	# should not be taken into account in the output data and plots.
	# data_in[:0] = [sum(data_in[:(window_size/4)])/(window_size/4)]

	filtered_ma = average_diff(data_in, moving_average, window_size)
	filtered_ema = average_diff(data_in, exp_moving_average, window_size)

	plot([0] * len(data_in),
	     filtered_ma,
	     filtered_ema,
	     [threshold] * len(data_in),
	     [-threshold] * len(data_in),
	     )

	mean_ma  = mean_value_detector(filtered_ma,  threshold)
	mean_ema = mean_value_detector(filtered_ema, threshold)

	plot(mean_ema)
	plot(mean_ma)

	write_data(mean_ema, filename + ".out")
示例#4
0
def bagging():
    from sklearn.feature_selection import SelectPercentile, chi2

    comments, dates, labels = load_data()
    select = SelectPercentile(score_func=chi2, percentile=4)

    clf = LogisticRegression(tol=1e-8, penalty='l2', C=7)
    #clf = BaggingClassifier(logr, n_estimators=50)
    countvect_char = TfidfVectorizer(ngram_range=(1, 5),
            analyzer="char", binary=False)
    countvect_word = TfidfVectorizer(ngram_range=(1, 3),
            analyzer="word", binary=False)
    badwords = BadWordCounter()

    ft = FeatureStacker([("badwords", badwords), ("chars", countvect_char),
        ("words", countvect_word)])
    #ft = TextFeatureTransformer()
    pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])

    cv = ShuffleSplit(len(comments), n_iterations=20, test_size=0.2,
            indices=True)
    scores = []
    for train, test in cv:
        X_train, y_train = comments[train], labels[train]
        X_test, y_test = comments[test], labels[test]
        pipeline.fit(X_train, y_train)
        probs = pipeline.predict_proba(X_test)
        scores.append(auc_score(y_test, probs[:, 1]))
        print("score: %f" % scores[-1])
    print(np.mean(scores), np.std(scores))
示例#5
0
def test_stacker():
    comments, dates, labels = load_data()
    clf = LogisticRegression(tol=1e-8, C=0.01, penalty='l2')
    countvect_char = TfidfVectorizer(ngram_range=(1, 5),
            analyzer="char", binary=False)
    countvect_word = TfidfVectorizer(ngram_range=(1, 3),
            analyzer="word", binary=False)
    badwords = BadWordCounter()
    select = SelectPercentile(score_func=chi2)
    char_select = Pipeline([('char_count', countvect_char),
                            ('select', select)])
    words_select = Pipeline([('word_count', countvect_word),
                             ('select', select)])
    badwords_select = Pipeline([('badwords', badwords), ('select', select)])

    stack = FeatureStacker([("badwords", badwords_select),
                            ("chars", char_select),
                            ("words", words_select)])
    #stack.fit(comments)
    #features = stack.transform(comments)

    #print("training and transforming for linear model")
    print("training grid search")
    pipeline = Pipeline([("features", stack), ("clf", clf)])
    param_grid = dict(clf__C=[0.31, 0.42, 0.54],
                      features__words__select__percentile=[5, 7])
    grid = GridSearchCV(pipeline, cv=5, param_grid=param_grid, verbose=4,
           n_jobs=1, score_func=auc_score)
    grid.fit(comments, labels)
    tracer()
示例#6
0
def plot_conformity(name, log_dir, ax=None, legend=True):
    if ax is None:
        ax = plt.gca()

    r, actual, pred, a_err, p_err = util.load_data(name, log_dir)
    ax.errorbar(r, actual[0] - a_err[0], actual[0] + a_err[0], color=red_col,
                 label='Red centrals')
    ax.errorbar(r, actual[1] - a_err[1], actual[1] + a_err[1], color=blue_col,
                 label='Blue centrals')
    ax.errorbar(r, actual[2] - a_err[2], actual[2] + a_err[2], color='k',
                 label='All centrals')
    ax.errorbar(r, pred[0] - p_err[0], pred[0] + p_err[0], color=red_col,
                linestyle='--', alpha=0.3)
    ax.errorbar(r, pred[1] - p_err[1], pred[1] + p_err[1], color=blue_col,
                linestyle='--', alpha=0.3)
    ax.errorbar(r, pred[2] - p_err[2], pred[2] + p_err[2], color='k',
                linestyle='--', alpha=0.3)
    ax.set_xscale('log')
    ax.set_xlabel('r [Mpc/h]')
    ax.set_ylabel('Quenched Fraction')
    ax.set_ylim(0.0, 1.1)
    ax.set_xlim(0.1, 20)
    if legend:
        ax.legend(loc='best')
    return style_plots(ax)
示例#7
0
def main():
    global k_out
    k_out = 0
    x, y = load_data(k=2)
    kf_out = cross_validation.KFold(len(x), n_fold)
    a_score, p_score, r_score = [], [], []
    for train_out, test_out in kf_out:
        x_train_out, x_test_out, y_train_out, y_test_out = x[train_out] , x[test_out] , y[train_out] , y[test_out]
        kf = cross_validation.KFold(len(x_train_out), n_fold)
        m_opt = pruning_cross_validation(x_train_out, y_train_out, kf)
        clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=m_opt + 1)
        print '=========m_opt:{}'.format(m_opt)
        clf.fit(x_train_out, y_train_out)
        y_pred = clf.predict(x_test_out)
        a_score.append(accuracy_score(y_test_out, y_pred))
        p_score.append(precision_score(y_test_out, y_pred))
        r_score.append(recall_score(y_test_out, y_pred))
        k_out += 1
    a = np.mean(a_score)
    p = np.mean(p_score)
    r = np.mean(r_score)
    f = 2 * p * r / (p + r)
    print 'precision: {}'.format(p)
    print "recall: {}".format(r)
    print "f1: {}".format(f)
    print "accuracy: {}".format(a)
示例#8
0
def main(stat, stat_name):
    cats = util.load_all_cats()
    all_r_values = []
    names = cats.keys()
    names = ['HW', 'Becker', 'Lu', 'Henriques', 'Illustris', 'EAGLE', 'MB-II'][::-1]
    proxies = ['s1','s2','s5','s10','d1','d2','d5','d10', 'rhill', 'rhillmass']
    proxies_formatted = [ '$\Sigma_1$', '$\Sigma_2$', '$\Sigma_5$', '$\Sigma_{10}$', '$D_1$', '$D_2$', '$D_5$', '$D_{10}$', 'R$_\mathrm{hill}$', 'R$_\mathrm{hill-mass}$' ]
    for name in names:
        cat = cats[name]
        stat_dict = util.load_data('statistics.pckl', cat['dir'])
        r_values = []
        for p in proxies:
            try:
                print 'std of ', stat,' for ', p, '=', np.std(stat_dict[stat][p])
                r_values.append(np.mean(stat_dict[stat][p]))
            except:
                print 'no statistics found for', p
                r_values.append(0)
        all_r_values.append(r_values)
    df = pd.DataFrame(columns=proxies_formatted, index=names)
    for name, r_values in zip(names, all_r_values):
        df.loc[name] = pd.Series({p: v for p,v in zip(proxies_formatted, r_values)})
    #plt.imshow(all_r_values)
    #plt.show()
    df = df[df.columns].astype(float)
    #sns.heatmap(df, vmin=0,vmax=0.71, cmap='Blues', annot=True, fmt='.2f')
    #plots.style_plots()
    #plt.show()
    print df.values
    plot_heatmap(df, proxies_formatted, names, stat_name)
示例#9
0
def main():
    x, y = load_data(k=2)
    kf = cross_validation.KFold(len(x), n_fold)
    a, p, r, f = classify(x, y, kf, n_estimator=50)
    print "precision: {}".format(p)
    print "recall: {}".format(r)
    print "f1: {}".format(f)
    print "accuracy: {}".format(a)
示例#10
0
 def loadText(self):
     login, password, dbname = load_data()
     self.ui.loginEdit.setText(login)
     self.ui.passwordEdit.setText(password)
     self.ui.dbEdit.setText(dbname)
     self.ui.rememberPassword.setChecked(bool(password))
     if login:
         self.ui.passwordEdit.setFocus()
示例#11
0
def get_visitorid():
    visitor_id = util.load_data(addon, VISITOR_FILE)
    if visitor_id is False:
        from random import randint
        visitor_id = str(randint(0, 0x7fffffff))
        util.save_data(addon, VISITOR_FILE, visitor_id)

    return visitor_id
示例#12
0
    def __init__(self, problem_path):
        A, b, N, block_sizes, x_true, nz, f = util.load_data(problem_path)

        self._A = A
        self._b = b
        self._U = util.U(block_sizes)
        self._x_true = x_true
        self._f = f
        self._N = N
        self._x0 = util.block_sizes_to_x0(block_sizes)
示例#13
0
def main():
    x, y = load_data(k=2)
    kf = cross_validation.KFold(len(x), n_fold)
    max_m = min(2500 - 1, int(len(x) * (n_fold - 1) / n_fold) - 1)
    acc_score = [[] for i in xrange(max_m)]
    p_score = [[] for i in xrange(max_m)]
    r_score = [[] for i in xrange(max_m)]
    for train, test in kf:
        print len(train)
        x_train, x_test, y_train, y_test = x[train] , x[test] , y[train] , y[test]
        m = 1
        
        while 1: 
            print "iter: {}".format(m)
            clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=m + 1)
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_test)
            acc = accuracy_score(y_test, y_pred)
            acc_score[m - 1].append(acc)
            p_score[m - 1].append(precision_score(y_test, y_pred))
            r_score[m - 1].append(recall_score(y_test, y_pred))
            print 'accuracy: {}'.format(acc)
            m += 1
            if m > max_m:
                break             
#         break
    max_val, max_id = -1, -1
    for i in xrange(len(acc_score)):
        acc = np.mean(acc_score[i])
        if acc > max_val:
            max_val = acc
            max_id = i
        acc_score[i] = acc
        p_score[i] = np.mean(p_score[i])
        r_score[i] = np.mean(r_score[i])
    print acc_score[:10]
    with open('res/effect_of_leaves', 'w') as out:
        out.write(str(acc_score) + '\n')
        out.write(str(p_score) + '\n')
        out.write(str(r_score) + '\n')
    print 'splits:{}'.format(max_id + 1)
    print 'accuracy:{}'.format(max_val)
    print 'p:{}    r:{}'.format(p_score[max_id], r_score[max_id])
    
    plt.clf()
    m_idx = np.arange(2, len(acc_score) + 2)
    max_leaf = max_id + 2 
    plt.plot(m_idx, acc_score, label='cross_validation')
    plt.plot(max_leaf, max_val, linestyle='none', marker='o', markeredgecolor='r', markeredgewidth=1, markersize=12, markerfacecolor='none', label='best choice')
    plt.plot((max_leaf, max_leaf), (0, max_val), 'k--')
    plt.ylim(ymin=0.88, ymax=0.96)
    plt.xlabel("Number of leaf nodes")
    plt.ylabel("Cross validation score")
    plt.legend(numpoints=1, loc=4)
    plt.savefig('figs/effect_of_leaves.png')
示例#14
0
def ge_cmd_predict():
	args = parse_arg_predict()

	# prepare input to GE_learn
	data = util.load_data(args.data)
	model = util.load_model(args.model)
	pred_path = args.output

	pred = GE_predict(data, model)
	util.write_prediction(pred, pred_path)
	return
示例#15
0
 def setup_ts(self):
     cube, self.time, flux, radii, unc = load_data(self.setup['data_dir'],
         self.aor)
     pixels = get_pix(cube, geom=self.geom)
     self.t = binned(self.time, binsize=self.bs)
     self.pix = binned(pixels, binsize=self.bs)
     i = self.select_radius(flux)
     print("using radius: {}".format(radii[i]))
     self.radius = radii[i]
     self.f = binned(flux[i], binsize=self.bs)
     self.unc = binned(unc[i], binsize=self.bs) / np.sqrt(self.bs)
     self.pld = [0] * pixels.shape[1] + [0] * 2
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    # Count successful and probation students as one group (s)
    # Comment this out to try and distinguish all 3 groups (s, p, f)
    data.combine_labels(["s", "p"], "s")

    binning_exploration(data)
    plot_tests(data)
示例#17
0
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    # Count successful and probation students as one group (s)
    # Comment this out to try and distinguish all 3 groups (s, p, f)
    data.combine_labels(["s", "p"], "s")
    
    examine_principal_components(data)
    
    pca_find_important_features(data)
def obfuscate_keystrokes(name, strategy, param):
    """

    """
    df = load_data(name)
    df = df.groupby(level=[0, 1]).apply(keystrokes2events).reset_index(level=[2, 3], drop=True)

    if strategy == 'delay':
        df = df.groupby(level=[0, 1]).apply(lambda x: delay_mix(x, param))
    elif strategy == 'interval':
        df = df.groupby(level=[0, 1]).apply(lambda x: interval_mix(x, param))
    else:
        raise Exception('Unknown masking strategy')

    df = df.groupby(level=[0, 1]).apply(events2keystrokes).reset_index(level=[2, 3], drop=True)
    save_data(df, name, masking=(strategy, param))
    return
def main():
    window_size = 150
    threshold = 3000

    filename = sys.argv[1]
    data_in = load_data(filename)

    # second arg - maximum size of the window of interest
    # third arg - some threshold
    data_filtered = adaptive_window_avg(data_in, 100, 10)
    abs_data = data_abs(data_filtered)
    out_data = filtered_derivative_detector(abs_data, window_size, 0, 0)
    tline = [threshold] * len(out_data)

    plot(data_in)
    plot(data_filtered)
    plot(out_data, tline)
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    cluster_3_groups(data.copy())
    cluster_pass_fail(data.copy())
    cluster_success_struggle(data.copy())
    
    util.print_line_break()
    
    print "Now with PCA:"
    cluster_3_groups_with_pca(data.copy())
    cluster_pass_fail_with_pca(data.copy())
    cluster_success_struggle_with_pca(data.copy())
示例#21
0
def ge_cmd_learn():
	args = parse_arg_learn()
	
	# prepare input to GE_learn
	data = GE_data()
	data.dat = util.load_data(args.data)
	data.labeled_features = util.load_labeled_features(args.labeled_features)
	init_model = GE_model()
	param = GE_param()
	if args.l2:
		param.l2_regularization = args.l2
	final_model_path = args.model

	# print data

	final_model = GE_learn(data, init_model, param)
	util.save_model(final_model, final_model_path)
	return
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    # Count successful and probation students as one group (s)
    # Comment this out to try and distinguish all 3 groups (s, p, f)
    data.combine_labels(["s", "p"], "s")

    # Take a 50-50 split
    training, testing = data.split(0.5, using_labels=True)
    
    # Run tests for each classifier to determine the accuracy it can achieve.
    knn_accuracy_tests(training, testing)
    naive_bayes_accuracy_tests(training, testing)
    decision_tree_accuracy_tests(training, testing)
def main():
    # The original data set.
    data = util.load_data()

    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()

    # Count successful and probation students as one group (s)
    # Comment this out to try and distinguish all 3 groups (s, p, f)
    data.combine_labels(["s", "p"], "s")

    num_components = recommend_num_components(data, min_pct_variance=0.95)
    pca_data = pca(data, num_components)

    training, testing = data.split(0.5, using_labels=True)
    pca_training, pca_testing = pca_data.split(0.5, using_labels=True)

    compare_knn(training, testing, pca_training, pca_testing)
    compare_naive_bayes(training, testing, pca_training, pca_testing)
示例#24
0
    def __init__(self, *args, **kwargs):
        wx.Frame.__init__(self, *args, **kwargs)
        shared.options.update(load_data())

        #menu setup
        self.CreateStatusBar() # A Statusbar in the bottom of the window

        # Setting up the menu.
        filemenu = wx.Menu()
        menuAbout = filemenu.Append(wx.ID_ABOUT, '&About',' Information about this program')
        menuExit = filemenu.Append(wx.ID_EXIT,'E&xit',' Terminate the program')

        # Creating the menubar.
        menuBar = wx.MenuBar()
        menuBar.Append(filemenu,'&File') # Adding the 'filemenu' to the MenuBar
        self.SetMenuBar(menuBar)  # Adding the MenuBar to the Frame content.

        # Events.
        self.Bind(wx.EVT_MENU, self.OnExit, menuExit)
        self.Bind(wx.EVT_MENU, self.OnAbout, menuAbout)

        # Here we create a panel and a notebook on the panel
        panel = wx.Panel(self)
        notebook = wx.Notebook(panel)

        # create the page windows as children of the notebook
        filepage = FilePanel(notebook)
        formatpage = FormatPanel(notebook)
        modifypage = ModifyPanel(notebook)

        # add the pages to the notebook with the label to show on the tab
        notebook.AddPage(filepage, 'Convert')
        notebook.AddPage(formatpage, 'Format')
        notebook.AddPage(modifypage, 'Modify')

        # finally, put the notebook in a sizer for the panel to manage
        # the layout
        sizer = wx.BoxSizer()
        sizer.Add(notebook, 1, wx.EXPAND)
        panel.SetSizer(sizer)
        self.SetSize(self.GetSize() + (0, 35)) # Expand to fit the PngPanel
        self.Show()
示例#25
0
def preprocess_villani(in_file, out_file, long_fixed_out_file):
    """
    Preprocess the raw Villani dataset and extend the long fixed dataset
    """
    df = pd.read_csv(in_file, index_col=[0, 1])

    # Make age a binary target, <30 and >=30
    df['age'] = df['agegroup'].map({
        'under20': '<30',
        '20-29': '<30',
        '30-39': '>=30',
        '40-49': '>=30',
        '50-59': '>=30',
        'over60': '>=30'}
    )

    # Ignore missing data
    df = df.dropna()
    df = remove_repeated_keys(df)

    # combine the villani fixed text with citefa dataset fixed text
    long_fixed = load_data('long_fixed')
    slf = long_fixed.groupby(level=[0, 1]).size()

    villani_fixed = df[df['inputtype'] == 'fixed']
    villani_fixed = villani_fixed.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
    villani_fixed = villani_fixed.reset_index(level=[0, 1], drop=True)
    villani_fixed = reduce_dataset(villani_fixed, min_samples=10, max_samples=10)

    long_fixed = pd.concat([long_fixed, villani_fixed])
    long_fixed = long_fixed[COLS]
    long_fixed.to_csv(long_fixed_out_file)

    # Free-text input only
    villani_free = df[df['inputtype'] == 'free']
    villani_free = villani_free.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
    villani_free = villani_free.reset_index(level=[0, 1], drop=True)

    villani_free = reduce_dataset(villani_free, min_samples=10, max_samples=10)
    villani_free = villani_free[COLS]
    villani_free.to_csv(out_file)
    return
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    # Count successful and probation students as one group (s)
    # Comment this out to try and distinguish all 3 groups (s, p, f)
    data.combine_labels(["s", "p"], "s")
    
    util.print_line_break()
    print "Without PCA: %.5f" % get_knn_accuracy(data)
    
    util.print_line_break()
    print "With PCA:"
    print "\t".join(["PCs", "Accuracy"])
    for num_components in range(1, data.num_features()):
        accuracy = get_knn_accuracy(pca(data, num_components))
        print "%d\t%.5f" % (num_components, accuracy)
示例#27
0
def describe(name):
    """
    Describe the dataset
    """
    df = load_data(name)
    s = df.groupby(level=[0, 1]).size()
    print('Dataset               :', name)
    print('Users                 :', len(s.groupby(level=0)))
    print('Sessions/user         :'******'Sample size           :', s.mean(), '+/-', s.std())
    print('Mean pp interval (ms) :',
          df.groupby(level=[0, 1]).apply(lambda x: x['timepress'].diff().dropna().mean()).mean())
    print('Mean duration (ms)    :',
          df.groupby(level=[0, 1]).apply(lambda x: (x['timerelease'] - x['timepress']).mean()).mean())

    for target in TARGETS[1:]:
        s = df.reset_index().groupby([target, 'session']).size().groupby(level=0).size()
        print(target)
        print(s / s.sum())
    return
示例#28
0
文件: rf.py 项目: harrylclc/ist557
def main():
    x, y = load_data(k=2)
    kf = cross_validation.KFold(len(x), n_fold)
    if performance:
        for criterion in criteria:
            print 'criterion: {}'.format(criterion)
            a, p, r, f = classify(x, y, kf, criterion=criterion, n_estimator=500)
            print 'precision: {}'.format(p)
            print "recall: {}".format(r)
            print "f1: {}".format(f)
            print "accuracy: {}".format(a)
    if relation:
        res = []
        for k in xrange(1, 50 + 1):
            print 'num of trees:{}'.format(k * 10)
            a, p, r, f = classify(x, y, kf, criterion='entropy', n_estimator=k * 10)
            print a, p, r, f
            res.append((a, p, r, f))
        with open('res/rf_trees', 'w') as out:
            for v in res:
                out.write('{},{},{},{}\n'.format(v[0], v[1], v[2], v[3]))
示例#29
0
文件: dt.py 项目: harrylclc/ist557
def main():
    x, y = load_data(k=2)
    if evaluation:
        kf = cross_validation.KFold(len(x), n_fold)
        for criterion in criteria:
            print 'criterion: {}'.format(criterion)
            acc, prec, recall, node_cnt = [], [], [], []
            clf = DecisionTreeClassifier(criterion=criterion)
            for train, test in kf:
                x_train, x_test, y_train, y_test = x[train] , x[test] , y[train] , y[test]
                clf.fit(x_train, y_train)
                node_cnt.append(clf.tree_.node_count)
                y_pred = clf.predict(x_test)
                acc.append(accuracy_score(y_test, y_pred))
                prec.append(precision_score(y_test, y_pred))
                recall.append(recall_score(y_test, y_pred))
            a = np.mean(acc)
            p = np.mean(prec)
            r = np.mean(recall)
            f = 2 * p * r / (p + r)
            print 'precision: {}'.format(p)
            print "recall: {}".format(r)
            print "f1: {}".format(f)
            print "accuracy: {}".format(a)
            print "nodes: {}".format(np.mean(node_cnt))
    
    if plot:
        from sklearn.externals.six import StringIO
        from sklearn import tree
        import pydot
        clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=41)
        clf.fit(x, y)
        print clf.tree_.max_depth
        print clf.tree_.node_count
        dot_data = StringIO()
        tree.export_graphviz(clf, out_file=dot_data)
        graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
        graph.write_pdf("figs/test.pdf") 
示例#30
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4,
            n_jobs=12, score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])
示例#31
0
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from util import load_data

################################################################################

# Classification data
target_names = ['Female', 'Male']

dir_label = [['badeer-r', 1], ['benson-r', 1], ['blair-l', 0], ['cash-m', 0],
             ['corman-s', 1], ['hain-m', 1]]

dataset = load_data(dir_label)

X = np.array(dataset[0])
y = dataset[1]

# Sci-Kit Learn Naive Baye's Classifiers
# Train/Test split model
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.30,
                                                    random_state=50)

# Gaussian
gauss = GaussianNB().fit(X_train, y_train)
y_pred_gauss = gauss.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred_gauss)
示例#32
0
文件: main.py 项目: salhasalman/prnn
def CV(args):
    '''
    k-fold Cross-Validation
    :param args: model arguments
    '''

    # loading model  parameters
    MAX_SEQUENCE_LENGTH = args['ms']
    embeddings_index = util.load_embedding('glove.6B.100d.txt')
    EMBEDDING_DIM = 100
    drops = args['do']
    batch = args['bs']
    hidden = args['hs']
    n_folds = args['nf']
    epochNo = args['ep']
    ds_id = args['ds']
    verbose = args['vb']

    # loading data

    data, labels, word_idx, id_all = util.load_data(dataset_id=ds_id,isonefile=False,
                                                     MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=None)

    # saving bins info
    i = 0
    f_bin = open(ds_id+'.interbins', 'w')
    f_bin.close()
    f_train = open(ds_id+'.interout_train', 'wt')
    f_train.close()
    f_test = open(ds_id+'.interout_test', 'wt')
    f_test.close()
    id_all = np.array(id_all)

    # Cross-Validation
    avg_acc_train, avg_acc_test, avg_error_train, avg_error_test =0,0,0,0
    for train_index, test_index in skf.split(np.zeros(len(labels)), labels):
        f_bin = open(ds_id+'.interbins', 'a')
        np.savetxt(f_bin, [id_all[train_index]], fmt='%s')
        np.savetxt(f_bin, [id_all[test_index]], fmt ='%s')
        f_bin.close()
        print("size of train index ", len(train_index))
        print("size of test index ", len(test_index))
        print ("Running Fold %d/%d " % (i+1, n_folds))
        my_model = None  # Clearing the NN.
        my_model ,inter_model = model.model(drop=drops, hidden_units=hidden, word_index=word_idx,
                                            embedding_index=embeddings_index, EMBEDDING_DIM=EMBEDDING_DIM,
                                            MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH)
        [data_l, data_r] = data
        my_data_train = [data_l[train_index], data_r[train_index]]
        my_data_test = [data_l[test_index], data_r[test_index]]
        labels = np.asarray(labels)
        [loss_train, acc_train], [loss_test, acc_test],[inter_out_train,inter_out_test] = train_and_evaluate_model\
            (my_model, my_data_train,  labels[train_index], my_data_test,  labels[test_index], epochNo, inter_model,
             batch, verbose)

        a_train = labels[train_index].reshape(labels[train_index].shape[0],-1)
        a_test = labels[test_index].reshape(labels[test_index].shape[0],-1)
        print (inter_out_train.shape, a_train.shape)
        inter_out_train = np.concatenate((inter_out_train,a_train),axis=1)
        inter_out_test = np.concatenate((inter_out_test,a_test),axis=1)

        # updating bins info
        f_train = open(ds_id+'.interout_train', 'at')
        np.savetxt(f_train, inter_out_train)
        f_train.close()
        f_test = open(ds_id+'.interout_test', 'at')
        np.savetxt(f_test, inter_out_test)
        f_test.close()

        #  results
        avg_acc_train += acc_train
        avg_acc_test += acc_test
        avg_error_train += loss_train
        avg_error_test += loss_test
        i += 1
    print ("avg acc train , test :", avg_acc_train/n_folds, avg_acc_test/n_folds)
    print ("avg error train , test :", avg_error_train/n_folds, avg_error_test/n_folds)
示例#33
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

from util import load_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD

DATA_PATH = '../data/train.csv'
X, y = load_data(DATA_PATH)
df = pd.read_csv(DATA_PATH)

svd = TruncatedSVD(n_components=50, n_iter=10)
X_selected = svd.fit_transform(X)
var_exp = svd.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

with plt.style.context('seaborn-whitegrid'):
    plt.bar(range(50),
            var_exp,
            alpha=0.5,
            align='center',
            label='individual explained variance')
    plt.step(range(50),
             cum_var_exp,
             where='mid',
             label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
示例#34
0
import util

depth = 1

data, meta = util.load_data()

print("Training random forest model on %s (%i examples) with depth %i" %
      (meta["type"], len(data), depth))
print("acc: 0.9")
示例#35
0
'''
(linear) ridge regression algorithm for classification (i.e. use 0/1 error for evaluation)
For Q9, Q10
'''

import numpy as np
import util
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import zero_one_loss
import matplotlib.pylab as plt

# Load data and parsing
data = util.load_data("hw2_lssvm_all.dat.txt")
X, y = util.preprocessing(data)

# add x0 = 1
X = np.insert(X, 0, 1, axis=1)
print(X)

# test parameter λ = {0.05, 0.5, 5, 50, 500}

lambbda = [0.05, 0.5, 5, 50, 500]

# fit linear ridge regression

E_in = np.zeros(5)
E_out = np.zeros(5)
for it, lb in enumerate(lambbda):
    print(">>>>> λ = {} >>>".format(lb))
    clf = RidgeClassifier(
        alpha=lb)  #alpha: Regularization strength # tol: precision #solver
示例#36
0
def part_two():
    data = load_data('Data/day09.txt', data_type=int)
    xmas = Xmas(preamble_size=25, data=data)
    target = xmas.find_invalid_number()
    result = xmas.find_vulnerability(data=data, target=target)
    print(f"Part two returns: {result}")
示例#37
0
        statistics_dir, cmd_args.data + "_" + cmd_args.gm + "_" +
        str(cmd_args.learning_rate) + "_" + str(cmd_args.sortpooling_k) + "_" +
        str(cmd_args.out_dim) + "_" + str(cmd_args.hidden))
    if os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    else:
        os.makedirs(save_dir)
    model_dir = os.path.join(save_dir, "models")
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    results_dir = os.path.join(save_dir, "results")
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    shuffle_dir = os.path.join(cur_dir, "shuffle_idx")

    graphs = load_data()

    if cmd_args.sortpooling_k <= 1:
        num_nodes_list = sorted([g.num_nodes for g in graphs])
        cmd_args.sortpooling_k = num_nodes_list[
            int(math.ceil(cmd_args.sortpooling_k * len(num_nodes_list))) - 1]
        print('k used in SortPooling is: ' + str(cmd_args.sortpooling_k))

    skf = StratifiedKFold(n_splits=10)

    for shuffle_idx in range(1, 11):
        parameters_save = []
        random_idx = [
            int(idx) for idx in ud.load_list_from_file(shuffle_dir + '/' +
                                                       cmd_args.data + "_" +
                                                       str(shuffle_idx))
示例#38
0
# coding: utf-8

import torch
import matplotlib.pyplot as plt
import numpy as np

from model import CNN
from util import load_data

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

if __name__ == "__main__":
    
    train_loader, test_loader = load_data()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    correct = 0
    total = 0

    net = CNN()
    net.load_state_dict(torch.load('model_data/model1.pth'))

    for data in test_loader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
示例#39
0
def part_one():
    data = load_data('Data/day09.txt', data_type=int)
    xmas = Xmas(preamble_size=25, data=data)
    result = xmas.find_invalid_number()
    print(f"Part one returns: {result}")
示例#40
0
                             'consensus_distance': consensus_distance,
                             'Sbins': Sbins,
                             'Sbinc': Sbinc}

            # get the allele frequency histograms for mutations away and towards consensus
            if params.type == 'nuc':
                (data['to_histogram'][subtype],
                 data['away_histogram'][subtype]) = get_toaway_histograms(subtype,
                                                                          Sc=10,
                                                                          refname=params.reference)
            else:
                (data['to_histogram'][subtype],
                 data['away_histogram'][subtype]) = get_toaway_histograms_aminoacids(subtype,
                                                                          Sc=10,
                                                                          refname=params.reference)

            data['time_bins'] = time_bins
            data['af_bins'] = af_bins

        store_data(data, fn_data)
    else:
        print "Loading data from file"
        data = load_data(fn_data)

    fig_filename = foldername+'to_away'
    if params.reference != 'HXB2':
        fig_filename = fig_filename + '_'+params.reference
    if params.type == 'aa':
        fig_filename = fig_filename + '_aa'
    plot_to_away(data, fig_filename=fig_filename, sequence_type=params.type)
示例#41
0
from util import plt, np, load_data, grad_check_sparse, time_elapse
from softmax import softmax_loss_vectorized
from linear_classifier import Softmax

cifar_dir = '../cifar-10-batches-py'
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = load_data(
    cifar_dir, num_test=500)

# ininialize W
W = np.random.randn(3073, 10) * 0.0001

# test loss
loss, grad = softmax_loss_vectorized(W, X_dev, y_dev, 0.0)
#print('loss: %f' % loss)
#print('sanity check: %f' % (-np.log(0.1)))

# test gradient without regularization
#def f(w): return softmax_loss_vectorized(W, X_dev, y_dev, 0.0)[0]
#grad_numerical = grad_check_sparse(f, W, grad, 10)

# test gradient with regularization
#def f(w): return softmax_loss_vectorized(W, X_dev, y_dev, 1e2)[0]
#grad_numerical = grad_check_sparse(f, W, grad, 10)

softmax = Softmax()
loss_history = softmax.train(X_train,
                             y_train,
                             learning_rate=1e-7,
                             reg=5e4,
                             num_iters=1500,
                             verbose=True)
示例#42
0
import numpy as np
import util
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle

DATA_PATH = 'data/data_total.csv'
KNN_MODEL_FILE = 'knn_new.model'
SVC_MODEL_FILE = 'svc_new.model'

if __name__ == '__main__':
    test_data, test_labels, train_data, train_labels, label_to_xyz, selected_features = util.load_data(
        DATA_PATH)
    print(train_data.shape)

    clf = svm.LinearSVC()
    knn = KNeighborsClassifier(n_neighbors=7)
    clf.fit(train_data, train_labels)
    knn.fit(train_data, train_labels)

    pickle.dump([clf, label_to_xyz, selected_features],
                open(SVC_MODEL_FILE, 'wb'))
    [clf, label_to_xyz,
     selected_features] = pickle.load(open(SVC_MODEL_FILE, 'rb'))
    pickle.dump([knn, label_to_xyz, selected_features],
                open(KNN_MODEL_FILE, 'wb'))
    [knn, label_to_xyz,
     selected_features] = pickle.load(open(KNN_MODEL_FILE, 'rb'))

    pred_labels = clf.predict(test_data)
示例#43
0
    help='initial gru bias for r & z. higher => more like SimpleRnn')
opts = parser.parse_args()
print >> sys.stderr, opts

NUM_LABELS = 3


def log(s):
    print >> sys.stderr, util.dts(), s


# slurp training data, including converting of tokens -> ids
vocab = Vocab()
train_x, train_y, train_stats = util.load_data(opts.train_set,
                                               vocab,
                                               update_vocab=True,
                                               max_egs=int(
                                                   opts.num_from_train))
log("train_stats %s %s" % (len(train_x), train_stats))
dev_x, dev_y, dev_stats = util.load_data(opts.dev_set,
                                         vocab,
                                         update_vocab=False,
                                         max_egs=int(opts.num_from_dev))
log("dev_stats %s %s" % (len(dev_x), dev_stats))

# input/output example vars
s1_idxs = T.ivector('s1')  # sequence for sentence one
s2_idxs = T.ivector('s2')  # sequence for sentence two
actual_y = T.ivector('y')  # single for sentence pair label; 0, 1 or 2

# keep track of different "layers" that handle their own gradients.
示例#44
0
def run_train(args):
    all_result = {}
    model_result = []
    for filename in args.input_file:
        print("=================================")
        print("== Loading data ... ")
        print("=================================")
        option = {}
        if args.group is not None:
            option["group"] = args.group
        x, y, opt, h, index = load_data(
            filename,
            ans_col=args.answer,
            ignore_col=args.ignore,
            header=args.header,
            cat_col=args.categorical,
            option=option,
        )
        g = None
        if args.group is not None or "group" in opt:
            if "group_type" in opt:
                if opt["group_type"] != "int":
                    print("group remapping")
                    g = []
                    mapping_g = {}
                    for g_name in opt["group"]:
                        if g_name not in mapping_g:
                            mapping_g[g_name] = len(mapping_g)
                        g.append(mapping_g[g_name])
                    g = np.array(g, dtype=np.int32)
                else:
                    g = np.array(opt["group"], dtype=np.int32)
        if args.data_sample is not None:
            x, y, g = resample(x, y, g, n_samples=args.data_sample)
        ## 欠損値を補完(平均)
        m = np.nanmean(x, axis=0)
        if h is not None:
            h = np.array(h)[~np.isnan(m)]
        imr = SimpleImputer(missing_values=np.nan, strategy="mean")
        x = imr.fit_transform(x)
        print("x:", x.shape)
        print("y:", y.shape)
        ## 標準化
        sc = StandardScaler()
        x = sc.fit_transform(x)

        print("x:", x.shape)
        print("y:", y.shape)
        if g is not None:
            print("g:", g.shape)
            print("grouping enabled:", g.shape)
        ## データから2クラス問題か多クラス問題化を決めておく
        if args.task == "auto":
            if len(np.unique(y)) == 2:
                args.task = "binary"
            else:
                args.task = "multiclass"
        if args.task != "regression":
            y = y.astype(dtype=np.int64)

        ##
        ## cross-validation を並列化して行う
        ##
        print("=================================")
        print("== Starting cross-validation ... ")
        print("=================================")
        if g is not None:
            kf = sklearn.model_selection.GroupKFold(n_splits=args.splits)
            pool = Pool(processes=args.splits)
            results = pool.map(train_cv_one_fold, [(x, y, h, s, g, args)
                                                   for s in kf.split(x, y, g)])
        else:
            kf = sklearn.model_selection.KFold(n_splits=args.splits,
                                               shuffle=True)
            pool = Pool(processes=args.splits)
            results = pool.map(train_cv_one_fold,
                               [(x, y, h, s, args) for s in kf.split(x)])

        ##
        ## cross-validation の結果をまとめる
        ## ・各評価値の平均・標準偏差を計算する
        ##
        cv_result = {"cv": [r[0] for r in results]}
        model_result.append([r[1] for r in results])
        print("=================================")
        print("== Evaluation ... ")
        print("=================================")
        if args.task == "regression":
            score_names = ["r2", "mse"]
        else:
            score_names = ["accuracy", "f1", "precision", "recall", "auc"]
        for score_name in score_names:
            scores = [r[0][score_name] for r in results]
            test_mean = np.nanmean(np.asarray(scores))
            test_std = np.nanstd(np.asarray(scores))
            print("Mean %10s on test set: %3f (standard deviation: %3s)" %
                  (score_name, test_mean, test_std))
            cv_result[score_name + "_mean"] = test_mean
            cv_result[score_name + "_std"] = test_std
        ##
        ## 全体の評価
        ##
        test_y = []
        pred_y = []
        for result in cv_result["cv"]:
            test_y.extend(result["test_y"])
            pred_y.extend(result["pred_y"])
        if args.task != "regression":
            conf = sklearn.metrics.confusion_matrix(test_y, pred_y)
            cv_result["confusion"] = conf
        cv_result["task"] = args.task
        cv_result["index"] = index
        ##
        ## 結果をディクショナリに保存して返値とする
        ##
        all_result[filename] = cv_result
    return all_result, model_result
示例#45
0
文件: main.py 项目: zwytop/DEMO-Net
                    vlss_early_model = loss_value_val
                vacc_max = np.max((acc_val, vacc_max))
                vloss_min = np.min((loss_value_val, vloss_min))
                curr_step = 0
            else:
                curr_step += 1
                if curr_step == args.patience:
                    print('Early stop! Min loss: ', vloss_min,
                          ', Max accuracy: ', vacc_max)
                    print('Early stop model validation loss: ',
                          vlss_early_model, ', accuracy: ', vacc_early_model)
                    break

        test_feed_dict = {}
        test_feed_dict.update({placeholders['labels']: y_test})
        test_feed_dict.update({placeholders['features']: features})
        test_feed_dict.update({placeholders['dropout']: 0.0})
        test_feed_dict.update({placeholders['masks']: test_mask})
        loss_value_test, acc_test = sess.run([loss, accuracy],
                                             feed_dict=test_feed_dict)
        print('Test loss:', loss_value_test, '; Test accuracy:', acc_test)
        sess.close()


if __name__ == '__main__':
    time_stamp = strftime('%Y_%m_%d_%H_%M_%S', localtime())
    print("The time of running the codes: ", time_stamp)
    args = parse_args()
    data = load_data(args.dataset)
    train(args, data)
示例#46
0
def tensor_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(Lang.EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensors_from_pair(input_lang, output_lang, pair):
    input_tensor = tensor_from_sentence(input_lang, pair[0])
    target_tensor = tensor_from_sentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


if __name__ == "__main__":
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    input_lang, output_lang, pairs = load_data()
    hidden_size = 256

    encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
    decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

    criterion = nn.NLLLoss()
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.01)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.01)

    training_pairs = [
        tensors_from_pair(input_lang, output_lang, random.choice(pairs))
        for i in range(75000)
    ]

    # epoch 75000
示例#47
0
		else:
			parsed_args.languages = args['languages']
		args['train'] = False
		args['path'] = parsed_args.path
		args['source'] = parsed_args.source
		args['target'] = parsed_args.target
		args['test'] = parsed_args.test
		args['store_test'] = parsed_args.store_test
		args['t'] = parsed_args.t
	for language in args['languages']:
		wordemb_path = args['wordemb_path']+'%s.pkl' %  language
		wvec, vocab = load_word_vectors(language, wordemb_path)
		if parsed_args.train:
			train_path = args['data_path']+'/train/%s.json' %  language
			dev_path = args['data_path']+'/dev/%s.json' %  language
			x_ids, y_ids, cur_labels = load_data(path=train_path)
			xv_ids, yv_ids, cur_labels = load_data( path=dev_path)
			print "\tX_train (80%)"+": %d" % len(x_ids)
			print "\tX_val (10%)"+": %d" % len(xv_ids)
			X_ids.append(np.array(x_ids));Y_ids.append(np.array(y_ids))
			XV_ids.append(np.array(xv_ids));YV_ids.append(np.array(yv_ids))
		elif parsed_args.test or parsed_args.store_test:
			test_path = args['data_path']+'/test/%s.json' %  language
			xt_ids, yt_ids, cur_labels = load_data( path=test_path)
		 	print "\tX_test (10%)"+": %d" % len(xt_ids)
		 	if parsed_args.store_test:
		 		max_num = parsed_args.max_num
		 		XT_ids.append(np.array(xt_ids)[:max_num]);YT_ids.append(np.array(yt_ids)[:max_num])
		 	else:
				XT_ids.append(np.array(xt_ids));YT_ids.append(np.array(yt_ids))
		print "\t|V|: %d, |Y|: %d" % (len(vocab[language]),len(cur_labels))
示例#48
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-train',
                        action='store_true',
                        default=False,
                        help="train flag")
    parser.add_argument('-eval',
                        action='store_true',
                        default=False,
                        help="evaluate flag")
    parser.add_argument('-pred',
                        action='store_true',
                        default=False,
                        help="predict flag")
    parser.add_argument('-w',
                        action='store_true',
                        default=False,
                        help="load weights flag")
    parser.add_argument('-c', help="training: coarse dir")
    parser.add_argument('-f', help="training: fine scale with track dir")
    parser.add_argument('-tc', help="test dataset: coarse dir")
    parser.add_argument('-tf', help="test dataset: fine scale with track dir")
    parser.add_argument('-x', help="predict input dataset dir")
    parser.add_argument('-o', help="predict output dir")
    parser.add_argument('-l', help="learning rate")
    parser.add_argument('-e', help="epochs")
    parser.add_argument("-resume", help="bool flag, False by default")
    parser.add_argument("-modelh5", help="load exist model")
    parser.add_argument("-modelweighth5", help="load model weights")
    args = parser.parse_args()
    if len(sys.argv) < 4:
        print "Usage: --train=True -l=learning_rate -e=epochs -c=... -f=... --eval=False --pred=True option* --> use --help"
        return 0

    coarseDir = None
    fineDir = None
    test_coarseDir = None
    test_fineDir = None
    pred_dir = None
    out_dir = None

    if args.train:
        learning_rate = float(args.l)
        epochs = int(args.e)
        coarseDir = args.c
        fineDir = args.f
        print "training dataset: "
        print ">>>  " + str(coarseDir) + "  >>>  " + str(fineDir)
    if args.eval:
        test_coarseDir = args.tc
        test_fineDir = args.tf
        print "evaluate dataset: "
        print ">>>  " + str(test_coarseDir) + "  >>>  " + str(test_fineDir)
    if args.pred:
        pred_dir = args.x
        out_dir = args.o
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        print "predict: "
        print ">>>  " + str(pred_dir) + "  >>>  " + str(out_dir)

    sample_data = []
    if coarseDir: sdir = coarseDir
    elif test_coarseDir: sdir = test_coarseDir
    elif pred_dir: sdir = pred_dir
    file_name = sdir + [f for f in os.listdir(sdir) if not f.startswith('.')
                        ][0] + "/00001_00.obj"
    # file_name = pred_dir + "test.obj"
    dim = obj2tri(file_name, sample_data)  # [tri_dim, vert_dim]
    v_dim = dim[1]
    mtx, mtx_1 = face2mtx(file_name, dim)
    # create model
    model = setmodel(dim, mtx, mtx_1)

    ##load predefined weights
    load_weights = args.w
    if load_weights:
        alpha = 1.0
        beta = 0.5
        a1 = [alpha, 0.0, 0.0, beta, 0.0, 0.0, beta, 0.0, 0.0]
        a2 = [0.0, alpha, 0.0, 0.0, beta, 0.0, 0.0, beta, 0.0]
        a3 = [0.0, 0.0, alpha, 0.0, 0.0, beta, 0.0, 0.0, beta]
        a4 = [beta, 0.0, 0.0, alpha, 0.0, 0.0, beta, 0.0, 0.0]
        a5 = [0.0, beta, 0.0, 0.0, alpha, 0.0, 0.0, beta, 0.0]
        a6 = [0.0, 0.0, beta, 0.0, 0.0, alpha, 0.0, 0.0, beta]
        a7 = [beta, 0.0, 0.0, beta, 0.0, 0.0, alpha, 0.0, 0.0]
        a8 = [0.0, beta, 0.0, 0.0, beta, 0.0, 0.0, alpha, 0.0]
        a9 = [0.0, 0.0, beta, 0.0, 0.0, beta, 0.0, 0.0, alpha]

        w = np.array([[a1, a2, a3, a4, a5, a6, a7, a8,
                       a9]])  # has to be 1x(9x9) dim

        w = np.array([[[
            -0.0358, -0.0896, -0.0222, 0.0345, -0.0198, -0.0242, -0.0577,
            0.0466, -0.044
        ],
                       [
                           0.0369, 0.0963, -0.0193, 0.0888, -0.0208, -0.0687,
                           -0.0288, -0.0076, 0.0463
                       ],
                       [
                           -0.0098, 0.0295, -0.0726, 0.0491, 0.0215, -0.0231,
                           0.0533, 0.0355, 0.0101
                       ],
                       [
                           0.0993, 0.0233, -0.034, -0.0268, 0.014, 0.0581,
                           -0.0794, -0.0376, 0.0361
                       ],
                       [
                           0.047, 0.0036, -0.0083, -0.0519, -0.0065, -0.0106,
                           0.032, -0.013, -0.016
                       ],
                       [
                           -0.0321, -0.0622, 0.0714, -0.0885, -0.0279, -0.0009,
                           0.0293, -0.0219, -0.0361
                       ],
                       [
                           -0.0441, 0.0593, 0.0486, 0.0189, -0.0226, 0.0179,
                           0.0712, 0.0213, -0.0723
                       ],
                       [
                           -0.0729, -0.0937, 0.036, -0.0693, 0.0113, 0.0663,
                           0.0165, 0.0255, -0.012
                       ],
                       [
                           0.0262, -0.0108, -0.0177, -0.0069, 0.0036, 0.0014,
                           -0.0144, 0.0373, -0.0357
                       ]]],
                     dtype=np.float32)

        print ">>> predefined weights: "
        print w
        model.layers[1].set_weights(w)

    if args.train:
        x_train = np.empty(0)
        y_train = np.empty(0)
        x_test = np.empty(0)
        y_test = np.empty(0)

        print ">>>>>>> loading data..."
        for dirName, subdirList, fileList in os.walk(coarseDir):
            total = len(subdirList)
            count = 0
            for subdir in subdirList:
                # print('Found directory: %s' % subdir)
                if count % 5 == 0:
                    print str(float(count) / total * 100) + '%'
                count = count + 1
                x, y = load_data(coarseDir + subdir, fineDir + subdir)
                if x_train.size == 0:
                    x_train = x
                    y_train = y
                else:
                    x_train = np.vstack((x_train, x))
                    y_train = np.vstack((y_train, y))

        if x_train.size == 0:
            print "Error: no input training data."
            return 0

        train(model, x_train, y_train, learning_rate, epochs)

    if args.eval:
        print 'load test data to evaluate...'
        for dirName, subdirList, fileList in os.walk(test_coarseDir):
            for subdir in subdirList:
                print('Found directory: %s' % subdir)
                x, y = load_data(test_coarseDir + subdir,
                                 test_fineDir + subdir)
                if x_test.size == 0:
                    x_test = x
                    y_test = y
                else:
                    x_test = np.vstack((x_test, x))
                    y_test = np.vstack((y_test, y))

        if x_test.size == 0:
            print "Error: Need test dataset."
            return 0

        eval(model, x_test, y_test)

    print ">>> weights: >>>> "
    weights = model.layers[1].get_weights()
    w1 = np.array(weights).astype(np.float32)
    np.set_printoptions(suppress=True)
    np.set_printoptions(precision=4)
    print weights

    ## predict and save output to obj
    if args.pred:
        for dirName, subdirList, fileList in os.walk(pred_dir):
            for subdir in subdirList:
                newpath = out_dir + subdir
                print newpath
                if not os.path.exists(newpath):
                    os.makedirs(newpath)
                obj_in = pred_dir + subdir + '/00001_00.obj'
                batch_coarse = []
                for dirpath, dirnames, filenames in os.walk(pred_dir + subdir):
                    for x in xrange(1, 101):
                        file_name = str(x).zfill(5) + '_00.obj'
                        obj2tri(pred_dir + subdir + '/' + file_name,
                                batch_coarse)

                x = np.array(batch_coarse)
                # print "predict input: \n >>>>  "
                # print x.shape
                pred(model, x, v_dim, obj_in, out_dir + subdir + '/')

    # ============= test ==============
    # obj_in = pred_dir + "test.obj"
    # batch_coarse = []
    # obj2tri(obj_in, batch_coarse)

    # x = np.array(batch_coarse)
    # pred(model, x, v_dim, obj_in, out_dir)
    # ============= test ==============

    save(model)
示例#49
0
import util
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from baseline import BaselinePredictor
from sklearn.svm import SVC

data = util.load_data()

preprocessed_data = util.preprocess_data(data)

X, Y = util.splitFeaturesAndLabel(preprocessed_data, 'Empathy')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

baseline_predictor = BaselinePredictor()

baseline_preds = util.trainAndPredict(X_train, Y_train, baseline_predictor,
                                      X_test)

print("Baseline accuracy and classification report")

util.printAccuracyAndClassficationReport(baseline_preds,
                                         Y_test,
                                         classes=['1', '2', '3', '4', '5'])

X_train, X_test = util.getBestFeatures(X_train, Y_train, X_test)

model = SVC(kernel='rbf')

params = {
    'C': [i for i in range(1, 11)],
示例#50
0
def main():
    from args import args
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--model', required=True)
    # parser.add_argument('--train', required=True)
    # parser.add_argument('--dev', required=True)
    # args.load_model_dir = parser.parse_args().model
    # args.ent_train_dir = parser.parse_args().train
    # args.ent_dev_dir = parser.parse_args().dev
    args.load_model_dir = '/scratch0/shifeng/rawr/drqa/original.pt'
    args.ent_train_dir = 'results/20180217T172242.135276/train.pkl'
    args.ent_dev_dir = 'pkls/original.rawr.dev.pkl'
    args.other_train_dir = 'results/targeted_train_all.pkl'
    out_dir = prepare_output_dir(args, '/scratch0/shifeng/rawr/drqa/')

    log = logging.getLogger(__name__)
    log.setLevel(logging.DEBUG)
    fh = logging.FileHandler(os.path.join(out_dir, 'output.log'))
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt='%(asctime)s %(message)s',
                                  datefmt='%m/%d/%Y %I:%M:%S')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    log.addHandler(fh)
    log.addHandler(ch)
    log.info('===== {} ====='.format(out_dir))

    with open(os.path.join(out_dir, 'args.pkl'), 'wb') as f:
        pickle.dump(args, f)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    log.info('loading regular data from {}'.format(args.data_file))
    train_reg, dev_reg, dev_y, embedding, opt = load_data(args)
    log.info('{} regular training examples'.format(len(train_reg)))
    log.info('{} regular dev examples'.format(len(dev_reg)))
    # log.info(opt)
    ''' load data for regularization '''
    log.info('loading entropy training data from {}'.format(
        args.ent_train_dir))
    with open(args.ent_train_dir, 'rb') as f:
        train_ent = pickle.load(f)
        if isinstance(train_ent, dict) and 'reduced' in train_ent:
            train_ent = train_ent['reduced']
        if isinstance(train_ent[0][0], list):
            train_ent = list(itertools.chain(*train_ent))

    # log.info('loading targeted training data from {}'.format(args.other_train_dir))
    # with open(args.other_train_dir, 'rb') as f:
    #     other_train_ent = pickle.load(f)
    #     if isinstance(other_train_ent, dict) and 'reduced' in train_ent:
    #         other_train_ent = other_train_ent['reduced']
    #     if isinstance(other_train_ent[0][0], list):
    #         other_train_ent = list(itertools.chain(*other_train_ent))
    # train_ent += other_train_ent

    if args.filter_long > 0:
        train_ent = [x for x in train_ent if len(x[5]) < args.filter_long]

    log.info('loading entropy dev data from {}'.format(args.ent_train_dir))
    with open(args.ent_dev_dir, 'rb') as f:
        dev_ent = pickle.load(f)['reduced']
        if isinstance(dev_ent[0], list):
            # dev_ent = list(itertools.chain(*dev_ent))
            dev_ent = [x[0] for x in dev_ent]
        # if args.filter_long > 0:
        #     dev_ent = [x for x in dev_ent if len(x[5]) > args.filter_long]
    log.info('{} entropy training examples'.format(len(train_ent)))
    log.info('{} entropy dev examples'.format(len(dev_ent)))

    log.info('loading model from {}'.format(args.load_model_dir))
    checkpoint = torch.load(args.load_model_dir)
    # opt = checkpoint['config']
    state_dict = checkpoint['state_dict']
    model = DocReaderModel(vars(opt), embedding, state_dict)
    model.cuda()
    ''' initial evaluation '''
    dev_reg_batches = BatchGen(dev_reg,
                               batch_size=args.batch_size,
                               pos_size=args.pos_size,
                               ner_size=args.ner_size,
                               evaluation=True,
                               gpu=args.cuda)
    dev_ent_batches = BatchGen(dev_ent,
                               batch_size=args.batch_size,
                               pos_size=args.pos_size,
                               ner_size=args.ner_size,
                               evaluation=True,
                               gpu=args.cuda)
    predictions = []
    for batch in dev_reg_batches:
        predictions.extend(model.predict(batch))
    em, f1 = score(predictions, dev_y)
    ents, predictions_r = [], []
    for batch in dev_ent_batches:
        p, _, ss, se, _, _ = model.predict(batch, get_all=True)
        ss = ss.cpu().numpy()
        se = se.cpu().numpy()
        ents.append(scipy.stats.entropy(ss.T).sum() + \
                    scipy.stats.entropy(se.T).sum())
        predictions_r.extend(p)
    ent = sum(ents) / len(ents)
    em_r, f1_r = score(predictions_r, dev_y)
    log.info("[dev EM: {:.5f} F1: {:.5f} Ent: {:.5f}]".format(em, f1, ent))
    log.info("[dev EMR: {:.5f} F1R: {:.5f}]".format(em_r, f1_r))
    best_f1_score = f1
    ''' interleaved training '''
    train_ent_batches = BatchGen(train_ent,
                                 batch_size=args.batch_size,
                                 pos_size=args.pos_size,
                                 ner_size=args.ner_size,
                                 gpu=args.cuda)
    len_train_ent_batches = len(train_ent_batches)
    train_ent_batches = iter(train_ent_batches)
    n_reg = 0
    n_ent = 0
    for epoch in range(args.epochs):
        log.warning('Epoch {}'.format(epoch))
        train_reg_batches = BatchGen(train_reg,
                                     batch_size=args.batch_size,
                                     pos_size=args.pos_size,
                                     ner_size=args.ner_size,
                                     gpu=args.cuda)
        start = datetime.now()

        for i_reg, reg_batch in enumerate(train_reg_batches):
            model.update(reg_batch)
            n_reg += 1
            if n_reg > args.start_ent:
                if i_reg % args.n_reg_per_ent == 0:
                    for j in range(args.n_ent_per_reg):
                        try:
                            model.update_entropy(next(train_ent_batches),
                                                 gamma=args.gamma)
                            n_ent += 1
                        except StopIteration:
                            n_ent = 0
                            train_ent_batches = iter(
                                BatchGen(train_ent,
                                         batch_size=args.batch_size,
                                         pos_size=args.pos_size,
                                         ner_size=args.ner_size,
                                         gpu=args.cuda))

            if n_reg % args.n_report == 0:
                log.info(
                    'epoch [{:2}] batch [{}, {}] loss[{:.5f}] entropy[{:.5f}]'.
                    format(epoch, i_reg, n_ent, model.train_loss.avg,
                           -model.entropy_loss.avg / args.gamma))

            # if n_reg % args.n_eval == 0:
        dev_reg_batches = BatchGen(dev_reg,
                                   batch_size=args.batch_size,
                                   pos_size=args.pos_size,
                                   ner_size=args.ner_size,
                                   evaluation=True,
                                   gpu=args.cuda)
        dev_ent_batches = BatchGen(dev_ent,
                                   batch_size=args.batch_size,
                                   pos_size=args.pos_size,
                                   ner_size=args.ner_size,
                                   evaluation=True,
                                   gpu=args.cuda)
        ''' regular evaluation '''
        predictions = []
        for batch in dev_reg_batches:
            predictions.extend(model.predict(batch))
        em, f1 = score(predictions, dev_y)
        ''' entropy evaluation '''
        ents, predictions_r = [], []
        for batch in dev_ent_batches:
            p, _, ss, se, _, _ = model.predict(batch, get_all=True)
            ss = ss.cpu().numpy()
            se = se.cpu().numpy()
            ents.append(scipy.stats.entropy(ss.T).sum() + \
                        scipy.stats.entropy(se.T).sum())
            predictions_r.extend(p)
        ent = sum(ents) / len(ents)
        em_r, f1_r = score(predictions_r, dev_y)

        log.info("dev EM: {:.5f} F1: {:.5f} Ent: {:.5f}".format(em, f1, ent))
        log.info("[dev EMR: {:.5f} F1R: {:.5f}]".format(em_r, f1_r))
        ''' save best model '''
        if f1 > best_f1_score:
            best_f1_score = f1
            model_file = os.path.join(out_dir, 'best_model.pt')
            model.save(model_file, epoch)
            log.info('[save best model F1: {:.5f}]'.format(best_f1_score))
        ''' save models '''
        model_file = os.path.join(out_dir,
                                  'checkpoint_epoch_{}.pt'.format(epoch))
        model.save(model_file, epoch)
        log.info("[save model {}]".format(model_file))
示例#51
0
def main():
    # Training settings
    # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper.
    parser = argparse.ArgumentParser(
        description=
        'PyTorch graph convolutional neural net for whole-graph classification'
    )
    parser.add_argument('--dataset',
                        type=str,
                        default="MUTAG",
                        help='name of dataset (default: MUTAG)')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument(
        '--iters_per_epoch',
        type=int,
        default=50,
        help='number of iterations per each epoch (default: 50)')
    parser.add_argument('--epochs',
                        type=int,
                        default=350,
                        help='number of epochs to train (default: 350)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate (default: 0.01)')
    parser.add_argument(
        '--seed',
        type=int,
        default=0,
        help='random seed for splitting the dataset into 10 (default: 0)')
    parser.add_argument(
        '--fold_idx',
        type=int,
        default=0,
        help='the index of fold in 10-fold validation. Should be less then 10.'
    )
    parser.add_argument(
        '--num_layers',
        type=int,
        default=5,
        help='number of layers INCLUDING the input one (default: 5)')
    parser.add_argument(
        '--num_mlp_layers',
        type=int,
        default=2,
        help=
        'number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.'
    )
    parser.add_argument('--hidden_dim',
                        type=int,
                        default=64,
                        help='number of hidden units (default: 64)')
    parser.add_argument('--final_dropout',
                        type=float,
                        default=0.5,
                        help='final layer dropout (default: 0.5)')
    parser.add_argument(
        '--graph_pooling_type',
        type=str,
        default="sum",
        choices=["sum", "average"],
        help='Pooling for over nodes in a graph: sum or average')
    parser.add_argument(
        '--neighbor_pooling_type',
        type=str,
        default="sum",
        choices=["sum", "average", "max"],
        help='Pooling for over neighboring nodes: sum, average or max')
    parser.add_argument(
        '--learn_eps',
        action="store_true",
        help=
        'Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.'
    )
    parser.add_argument(
        '--degree_as_tag',
        action="store_true",
        help=
        'let the input node features be the degree of nodes (heuristics for unlabeled graph)'
    )
    parser.add_argument('--filename', type=str, default="", help='output file')
    args = parser.parse_args()

    #set up seeds and gpu device
    torch.manual_seed(0)
    np.random.seed(0)
    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(0)

    graphs, num_classes = load_data(args.dataset, args.degree_as_tag)

    ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx.
    train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx)

    model = GraphCNN(args.num_layers, args.num_mlp_layers,
                     train_graphs[0].node_features.shape[1], args.hidden_dim,
                     num_classes, args.final_dropout, args.learn_eps,
                     args.graph_pooling_type, args.neighbor_pooling_type,
                     device).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    for epoch in range(1, args.epochs + 1):
        scheduler.step()

        avg_loss = train(args, model, device, train_graphs, optimizer, epoch)
        acc_train, acc_test = test(args, model, device, train_graphs,
                                   test_graphs, epoch)

        if not args.filename == "":
            with open(args.filename, 'w') as f:
                f.write("%f %f %f" % (avg_loss, acc_train, acc_test))
                f.write("\n")
        print("")

    extract_features(model, graphs)
            if min_path is None:
                draw_document_graph(document_item, "error.png")
                draw_document_graph(document_item, "error2.png")
                print("Document graph is dumped in error.png")
                print("Entities are: {}".format(document_item["entities"]))
                raise Exception("No shortest path between entity {} and {}".format(i_ent1, i_ent2))
            min_paths[i_ent1, i_ent2] = min_path

    return PatternPairwiseShortestPath(doc_graph, len(document_item["entities"]),
            min_paths)

if __name__=="__main__":
    from util import load_data

    items, indmap, _observed_tuples, arities = load_data("wikismall.data.json", tuple_type="ent_index")

    for key in items["train"]:
        sample_doc = items["train"][key]["docs"][0]

        tokens = []
        for sent in sample_doc["sentences"]:
            for node in sent["nodes"]:
                tokens.append(node["label"])
        raw_sent = " ".join(tokens)

        ents = [ent for ent in sample_doc["entities"]]
        print(ents)

        print(raw_sent)
示例#53
0
        description="Make figure for divergence and diversity")
    parser.add_argument('--redo', action='store_true', help='recalculate data')
    params = parser.parse_args()

    username = os.path.split(os.getenv('HOME'))[-1]
    foldername = get_figure_folder(username, 'first')
    fn_data = foldername + 'data/'
    fn2_data = fn_data + 'divdiv_correlation.pickle'
    fn_data = fn_data + 'syn_nonsyn_divergence.pickle'

    if not os.path.isfile(fn_data) or params.redo:
        patients = ['p1', 'p2', 'p3', 'p5', 'p6', 'p8', 'p9', 'p10', 'p11']
        regions = {
            'structural': ['gag'],  #['p17', 'p24'],
            'enzymes': ['pol'],  #['PR', 'RT', 'p15', 'IN'],
            'accessory': ['vif', 'nef', 'vpr', 'vpu', 'tat', 'rev'],
            'envelope': ['env']  #['gp41', 'gp120'],
        }
        # NOTE: these two give the same result, good
        data = collect_data_fabio(patients, regions)
        #data = collect_data_richard(patients, regions)
        store_data(data, fn_data)
    else:
        print("Loading data from file")
        data = load_data(fn_data)

    # this load additional data produced by script divergence_diversity_correlation
    data['divdiv_corr'] = load_data(fn2_data)

    plot_divdiv(data, fig_filename=foldername + 'divdiv')
示例#54
0
# -*- coding:utf-8 -*-
#test
import tensorflow as tf
import numpy as np

import model
import util
import trainer

# define log file descriptor
log_file = open("log/log.txt", 'w')

# load dataset
data, label = util.load_data(model.DATA_PATH, model.LABEL_PATH)
print(data.shape)
print(label.shape)

weight, bias = model.set_weights()
param = {
    "model": model.model,
    "weight": weight,
    "bias": bias,
    "train_epoch": model.TRAIN_EPOCH,
    "learning_rate": model.LEARNING_RATE,
    "decay_rate": model.DECAY_RATE,
    "fold": model.FOLD,
    "train_batch_size": model.TRAIN_BATCH_SIZE,
    "valid_batch_size": model.VALID_BATCH_SIZE,
    "display_step": model.DISPLAY_STEP,
    "log_file": log_file
}
def main():
    parser = argparse.ArgumentParser(description='training nn and make predictions')
    parser.add_argument('--dataset', type = str, default = "all_models", help = 'dataset to use for layer2 stacking')
    parser.add_argument('--mode', type = str, default = "OOF", help = 'do cv tuning or oof generation')
    parser.add_argument('--model', type = str, default = "xgb", help = 'what model to use for stacking')
    parser.add_argument('--save_flag', type = str, default = "0", help = 'versioning flag')
    parser.add_argument('--save_prediction', type = str2bool, default = "True", help = 'save prediciton or not')
    args = parser.parse_args()
    print(args)

    models = all_models
    if args.dataset == "all_models":
        models = all_models
    elif args.dataset == 'model_bench':
        models = model_bench
    elif args.dataset == 'de_corred_models':
        models = de_corred_models

    train, test, y, y_label_dist = load_data(processed = True)
    sub = pd.read_csv("../input/sample_submission.csv")

    # some ad hoc features
    train['comment_text'].fillna("__UNKNOWN__", inplace = True)
    test['comment_text'].fillna("__UNKNOWN__", inplace = True)
    train['num_words'] = train.comment_text.str.count('\S+')
    test['num_words'] = test.comment_text.str.count('\S+')
    train['num_comas'] = train.comment_text.str.count('\.')
    test['num_comas'] = test.comment_text.str.count('\.')
    train['num_bangs'] = train.comment_text.str.count('\!')
    test['num_bangs'] = test.comment_text.str.count('\!')
    train['num_quotas'] = train.comment_text.str.count('\"')
    test['num_quotas'] = test.comment_text.str.count('\"')
    train['avg_word'] = train.comment_text.str.len() / (1 + train.num_words)
    test['avg_word'] = test.comment_text.str.len() / (1 + test.num_words)
    sent_analyzer = SentimentIntensityAnalyzer()
    train['sentiments'] = train.comment_text.progress_map(lambda text: sent_analyzer.polarity_scores(text)['compound'])
    test['sentiments'] = test.comment_text.progress_map(lambda text: sent_analyzer.polarity_scores(text)['compound'])
    META_FEATURES = [ 'num_words', 'num_comas', 'num_bangs', 'num_quotas', 'avg_word', 'sentiments']

    # read in oof predictions from layer1
    train_features = pd.concat([
        pd.read_csv(inp)[LABELS]
        for inp in ["../models/{}/train_meta_probs_round_0.csv".format(model)
        for model in models]]
    , axis = 1)
    train_features.columns = ['_'.join([label, str(i + 1)]) for i in range(len(models)) for label in LABELS]

    train_features = pd.concat([train_features, train[META_FEATURES]], axis = 1)

    # read in avg test predicitons from layer 1
    test_features = pd.concat([
        pd.read_csv(inp)[LABELS]
        for inp in ["../models/{}/test_probs_5_bag_arith_mean_round_0.csv".format(model)
        for model in models]]
    , axis = 1)
    test_features.columns = ['_'.join([label, str(i + 1)]) for i in range(len(models)) for label in LABELS]
    test_features = pd.concat([test_features, test[META_FEATURES]], axis = 1)

    # === cv splits and place holders
    # I am reusing the same split from layer 1
    splitter = StratifiedKFold(n_splits = 5, shuffle = True, random_state = CV_SPLIT_SEED)
    folds = list(splitter.split(train_features, y_label_dist))

    if args.mode == 'CV':
        if args.model == 'lgb':
            lgb_params = {}
            aucs = []

            # per label cv tuning
            for idx, label in enumerate(LABELS):
                print("idx {} label {} started cv at time {}".format(idx, label, datetime.now()))
                current_param_set = {}
                lgb_model = lgb.LGBMClassifier(objective = 'binary', n_jobs = 8, class_weight = 'balanced')
                for param_pairs in [
                        {'learning_rate': [0.02, 0.03, 0.05, 0.06, 0.07],
                         'n_estimators': [100, 120, 140, 160, 180]},
                        {'num_leaves': [15, 18, 24, 27, 30],
                         'min_child_samples': [30, 40, 60, 80, 90]},
                        {'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
                         'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7]},
                        {'reg_alpha': [0, 0.1, 0.2, 0.3, 0.5],
                         'reg_lambda': [0.2, 0.3, 0.5, 0.7, 0.9]}
                    ]:

                    grid_search = GridSearchCV(
                            estimator = lgb_model
                            , param_grid = param_pairs
                            , scoring = 'roc_auc'
                            , n_jobs = 1
                            , cv = folds
                            , refit = True
                            , verbose = 1
                            , return_train_score = True
                        )
                    results = grid_search.fit(train_features, y[:, idx])
                    current_param_set.update(results.best_params_)
                    lgb_model = results.best_estimator_
                    print(results.best_score_)
                    print(lgb_model)
                sub[label] = lgb_model.predict_proba(test_features)[:,1]
                print(results.best_score_)
                print(current_param_set)
                aucs.append(results.best_score_)
                lgb_params[label] = current_param_set

            print(np.mean(aucs))
            if args.save_prediction:
                sub.to_csv("lgb_stacker_ver{}.csv".format(args.save_flag), index = False)

        if args.model == 'lr':
            log_reg_params = {}
            aucs = []

            # per label cv tuning
            for idx, label in enumerate(LABELS):
                print("idx {} label {} started cv at time {}".format(idx, label, datetime.now()))
                log_reg = LogisticRegression(fit_intercept = True, penalty = 'l2', class_weight = 'balanced')
                param_grid = {
                        'C':   [0.001, 0.05, 0.1, 1, 2, 10],
                        'tol': [0.01],
                        'solver': ['lbfgs', 'newton-cg']
                    }

                grid_search = GridSearchCV(
                        estimator = log_reg
                        , param_grid = param_grid
                        , scoring = 'roc_auc'
                        , n_jobs = 8
                        , cv = folds
                        , refit = True
                        , verbose = 1
                        , return_train_score = True
                    )
                results = grid_search.fit(train_features, y[:, idx])
                log_reg = results.best_estimator_
                print(results.best_score_)
                print(results.best_params_)

                sub[label] = log_reg.predict_proba(test_features)[:,1]
                aucs.append(results.best_score_)
                log_reg_params[label] = results.best_params_

            print(np.mean(aucs))
            if args.save_prediction:
                sub.to_csv("log_reg_stacker_ver{}.csv".format(args.save_flag), index = False)

        if args.model == 'xgb':
            xgb_params = {}
            aucs = []
            test_probs = []
            # per label cv tuning
            for idx, label in enumerate(LABELS):
                print("idx {} label {} started cv at time {}".format(idx, label, datetime.now()))
                current_param_set = {}
                xgb_model = xgb.XGBClassifier(objective = 'binary:logistic', n_jobs = 8, class_weight = 'balanced')
                for param_pairs in [
                        {'learning_rate': [0.04, 0.05, 0.06]},
                        {'n_estimators': [120, 140, 150, 160]},
                        {'max_depth': [2,3,4]},
                        {'min_child_weight': [1,3,5]},
                        {'subsample': [0.8, 1]},
                        {'colsample_bytree': [0.8, 1]},
                        {'reg_alpha': [0, 0.1]},
                        {'reg_lambda': [0.9, 1]}
                    ]:
                    # print(np.mean(
                    #   cross_val_score(
                    #       xgb_model,
                    #       train_features,
                    #       y[:, idx],
                    #       cv = folds,
                    #       scoring = 'roc_auc',
                    #       verbose = 2
                    #   )))
                    grid_search = GridSearchCV(
                            estimator = xgb_model
                            , param_grid = param_pairs
                            , scoring = 'roc_auc'
                            , n_jobs = 1
                            , cv = folds
                            , refit = True
                            , verbose = 2
                            , return_train_score = True
                        )
                    results = grid_search.fit(train_features, y[:, idx])
                    current_param_set.update(results.best_params_)
                    xgb_model = results.best_estimator_
                    print(results.best_score_)
                    print(xgb_model)
                sub[label] = xgb_model.predict_proba(test_features)[:,1]
                print(results.best_score_)
                print(current_param_set)
                aucs.append(results.best_score_)
                xgb_params[label] = current_param_set

            print(np.mean(aucs))
            if args.save_prediction:
                sub.to_csv("xgb_stacker_ver{}.csv".format(args.save_flag), index = False)


    if args.mode == 'OOF':
        if args.model == 'lgb':
            model_params = lgbm_params[args.dataset]
            train_metas = np.zeros(y.shape)
            aucs = []
            losses = []
            test_probs = []
            classifiers = {}
            aucs_per_label = {}

            for fold_num, [train_indices, valid_indices] in enumerate(folds):
                print("=== fitting fold {} datetime {} ===".format(fold_num, datetime.now()))
                x_train, x_valid = train_features.values[train_indices,:], train_features.values[valid_indices,:]
                y_train, y_valid = y[train_indices], y[valid_indices]

                valid_preds = np.zeros(y_valid.shape)
                test_preds = np.zeros((test_features.shape[0], len(LABELS)))

                for idx, label in enumerate(LABELS):
                    print("fitting lightgbm for label {} at time {}".format(label, datetime.now()))
                    classifier = "fold_{}_{}".format(fold_num, label)
                    classifiers[classifier] = lgb.LGBMClassifier(
                        objective = 'binary',
                        n_jobs = 8,
                        class_weight = 'balanced',
                        learning_rate = model_params[label]['learning_rate'],
                        num_leaves = model_params[label]['num_leaves'],
                        n_estimators = model_params[label]['n_estimators'],
                        min_child_samples = model_params[label]['min_child_samples'],
                        subsample = model_params[label]['subsample'],
                        colsample_bytree = model_params[label]['colsample_bytree'],
                        reg_alpha = model_params[label]['reg_alpha'],
                        reg_lambda = model_params[label]['reg_lambda']
                    )
                    classifiers[classifier].fit(x_train, y_train[:, idx])
                    valid_preds[:, idx] = classifiers[classifier].predict_proba(x_valid)[:, 1]
                    test_preds[:, idx] = classifiers[classifier].predict_proba(test_features)[:, 1]
                    auc_score = roc_auc_score(y_valid[:, idx], valid_preds[:, idx])

                    if label not in aucs_per_label:
                        aucs_per_label[label] = [auc_score]
                    else:
                        aucs_per_label[label].append(auc_score)

                train_metas[valid_indices] = valid_preds
                test_probs.append(test_preds)
                auc_score = roc_auc_score(y_valid, valid_preds)
                log_loss_score = log_loss(y_valid, valid_preds)

                print("validation auc {} log loss {}".format(auc_score, log_loss_score))
                aucs.append(auc_score)
                losses.append(log_loss_score)

            aaa = []
            for label in aucs_per_label:
                print(np.mean(aucs_per_label[label]))
                aaa.append(np.mean(aucs_per_label[label]))
            print(np.mean(aaa))

            print("mean auc score: {} - std {} , mean log loss score: {} - std {}".format(
                    np.mean(aucs), np.std(aucs), np.mean(losses), np.std(losses)
                ))

            out_dir = '../models/layer2/{}-{}-{}'.format(args.model, args.dataset, args.save_flag)
            try:
                os.mkdir(out_dir)
            except:
                print("path exists or failed to create")

            pd.DataFrame(train_metas, columns = LABELS).to_csv(out_dir + "/train_meta_probs_round_0.csv", index = False)

            sub[LABELS] = np.zeros(sub[LABELS].shape)
            for i in range(5):
                sub[LABELS] += test_probs[i]
            sub[LABELS] /= 5
            sub.to_csv(out_dir + "/test_probs_5_bag_arith_mean_round_0.csv", index = False)

        if args.model == 'lr':
            train_metas = np.zeros(y.shape)
            aucs = []
            losses = []
            test_probs = []
            classifiers = {}
            aucs_per_label = {}

            for fold_num, [train_indices, valid_indices] in enumerate(folds):
                print("=== fitting fold {} datetime {} ===".format(fold_num, datetime.now()))
                x_train, x_valid = train_features.values[train_indices,:], train_features.values[valid_indices,:]
                y_train, y_valid = y[train_indices], y[valid_indices]

                valid_preds = np.zeros(y_valid.shape)
                test_preds = np.zeros((test_features.shape[0], len(LABELS)))

                for idx, label in enumerate(LABELS):
                    print("fitting logistic regression for label {} at time {}".format(label, datetime.now()))
                    classifier = "fold_{}_{}".format(fold_num, label)
                    classifiers[classifier] = LogisticRegression(
                        fit_intercept = True,
                        penalty = 'l2',
                        class_weight = 'balanced',
                        C = lr_params[label]['C'],
                        tol = lr_params[label]['tol'],
                        solver = lr_params[label]['solver'],
                    )
                    classifiers[classifier].fit(x_train, y_train[:, idx])
                    valid_preds[:, idx] = classifiers[classifier].predict_proba(x_valid)[:, 1]
                    test_preds[:, idx] = classifiers[classifier].predict_proba(test_features)[:, 1]
                    auc_score = roc_auc_score(y_valid[:, idx], valid_preds[:, idx])

                    if label not in aucs_per_label:
                        aucs_per_label[label] = [auc_score]
                    else:
                        aucs_per_label[label].append(auc_score)

                train_metas[valid_indices] = valid_preds
                test_probs.append(test_preds)
                auc_score = roc_auc_score(y_valid, valid_preds)
                log_loss_score = log_loss(y_valid, valid_preds)

                print("validation auc {} log loss {}".format(auc_score, log_loss_score))
                aucs.append(auc_score)
                losses.append(log_loss_score)

            aaa = []
            for label in aucs_per_label:
                print(np.mean(aucs_per_label[label]))
                aaa.append(np.mean(aucs_per_label[label]))
            print(np.mean(aaa))

            print("mean auc score: {} - std {} , mean log loss score: {} - std {}".format(
                    np.mean(aucs), np.std(aucs), np.mean(losses), np.std(losses)
                ))

            out_dir = '../models/layer2/{}-{}-{}'.format(args.model, args.dataset, args.save_flag)
            try:
                os.mkdir(out_dir)
            except:
                print("path exists or failed to create")

            pd.DataFrame(train_metas, columns = LABELS).to_csv(out_dir + "/train_meta_probs_round_0.csv", index = False)

            sub[LABELS] = np.zeros(sub[LABELS].shape)
            for i in range(5):
                sub[LABELS] += test_probs[i]
            sub[LABELS] /= 5
            sub.to_csv(out_dir + "/test_probs_5_bag_arith_mean_round_0.csv", index = False)

        if args.model == 'xgb':
            test_features = test_features.values
            model_params = xgbm_params[args.dataset]
            train_metas = np.zeros(y.shape)
            aucs = []
            losses = []
            test_probs = []
            classifiers = {}
            aucs_per_label = {}

            for fold_num, [train_indices, valid_indices] in enumerate(folds):
                print("=== fitting fold {} datetime {} ===".format(fold_num, datetime.now()))
                x_train, x_valid = train_features.values[train_indices,:], train_features.values[valid_indices,:]
                y_train, y_valid = y[train_indices], y[valid_indices]

                valid_preds = np.zeros(y_valid.shape)
                test_preds = np.zeros((test_features.shape[0], len(LABELS)))

                for idx, label in enumerate(LABELS):
                    print("fitting xgboost for label {} at time {}".format(label, datetime.now()))
                    classifier = "fold_{}_{}".format(fold_num, label)
                    classifiers[classifier] = xgb.XGBClassifier(
                        objective = 'binary:logistic',
                        n_jobs = 8,
                        class_weight = 'balanced',
                        learning_rate = model_params[label]['learning_rate'],
                        n_estimators = model_params[label]['n_estimators'],
                        max_depth = model_params[label]['max_depth'],
                        min_child_weight = model_params[label]['min_child_weight'],
                        subsample = model_params[label]['subsample'],
                        colsample_bytree = model_params[label]['colsample_bytree'],
                        reg_alpha = model_params[label]['reg_alpha'],
                        reg_lambda = model_params[label]['reg_lambda']
                    )
                    classifiers[classifier].fit(x_train, y_train[:, idx])
                    valid_preds[:, idx] = classifiers[classifier].predict_proba(x_valid)[:, 1]
                    test_preds[:, idx] = classifiers[classifier].predict_proba(test_features)[:, 1]
                    auc_score = roc_auc_score(y_valid[:, idx], valid_preds[:, idx])
                    gc.collect()
                    if label not in aucs_per_label:
                        aucs_per_label[label] = [auc_score]
                    else:
                        aucs_per_label[label].append(auc_score)

                train_metas[valid_indices] = valid_preds
                test_probs.append(test_preds)
                auc_score = roc_auc_score(y_valid, valid_preds)
                log_loss_score = log_loss(y_valid, valid_preds)

                print("validation auc {} log loss {}".format(auc_score, log_loss_score))
                aucs.append(auc_score)
                losses.append(log_loss_score)

            aaa = []
            for label in aucs_per_label:
                print(np.mean(aucs_per_label[label]))
                aaa.append(np.mean(aucs_per_label[label]))
            print(np.mean(aaa))

            print("mean auc score: {} - std {} , mean log loss score: {} - std {}".format(
                    np.mean(aucs), np.std(aucs), np.mean(losses), np.std(losses)
                ))

            out_dir = '../models/layer2/{}-{}-{}'.format(args.model, args.dataset, args.save_flag)
            try:
                os.mkdir(out_dir)
            except:
                print("path exists or failed to create")

            pd.DataFrame(train_metas, columns = LABELS).to_csv(out_dir + "/train_meta_probs_round_0.csv", index = False)

            sub[LABELS] = np.zeros(sub[LABELS].shape)
            for i in range(5):
                sub[LABELS] += test_probs[i]
            sub[LABELS] /= 5
            sub.to_csv(out_dir + "/test_probs_5_bag_arith_mean_round_0.csv", index = False)
示例#56
0
 def test_init(self):
     print("test_init")
     """测试初始化函数,捕捉异常"""
     data_x = load_data(train_file)
     self.assertEqual(len(data_x) > 0, True)
示例#57
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('-train', action='store_true', default=True, help="train flag")
  # parser.add_argument('-eval', action='store_true', default=False, help="evaluate flag")
  # parser.add_argument('-pred', action='store_true', default=True, help="predict flag")
  # parser.add_argument('-w', action='store_true', default=False, help="load weights flag")
  parser.add_argument('-c', help="training: coarse dir")
  parser.add_argument('-f', help="training: fine scale with track dir")
  parser.add_argument('-logdir', help="logdir")
  # parser.add_argument('-tc', help="test dataset: coarse dir")
  # parser.add_argument('-tf', help="test dataset: fine scale with track dir")
  parser.add_argument('-x', help="predict input dataset dir") 
  parser.add_argument('-o', help="predict output dir") 
  parser.add_argument('-l', help="learning rate") 
  parser.add_argument('-e', help="epochs") 
  # parser.add_argument('-p', help="png file name")
  # parser.add_argument("-resume", help="bool flag, False by default")
  # parser.add_argument("-modelh5", help="load exist model")
  # parser.add_argument("-modelweighth5", help="load model weights")
  # parser.add_argument('-m', help="M")
  # parser.add_argument('-n', help="N")
  parser.add_argument('-restore', action='store_true', default=False, help="restore trained model")
  parser.add_argument('-init_w', action='store_true', default=False, help="init the weight from upsample.txt")
  parser.add_argument('-lr_decay', help="learning rate decay rate")

  # FLAGS = parser.parse_args()
  # args = parser.parse_args()
  args, unknown = parser.parse_known_args()
  if len(sys.argv) < 3:
    print("Usage: python upsample_train.py -c -f -logdir -x -o -l -e -restore -init_w -lr_decay")
    return
  # if args.m and args.n is not None:
  #     m = int(args.m)
  #     n = int(args.n)
  #     print("m and n for prediction: ", m, n)
  # else:
  #     m = 700
  #     n = 700
  #     print("No parameters m and n for prediction, use: ", m, n)

  restore = False
  init_w = False
  lr_decay_rate = 0
  if args.restore:
    restore = True
  if args.init_w:
    init_w = True
  if args.lr_decay:
    lr_decay_rate = args.lr_decay

  if args.train:    
      x_train = np.empty(0)
      y_train = np.empty(0)
      x_test = np.empty(0)
      y_test = np.empty(0)

      learning_rate = float(args.l)
      epochs = int(args.e)
      coarseDir = args.c
      fineDir = args.f
      logdir = args.logdir

      if not os.path.exists(logdir):
        os.makedirs(logdir)

      sdir = coarseDir
      rest_file = sdir + [f for f in os.listdir(sdir) if not f.startswith('.')][0] + "/00001_00.obj"
      dim, mtx, mtx_1 = preprocess.meshmtx_wnb(rest_file)
      rest_pos = util.load_pos(rest_file)

      print("training dataset: ")
      print(">>>  " + str(coarseDir) + "  >>>  " + str(fineDir))
      t0 = time.clock()
      print(">>>>>>> loading data for training  >>>>>>> ")
      for dirName, subdirList, fileList in os.walk(coarseDir):
          total = len(subdirList)
          count = 0
          for subdir in subdirList:
              # print('Found directory: %s' % subdir)
              if count%40 == 0:
                  print(str(float(count)/total*100) + '%')
              count = count + 1
              x, y = util.load_data(coarseDir + subdir, fineDir + subdir, rest_pos)

              if x_train.size == 0:
                  x_train = x
                  y_train = y
              else: 
                  x_train = np.vstack((x_train, x))
                  y_train = np.vstack((y_train, y))  

      print(time.clock() - t0, "seconds loading training data.")
      if x_train.size == 0:
          print("Error: no input training data.")
          return 0
      
      # load data
      x_pred = np.empty(0)
      x_coarse = np.empty(0)
      outDir = "pred/"
  # if args.pred:
      inDir = args.x
      outDir = args.o
      print(">>>>>>> loading data for prediction >>>>>>>> ")
      t1 = time.clock()
      for dirName, subdirList, fileList in os.walk(inDir):
          total = len(subdirList)
          for subdir in subdirList:
              # print('Found directory: %s' % subdir)
              x_p, x_c = util.load_input_only(inDir + subdir, rest_file)
              if x_pred.size == 0:
                  x_pred = x_p
              else: 
                  x_pred = np.vstack((x_pred, x_p))

              if x_coarse.size == 0:
                  x_coarse = x_c
              else: 
                  x_coarse = np.vstack((x_coarse, x_c))

      print (time.clock() - t1, "seconds loading test data.")

  # batch_size = x_pred.shape[0]

  # for learning_rate in [1E-1, 1E-2]:
  #   print('Starting run for learning_rate %f' % learning_rate)

  # train_model(x_train, y_train, dim, mtx, mtx_1, epochs, learning_rate, logdir)
  train_model(x_train, y_train, x_pred, x_coarse, rest_file, mtx, mtx_1, epochs, learning_rate, logdir, outDir, init_w, lr_decay_rate, restore)
    print("test SMAPE", SMAPE)

    if plot_flag:
        util.plot(trainPred, trainY, testPred, testY)

    return trainPred, testPred, MAE, MRSE, SMAPE


if __name__ == "__main__":

    lag = 40
    batch_size = 32
    epoch = 20
    hidden_dim = 64
    lr = 1e-4

    # ts, data = util.load_data("./data/NSW2013.csv", columnName="TOTALDEMAND")
    # ts, data = util.load_data("./data/bike_hour.csv", columnName="cnt")
    # ts, data = util.load_data("./data/TAS2016.csv", columnName="TOTALDEMAND")
    # ts, data = util.load_data("./data/traffic_data_in_bits.csv", columnName="value")
    # ts, data = util.load_data("./data/beijing_pm25.csv", columnName="pm2.5")
    ts, data = util.load_data("./data/pollution.csv", columnName="Ozone")
    trainPred, testPred, mae, mrse, smape = MLP_forecasting(
        data,
        inputDim=lag,
        hiddenNum=hidden_dim,
        lr=lr,
        epoch=epoch,
        batchSize=batch_size,
        plot_flag=True)
    return trainPred, testPred, MAE, MRSE, SMAPE


if __name__ == "__main__":

    lag = 24
    batch_size = 32
    epoch = 20
    hidden_dim = 64
    lr = 1e-4
    freq = 4

    # ts, data = util.load_data("./data/NSW2013.csv", columnName="TOTALDEMAND")
    # ts, data = util.load_data("./data/bike_hour.csv", columnName="cnt")
    # ts, data = util.load_data("./data/TAS2016.csv", columnName="TOTALDEMAND")
    ts, data = util.load_data("./data/traffic_data_in_bits.csv",
                              columnName="value")
    # ts, data = util.load_data("./data/beijing_pm25.csv", columnName="pm2.5")
    # ts, data = util.load_data("./data/pollution.csv", columnName="Ozone")

    trainPred, testPred, mae, mrse, smape = decompose_MLP_forecasting(
        ts,
        data,
        lag=lag,
        freq=freq,
        epoch=epoch,
        hidden_num=hidden_dim,
        lr=lr,
        batch_size=batch_size)
示例#60
0
import util

##mass = "m0.001524"
mass = "m0.0677"
#  filename with the correlators
##fff = "Pseuodoscalar_0.202_chargeAV_outcorr.gpl"
fff = "gpl/" + mass + "_Rhox_0.202_chargeAV_outcorr.gpl"

nt = 48
no_config = 101

##
##  new data
##

corr = util.load_data(fff, nt, no_config, corr_tag)
##corr *= -1
corr /= nrm

print("Normalizatio factor ", nrm, " applied")
print("Computing jackknife correlators")
tt, corr_mean, corr_err = util.calc_corr(corr, nt, no_config, 0.0)

##
##  Dan's data
##
nconfig_dan = 381

corr_dan = util.load_data("../docs/rho_vcphys_bothcharges_m0.001524.gpl", nt,
                          nconfig_dan, "charged-up")
tt_dan, corr_dan_mean, corr_dan_err = util.calc_corr(corr_dan, nt, nconfig_dan,