def bat_processing_csv(self):

        save_dir = self.ui.saveDirLineEdit.text()
        if not (self.ui.isSavematRaido.isChecked()
                ) and not self.ui.isSavenpz.isChecked():
            QMessageBox.warning(None, 'error', 'please chose save format',
                                QMessageBox.Yes)
            return
        if not save_dir:
            QMessageBox.warning(None, 'error', 'please input save dir',
                                QMessageBox.Yes)
            return
        file_names, _ = QFileDialog.getOpenFileNames(self, 'open files', './',
                                                     'data (*.csv)')

        if file_names:
            group_number = len(file_names) // 4
            if group_number <= 0:
                QMessageBox.warning(
                    None, 'error',
                    'please ensure there exist at least four channels',
                    QMessageBox.Yes)
                return

            for ith_group in range(group_number):
                ch1, ch2, ch3, ch4 = file_names[4 * ith_group:4 * ith_group +
                                                4]
                if self.ui.isSavematRaido.isChecked():
                    read_data(ch1, ch2, ch3, ch4, None,
                              save_dir + '/' + str(ith_group) + '.mat', False)
                if self.ui.isSavenpz.isChecked():
                    read_data(ch1, ch2, ch3, ch4,
                              save_dir + '/' + str(ith_group) + '.npz', None,
                              True)
예제 #2
0
def main():
    csv.dowload_csv()
    list_of_localities = pd.get_locality_list_from_db()
    list_of_email_recipients = pd.get_recipient_list_from_db()
    pd.create_localities(list_of_localities)
    pd.read_data()
    for locality in pd.tracking_localities:
        html.create_html(locality,7)
    
    for recipient in list_of_email_recipients:
        locality_obj = pd.return_locality_obj(recipient.locality)
        es.send_mail(locality_obj, recipient.email)
        
    es.send_admin_email("*****@*****.**")
def main(_):
    train_file = 'data/data_1_train.csv'
    source_count, target_count = [], []
    data = process_data.read_data(train_file)

    parsed_data = process_data.parse_data(data)

    source_word2idx, target_word2idx = create_vocab(parsed_data)

    #train_data = read_data(FLAGS.train_data, source_count, source_word2idx, target_count, target_word2idx)
    #test_data = read_data(FLAGS.test_data, source_count, source_word2idx, target_count, target_word2idx)

    trainData, testData = process_data.split_data(parsed_data, 80, 20)
    train_data = process_data.read_and_process_data(trainData, source_word2idx,
                                                    target_word2idx)
    test_data = process_data.read_and_process_data(testData, source_word2idx,
                                                   target_word2idx)
    FLAGS.pad_idx = source_word2idx['<pad>']
    FLAGS.nwords = len(source_word2idx)
    FLAGS.mem_size = train_data[
        4] if train_data[4] > test_data[4] else test_data[4]

    pp.pprint(flags.FLAGS.__flags)

    print('loading pre-trained word vectors...')
    FLAGS.pre_trained_context_wt = init_word_embeddings(source_word2idx)
    FLAGS.pre_trained_target_wt = init_word_embeddings(target_word2idx)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
예제 #4
0
def add_observables(model):
    data = process_data.read_data()
    ab_map = process_data.get_antibody_map(data)
    for ab_name, agents in ab_map.items():
        patterns = []
        for agent in agents:
            try:
                monomer = model.monomers[agent.name]
            except KeyError:
                continue
            if agent.mods:
                mc = agent.mods[0]
                site_names = ['phospho', mc.residue]
                if mc.position is not None:
                    site_names.append(mc.residue + mc.position)
                for site_name in site_names:
                    try:
                        pattern = monomer(**{site_name: 'p'})
                        patterns.append(ComplexPattern([pattern], None))
                    except Exception:
                        pass
            else:
                patterns.append(ComplexPattern([monomer()], None))
        if patterns:
            if model.monomers.get(ab_name) is not None:
                obs_name = ab_name + '_obs'
            else:
                obs_name = ab_name
            if not re.match(r'[_a-z][_a-z0-9]*\Z', obs_name, re.IGNORECASE):
                obs_name = obs_name.replace('-', '_')
            if not re.match(r'[_a-z][_a-z0-9]*\Z', obs_name, re.IGNORECASE):
                obs_name = 'p' + obs_name
            o = Observable(obs_name, ReactionPattern(patterns))
            model.add_component(o)
    '''
예제 #5
0
def main():
    df = process_data.read_data('numerai_datasets/numerai_training_data.csv')
    df = process_data.scale_data(df)
    df = process_data.get_c1_dummies(df)
    x_train, y_train, x_val, y_val = process_data.split_data(df)

    svc_model = create_test_model(SVC(probability=True, C=100, kernel='rbf'),
                                  x_train, y_train, x_val, y_val)
    process_data.save_model(svc_model, "models/svc/svc_model_c_100.pkl")
예제 #6
0
def main():
    # read data
    print(1)
    movie_data = read_data(file, cols)

    #connect to db
    movie_db = connectdb()

    # delete previous data
    movie_db.delete_all()
    # create db
    create_database(movie_db, movie_data)
예제 #7
0
def main():
    wiki_movie_df, rating_df, genres_df, wiki_genres_df = process_data.read_data(
    )
    seaborn.set()
    #audience_rating, critic_rating = get_rating(rating_df)
    audience_average, critic_average, audience_percent, critic_percent = get_data(
        rating_df)
    check_test(audience_average, critic_average, audience_percent,
               critic_percent)
    pvalue = do_anova(audience_average, critic_average, audience_percent)
    if (pvalue < 0.05):
        print(" \n ")
        print("Do post hoc Tukey test")
    do_post_hoc(audience_average, critic_average, audience_percent)
예제 #8
0
def main():
    wiki_movie_df, rating_df, genres_df, wiki_genres_df = process_data.read_data(
    )
    audience_rating, critic_rating = get_rating(rating_df)
    seaborn.set()
    #plt.savefig('rating.png')

    # do T-test for testing if audience rating and critic norm have the same means
    print("\n")
    print("----- T-test -----")
    t_test(audience_rating, critic_rating)
    print("\n")
    print("----- U-test -----")
    u_test(audience_rating, critic_rating)
    print("\n")
    print("----- Regression -----")
    regression(audience_rating, critic_rating)
예제 #9
0
def main():
    data = proc.read_data()
    features, yfill = proc.features_yfill(data)
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        yfill,
                                                        test_size=0.20,
                                                        random_state=42,
                                                        stratify=yfill)
    X_train_over, y_train_over = proc.oversample(X_train, y_train, r=0.3)
    #plot_roc(X_train, y_train, 'LogisticRegression', LogisticRegression(C=1e5,penalty='l2'))
    '''
    model_over = runLR(X_train_over, X_test, y_train_over, y_test)
    test_results(model_over, X_test, y_test)
    '''

    model = runLR(X_train.values, X_test, y_train.values, y_test)
    test_results(model, X_test, y_test)
예제 #10
0
def run(dec_thresh=-1, inc_thresh=1):
    data = pd.read_data(pd.data_file)
    ab_agents = pd.get_antibody_map(data)

    # If filtering is to be done based on thresholds only,
    # set this to None
    drug_ab_combs = get_eval_drug_ab_combs(data)
    #drug_ab_combs = None

    stmts, values = make_stmts(data,
                               ab_agents,
                               drug_ab_combs=drug_ab_combs,
                               thresh=[dec_thresh, inc_thresh])

    # Now, preassemble the statements to remove duplicates
    pa_dict = preassemble_stmts(stmts)

    with open('data_stmts.pkl', 'wb') as f:
        pickle.dump((pa_dict, values), f, protocol=2)

    return (stmts, values)
예제 #11
0
    plt.savefig('Accuracy_vs_numtrees_{}.png'.format(graphid))
    plt.close()
    plt.figure()
    plt.plot(num_trees, precision)
    #plt.ylim((0.8, 1))
    plt.savefig('precision_vs_numtrees_{}.png'.format(graphid))
    plt.close()
    plt.figure()
    plt.plot(num_trees, recall)
    #plt.ylim((0.8, 1))
    plt.savefig('recall_vs_numtrees_{}.png'.format(graphid))
    plt.close()


if __name__ == '__main__':
    data = proc.read_data()
    # bits, yfill = bits_yfill(data)
    # X_train, X_test, y_train, y_test = train_test_split(bits, yfill, test_size=0.20, random_state=42, stratify =yfill)
    # for num in range(10):
    #     rffit = RandomForestClass(X_train, X_test, y_train, y_test)
    #     feature_importance(bits, rffit)
    #     plot_features(bits, rffit, 20, 'bits', num)

    features, yfill = proc.features_yfill(data)
    X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=1, stratify =yfill)
    X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3)
    rffit, y_predict = randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample')

    precision, recall, median_recall_index, medianrecall_threshold = set_threshold(rffit, X_train, X_test, y_train, y_test)
    print_threshold(rffit, X_train, X_test, y_train, y_test, medianrecall_threshold)
    feature_importance(features, rffit)
예제 #12
0
        if json_dict.get(drug) is None:
            json_dict[drug] = {}
        if json_dict[drug].get(ab) is None:
            json_dict[drug][ab] = {}
        for idx, path in enumerate(paths):
            path_stmts = []
            for rule_name, sign in path[:-1]:
                stmt = _stmt_from_rule(model, rule_name, stmts)
                path_stmts.append(stmt.uuid)
            json_dict[drug][ab][idx] = path_stmts
    return json_dict

if __name__ == '__main__':
    print("Processing data")

    data = process_data.read_data(process_data.data_file)
    data_genes = process_data.get_all_gene_names(data)
    ab_map = process_data.get_antibody_map(data)

    print('Loading data statements.')
    data_stmts, data_values = make_stmts.run(dec_thresh=0.5, inc_thresh=1.5)
    all_data_stmts = [values.values() for values in data_stmts.values()]
    all_data_stmts = itertools.chain.from_iterable(all_data_stmts)
    all_data_stmts = list(itertools.chain.from_iterable(all_data_stmts))

    print('We will check the following drug-ab combinations:\n============')
    for drug, stmtd in data_stmts.items():
        print(drug)
        for ab in stmtd.keys():
            print('-'+ ab)
예제 #13
0
	score_list = [
					SVC_score_all, LR_score_all, NB_score_all, \
					SVC_score_pca, LR_score_pca, NB_score_pca, \
					SVC_score_fs, LR_score_fs, NB_score_fs
	]

	method_list = [
					"The accuracy of model with all rating features by SVM Classifier",
					"The accuracy of model with all rating features by Logistic Regression",
					"The accuracy of model with all rating features by Naive bayes Classifier",
					"The accuracy of model with PCA transformed features by SVM Classifier",
					"The accuracy of model with PCA transformed features by Logistic Regression",
					"The accuracy of model with PCA transformed features by Naive Bayes Classifier",
					"The accuracy of model with Top-2 important features by SVM Classifier",
					"The accuracy of model with Top-2 important features by Logistic Regression",
					"The accuracy of model with Top-2 important features by Naive Bayes Classifier",
	]

	for k, v in sorted(zip(map(lambda x: round(x, 4), score_list), method_list), reverse=True):
		print(v + ': ' + str(k)) 	

	sb.set()
	show_distribution(fs_df)


if __name__ == "__main__":
	wiki_movie_df, rating_df, genres_df, wiki_genres_df = process_data.read_data()
	predict_profit(wiki_movie_df, rating_df)	
	# print(wiki_movie_df)
	
예제 #14
0
# -*- coding: utf-8 -*-

import os
import numpy as np
from process_data import read_data
from protein_feature_signal import discretize
from sklearn.ensemble import RandomForestRegressor
from protein_feature_preparation_linear import ProteinFeaturePreparationLinear

## Read Data
project_directory = os.path.dirname(os.getcwd())
file_data = read_data(project_directory)
protein_data = ['DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA'] * len(file_data)

print("********* Model: Random Forest ")
for window_size in range(2, 6):
    print("#############################")
    print("********* Using Window Size:", window_size)
    params = ProteinFeaturePreparationLinear(window_size)

    #### Train Feature //////
    train_signal = []
    train_features = []
    for i in range(0, len(protein_data)):
        prot_seq = protein_data[i]
        signal = file_data[i]
        features = params.get_feature(prot_seq)
        discrete_signal = discretize(signal, len(prot_seq), window_size)
        train_signal.extend(discrete_signal)
        train_features.extend(features)
    train_features = np.array(train_features)
예제 #15
0
 def _preprocess(self):
     """ Read in data and build the vocabulary """
     words = read_data(self.file_path)
     self.dictionary, self.invert_dict = build_vocab(words, self.vocab_size)
     self.index_words = convert_words_to_index(words, self.dictionary)
예제 #16
0
    plt.savefig('Accuracy_vs_numtrees_{}.png'.format(graphid))
    plt.close()
    plt.figure()
    plt.plot(num_trees, precision)
    #plt.ylim((0.8, 1))
    plt.savefig('precision_vs_numtrees_{}.png'.format(graphid))
    plt.close()
    plt.figure()
    plt.plot(num_trees, recall)
    #plt.ylim((0.8, 1))
    plt.savefig('recall_vs_numtrees_{}.png'.format(graphid))
    plt.close()


if __name__ == '__main__':
    df = proc.read_data()
    #df = pd.read_csv(os.path.join(app.config['UPLOAD_FOLDER'], filename))
    # use all features and yfill (no NaNs, filled with 0)
    features, yfill = proc.features_yfill(df)
    #train test split at 20%
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        yfill,
                                                        test_size=0.20,
                                                        random_state=1,
                                                        stratify=yfill)

    #Optional: oversampling of minority class for training purposes
    #X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3)
    #rffit, y_predict = rf.randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample')

    #fit the Random Forest classifier: would like to add in a grid search
예제 #17
0
def read_sources():
    trips_stmts = process_trips.read_stmts(process_trips.base_folder)
    sparser_stmts = process_sparser.read_stmts(process_sparser.base_folder)
    r3_stmts = process_r3.read_stmts(process_r3.active_forms_file)
    stmts = trips_stmts + sparser_stmts + r3_stmts
    return stmts

def get_prior_genes(fname):
    """Get the list of prior genes."""
    with open(fname, 'rt') as fh:
        genes = fh.read().strip().split('\n')
        return genes

if __name__ == '__main__':
    outf = 'output/'
    data = process_data.read_data(process_data.data_file)
    data_genes = process_data.get_all_gene_names(data)
    reassemble = False
    if not reassemble:
        stmts = ac.load_statements(pjoin(outf, 'preassembled.pkl'))
        #stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
    else:
        #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.map_grounding(prior_stmts,
                                       save=pjoin(outf, 'gmapped_prior.pkl'))
        reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl'))
        reading_stmts = ac.map_grounding(reading_stmts,
                                    save=pjoin(outf, 'gmapped_reading.pkl'))
        stmts = prior_stmts + reading_stmts