Exemplo n.º 1
0
def get_fact():
    overall_start = time.time()
    qid = request.args.get('subject_id')
    pid = request.args.get('property_id')
    query = [construct_query(qid, pid)]
    if not os.path.exists('/tmp/data'):
        os.makedirs('/tmp/data')
    if not os.path.exists('/tmp/features'):
        os.makedirs('/tmp/features')
    if not os.path.exists('/tmp/predictions'):
        os.makedirs('/tmp/predictions')

    with open('/tmp/data/{}_test.json'.format(pid), 'w') as f:
        json.dump(query, f)

    # prepare article and label.
    fname = get_filename_for_article_id(
        query[0]['wikipedia_link'].split('wiki/')[-1])
    if not os.path.exists(os.path.join('data', 'wiki', fname)):
        print('Article not exists start downloading')
        get_article()
    if not os.path.exists(
            os.path.join('data', 'labels', '{}_labels.json'.format(pid))):
        print('Label not exists start downloading')
        get_labels_data(relations=[pid])

    # make prediction
    start_time = time.time()
    print('[INFO] Prediction')
    make_predictions(model_path='/home/guo/model',
                     data_path='/tmp/data',
                     feature_path='/tmp/features')
    time_1 = (time.time() - start_time)

    # pass to rankerNet
    start_time = time.time()
    print('[INFO] rankerNet')
    call_ranker_net()
    time_2 = (time.time() - start_time)

    # pass to enhanced_linker
    start_time = time.time()
    print('[INFO] Linker')
    result = call_enhanced_linker(pid)
    time_3 = time.time() - start_time

    # return fact
    result["runtime"] = {
        "time_predict": time_1,
        "time_rankerNet": time_2,
        "time_linker": time_3,
        "overall_": time.time() - overall_start
    }
    result["time"] = str(datetime.now())
    return jsonify(result)
Exemplo n.º 2
0
def random_forest_classification(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = make_predictions(test_df, tree=forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_prediction = df_predictions.mode(axis=1)[0]

    return random_forest_prediction
    def on_step_end(self, episode_step, logs):
        """Calculate metrics every `interval`-steps. Save target_model if conditions are met."""
        self.step += 1
        self.loss.append(logs.get("metrics")[0])

        if not self.step % self.interval:
            y_pred = make_predictions(self.model.target_model, self.X_val)
            stats = calculate_metrics(self.y_val, y_pred)

            if np.isnan(self.loss).all():  # If all entries are NaN, this happens during training
                stats["loss"] = 0
            else:
                stats["loss"] = np.nanmean(self.loss)
            self.loss = []  # Reset loss every `self.interval`

            for k, v in stats.items():
                summary = Summary(value=[Summary.Value(tag=k, simple_value=v)])
                self.writer.add_summary(summary, global_step=self.step)

            if stats.get("FN") <= self.FN_bound and stats.get("FP") <= self.FP_bound and self.step >= self.save_after:
                print(f"Model saved! FN: {stats.get('FN')}; FP: {stats.get('FP')}")
                self.model.target_model.save(f"./models/{datetime.now().strftime('%Y%m%d')}_FN{stats.get('FN')}_FP{stats.get('FP')}.h5")
Exemplo n.º 4
0
def main():
	feature_combination, input_dir, use_rf, num_cores, mode, anno_source, anno_F, target_taxid, refF, output_dir = sys.argv[1:]

	#Create feature combination
	if feature_combination == "00000000": sys.exit()
	scores = [CS.MutualInformation(2), CS.Bayes(3), CS.Euclidiean(), CS.Wcc(), CS.Jaccard(), CS.Poisson(5), CS.Pearson(), CS.Apex()]
	this_scores = []
	for i, feature_selection in enumerate(feature_combination):
		if feature_selection == "1": this_scores.append(scores[i])

	print "\t".join([fs.name for fs in this_scores])

	# Initialize CLF
	use_rf = use_rf == "True"
	num_cores = int(num_cores)
	clf = CS.CLF_Wrapper(num_cores, use_rf)

	# Load elution data
	foundprots, elution_datas = utils.load_data(input_dir, this_scores)

	# Generate reference data set
	if refF == "":
		all_gs = utils.create_goldstandard(target_taxid, foundprots)
	else:
		all_gs = Goldstandard_from_cluster_File(refF, foundprots)
	all_gs = utils.create_goldstandard(target_taxid, foundprots)
	#all_gs = Goldstandard_from_cluster_File(refF, foundprots)
#	sys.exit()


	scoreCalc = CS.CalculateCoElutionScores(this_scores, elution_datas, output_dir + ".scores.txt", num_cores=num_cores, cutoff= 0.5)
#	scoreCalc.calculate_coelutionDatas(all_gs)
	scoreCalc.readTable(output_dir + ".scores.txt", all_gs)

	print "training ppis: %i" % len(set(scoreCalc.ppiToIndex.keys()))

	#n_fold cross validation to test the stability of preicted PPIs
	utils.stability_evaluation(10, all_gs, scoreCalc, clf, output_dir, mode, anno_source, anno_F)
	sys.exit()

	#n_fold cross validation to select the best features.
	n_fold_cross_validation(10, all_gs, scoreCalc, clf, output_dir, mode, anno_source, anno_F)

	sys.exit()

	###### actually predict the network using all data
	train, eval = all_gs.split_into_holdout_training(set(scoreCalc.ppiToIndex.keys()))

	print "All comp:%i" % len(all_gs.complexes.complexes)
	print "Train comp:%i" % len(train.complexes.complexes)
	print "Eval comp:%i" % len(eval.complexes.complexes)

	print "Num valid ppis in training pos: %i" % len(train.positive)
	print "Num valid ppis in training neg: %i" % len(train.negative)
	print "Num valid ppis in eval pos: %i" % len(eval.positive)
	print "Num valid ppis in eval neg: %i" % len(eval.negative)

	# Evaluate classifier
	utils.bench_clf(scoreCalc, train, eval, clf, output_dir, verbose=True)

	functionalData = ""
	if mode != "exp":
		functionalData = utils.get_FA_data(anno_source, anno_F)

	print functionalData.scores.shape

	# Predict protein interaction
	network = utils.make_predictions(scoreCalc, mode, clf, all_gs, functionalData)
	outFH = open("%s.%s.pred.txt" % (output_dir, mode + anno_source), "w")
	print >> outFH, "\n".join(network)
	outFH.close()

	# Predicting clusters
	utils.predict_clusters("%s.%s.pred.txt" % (output_dir, mode + anno_source), "%s.%s.clust.txt" % (output_dir, mode + anno_source))

	# Evaluating predicted clusters
	pred_clusters = GS.Clusters(False)
	pred_clusters.read_file("%s.%s.clust.txt" % (output_dir, mode + anno_source))
#	utils.clustering_evaluation(train.complexes, pred_clusters, "Train", True)
	clusterEvaluationScores = utils.clustering_evaluation(eval.complexes, pred_clusters, "", True)
	outFH = open("%s.%s.evaluation.txt" % (output_dir, mode + anno_source), "w")

	head = clusterEvaluationScores[1]
	cluster_scores = clusterEvaluationScores[0]

	tmp_head = head.split("\t")
	tmp_scores = cluster_scores.split("\t")
	for i in range(len(tmp_head)):
		outFH.write("%s\t%s" % (tmp_head[i], tmp_scores[i]))
		outFH.write("\n")
Exemplo n.º 5
0
                              value_max=EPS_MAX,
                              value_min=EPS_MIN,
                              value_test=0.05,
                              nb_steps=EPS_STEPS)

dqn = DQNAgent(model=model,
               policy=policy,
               nb_actions=2,
               memory=memory,
               processor=processor,
               nb_steps_warmup=WARMUP_STEPS,
               gamma=GAMMA,
               target_model_update=TARGET_MODEL_UPDATE,
               train_interval=1,
               delta_clip=1,
               batch_size=BATCH_SIZE,
               enable_double_dqn=DOUBLE_DQN)
dqn.compile(Adam(lr=LR))

metrics = Metrics(X_val, y_val, interval=5_000)
dqn.fit(env,
        nb_steps=training_steps,
        log_interval=LOG_INTERVAL,
        callbacks=[metrics])
dqn.target_model.save(FP_MODEL)

# Validate on test dataset
trained_model = load_model(FP_MODEL)  # Load the just saved model
y_pred = make_predictions(trained_model, X_test)
plot_conf_matrix(y_test, y_pred)
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--feature_selection",
        type=str,
        help=
        "Select which features to use. This is an 8 position long array of 0 and 1, where each position determines which co-elution feature to use. Features sorted by position are: MI, Bayes, Euclidean, WCC, Jaccard, PCCN, PCC, and Apex.  Each default=11101001",
        default="11101001")
    parser.add_argument(
        "input_dir",
        type=str,
        help="Directory containing the elution files for each experiment")

    parser.add_argument(
        "-t",
        "--taxid",
        type=str,
        help="TAXID to automatically download reference from GO,CORUM,INtACT",
        default="")
    parser.add_argument(
        "-c",
        "--cluster",
        type=str,
        help="Path to file containing protein clsuter reference",
        default="")
    parser.add_argument("-p",
                        "--ppi",
                        type=str,
                        help="path to ppi File",
                        default="")

    parser.add_argument("output_dir",
                        type=str,
                        help="Directory containing the output files")
    parser.add_argument("-o",
                        "--output_prefix",
                        type=str,
                        help="Prefix name for all output Files",
                        default="Out")

    parser.add_argument(
        "-M",
        "--classifier",
        type=str,
        help="Select which classifier to use. Values: RF SVM, default RF",
        default="RF")
    parser.add_argument("-n",
                        "--num_cores",
                        type=int,
                        help="Number of cores to be used, default 1",
                        default=1)

    parser.add_argument(
        "-m",
        "--mode",
        type=str,
        help=
        "Run EPIC with experimental, functional, or both evidences. Values: EXP, FA, COMB, default: EXP  ",
        default="EXP")
    parser.add_argument(
        "-f",
        "--fun_anno_source",
        type=str,
        help=
        "Where to get functional annotaiton from. Values: STRING or GM or FILE, default= GM",
        default="GM")
    parser.add_argument(
        "-F",
        "--fun_anno_file",
        type=str,
        help=
        "Path to File containing functional annotation. This flag needs to be set when using FILE as fun_anno_source.",
    )
    parser.add_argument("-r",
                        "--co_elution_cutoff",
                        type=float,
                        help="Co-elution score cutoff. default 0.5",
                        default=0.5)
    parser.add_argument(
        "-R",
        "--classifier_cutoff",
        type=float,
        help="Classifier confidence valye cutoff. default = 0.5",
        default=0.5)
    parser.add_argument(
        "-e",
        "--elution_max_count",
        type=int,
        help=
        "Removies protein that have a maximal peptide count less than the given value. default = 1",
        default=1)
    parser.add_argument(
        "-E",
        "--frac_count",
        type=int,
        help=
        "Number of fracrions a protein needs to be measured in. default = 2",
        default=2)

    parser.add_argument(
        "-P",
        "--precalcualted_score_file",
        type=str,
        help=
        "Path to precalulated scorefile to read scores from for faster rerunning of EPIC. default = None",
        default="NONE")

    args = parser.parse_args()

    args.mode = args.mode.upper()
    args.fun_anno_source = args.fun_anno_source.upper()

    #Create feature combination
    if args.feature_selection == "00000000":
        print "Select at least one feature"
        sys.exit()

    this_scores = utils.get_fs_comb(args.feature_selection)
    print "\t".join([fs.name for fs in this_scores])

    # Initialize CLF
    use_rf = args.classifier == "RF"
    clf = CS.CLF_Wrapper(args.num_cores, use_rf)

    # Load elution data
    foundprots, elution_datas = utils.load_data(args.input_dir,
                                                this_scores,
                                                fc=args.frac_count,
                                                mfc=args.elution_max_count)

    # Generate reference data set
    gs = ""
    if ((args.taxid != "" and args.ppi != "")
            or (args.cluster != "" and args.ppi != "")):
        print "Refernce from cluster and PPI are nor compatiple. Please supply ppi or complex reference, not both!"
        sys.exit()

    if args.taxid == "" and args.ppi == "" and args.cluster == "":
        print "Please supply a reference by setting taxid, cluster, or ppi tag"
        sys.exit()

    gs_clusters = []
    if (args.taxid != "" and args.cluster == "" and args.ppi == ""):
        print "Loading clusters from GO, CORUM, and Intact"
        gs_clusters.extend(utils.get_reference_from_net(args.taxid))

    if args.cluster != "":
        print "Loading complexes from file"
        if args.mode == "FA":
            gs_clusters.append(GS.FileClusters(args.cluster, "all"))
        else:
            gs_clusters.append(GS.FileClusters(args.cluster, foundprots))

    if args.ppi != "":
        print "Reading PPI file from %s" % args.reference
        gs = Goldstandard_from_PPI_File(args.ppi, foundprots)

    print gs_clusters
    if len(gs_clusters) > 0:
        gs = utils.create_goldstandard(gs_clusters, args.taxid, foundprots)

    output_dir = args.output_dir + os.sep + args.output_prefix

    refFH = open(output_dir + ".ref_complexes.txt", "w")
    for comp in gs.complexes.complexes:
        print >> refFH, "%s\t%s" % (",".join(comp), ",".join(
            gs.complexes.complexes[comp]))
    refFH.close()

    scoreCalc = CS.CalculateCoElutionScores(this_scores,
                                            elution_datas,
                                            output_dir + ".scores.txt",
                                            num_cores=args.num_cores,
                                            cutoff=args.co_elution_cutoff)
    if args.precalcualted_score_file == "NONE":
        scoreCalc.calculate_coelutionDatas(gs)
    else:
        scoreCalc.readTable(args.precalcualted_score_file, gs)

    print scoreCalc.scores.shape

    functionalData = ""
    gs.positive = set(gs.positive & set(scoreCalc.ppiToIndex.keys()))
    gs.negative = set(gs.negative & set(scoreCalc.ppiToIndex.keys()))
    gs.rebalance()

    print len(gs.positive)
    print len(gs.negative)

    if args.mode != "EXP":
        print "Loading functional data"
        functionalData = utils.get_FA_data(args.fun_anno_source, args.taxid,
                                           args.fun_anno_file)
        print "Dimension of fun anno " + str(functionalData.scores.shape)

    print "Start benchmarking"

    if args.mode == "EXP":
        utils.cv_bench_clf(scoreCalc,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    if args.mode == "COMB":
        tmp_sc = copy.deepcopy(scoreCalc)
        tmp_sc.add_fun_anno(functionalData)
        utils.cv_bench_clf(tmp_sc,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    if args.mode == "FA":
        utils.cv_bench_clf(functionalData,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    # PPI evaluation
    print utils.cv_bench_clf(scoreCalc,
                             clf,
                             gs,
                             args.output_dir,
                             verbose=False,
                             format="pdf",
                             folds=5)
    #print "I am here"

    network = utils.make_predictions(scoreCalc,
                                     args.mode,
                                     clf,
                                     gs,
                                     fun_anno=functionalData)

    # Predict protein interaction
    outFH = open("%s.pred.txt" % (output_dir), "w")

    final_network = []
    for PPI in network:
        items = PPI.split("\t")
        if float(items[2]) >= args.classifier_cutoff:
            final_network.append(PPI)

    print >> outFH, "\n".join(final_network)
    outFH.close()

    # Predicting clusters
    utils.predict_clusters("%s.pred.txt" % (output_dir),
                           "%s.clust.txt" % (output_dir))

    # Evaluating predicted clusters
    pred_clusters = GS.Clusters(False)
    pred_clusters.read_file("%s.clust.txt" % (output_dir))
    overlapped_complexes_with_reference = gs.get_complexes(
    ).get_overlapped_complexes_set(pred_clusters)
    print "# of complexes in reference dataset: " + str(
        len(overlapped_complexes_with_reference))
    #clust_scores, header = utils.clustering_evaluation(gs.complexes, pred_clusters, "", False)
    clust_scores, header, composite_score = utils.clustering_evaluation(
        gs.complexes, pred_clusters, "", False)
    outFH = open("%s.eval.txt" % (output_dir), "w")
    header = header.split("\t")
    clust_scores = clust_scores.split("\t")
    for i, head in enumerate(header):
        print "%s\t%s" % (head, clust_scores[i])
        print >> outFH, "%s\t%s" % (head, clust_scores[i])
    outFH.close()
                                      nb_steps=EPS_STEPS)

        dqn = DQNAgent(model=model,
                       policy=policy,
                       nb_actions=2,
                       memory=memory,
                       processor=processor,
                       nb_steps_warmup=WARMUP_STEPS,
                       gamma=GAMMA,
                       target_model_update=TARGET_MODEL_UPDATE,
                       train_interval=4,
                       delta_clip=1,
                       batch_size=BATCH_SIZE,
                       enable_double_dqn=DOUBLE_DQN)
        dqn.compile(Adam(lr=LR))

        metrics = Metrics(X_val, y_val)
        dqn.fit(env,
                nb_steps=TRAINING_STEPS,
                log_interval=LOG_INTERVAL,
                callbacks=[metrics],
                verbose=0)
        y_pred = make_predictions(dqn.target_model, X_test)
        stats = calculate_metrics(y_test, y_pred)  # Get stats as dictionairy
        writer.writerow(stats)  # Write dictionairy as row
        f.flush(
        )  # Save results to file inbetween iterations as to not lose results

        if not i % LOG_EVERY:
            print(f"{i}: FN: {stats.get('FN')}, FP: {stats.get('FP')}")
Exemplo n.º 8
0
st.title("UDEA Project")

st.markdown('## Select task prediction')
model = st.radio("Pick the model you want to generate the predictions",
                 ('WORD DETECTION', 'SHEET DETECTION'))

with st.beta_expander('Plot image with annotations.', ):
    img_file_buffer2 = st.file_uploader("Choose another image...", type="jpg")

    if img_file_buffer2 is not None:
        # print(type(img_file_buffer2.getvalue()))
        image2 = Image.open(img_file_buffer2)
        img_array2 = np.array(image2)

        # predictions = make_predictions(img_file_buffer2, 'yolo')
        predictions = make_predictions(image2, model)

        print(predictions)

        df = pd.DataFrame(predictions)
        df['labels'] = df['labels'] + df.index.values.astype(str)

        df.sort_values(by=['scores'], ascending=False, inplace=True)
        df = df[df['scores'] > 0.1]

        if df.shape[0] > 0:
            add_selectbox = st.slider('Select confidence',
                                      min_value=0,
                                      max_value=100,
                                      value=50,
                                      step=5) / 100.0
Exemplo n.º 9
0
from utils import make_predictions
import pandas as pd
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    model = "../saved_models/model_v1.sav"
    features = "../saved_models/saved_features_v1.csv"
    scaler =  "../saved_models/saved_scaler_v1.sav"
    data = "Testing data hello there, ability trump. bad ..! adwa"

    print(make_predictions(model, features, scaler, data))