scores = defaultdict(list) # loop over repetitions for q in range(conf.num_reps): training_list = cPickle.load(training_input) test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]] test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]] # loop over fps single_score = defaultdict(list) for fp in fp_names: query_fps = [actives[i][1][fp] for i in training_list[:num_query_mols]] # test_list: first actives then decoys test_fps = [[actives[i][0], actives[i][1][fp], 1] for i in test_list[:num_test_actives]] test_fps += [[decoys[i][0], decoys[i][1][fp], 0] for i in test_list[num_test_actives:]] for tmp_mol in test_fps: tmp_score = scor.getBulkSimilarity(tmp_mol[1], query_fps, simil_metric) # use max fusion # store : [similarity, internal ID, active/inactive] single_score[fp].append([tmp_score[0], tmp_mol[0], tmp_mol[2]]) # rank list according to similarity scores[fp].append(sorted(single_score[fp], reverse=True)) # write scores to file if do_append: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'ab+') # binary format else: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in fp_names: cPickle.dump([fp, scores[fp]], outfile, 2) outfile.close() print "scoring done and scored lists written"
] np_test_fps = [ np_fps_act[i] for i in test_list[:num_test_actives] ] np_test_fps += [ np_fps_dcy[i] for i in test_list[num_test_actives:] ] test_mols = [[actives[i][0], 1] for i in test_list[:num_test_actives]] test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]] # calculate similarity with standard fp std_simil = [] for fp in test_fps: tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric) tmp_simil.sort(reverse=True) std_simil.append(tmp_simil[0]) # rank based on probability (and second based on similarity) single_score = ml.predict_proba(np_test_fps) # store: [probability, similarity, internal ID, active/inactive] single_score = [[ m[1], s, t[0], t[1] ] for m, s, t in zip(single_score, std_simil, test_mols)] single_score.sort(reverse=True) scores['rf_' + fp_build].append(single_score) # write scores to file if do_append: outfile = gzip.open(outpath + '/list_' + dataset + '_' +
if i not in training_list[num_query_mols:] ] # loop over fps single_score = defaultdict(list) for fp in fp_names: query_fps = [ actives[i][1][fp] for i in training_list[:num_query_mols] ] # test_list: first actives then decoys test_fps = [[actives[i][0], actives[i][1][fp], 1] for i in test_list[:num_test_actives]] test_fps += [[decoys[i][0], decoys[i][1][fp], 0] for i in test_list[num_test_actives:]] for tmp_mol in test_fps: tmp_score = scor.getBulkSimilarity( tmp_mol[1], query_fps, simil_metric) # use max fusion # store : [similarity, internal ID, active/inactive] single_score[fp].append( [tmp_score[0], tmp_mol[0], tmp_mol[2]]) # rank list according to similarity scores[fp].append(sorted(single_score[fp], reverse=True)) # write scores to file if do_append: outfile = gzip.open(outpath + '/list_' + dataset + '_' + str(target) + '.pkl.gz', 'ab+') # binary format else: outfile = gzip.open(outpath + '/list_' + dataset + '_' + str(target) + '.pkl.gz',
np_train_fps += [np_fps_dcy[i] for i in training_list[num_query_mols:]] # fit random forest ml.fit(np_train_fps, ys_fit) # test fps and molecule info test_fps = [actives[i][1] for i in test_list[:num_test_actives]] test_fps += [decoys[i][1] for i in test_list[num_test_actives:]] np_test_fps = [np_fps_act[i] for i in test_list[:num_test_actives]] np_test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]] test_mols = [[actives[i][0], 1] for i in test_list[:num_test_actives]] test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]] # calculate similarity with standard fp std_simil = [] for fp in test_fps: tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric) tmp_simil.sort(reverse=True) std_simil.append(tmp_simil[0]) # rank based on probability (and second based on similarity) single_score = ml.predict_proba(np_test_fps) # store: [probability, similarity, internal ID, active/inactive] single_score = [[m[1], s, t[0], t[1]] for m,s,t in zip(single_score,std_simil,test_mols)] single_score.sort(reverse=True) scores['rf_'+fp_build].append(single_score) # write scores to file if do_append: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'ab+') # binary format else: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format