示例#1
0
def metrics_for_target(pred, actual, mask):
    mask = np.array(mask, dtype=np.bool)
    masked_preds = pred.squeeze()[mask]
    order = np.flipud(np.argsort(masked_preds))
    masked_oredered_actual = actual[mask][order]
    return Scoring.CalcEnrichment(
        masked_oredered_actual, 0, [.001, .005, .01, .05]) + [
            Scoring.CalcAUC(masked_oredered_actual, 0),
            Scoring.CalcBEDROC(masked_oredered_actual, 0, 20)
        ]
示例#2
0
def evaluate(activity_arr):
    auc = Scoring.CalcAUC(activity_arr, 0)
    print("AUC: ", auc)
    ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    print("EF for 1%: ", ef[0])
    ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    print("EF for 5%: ", ef[0])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    print("RIE for 100: ", rie)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    print("BEDROC for 100: ", bedroc)
def evaluation(activity_arr: list, output_file: str):
    inputoutput_utils.create_parent_directory(output_file)
    auc = Scoring.CalcAUC(activity_arr, 0)
    ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    output = {
        "AUC": auc,
        "EF1": ef1[0],
        "EF5": ef5[0],
        "RIE": rie,
        "BEDROC": bedroc
    }
    with open(output_file, "w", encoding="utf-8") as stream:
        json.dump(output, stream)
示例#4
0
 def test3(self):
     """ test area under the curve (AUC) of ROC """
     # best case
     auc = Scoring.CalcAUC(self.scoreBestCase, self.index)
     self.assertAlmostEqual(auc, 1.0, self.acc)
     # worst case
     auc = Scoring.CalcAUC(self.scoreWorstCase, self.index)
     self.assertAlmostEqual(auc, 0.0, self.acc)
     # empty list
     self.assertRaises(ValueError, Scoring.CalcAUC, self.scoreEmptyList, self.index)
     # all actives
     auc = Scoring.CalcAUC(self.scoreAllActives, self.index)
     self.assertAlmostEqual(auc, 0.0, self.acc)
     # all decoys
     auc = Scoring.CalcAUC(self.scoreAllDecoys, self.index)
     self.assertAlmostEqual(auc, 0.0, self.acc)
示例#5
0
def macro_bedroc(y_true, y_pred, a=20):
    """
    Helper function which calculates macro averaged BEDROC score using
    ML.Scoring.Scoring.CalcBEDROC from rdkit

    Args:
        y_true: DataFrame of known labels, each row corresponds to a compound
            and each column corresponds to a target label
        y_pred: DataFrame of predicted label probabilities, rows and columns
            should match known DataFrame
        a: alpha value for BEDROC calculation. NOTE:only scores computed using
            the same alpha value can be compared
    Returns:
        The macro averaged BEDROC score for the predicted labels

    """
    bedroc_scores = []
    for column in y_true:
        if np.sum(y_true[column]) != 0:
            scores = pd.DataFrame()
            scores['proba'] = np.array(y_pred[column])
            scores['active'] = np.array(y_true[column])
            scores.sort_values(by='proba', ascending=False, inplace=True)
            bedroc_scores.append(
                Scoring.CalcBEDROC(np.array(scores), col=1, alpha=a))
        else:
            continue

    macro_bedroc_score = np.mean(bedroc_scores)

    return macro_bedroc_score
def evaluation(activity_arr: list, output_file: str):
    with open(output_file, "w") as stream:
        auc = Scoring.CalcAUC(activity_arr, 0)
        stream.write("AUC: ")
        stream.write(str(auc))
        ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
        stream.write("\nEF for 1%: ")
        stream.write(str(ef[0]))
        ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
        stream.write("\nEF for 5%: ")
        stream.write(str(ef[0]))
        rie = Scoring.CalcRIE(activity_arr, 0, 100)
        stream.write("\nRIE for 100: ")
        stream.write(str(rie))
        bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
        stream.write("\nBEDROC for 100: ")
        stream.write(str(bedroc))
示例#7
0
 def test4(self):
     """ test BEDROC """
     # best case
     bedroc = Scoring.CalcBEDROC(self.scoreBestCase, self.index, self.alpha)
     self.assertAlmostEqual(bedroc, 1.0, self.acc)
     # worst case
     bedroc = Scoring.CalcBEDROC(self.scoreWorstCase, self.index, self.alpha)
     self.assertAlmostEqual(bedroc, 0.0, self.acc)
     # empty list
     self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreEmptyList, self.index, self.alpha)
     # alpha == 0.0
     self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreBestCase, self.index, 0.0)
     # all actives
     bedroc = Scoring.CalcBEDROC(self.scoreAllActives, self.index, self.alpha)
     self.assertEqual(bedroc, 1.0)
     # all decoys
     bedroc = Scoring.CalcBEDROC(self.scoreAllDecoys, self.index, self.alpha)
     self.assertEqual(bedroc, 0.0)
示例#8
0
 def test2(self):
     """ test RIE """
     ratio = float(self.numActives) / self.numMol
     # best case
     RIEmax = ((1 - math.exp(-self.alpha*ratio)) / (1 - math.exp(-self.alpha))) / ratio
     rie = Scoring.CalcRIE(self.scoreBestCase, self.index, self.alpha)
     self.assertAlmostEqual(rie, RIEmax, self.acc)
     # worst case
     RIEmin = ((1 - math.exp(self.alpha*ratio)) / (1 - math.exp(self.alpha))) / ratio
     rie = Scoring.CalcRIE(self.scoreWorstCase, self.index, self.alpha)
     self.assertAlmostEqual(rie, RIEmin, self.acc)
     # empty list
     self.assertRaises(ValueError, Scoring.CalcRIE, self.scoreEmptyList, self.index, self.alpha)
     # alpha == 0
     self.assertRaises(ValueError, Scoring.CalcRIE, self.scoreBestCase, self.index, 0.0)
     # all decoys
     rie = Scoring.CalcRIE(self.scoreAllDecoys, self.index, self.alpha)
     self.assertEqual(rie, 0.0)
示例#9
0
def main():
    args = parse_arguments()
    unp_id_list = [row[0] for row in read_csv(args.target_ids)]

    category_list = [row[1] for row in read_csv(args.target_ids)]
    plt.figure(figsize=(5, 5))
    for unp_id, category in list(zip(unp_id_list, category_list)):
        print(unp_id)
        sub_dir_list = next(os.walk(os.path.join(args.result_dir)))[1]

        tprs = []
        mean_fpr = np.linspace(0, 1, 100)

        for sub_dir in sub_dir_list:

            scores_dir = os.path.join(args.result_dir, sub_dir,
                                      '{}'.format(unp_id))
            print(scores_dir)

            if os.path.isdir(scores_dir):
                os.chdir(scores_dir)

                if os.path.isdir(scores_dir):
                    for filename in os.listdir(scores_dir):

                        if filename.endswith('.csv'):
                            print(filename)

                            rows = read_csv(filename)
                            scores = []
                            for row in rows:
                                scores.append([row[0], int(row[1])])
                            fpr, tpr = Scoring.CalcROC(scores, 1)
                            tpr = np.array(tpr)
                            tprs.append(interp(mean_fpr, fpr, tpr))
        if tprs:

            mean_tpr = np.mean(tprs, axis=0)

            mean_tpr[-1] = 1.0

            if (category) == 'easy':
                plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='blue')

            elif (category) == 'moderate':
                plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='orange')

            elif (category) == 'hard':
                plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='green')

            elif (category) == 'unfeasible':
                plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='magenta')
        else:
            print("no info for target {}".format(unp_id))

    plt.savefig(os.path.join(args.output_dir, 'avg_roc.png'))
示例#10
0
 def test1(self):
     """ test enrichment factor """
     # best case
     enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, self.fractions)
     self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc)
     # worst case
     enrich = Scoring.CalcEnrichment(self.scoreWorstCase, self.index, self.fractions)
     self.assertAlmostEqual(enrich[0], 0.0, self.acc)
     # empty list
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreEmptyList, self.index, self.fractions)
     # all actives
     enrich = Scoring.CalcEnrichment(self.scoreAllActives, self.index, self.fractions)
     self.assertAlmostEqual(enrich[0], 1.0, self.acc)
     # all decoys
     enrich = Scoring.CalcEnrichment(self.scoreAllDecoys, self.index, self.fractions)
     self.assertEqual(enrich[0], 0.0)
     # fraction * numMol is smaller than 1
     enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, self.fracSmall)
     self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc)
     # fraction list is empty
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [])
     # fraction == 0.0
     enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, [0.0])
     self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc)
     # fraction < 0
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [-0.05])
     # fraction > 1
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [1.5])
def main():
    args = parse_arguments()
    unp_id_list = [row[0] for row in read_csv(args.target_ids)]
    category_list = [row[1] for row in read_csv(args.target_ids)]
    plt.figure(figsize=(5,5))
    for unp_id, category in list(zip(unp_id_list, category_list)):

        scores_dir = os.path.join(args.result_dir, '{}'.format(unp_id))

        if os.path.isdir(scores_dir):
            os.chdir(scores_dir)

            if os.path.isdir(scores_dir):
                for filename in os.listdir(scores_dir):

                    if filename.endswith('.csv'):
                        print(filename)
                        rows = read_csv(filename)




                        scores = []
                        for row in rows:
                            scores.append([row[0], int(row[1])])
                        print(scores)
                        fpr, tpr = Scoring.CalcROC(scores, 1)
                        tpr = np.array(tpr)
                        print(unp_id)

                        if (category) == 'easy':
                            plot_curve(fpr=fpr, tpr=tpr, color='blue')


                        elif (category) =='moderate':
                            plot_curve(fpr=fpr, tpr=tpr, color='orange')

                        elif (category) == 'hard':
                            plot_curve(fpr=fpr, tpr=tpr, color='green')

                        elif (category) =='unfeasible':
                            plot_curve(fpr=fpr, tpr=tpr, color='magenta')

    plt.text(0.57, 0.05, args.label)
    plt.savefig(os.path.join(args.output_dir, 'mlt_roc.png'))
示例#12
0
 def process_results(target):
     header = []
     print "###Checking target %s..." % target
     labels, rocs, results_dict = build_rocs(target,
                                             plot=not (writefile))
     if labels == rocs == None: return None
     print "Data loaded"
     if not header:
         header = labels
     auc_row = [target]
     ef1_row = [target]
     ef1r_row = [target]
     bedroc_row = [target]
     for roc in rocs:
         auc_row.append(roc.auc())
         ef1_row.append(get_EF(roc, 10, relative=True))
         ef1r_row.append(get_EF(roc, 1, relative=True))
         bedroc_row.append(Scoring.CalcBEDROC(roc.data, 0, 20))
     return header, auc_row, ef1_row, ef1r_row, bedroc_row, target
示例#13
0
def main():
    rows = read_csv('/home/amukhopadhyay/ligand_screener_testing/screening_scores.csv')
    scores = []
    for row in rows:
        scores.append([row[0], int(row[1])])

    #print(scores) rdkit methods
    #fractions = [0.01, 0.05, 0.1]
    #print(Scoring.CalcAUC(scores, 1))
    #print(Scoring.CalcBEDROC(scores, 1, 20))
    #print(Scoring.CalcEnrichment(scores, 1, fractions))
    #print(Scoring.CalcRIE(scores, 1, 20))
    #print((Scoring.CalcAUC(scores, 1)))
    #print((Scoring.CalcROC(scores, 1)))

    rank_stats = StatisticalDescriptors.RankStatistics(scores, activity_column=operator.itemgetter(1))
    print(round(rank_stats.EF(0.01), 1))
    print(round(rank_stats.EF(0.02), 1))
    print(round(rank_stats.EF(0.05), 1))
    print(round(rank_stats.EF(0.1), 1))
    print(round(rank_stats.AUC(), 1))
    print(round(rank_stats.BEDROC(alpha=20), 1))
    print(round(rank_stats.RIE(alpha=20), 1))


    fpr, tpr = Scoring.CalcROC(scores, 1)
    roc_auc = metrics.auc(fpr, tpr)


    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig('test_roc.png')
示例#14
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate prediction results')
    parser.add_argument('-P',
                        '--pred_data',
                        action='store',
                        nargs='*',
                        dest='P',
                        help='Predicted targets for model (.csv format)')
    parser.add_argument('-y',
                        '--y_data',
                        action='store',
                        nargs='*',
                        dest='y',
                        help='Known target values (.csv format)')
    parser.add_argument('-i',
                        '--input_directory',
                        action='store',
                        nargs=1,
                        dest='input',
                        default=['./'],
                        help='Directory where input files are stored')
    parser.add_argument('-o',
                        '--output_directory',
                        action='store',
                        nargs=1,
                        dest='output',
                        default=['./'],
                        help='Directory where output files should be written')
    args = vars(parser.parse_args())

    #Sort P arguements passed to keep result order consistent
    args['P'].sort()

    #Loop through all predictions to evaluate
    for i in range(len(args['y'])):
        name = args['y'][i].split('_')[1]
        name = name.split('.')[0]
        y = pd.read_csv(args['input'][0] + args['y'][i])

        #Collect predictions for corresponding dataset - e.g. train, test
        name_index = [j for j, s in enumerate(args['P']) if name in s.lower()]

        #Generate dictionary to store predictions
        predictions = {}
        for j in name_index:
            pred = pd.read_csv(args['input'][0] + args['P'][j])

            #Get classifier from file name
            clf_name = args['P'][j].split('.')[0]
            clf_name = clf_name.split('_')[1]

            #Check for predictions which don't have the correct dimensions
            #This handles cases in which feature dimenions  were used in
            #stacking that have different dimensions - e.g. MLP hidden layer
            if len(pred.columns) == len(y.columns):
                #Store classifer name and values in dict
                predictions[clf_name] = pred

        #Get values of base classifier predictions and compute mean predictions
        pred_base = [
            df.values for key, df in predictions.items()
            if key not in ['stack']
        ]
        if pred_base:
            average_values = sum(pred_base) / len(pred_base)
            predictions['ConsensusAverage'] = pd.DataFrame(average_values)

        results = []
        for clf in predictions:
            pred = predictions[clf]

            ranking = get_ranking(y, pred)
            tp_cmpd = true_positive_per_compound(ranking)[9]
            tp_all = true_positives_recovered(ranking)[9]

            micro_ap_score = skm.average_precision_score(y,
                                                         pred,
                                                         average='micro')
            macro_ap_score = macro_ap(y, pred)

            coverage = skm.coverage_error(y, pred)

            micro_auroc_score = skm.roc_auc_score(y, pred, average='micro')
            macro_auroc_score = macro_auroc(y, pred)

            scores = pd.DataFrame()
            scores['proba'] = np.array(pred).flatten()
            scores['active'] = np.array(y).flatten()
            scores.sort_values(by='proba', ascending=False, inplace=True)

            micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores),
                                                    col=1,
                                                    alpha=20)
            macro_bedroc_score = macro_bedroc(y, pred)

            results.append([
                clf, micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all,
                micro_ap_score, macro_ap_score, micro_bedroc_score,
                macro_bedroc_score, coverage
            ])

        results = pd.DataFrame(results)
        results.columns = [
            'Model', 'micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10',
            'Frac_all_in_top10', 'micro_AP', 'macro_AP', 'micro_BEDROC',
            'macro_BEDROC', 'coverage'
        ]
        print(results)
        results.to_csv(args['output'][0] + '/' + name + '_results.csv',
                       index=False)
示例#15
0
def run_ted(input_path, input_directory, prop, output_path):
    """ Loads .sdf file, converts the molecules into trees with graph annotations, runs
    the TED, evaluates the results and saves them into a file.

    :param input_path:
    :param input_directory:
    :param output_path:
    :return:
    """
    with open(input_path) as input_stream:
        input_data = json.load(input_stream)

    # Load molecules and convert them to tree graphs.
    logging.info('Loading molecules ...')
    molecules = {}
    sizes = {}
    bondSizes = {}
    for file_item in input_data['files']:
        path = input_directory + file_item + '.sdf'
        logging.debug(path)
        if not os.path.exists(path):
            logging.error('Missing file: %s' % file_item)
            raise Exception('Missing file.')
        molecules.update(_load_molecules(path, sizes, bondSizes, prop))

    # Screening.
    logging.info('Screening ...')
    scores = []
    counter = 0
    counter_max = len(input_data['data']['test'])
    counter_step = math.floor(counter_max / 100.0) + 1
    time_begin = time.clock()
    for item in input_data['data']['test']:
        if item['name'] not in molecules:
            continue
        query = molecules[item['name']]
        query_size = sizes[item['name']]
        query_bonds = bondSizes[item['name']]
        # Count pairwise similarity with all actives and choose the maximum.
        maxsim = 0
        for active in input_data['data']['train']['ligands']:
            if active['name'] not in molecules:
                continue
            active_graph = molecules[active['name']]
            active_size = sizes[active['name']]
            active_bonds = bondSizes[active['name']]
            ted = _ted(query, active_graph, prop)
            sim = 1.00 - ted / float(query_size + active_size + query_bonds +
                                     active_bonds)
            if (sim > maxsim):
                maxsim = sim
                minted = ted
        scores.append({
            'name': item['name'],
            'similarity': maxsim,
            'activity': item['activity'],
            'ted': minted
        })
        if counter % counter_step == 0:
            logging.debug('%d/%d', counter, counter_max)
            _flush_results(output_path, scores)
        counter += 1
        logging.debug('counter: ' + str(counter))
    time_end = time.clock()
    logging.debug("Reached the end.")

    # Evaluate screening.
    scores = sorted(scores, key=lambda m: m['similarity'], reverse=True)
    auc = Scoring.CalcAUC(scores, 'activity')
    ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05])

    # Print results.
    print('AUC : ', auc)
    print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef)
    print('Execution time : %.2fs' % (time_end - time_begin))
    # Write result to a file.
    if not output_path is None and not output_path == '':
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        with open(output_path, 'w') as output_stream:
            json.dump(
                {
                    'data': scores,
                    'metadata': {
                        'auc': auc,
                        'ef': {
                            '0.005': ef[0],
                            '0.01': ef[1],
                            '0.02': ef[2],
                            '0.05': ef[3]
                        },
                        'fileName': os.path.basename(__file__),
                        'executionTime': time_end - time_begin,
                        'definition': {
                            'selection': input_data['info']['selection'],
                            'molecules': input_data['info']['molecules'],
                            'index': input_data['info']['index'],
                            'dataset': input_data['info']['dataset'],
                            'method': input_data['info']['method'],
                            'config': 'config_file'
                        }
                    }
                },
                output_stream,
                indent=2)
 def calculate(self, score, index):
     tmp = []
     for p in self.params:
         tmp.append(Scoring.CalcRIE(score, index, p))
     return tmp
 def calculate(self, score, index):
     return Scoring.CalcEnrichment(score, index, self.params)
 def calculate(self, score, index):
     return Scoring.CalcAUC(score, index)
示例#19
0
    train_fps += [fps_inact[j] for j in train_indices_inact]
    ys_fit = [1] * len(train_indices_act) + [0] * len(train_indices_inact)
    # train the model
    ml = BernoulliNB()
    ml.fit(train_fps, ys_fit)

    # chemical similarity
    simil = cPickle.load(infile)

    # ranking
    test_fps = [fps_act[j] for j in test_indices_act[i]]
    test_fps += [fps_inact[j] for j in test_indices_inact[i]]
    scores = [[pp[1], s[0], s[1]]
              for pp, s in zip(ml.predict_proba(test_fps), simil)]

    # write ranks for actives
    cf.writeActiveRanks(scores, rankfile, num_actives)

    scores.sort(reverse=True)

    # evaluation
    auc = Scoring.CalcAUC(scores, -1)
    ef = Scoring.CalcEnrichment(scores, -1, [0.05])

    # write out
    outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0]))

infile.close()
rankfile.close()
outfile.close()
示例#20
0
    test_fps = [fps_act_morgan2[j] for j in test_indices_act[i]]
    test_fps += [fps_inact_morgan2[j] for j in test_indices_inact[i]]
    scores_rf_morgan2 = [[
        pp[1], s[0], s[1]
    ] for pp, s in zip(rf_morgan2.predict_proba(test_fps), simil)]

    # assign ranks
    scores_rf_rdk5 = cf.assignRanksWithInfo(scores_rf_rdk5)
    scores_lr_rdk5 = cf.assignRanksWithInfo(scores_lr_rdk5)
    scores_rf_morgan2 = cf.assignRanksWithInfo(scores_rf_morgan2)

    # fusion
    fusion_scores = []
    for m1, m2, m3 in zip(scores_rf_rdk5, scores_lr_rdk5, scores_rf_morgan2):
        rank = max([m1[0], m2[0], m3[0]])  # max. rank
        proba = max([m1[1], m2[1], m3[1]])  # max. rank
        # store: [max rank, max proba, simil, info]
        fusion_scores.append([rank, proba, m1[2], m1[3]])
    fusion_scores.sort(reverse=True)

    # evaluation
    auc = Scoring.CalcAUC(fusion_scores, -1)
    ef = Scoring.CalcEnrichment(fusion_scores, -1, [0.05])

    # write out
    outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0]))

infile1.close()
infile2.close()
outfile.close()
示例#21
0
def calcularBEDROC(llistaTuplesOrdenada):

    llista_scores = [(1 - el[1], el[2]) for el in llistaTuplesOrdenada]
    bedroc = Scoring.CalcBEDROC(llista_scores, 1, 20)

    return bedroc
def screening(input_dir, input_directory, config_file, output_path=None):
    """Perform a virtual screening.

    :param input_dir: path to input data (training and test in .json)
    :param input_directory: path to sdf files
    :param config_file: configuration file of mcs
    :param output_path: directory to save the results
    :return:
    """
    
    with open(input_dir) as input_stream:
        input_data = json.load(input_stream)
    # Load molecules.
    logging.info('Loading molecules ...')
    molecules = {}
    for file_item in input_data['files']:
        path = input_directory + file_item + '.sdf'
        if not os.path.exists(path):
            logging.error('Missing file: %s' % file_item)
            raise Exception('Missing file.')
        molecules.update(_load_molecules(path))
    # Create representation of active molecules.
    actives = []
    for active in input_data['data']['train']['ligands']:
        if active['name'] not in molecules:
            continue
        actives.append(molecules[active['name']])
    # Screening.
    logging.info('Screening ...')
    scores = []
    counter = 0
    inexact = 0
    counter_max = len(input_data['data']['test'])
    counter_step = math.floor(counter_max / 100.0) + 1
    params = mcsutils._parse_config(config_file)
    time_begin = time.clock()
    for item in input_data['data']['test']:
        if item['name'] not in molecules:
            continue
        query = molecules[item['name']]
        similarity = max([mcsutils._similarity(query, active, inexact, input_data['info'], params) for active in actives])
        scores.append({
            'name': item['name'],
            'similarity': similarity,
            'activity': item['activity']
        })
        if counter % counter_step == 0:
            logging.debug('%d/%d', counter, counter_max)            
            #_flush_results(output_path, scores)
        counter += 1
        #logging.debug('counter: ' + str(counter))
    time_end = time.clock()
    # Evaluate screening.
    scores = sorted(scores,
                    key=lambda m: m['similarity'],
                    reverse=True)
    auc = Scoring.CalcAUC(scores, 'activity')
    ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05])
    # Print results.
    print('Input file: ', input_dir)
    print('Difficulty: ', input_directory)
    print('AUC : ', auc)
    print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef)
    print('Execution time : %.2fs' % (time_end - time_begin))
    # Write result to a file.
    if not output_path is None and not output_path == '':
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        with open(output_path, 'w') as output_stream:
            json.dump({
                'data': scores,
                'metadata': {
                    'auc': auc,
                    'ef': {
                        '0.005': ef[0],
                        '0.01': ef[1],
                        '0.02': ef[2],
                        '0.05': ef[3]
                    },
                    'fileName': os.path.basename(__file__),
                    'executionTime': time_end - time_begin,
                    'inexactMolecules': inexact,
                    
                'definition': {
                        'selection': input_data['info']['selection'],
                        'molecules': input_data['info']['molecules'],
                        'index': input_data['info']['index'],
                        'dataset': input_data['info']['dataset'],
                        'method': input_data['info']['method'],
                        'config': 'config_file'
                    }
                }
            }, output_stream, indent=2)
示例#23
0
def main():
    parser = argparse.ArgumentParser(description='Tune KNeighborsClassifier')
    parser.add_argument('-X',
                        '--X_data',
                        action='store',
                        nargs=2,
                        dest='X',
                        help='Input features for the model (.csv format)')
    parser.add_argument('-y',
                        '--y_data',
                        action='store',
                        nargs=2,
                        dest='y',
                        help='Target outputs for the model (.csv format)')
    parser.add_argument('-i',
                        '--input_directory',
                        action='store',
                        nargs=1,
                        dest='input',
                        default=['./'],
                        help='Directory where input files are stored')
    parser.add_argument('-o',
                        '--output_directory',
                        action='store',
                        nargs=1,
                        dest='output',
                        default=['./'],
                        help='Directory where output files should be written')
    args = vars(parser.parse_args())

    #Sort so that training and test data are in a predictable order
    args['X'].sort()
    args['y'].sort()

    X_train = pd.read_csv(args['input'][0] + args['X'][1]) \
    .drop(columns=['smiles'])
    y_train = pd.read_csv(args['input'][0] + args['y'][1])

    X_test = pd.read_csv(args['input'][0] + args['X'][0]) \
    .drop(columns=['smiles'])
    y_test = pd.read_csv(args['input'][0] + args['y'][0])

    # use a full grid over all parameters
    param_grid = {
        'n_neighbors': [1, 5, 10],
        'metric': ['minkowski', 'jaccard']
    }

    results = []
    for params in list(ParameterGrid(param_grid)):
        clf = KNeighborsClassifier(n_jobs=-1,
                                   n_neighbors=params['n_neighbors'],
                                   metric=params['metric'])

        time_start = time.time()
        clf.fit(X_train, y_train)
        pred = clf.predict_proba(X_test)
        pred = pd.DataFrame([proba_pair[:, 1] for proba_pair in pred]).T
        print('Training and prediction done! Time elapsed: \
              {} seconds'.format(time.time() - time_start))

        ranking = get_ranking(y_test, pred)
        tp_cmpd = true_positive_per_compound(ranking)[9]
        tp_all = true_positives_recovered(ranking)[9]

        micro_ap_score = skm.average_precision_score(y_test,
                                                     pred,
                                                     average='micro')
        macro_ap_score = macro_ap(y_test, pred)

        coverage = skm.coverage_error(y_test, pred)

        micro_auroc_score = skm.roc_auc_score(y_test, pred, average='micro')
        macro_auroc_score = macro_auroc(y_test, pred)

        scores = pd.DataFrame()
        scores['proba'] = np.array(pred).flatten()
        scores['active'] = np.array(y_test).flatten()
        scores.sort_values(by='proba', ascending=False, inplace=True)

        micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores),
                                                col=1,
                                                alpha=20)
        macro_bedroc_score = macro_bedroc(y_test, pred)

        results.append([
            micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all,
            micro_ap_score, macro_ap_score, micro_bedroc_score,
            macro_bedroc_score, coverage
        ] + list(params.values()))

    results = pd.DataFrame(results)
    results.columns = ['micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10',
                       'Frac_all_in_top10', 'micro_AP', 'macro_AP',
                       'micro_BEDROC', 'macro_BEDROC', 'coverage'] \
                       + list(params)
    results.to_csv(args['output'][0] + '/' + 'KNN_opt_results.csv',
                   index=False)
def screening(input_path, input_directory, ged_results_file, output_path=None):
    """Perform a virtual screening.

    :param input_path: input .json file with basic screening params
    :param input_directory: directory with .sdf files
    :param ged_results_file: .json file with GED results and parameters
    :param output_path:
    :return:
    """
    with open(input_path) as input_stream:
        input_data = json.load(input_stream)
    # Load molecules.
    logging.info('Loading molecules ...')
    molecules = {}
    for file_item in input_data['files']:
        path = input_directory + file_item + '.sdf'
        if not os.path.exists(path):
            logging.error('Missing file: %s' % path)
            raise Exception('Missing file.')
        molecules.update(_load_molecules(path))
    # Parse ged results file
    with open(ged_results_file) as ged_stream:
        ged_data = json.load(ged_stream)
    # Create representation of active molecules.
    actives = []
    for active in input_data['data']['train']['ligands']:
        if active['name'] not in molecules:
            continue
        actives.append(molecules[active['name']])
    # Screening.
    logging.info('Screening ...')
    scores = []
    counter = 0
    counter_max = len(input_data['data']['test'])
    counter_step = math.floor(counter_max / 100.0) + 1
    time_begin = time.clock()
    for item in input_data['data']['test']:
        if item['name'] not in molecules:
            continue
        query = molecules[item['name']]
        # Counting similarity and searching for most similar active molecule
        similarity = 0
        similarMol = query
        for active in actives:
            currentSimilarity = _ged_similarity(query, active, ged_data)
            if (currentSimilarity > similarity):
                similarity = currentSimilarity
                similarMol = active
        scores.append({
            'name': item['name'],
            'similarity': similarity,
            'activity': item['activity'],
            'most-similar-active': similarMol.GetProp("_Name")
        })
        #if (item['activity'] == 1) create_picture(query, similar-active)
        if counter % counter_step == 0:
            logging.debug('%d/%d', counter, counter_max)
        counter += 1
        logging.debug('counter: ' + str(counter))
    time_end = time.clock()
    # Evaluate screening.
    scores = sorted(scores, key=lambda m: m['similarity'], reverse=True)
    auc = Scoring.CalcAUC(scores, 'activity')
    ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05])
    # Print results.
    print('AUC : ', auc)
    print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef)
    total_time = float(ged_data["properties"]["time"]) / 1000
    total_time += (time_end - time_begin)
    print('Execution time : %.2fs' % total_time)
    # Write result to a file.
    if not output_path is None and not output_path == '':
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        with open(output_path, 'w') as output_stream:
            json.dump(
                {
                    'properties': ged_data["properties"],
                    'data': scores,
                    'metadata': {
                        'auc': auc,
                        'ef': {
                            '0.005': ef[0],
                            '0.01': ef[1],
                            '0.02': ef[2],
                            '0.05': ef[3]
                        },
                        'fileName': os.path.basename(__file__),
                        'executionTime': total_time,
                        'definition': {
                            'selection': input_data['info']['selection'],
                            'molecules': input_data['info']['molecules'],
                            'index': input_data['info']['index'],
                            'dataset': input_data['info']['dataset'],
                            'method': input_data['info']['method'],
                            'config': 'config_file'
                        }
                    }
                },
                output_stream,
                indent=2)