ml_dict = dict(alpha=1.0, binarize=None, fit_prior=True)
    if options.ml:
        ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)

    # initialize machine-learning method
    ml = BernoulliNB(alpha=ml_dict['alpha'], binarize=ml_dict['binarize'], fit_prior=ml_dict['fit_prior'])

    # loop over targets
    for target in conf.set_data:
        print target

        # read in training actives and calculate fps
        actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r'))
        for k in actives.keys():
            for i,m in enumerate(actives[k]):
                fp_dict = scor.getFP(fp_build, m[1])
                actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict]

        # read in test actives and calculate fps
        div_actives = []
        for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
            if line[0] != '#': 
                # structure of line: [external ID, internal ID, SMILES]]
                line = line.rstrip().split()
                fp_dict = scor.getFP(fp_build, line[2])
                # store: [internal ID, dict with fps]
                div_actives.append([line[1], fp_dict])
        num_test_actives = conf.num_div_act - 1
        # convert fps to numpy arrays
        np_fps_div_act = ml_func.getNumpy(div_actives)
예제 #2
0
    # loop over data-set sources
    for dataset in conf.set_data.keys():
        print dataset
        # loop over targets
        for target in conf.set_data[dataset]['ids']:
            print target

            # read in actives and calculate fps
            actives = []
            for line in gzip.open(
                    inpath_cmp + dataset + '/cmp_list_' + dataset + '_' +
                    str(target) + '_actives.dat.gz', 'r'):
                if line[0] != '#':
                    # structure of line: [external ID, internal ID, SMILES]]
                    line = line.rstrip().split()
                    fp_dict = scor.getFP(fp_build, line[2])
                    # store: [internal ID, dict with fps]
                    actives.append([line[1], fp_dict])
            num_actives = len(actives)
            num_test_actives = num_actives - num_query_mols
            # convert fps to numpy arrays
            np_fps_act = ml_func.getNumpy(actives)

            # read in decoys and calculate fps
            if dataset == 'ChEMBL':
                if firstchembl:
                    decoys = []
                    for line in gzip.open(
                            inpath_cmp + dataset + '/cmp_list_' + dataset +
                            '_zinc_decoys.dat.gz', 'r'):
                        if line[0] != '#':