示例#1
0
文件: main.py 项目: sanikak96/WebApp1
def upload():

    if request.method == 'POST':
        # This will be executed on POST request.
        upfile = request.files['file']
        if upfile and allowed_file(upfile.filename):

            filename = secure_filename(upfile.filename)
            upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            #return render_template('upload.html')
            #flash("File uploaded", "success")
            #with open("/home/sanika/proj/uploads/aa.fasta") as f:
            #lines = f.readlines()
            #lines = [l for l in lines if "ROW" in l]

            #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1:
            #f1.writelines(lines)

            #f = open(filename)
            #prot_seq = ReadFasta(f)

            with open(filename) as fasta_file:  # Will close handle cleanly
                identifiers = []
                sequence = []
                for seq_record in SeqIO.parse(fasta_file,
                                              'fasta'):  # (generator)
                    identifiers.append(seq_record.id)
                    sequence.append(seq_record.seq)

            pepdesc = PeptideDescriptor(
                filename, 'eisenberg')  # use Eisenberg consensus scale
            globdesc = GlobalDescriptor(filename)

            # --------------- Peptide Descriptor (AA scales) Calculations ---------------
            pepdesc.calculate_global(
            )  # calculate global Eisenberg hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate Eisenberg hydrophobic moment

            # load other AA scales
            pepdesc.load_scale('gravy')  # load GRAVY scale
            pepdesc.calculate_global(
                append=True)  # calculate global GRAVY hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate GRAVY hydrophobic moment
            pepdesc.load_scale('z3')  # load old Z scale
            pepdesc.calculate_autocorr(
                1, append=True
            )  # calculate global Z scale (=window1 autocorrelation)

            # --------------- Global Descriptor Calculations ---------------
            globdesc.length()  # sequence length
            globdesc.boman_index(append=True)  # Boman index
            globdesc.aromaticity(append=True)  # global aromaticity
            globdesc.aliphatic_index(append=True)  # aliphatic index
            globdesc.instability_index(append=True)  # instability index
            globdesc.calculate_charge(ph=7.4, amide=False,
                                      append=True)  # net charge
            globdesc.calculate_MW(amide=False, append=True)  # molecular weight

            f1 = pepdesc.descriptor
            f2 = globdesc.descriptor
            result = np.concatenate((f2, f1), axis=1)
            rs = []
            for i in range(len(result)):
                prt = np.reshape(result[i], (-1, 14))
                clf = joblib.load('ml_model.pkl')
                pred = clf.predict(prt)
                out = pred.toarray()
                #print(clf.predict_proba(result))
                proba = clf.predict_proba(prt).tocoo()
                mc = pred.tocoo()
                out = mc.col
                res = []
                for i in range(len(out)):
                    if out[i] == 0:
                        res.append("antiviral")
                    elif out[i] == 1:
                        res.append("antibacterial")
                    else:
                        res.append("antifungal")
                rs.append(res)
            a = []
            for i in range(len(rs)):
                a.append('-'.join(rs[i]))

            df = pd.DataFrame(data={
                "id": identifiers,
                "sequence": sequence,
                "activity": a
            },
                              columns=['id', 'sequence', 'activity'])
            df.to_csv("result.csv", sep=',', index=False)

            os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            #return render_template('seq.html', seq = rs)
            return render_template('up.html', mimetype="text/csv")

            #flash("File uploaded: Thanks!", "success")
        else:
            error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD"
            return render_template('upload.html', error=error)

    # This will be executed on GET request.
    return render_template('predictor.html')
示例#2
0
文件: main.py 项目: sanikak96/WebApp1
def predict():

    if request.method == 'POST':

        seq = request.form['seq']
        with open("random.fasta", "w") as fp:
            fp.write(seq)

        pepdesc = PeptideDescriptor(
            '/home/sanika/proj/random.fasta',
            'eisenberg')  # use Eisenberg consensus scale
        globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta')

        # --------------- Peptide Descriptor (AA scales) Calculations ---------------
        pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate Eisenberg hydrophobic moment

        # load other AA scales
        pepdesc.load_scale('gravy')  # load GRAVY scale
        pepdesc.calculate_global(
            append=True)  # calculate global GRAVY hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate GRAVY hydrophobic moment
        pepdesc.load_scale('z3')  # load old Z scale
        pepdesc.calculate_autocorr(
            1,
            append=True)  # calculate global Z scale (=window1 autocorrelation)

        # --------------- Global Descriptor Calculations ---------------
        globdesc.length()  # sequence length
        globdesc.boman_index(append=True)  # Boman index
        globdesc.aromaticity(append=True)  # global aromaticity
        globdesc.aliphatic_index(append=True)  # aliphatic index
        globdesc.instability_index(append=True)  # instability index
        globdesc.calculate_charge(ph=7.4, amide=False,
                                  append=True)  # net charge
        globdesc.calculate_MW(amide=False, append=True)  # molecular weight

        f1 = pepdesc.descriptor
        f2 = globdesc.descriptor
        result = np.concatenate((f2, f1), axis=1)

        clf = joblib.load('ml_model.pkl')
        pred = clf.predict(result)
        proba = clf.predict_proba(result).tocoo()
        mc = pred.tocoo()
        out = mc.col
        res = []
        labels = ['antiviral', 'antibacterial', 'antifungal']
        values = proba.data
        plt.pie(values,
                labels=labels,
                autopct='%.0f%%',
                shadow=True,
                radius=0.5)
        plt.savefig('/home/sanika/proj/pie_chart.jpg')

        figfile = BytesIO()
        plt.savefig(figfile, format='png')
        figfile.seek(0)
        figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii')
        plt.close()

        for i in range(len(out)):
            if out[i] == 0:
                res.append("antiviral")
            elif out[i] == 1:
                res.append("antibacterial")
            else:
                res.append("antifungal")

        return render_template('seq.html', seq=res, result=figdata_png)

    return render_template('predictor.html')
示例#3
0
def main(infolder, outfolder):

    descriptor = 'PPCALI'
    
    print "RF Peptide Learning Info\n========================\n"
    print datetime.now().strftime("%Y-%m-%d_%H-%M") + "\n"
    print("INPUT:\nInputfolder is\t%s\nOutputfolder is\t%s\nDescriptor is\t%s , auto-correlated (window 7)\n" %
            (infolder, outfolder, descriptor))

    # -------------------------------- TRAINING --------------------------------
    print "LOG:\nLoading data..."
    Pos = PeptideDescriptor(infolder + '/Pos.fasta', descriptor)
    Pos.filter_duplicates()
    Neg = PeptideDescriptor(infolder + '/Neg.fasta', descriptor)
    Neg.filter_duplicates()
    targets = np.array(len(Pos.sequences) * [1] + len(Neg.sequences) * [0])  # target vector

    # Descriptor calculation
    print "Calculating %s descriptor..." % descriptor
    Data = PeptideDescriptor(Pos.sequences + Neg.sequences, descriptor)
    Data.calculate_autocorr(7)
    
    # Standard Scaling
    print "Loading prefitted scaler and standard scaling %s descriptor..." % descriptor
    scaler = pickle.load(open(infolder + '/scaler.p', 'r'))
    Data = scaler.transform(Data.descriptor)

    # Classifier
    print "Loading pretrained classifier..."
    clf = pickle.load(open(infolder + '/classifier.p', 'r'))
    
    # fitting classifier
    print "Fitting Random Forest classifier..."
    clf.fit(Data, targets)
    fit_leafs = clf.apply(Data)
    print "\tRF out-of-bag score: %.2f" % clf.oob_score_

    # -------------------------------- LIBRARY --------------------------------
    # Loading library
    print "Loading sequence library..."
    Lib = PeptideDescriptor(infolder + '/Lib.fasta', descriptor)
    class_labels = [l[:3] for l in Lib.names]  # extract class labels from sequence names
    
    print "\tLibrary size: %i" % len(Lib.sequences)
    print "\tLibrary composition is:\n\t\thel: %i\n\t\tasy: %i\n\t\tnCM: %i" % (class_labels.count('hel'),
                                                                                class_labels.count('asy'),
                                                                                class_labels.count('nCM'))

    # Calculating descriptors for library members
    print "Calculating %s descriptor for library..." % descriptor
    D = PeptideDescriptor(Lib.sequences, descriptor)
    D.calculate_autocorr(7)
   
    # combining both libraries and scaling descriptor
    print "Standard scaling %s descriptor for library..." % descriptor
    X = scaler.transform(D.descriptor)

    # -------------------------------- PREDICTING --------------------------------
    # get single tree predictions and calculate stdev
    print "Predicting single tree results, standard deviation and entropy for library..."
    start = time.time()
    preds = get_tree_pred(clf, X)

    print "Predicting class probabilities for library..."
    probas = clf.predict_proba(X)
    probas = probas[:, 1].tolist()
    variance = np.var(preds, axis=1)
    print("\tPredictions took %.1f s" % (time.time() - start))

    # calculate similarity of library members to training data
    print("Calculating Random Forest similarity (cosine)...")
    start = time.time()
    lib_leafs = clf.apply(X)  # leaf indices where library samples end up in -> RF intrinsic similarity measure
    D_RF = pairwise_distances(lib_leafs, fit_leafs, metric='cosine')
    RF_dist = D_RF.mean(axis=1).tolist()
    print ("\tDistance calculation took %.1f s" % (time.time() - start))

    # scaling all output features
    print "Min-Max scaling outputs..."
    sclr = MinMaxScaler()
    # some transformations from lists to numpy matrices to arrays back to min-max scaled list:
    variance = np.squeeze(sclr.fit_transform(variance.reshape(-1, 1))).tolist()
    RF_dist = np.squeeze(sclr.fit_transform(np.array(RF_dist).reshape(-1, 1))).tolist()

    # construct final list with all values (prediction, RF_dist, var, sum)
    print "Creating result dictionaries..."
    sums = [0.5 * (x * (1 - y) + z) for x, y, z in zip(variance, RF_dist, probas)]  # dens-weight + proba

    # create data frame with all values
    d = pd.DataFrame({'Class': class_labels, 'Prediction': probas, 'RFSimilarity': RF_dist, 'TreeVariance': variance,
                    'WeighedSum': sums}, index=Lib.sequences)
    d.index.name = 'Sequence'
    d = d[['Class', 'Prediction', 'RFSimilarity', 'TreeVariance', 'WeighedSum']].sort_values('WeighedSum',
                                                                                           ascending=False)
    
    # get top 10 predictions according to the weighted sum
    synth_sele = d[:10]

    # writing output
    print "Saving output files to output directory..."
    synth_sele.to_csv(outfolder + '/' + datetime.now().strftime("%Y-%m-%d_%H-%M") + 'synthesis_selection.csv')
    d.to_csv(outfolder + '/library_pred.csv')
    
    # saving scaler and classifier to pickle file for later usage
    pickle.dump(sclr, open(outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-scaler.p', 'w'))
    pickle.dump(clf, open(outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-classifier.p', 'w'))

    print("Total runtime: %.1f s\n" % (time.time() - globstart))
    print "\nALL DONE SUCCESSFULLY"
    print "Look for your results file in %s\nAnd maybe save this terminal output to a logfile ;-)" % outfolder
    return preds


Pos = PeptideDescriptor(
    '/Users/modlab/y/pycharm/activelearning/retrospective/input/B/Pos.fasta',
    'PPCALI')
Pos.keep_natural_aa()
Neg = PeptideDescriptor(
    '/Users/modlab/y/pycharm/activelearning/retrospective/input/B/Neg.fasta',
    'PPCALI')
Neg.keep_natural_aa()
y = np.array(len(Pos.sequences) * [1] +
             len(Neg.sequences) * [0])  # target vector

Data = PeptideDescriptor(Pos.sequences + Neg.sequences, 'PPCALI')
Data.calculate_autocorr(7)

# Scaler
scaler = StandardScaler()
X = scaler.fit_transform(Data.descriptor)

# Classifier
clf = RandomForestClassifier(bootstrap=True,
                             class_weight=None,
                             criterion='gini',
                             max_depth=None,
                             max_features='auto',
                             max_leaf_nodes=None,
                             min_samples_leaf=1,
                             min_samples_split=2,
                             min_weight_fraction_leaf=0.0,
# Load sequence file into descriptor object
pepdesc = PeptideDescriptor('/path/to/sequences.fasta',
                            'Eisenberg')  # use Eisenberg consensus scale
globdesc = GlobalDescriptor('/path/to/sequences.fasta')

# --------------- Peptide Descriptor (AA scales) Calculations ---------------
pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate Eisenberg hydrophobic moment

# load other AA scales
pepdesc.load_scale('gravy')  # load GRAVY scale
pepdesc.calculate_global(append=True)  # calculate global GRAVY hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate GRAVY hydrophobic moment
pepdesc.load_scale('z3')  # load old Z scale
pepdesc.calculate_autocorr(
    1, append=True)  # calculate global Z scale (=window1 autocorrelation)

# save descriptor data to .csv file
col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3'
pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1)

# --------------- Global Descriptor Calculations ---------------
globdesc.length()  # sequence length
globdesc.boman_index(append=True)  # Boman index
globdesc.aromaticity(append=True)  # global aromaticity
globdesc.aliphatic_index(append=True)  # aliphatic index
globdesc.instability_index(append=True)  # instability index
globdesc.calculate_charge(ph=7.4, amide=False, append=True)  # net charge
globdesc.calculate_MW(amide=False, append=True)  # molecular weight

# save descriptor data to .csv file
示例#6
0
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')
from sklearn.metrics import matthews_corrcoef, accuracy_score
from progressbar import ProgressBar

from modlamp.core import read_fasta
from modlamp.descriptors import PeptideDescriptor

seed = np.random.RandomState(seed=42)

for d in os.listdir('./output'):
    if os.path.isdir('./output/' + d):
        print("\nRunning %s..." % d)
        sclr = pickle.load(open('./output/' + d + '/scaler.p', 'r'))
        pos = read_fasta('./input/' + d + '/Pos.fasta')[0]
        neg = read_fasta('./input/' + d + '/Neg.fasta')[0]
        desc = PeptideDescriptor(pos + neg, 'PPCALI')
        desc.calculate_autocorr(7)
        X = sclr.transform(desc.descriptor)
        y = np.array(len(pos) * [1] + len(neg) * [0])
        skf = StratifiedKFold(y, n_folds=10)

        synth = pd.read_csv('./output/' + d + '/synthesis_selection.csv')

        print("\tPerforming 10-fold cross-validation")
        mcc = list()
        acc = list()
        pbar = ProgressBar()
        for train, test in pbar(skf):
            clf = RandomForestClassifier(bootstrap=True,
                                         class_weight=None,
                                         criterion='gini',
                                         max_depth=None,