def predictFluSeq(seqs): # Seqs is the file path of your FASTA files #returns cross-val scores and MSE X0 = [] # adding to X and y for i in range(0, len(seqs) - 1): X0.append(seqs[i].seq) y0 = [] for j in range(1, len(seqs)): y0.append(seqs[i].seq) from Encoding_v2 import encoding # Encoding letters into numbers X = [] for k in range(len(X0)): encoded_X = encoding(X0[k]) X.append(encoded_X) y = [] for l in range(len(y0)): encoded_y = encoding(y0[l]) y.append(encoded_y) from sklearn.model_selection import cross_val_score, train_test_split from sklearn import ensemble, metrics # Cross-Validation rfr = ensemble.RandomForestRegressor() rfrscores = cross_val_score(rfr, X, y, cv=2) cv_score = ("Random Forests cross-validation score", rfrscores) avg_cv_score = ("Average Cross-Val Accuracy: %0.2f (+/- %0.2f)" % (rfrscores.mean() * 100, rfrscores.std() * 100)) # Mean Squared Error X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=50) rfr.fit(X_train, y_train) y_predicted = rfr.predict(X_test) mse_score = ('Random Forests MSE:', metrics.mean_squared_error(y_test, y_predicted)) return cv_score, avg_cv_score, mse_score
def predictFluSeq(seqs): # Seqs is the file path of your FASTA files #returns cross-val scores and MSE X0 = [] # adding to X and y for i in range(0, len(seqs) - 1): X0.append(seqs[i].seq) y0 = [] for j in range(1, len(seqs)): y0.append(seqs[i].seq) from Encoding_v2 import encoding # Encoding letters into numbers X = [] for k in range(len(X0)): encoded_X = encoding(X0[k]) X.append(encoded_X) y = [] for l in range(len(y0)): encoded_y = encoding(y0[l]) y.append(encoded_y) from sklearn import ensemble, cross_validation, metrics # Cross-Validation rfr = ensemble.RandomForestRegressor() rfrscores = cross_validation.cross_val_score(rfr, X, y, cv=2) cv_score = ("Random Forests cross-validation score", rfrscores) avg_cv_score = ("Average Cross-Val Accuracy: %0.2f (+/- %0.2f)" % (rfrscores.mean()*100, rfrscores.std() *100)) # Mean Squared Error X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.5,random_state=50) rfr.fit(X_train,y_train) y_predicted = rfr.predict(X_test) mse_score = ('Random Forests MSE:', metrics.mean_squared_error(y_test,y_predicted)) return cv_score, avg_cv_score, mse_score
#adding to X and y for i in range(0, len(new) - 1): X0.append(new[i].seq) #print len(X0) y0 = [] for j in range(1, len(new)): y0.append(new[i].seq) from Encoding_v2 import encoding X = [] for k in range(len(X0)): encoded_X = encoding(X0[k]) X.append(encoded_X) y = [] for l in range(len(y0)): encoded_y = encoding(y0[l]) y.append(encoded_y) ''' print len(X[0]) print len(y[298]) a = [1,2,3,4,5] print len(a) from Compare_Strains import test_length
#adding to X and y for i in range(0,len(new)-1): X0.append(new[i].seq) y0 = [] for j in range(1,len(new)): y0.append(new[i].seq) from Encoding_v2 import encoding # Changing A,C,T,G into 1,2,3,4 X = [] for k in range(len(X0)): encoded_X = encoding(X0[k]) X.append(encoded_X) y = [] for l in range(len(y0)): encoded_y = encoding(y0[l]) y.append(encoded_y) # Using sklearn models for prediction from sklearn import tree dtr = tree.DecisionTreeRegressor() dtr.fit(X,y) from sklearn import cross_validation dtrscores = cross_validation.cross_val_score(dtr,X,y,cv=2)
#adding to X and y for i in range(0, len(new) - 1): X0.append(new[i].seq) y0 = [] for j in range(1, len(new)): y0.append(new[i].seq) from Encoding_v2 import encoding, decoding, compare_sequences # Encoding X = [] for k in range(len(X0)): encoded_X = encoding(X0[k]) X.append(encoded_X) y = [] for l in range(len(y0)): encoded_y = encoding(y0[l]) y.append(encoded_y) # ML and accuracy from sklearn import tree dtr = tree.DecisionTreeRegressor() dtr.fit(X, y) from sklearn.model_selection import cross_val_score, train_test_split dtrscores = cross_val_score(dtr, X, y, cv=2) print('Decision Trees', dtrscores)