def pipeline(trainingDir,testingDir,GetTrainingFeatures,GetTestFeatures, classType): # change here to the training data folder # trainingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' # trainingDir = str(trainingDir) # change here to the testing data folder # testingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\FASTA_Sets\HSP33_Chap\Unknown_Tests' # testingDir = str(testingDir) if GetTrainingFeatures==True: print('Starting to extract features from training set') 'Temporary measure: If features extracted and saved, disable following line to avoid re-extracting trainign features' t0 = time.clock() featExt(directory=trainingDir, trainingSetFlag=True, classType=classType, normFlag=True, normParams='.') t1 = time.clock() time1 = t1-t0 print('Extracted training data features') # print('Starting to train model') t1 = time.clock() model, lb_encoder = trainClassifier(trainingDir+'\\trainingSetFeatures.csv', False, 'forest', 0, False, False) t1 = time.clock() time2 = t1-t0 print('Created and trained model') if GetTestFeatures==True: print('Starting to extract features from testing set') t0 = time.clock() featExt(directory=testingDir, trainingSetFlag=False, classType='dir', normFlag=True,normParams= trainingDir+'\\trainingSetNormParams.csv') t1 = time.clock() time3 = t1-t0 print('Extracted testing data features') df = pd.DataFrame.from_csv(testingDir+'\\testingSetFeatures.csv') features = df.values print('Predicting labels') t0 = time.clock() results = model.predict(features) t1 = time.clock() time4 = t1-t0 ind = 0 df['classname'] = 0 for index, row in df.iterrows(): df['classname'][index] = lb_encoder.inverse_transform(results[ind]) ind+=1 df = df['classname'] df.to_csv(testingDir+'\\testingSetResults.csv') print('saved results to ' + testingDir+'\\testingSetResults.csv')
def train_model(dataset_files, classformat, outputmodel, classifiertype): """Given set of files with fasta sequences, class format (e.g., file), filename to save model and required model type (e.g., forest) Train the model and save it to the given file """ output_features_file = "ProFET_features.csv" features_dict = extract_datasets_features(dataset_files, classformat, output_features_file) # features_df = load_data(output_features_file) print("Learning %s model" % classifiertype) model, label_encoder, scaler, feature_names = trainClassifier( output_features_file, classifiertype, kbest=0, alpha=False, optimalFlag=False, normFlag=True ) # Save model and additional data to file pickle.dump( (model, label_encoder, scaler, feature_names), open(outputmodel, "wb"), protocol=pickle.HIGHEST_PROTOCOL ) print("Done")
def train_model(dataset_files, classformat, outputmodel, classifiertype): '''Given set of files with fasta sequences, class format (e.g., file), filename to save model and required model type (e.g., forest) Train the model and save it to the given file ''' output_features_file = 'ProFET_features.csv' features_dict = extract_datasets_features(dataset_files, classformat, output_features_file) #features_df = load_data(output_features_file) print('Learning %s model' % classifiertype) model, label_encoder, scaler, feature_names = trainClassifier( output_features_file, classifiertype, kbest=0, alpha=False, optimalFlag=False, normFlag=True) #Save model and additional data to file pickle.dump((model, label_encoder, scaler, feature_names), open(outputmodel, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print('Done')
def pipeline(trainingDir,testingDir,resultsDir, GetTrainingFeatures,GetTestFeatures, classType): print(profiler) # change here to the training data folder # trainingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' # change here to the testing data folder # testingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\FASTA_Sets\HSP33_Chap\Unknown_Tests' if GetTrainingFeatures==True: print('Starting to extract features from training set') 'Temporary measure: If features extracted and saved, disable following line to avoid re-extracting trainign features' featExt(directory=trainingDir, trainingSetFlag=True, classType=classType, normParams='.') print('Extracted training data features') 'TODO: Seperate model training/prediction from feat.extraction!' if GetTestFeatures==True: print('Training predictive model') #ORIG \\ ## model, lb_encoder = trainClassifier(trainingDir+'\\trainingSetFeatures.csv', False, 'forest', 0, False, False) model, lb_encoder = trainClassifier(filename=trainingDir+'/trainingSetFeatures.csv',normFlag= False,classifierType= 'forest',kbest= 0,alpha= False,optimalFlag= False) #Win # model, lb_encoder = trainClassifier(filename=trainingDir+'\\trainingSetFeatures.csv',normFlag= False,classifierType= 'forest',kbest= 0,alpha= False,optimalFlag= False) print('Model trained') 'Change to "If GetPredictions==True" , after adding such a param' if GetTestFeatures==True: ## TODO: If more than 4k seqs, predict in chunks - DANs print() print('Extracting features from test set') print("trainingDir: ",trainingDir) featExt(directory=testingDir, trainingSetFlag=False, classType='dir', normParams=(trainingDir+'/trainingSetNormParams.csv')) # featExt(testingDir, False, 'dir', trainingDir+'\\trainingSetNormParams.csv') #ORIG print('Extracted test data features') # dfTesting = pd.DataFrame.from_csv(testingDir+'\\testingSetFeatures.csv') #ORIG dfTesting = pd.DataFrame.from_csv(testingDir+'/testingSetFeatures.csv') dfTraining = pd.io.parsers.read_csv(trainingDir+'/trainingSetFeatures.csv',nrows=2) #Orig. # dfTraining = pd.DataFrame.from_csv(trainingDir+'/trainingSetFeatures.csv') #New ''' # FeatureFilt Filter Extracted Features, keeping only feats that are in the training set. This is crucial! (Remember be reapplied if used elsewhere, if feature filtering/selection used) ''' # remove feature in dfTesting when not in dfTraining: #Not working? #dan " Bug here - fix by padding non existant features with zeroes." feature_cols = [col for col in dfTraining.columns if col not in ['classname','Id','proteinname']] # feature_cols = [col for col in feature_cols if col in dfTraining.columns] # https://github.com/zygmuntz/kaggle-happiness/blob/master/vectorize_validation.py ### train.YOB[ train.YOB.isnull() ] = 0 #new - fill missing features.. # dfTesting = dfTesting[feature_cols] common_cols = [col for col in feature_cols if col in dfTesting.columns] missing_cols = [col for col in feature_cols if col not in dfTesting.columns] dfTesting = dfTesting[common_cols] #dfTesting.fillna(0) "ToDO: Do this in one command as a map or pandas command. Faster" print("Orig dfTesting.shape:", dfTesting.shape) print("Missing_cols (in dfTesting: \n", missing_cols) print("len(dfTesting)",len(dfTesting),"len(dfTesting).columns",len(dfTesting.columns)) # import numpy.zeroes for col in missing_cols: dfTesting[col] = pd.Series([0] * len(dfTesting)) # dfTesting[col] = np.zeroes(len(dfTesting)) print("dfTraining (shape) was:", dfTraining.shape) print("dfTesting shape (after padding features):", dfTesting.shape) print("Features matched") #May be unnecessary? # dfTesting.replace([np.inf, -np.inf], 0) dfTesting.fillna(0, inplace=True) # features = dfTesting[feature_cols].values #ORIG features = dfTesting.values print('Predicting labels') results = model.predict(features) labels = lb_encoder.inverse_transform(results) # dfTesting['classname'].append(list(labels)) dfTesting['classname'] = labels #df to df2 : df2 = dfTesting['classname'] df2.to_csv(testingDir+'\\PredictedTestSetResults.csv') print('Saved results to ' + testingDir+'\\PredictedTestSetResults.csv') #ORIG # print('Saved results to ' + testingDir+'/PredictedTestSetResults.csv') if os.access(resultsDir, os.F_OK) and os.access(resultsDir, os.W_OK): writeClassifiedFastas(classType, testingDir, resultsDir, df2) else: print("Classified fastas were not written - no access to %s" % resultsDir) profiler.dump_stats('profile.txt')
def pipeline(): results = parser.parse_args() trainingDir=results.trainingDir testingDir=results.testingDir resultsDir=results.resultsDir GetTrainingFeatures=results.GetTrainingFeatures GetTestFeatures=results.GetTestFeatures classType=results.classType classifierType=results.classifierType outputTrainedModel=results.outputTrainedModel if trainingDir: if (not os.path.exists(trainingDir)): print('training dir doesn\'t exist') exit() if not (os.access(trainingDir, os.R_OK) and os.access(trainingDir, os.X_OK) and os.access(trainingDir, os.W_OK)): print('don\' have permission to access training dir') exit() if testingDir: if (not os.path.exists(testingDir)): print('testing dir doesn\'t exist') exit() if not (os.access(testingDir, os.R_OK) and os.access(testingDir, os.X_OK) and os.access(testingDir, os.W_OK)): print('don\' have permission to access testing dir') exit() if resultsDir: if (not os.path.exists(resultsDir)): print('results dir doesn\'t exist') exit() if not (os.access(resultsDir, os.R_OK) and os.access(resultsDir, os.X_OK) and os.access(resultsDir, os.W_OK)): print('don\' have permission to access results dir') exit() print(profiler) # change here to the training data folder # trainingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' # change here to the testing data folder # testingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\FASTA_Sets\HSP33_Chap\Unknown_Tests' if GetTrainingFeatures==True: print('Starting to extract features from training set') 'Temporary measure: If features extracted and saved, disable following line to avoid re-extracting training features' featExt(directory=trainingDir, trainingSetFlag=True, classType=classType, normParams='.') print('Extracted training data features') # 'TODO: Seperate model training/prediction from feat.extraction!' if GetTestFeatures or outputTrainedModel: print('Training predictive model') model, lb_encoder = trainClassifier(filename=trainingDir+'/trainingSetFeatures.csv',normFlag= False,classifierType= classifierType,kbest= 0,alpha= False,optimalFlag= False) #Win print('Model trained') 'Change to "If GetPredictions==True" , after adding such a param' if GetTestFeatures==True: ## TODO: If more than 4k seqs, predict in chunks - DANs print() print('Extracting features from test set') print("trainingDir: ",trainingDir) featExt(directory=testingDir, trainingSetFlag=False, classType='dir', normParams=(trainingDir+'/trainingSetNormParams.csv')) # featExt(testingDir, False, 'dir', trainingDir+'\\trainingSetNormParams.csv') #ORIG print('Extracted test data features') # dfTesting = pd.DataFrame.from_csv(testingDir+'\\testingSetFeatures.csv') #ORIG dfTesting = pd.DataFrame.from_csv(testingDir+'/testingSetFeatures.csv') # We use DF training to ensure consistency with features - we just need the feature names. dfTraining = pd.io.parsers.read_csv(trainingDir+'/trainingSetFeatures.csv',nrows=2) #Orig. # dfTraining = pd.DataFrame.from_csv(trainingDir+'/trainingSetFeatures.csv') #New ''' # FeatureFilt Filter Extracted Features, keeping only feats that are in the training set. This is crucial! (Remember be reapplied if used elsewhere, if feature filtering/selection used) ''' # remove feature in dfTesting when not in dfTraining: #Not working? #dan " Bug here - fix by padding non existant features with zeroes." feature_cols = [col for col in dfTraining.columns if col not in ['classname','Id','proteinname']] # feature_cols = [col for col in feature_cols if col in dfTraining.columns] # https://github.com/zygmuntz/kaggle-happiness/blob/master/vectorize_validation.py ### train.YOB[ train.YOB.isnull() ] = 0 #new - fill missing features.. # dfTesting = dfTesting[feature_cols] common_cols = [col for col in feature_cols if col in dfTesting.columns] missing_cols = [col for col in feature_cols if col not in dfTesting.columns] dfTesting = dfTesting[common_cols] #dfTesting.fillna(0) "ToDO: Do this in one command as a map or pandas command. Faster" print("Orig dfTesting.shape:", dfTesting.shape) print("Missing_cols (in dfTesting: \n", missing_cols) print("len(dfTesting)",len(dfTesting),"len(dfTesting).columns",len(dfTesting.columns)) # import numpy.zeroes for col in missing_cols: dfTesting[col] = pd.Series([0] * len(dfTesting)) # dfTesting[col] = np.zeroes(len(dfTesting)) print("dfTraining (shape) was:", dfTraining.shape) print("dfTesting shape (after padding features):", dfTesting.shape) print("Features matched") #May be unnecessary? # dfTesting.replace([np.inf, -np.inf], 0) dfTesting.fillna(0, inplace=True) # features = dfTesting[feature_cols].values #ORIG features = dfTesting.values print('Predicting labels') results = model.predict(features) labels = lb_encoder.inverse_transform(results) # dfTesting['classname'].append(list(labels)) dfTesting['classname'] = labels #df to df2 : df2 = dfTesting['classname'] df2.to_csv(testingDir+'\\PredictedTestSetResults.csv') print('Saved results to ' + testingDir+'\\PredictedTestSetResults.csv') #ORIG # print('Saved results to ' + testingDir+'/PredictedTestSetResults.csv') if os.access(resultsDir, os.F_OK) and os.access(resultsDir, os.W_OK): writeClassifiedFastas(classType, testingDir, resultsDir, df2) else: print("Classified fastas were not written - no access to %s" % resultsDir) profiler.dump_stats('profile.txt')