def filter_feature_file(feature_file, lineage_file, filtered_features_file): """ This function takes a file with features and filters it such that only one member of each lineage is chosen to represent the whole lineage :param feature_file: A file containing a mapping from ID to features :param lineage_file: A file containing a mapping from ID to lineage :return: None. """ ids, features = fileIO.read_feature_file(feature_file) lineage_dict = fileIO.read_lineage_file(lineage_file) used_lineages = [] which = [] for i in xrange(len(ids)): if lineage_dict[ids[i]] not in used_lineages: which += [i] used_lineages += [lineage_dict[ids[i]]] filtered_ids = ids[which] filtered_features = features[which] header = "# Filtered k-mer count file" header += "\n# Filtered from: %s" % feature_file header += "\n# Filtered by: %s" % lineage_file fileIO.save_counts(filtered_features, filtered_ids, filtered_features_file, header=header)
def test_cut_response(self): ''' This function tests the effect of sequence cut length on Phamer learning algorithm :return: A dictionary mapping cut length to ROC AUC values ''' self.cut_file_map = self.get_cut_file_map() self.cut_sizes = sorted(self.cut_file_map.keys()) self.aucs = np.zeros(len(self.cut_sizes)) if self.validator is None: self.validator = cross_validate.cross_validator() self.validator.scoring_function = phamer.score_points self.validator.N = self.N_fold i = 0 for cut_size in self.cut_sizes: logger.info("Cross validating with cutsize: %d bp" % cut_size) phage_file = [ file for file in self.cut_file_map[cut_size] if os.path.basename(file).startswith("phage") ][0] bacteria_file = [ file for file in self.cut_file_map[cut_size] if os.path.basename(file).startswith("bacteria") ][0] self.validator.positive_data = fileIO.read_feature_file( phage_file, normalize=True, old=True)[1] self.validator.negative_data = fileIO.read_feature_file( bacteria_file, normalize=True, old=True)[1] phage_scores, bacteria_scores = self.validator.cross_validate() self.aucs[i] = learning.predictor_performance( phage_scores, bacteria_scores)[2] i += 1 plt.figure(figsize=(9, 6)) plt.plot( np.array([0] + self.cut_sizes) / 1000.0, [0] + list(self.aucs), 'b-o') plt.grid(True) plt.xlabel('Cut Size (kbp)') plt.ylabel('ROC AUC') plt.savefig(self.get_output_plot_filename()) return dict(zip(self.cut_sizes, self.aucs))
def load_data(self): """ This function loads the relevant data into a phamer_scorer object :param args: An argparse parsed arguments object from this script :return: None. scorer object is modified in place """ # loading reference data if self.positive_features and os.path.exists(self.positive_features_file): logger.info("Reading positive features from: %s" % os.path.basename(self.positive_features_file)) scorer.positive_ids, scorer.positive_data = fileIO.read_feature_file(self.positive_features_file, normalize=True) elif self.positive_fasta and os.path.exists(self.positive_fasta): logger.info("Counting positive k-mers from: %s" % os.path.basename(self.positive_fasta)) scorer.positive_ids, scorer.positive_data = kmer.count(self.positive_fasta) if self.negative_features_file and os.path.exists(self.negative_features_file): logger.info("Reading negative features from: %s" % os.path.basename(self.negative_features_file)) scorer.negative_ids, scorer.negative_data = fileIO.read_feature_file(self.negative_features_file, normalize=True) elif self.negative_fasta and os.path.exists(self.negative_fasta): logger.info("Counting negative k-mers from: %s" % os.path.basename(self.negative_fasta)) scorer.negative_ids, scorer.negative_data = kmer.count(self.negative_fasta) scorer.find_input_files() # Loading input data if self.features_file is not None and os.path.exists(self.features_file): logger.info("Reading features from: %s..." % os.path.basename(self.features_file)) scorer.data_ids, scorer.data_points = fileIO.read_feature_file(self.features_file) elif self.fasta_file is not None and os.path.exists(self.fasta_file): logger.info("Calculating features of: %s" % os.path.basename(self.fasta_file)) self.data_ids, self.data_points = kmer.count_file(self.fasta_file, self.kmer_length, normalize=False) self.features_file = "{base}_features.csv".format(base=os.path.splitext(self.fasta_file)[0]) logger.info("Saving features to {file}...".format(file=self.features_file)) fileIO.save_counts(self.data_points, self.data_ids, self.features_file) else: logger.error("No input fasta file or features file. Exiting...") exit() self.data_points = kmer.normalize_counts(self.data_points) if args.length_requirement: scorer.screen_by_length()
elif args.verbose: logger.setLevel(logging.INFO) logging.basicConfig( format='[%(asctime)s][%(levelname)s][%(funcName)s] - %(message)s') else: logger.setLevel(logging.WARNING) logging.basicConfig(format='[log][%(levelname)s] - %(message)s') validator = cross_validator() validator.scoring_function = phamer.score_points validator.method = args.method validator.N = args.N_fold validator.method = args.method validator.output_directory = args.output_directory positive_ids, positive_data = fileIO.read_feature_file( args.positive_features_file) negative_ids, negative_data = fileIO.read_feature_file( args.negative_features_file) validator.positive_ids = positive_ids validator.negative_ids = negative_ids validator.positive_data = kmer.normalize_counts(positive_data) validator.negative_data = kmer.normalize_counts(negative_data) validator.equalize_reference = args.equalize_reference validator.cross_validate() logger.info("Plotting score distributions...") validator.plot_score_distributions() logger.info("Plotting ROC curve...") validator.plot_ROC()
def load_data(self): """ This function loads all the data necessary for plotting into memory. This function will also make the necessary directories for where the outputs shoudl be placed :return: None """ if self.output_directory and not os.path.isdir(self.output_directory): try: os.mkdir(self.output_directory) except: logger.error("Could not create: %s" % self.output_directory) logger.error( "Resolve this by creating this directory yourself and re-running" ) exit(1) if self.features_file and os.path.exists(self.features_file): # Loading features logger.info("Loading features from: %s ..." % os.path.basename(self.features_file)) self.id_list, self.features = fileIO.read_feature_file( self.features_file, normalize=True) logger.info("Loaded features.") elif self.fasta_file and os.path.exists(self.fasta_file): # Calculating Features logger.info("No feature file provided, calculating features...") self.id_list, self.features = kmer.count_file(self.fasta_file, 4, normalize=True) self.features_outfile = self.get_kmers_out_filename() logger.info("Calculated features. Saving features to: %s" % os.path.basename(self.features_outfile)) fileIO.save_counts(self.features, self.id_list, self.features_outfile, args=args) logger.info("Saved features.") if not self.do_tsne and os.path.isfile( self.tsne_file) and os.path.isfile(self.features_file): # Loading t-SNE data logger.info("Loading t-SNE data from: %s ... " % os.path.basename(self.tsne_file)) self.id_list, self.tsne_data, _ = fileIO.read_tsne_file( self.tsne_file) logger.info("Loaded t-SNE data.") else: # Doing t-SNE logger.info("Performing t-SNE...") if self.PCA_preprocess: logger.info("Pre-processing with PCA...") pca_data = PCA( n_components=self.pca_preprocess_red).fit_transform( self.features) self.tsne_data = TSNE( perplexity=self.perplexity, verbose=True, random_state=self.tsne_seed, init=self.tsne_init, early_exaggeration=self.early_exaggeration, learning_rate=self.tsne_learning_rate).fit_transform( pca_data) else: self.tsne_data = TSNE( perplexity=self.perplexity, verbose=True, random_state=self.tsne_seed, init=self.tsne_init, early_exaggeration=self.early_exaggeration, learning_rate=self.tsne_learning_rate).fit_transform( self.features) logger.info("t-SNE complete.") self.tsne_file = self.get_tsne_filename() fileIO.save_tsne_data(self.tsne_file, self.tsne_data, self.id_list) logger.info("Saved t-SNE to: %s" % os.path.basename(self.tsne_file)) logger.info("Loading lineages from: %s ..." % os.path.basename(self.lineage_file)) self.lineages = self.get_lineages()
import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-in", "--features_file", required=True, help="Features file") parser.add_argument("-out", "--output_file", required=False, help="Filename for the output plot") args = parser.parse_args() ids, data = fileIO.read_feature_file(args.features_file, normalize=True) k_clusters = np.arange(10, 600, 10) k_clusters = np.array(sorted(list(set(k_clusters)))) sil_scores = np.zeros(k_clusters.shape) sil_score_std = np.zeros(k_clusters.shape) num_repeats = 5 for i in xrange(k_clusters.shape[0]): k = k_clusters[i] means = np.zeros(num_repeats) for j in xrange(num_repeats): sils = learning.silhouette_score( data, learning.kmeans(data, k, verbose=True))