Пример #1
0
def filter_feature_file(feature_file, lineage_file, filtered_features_file):
    """
    This function takes a file with features and filters it such that only one member of each lineage is chosen
    to represent the whole lineage
    :param feature_file: A file containing a mapping from ID to features
    :param lineage_file: A file containing a mapping from ID to lineage
    :return: None.
    """
    ids, features = fileIO.read_feature_file(feature_file)
    lineage_dict = fileIO.read_lineage_file(lineage_file)
    used_lineages = []
    which = []
    for i in xrange(len(ids)):
        if lineage_dict[ids[i]] not in used_lineages:
            which += [i]
            used_lineages += [lineage_dict[ids[i]]]
    filtered_ids = ids[which]
    filtered_features = features[which]
    header = "# Filtered k-mer count file"
    header += "\n# Filtered from: %s" % feature_file
    header += "\n# Filtered by: %s" % lineage_file
    fileIO.save_counts(filtered_features,
                       filtered_ids,
                       filtered_features_file,
                       header=header)
Пример #2
0
    def test_cut_response(self):
        '''
        This function tests the effect of sequence cut length on Phamer learning algorithm
        :return: A dictionary mapping cut length to ROC AUC values
        '''
        self.cut_file_map = self.get_cut_file_map()
        self.cut_sizes = sorted(self.cut_file_map.keys())
        self.aucs = np.zeros(len(self.cut_sizes))
        if self.validator is None:
            self.validator = cross_validate.cross_validator()
            self.validator.scoring_function = phamer.score_points
        self.validator.N = self.N_fold
        i = 0
        for cut_size in self.cut_sizes:
            logger.info("Cross validating with cutsize: %d bp" % cut_size)
            phage_file = [
                file for file in self.cut_file_map[cut_size]
                if os.path.basename(file).startswith("phage")
            ][0]
            bacteria_file = [
                file for file in self.cut_file_map[cut_size]
                if os.path.basename(file).startswith("bacteria")
            ][0]
            self.validator.positive_data = fileIO.read_feature_file(
                phage_file, normalize=True, old=True)[1]
            self.validator.negative_data = fileIO.read_feature_file(
                bacteria_file, normalize=True, old=True)[1]
            phage_scores, bacteria_scores = self.validator.cross_validate()
            self.aucs[i] = learning.predictor_performance(
                phage_scores, bacteria_scores)[2]
            i += 1

        plt.figure(figsize=(9, 6))
        plt.plot(
            np.array([0] + self.cut_sizes) / 1000.0, [0] + list(self.aucs),
            'b-o')
        plt.grid(True)
        plt.xlabel('Cut Size (kbp)')
        plt.ylabel('ROC AUC')
        plt.savefig(self.get_output_plot_filename())
        return dict(zip(self.cut_sizes, self.aucs))
Пример #3
0
    def load_data(self):
        """
        This function loads the relevant data into a phamer_scorer object
        :param args: An argparse parsed arguments object from this script
        :return: None. scorer object is modified in place
        """
        # loading reference data
        if self.positive_features and os.path.exists(self.positive_features_file):
            logger.info("Reading positive features from: %s" % os.path.basename(self.positive_features_file))
            scorer.positive_ids, scorer.positive_data = fileIO.read_feature_file(self.positive_features_file, normalize=True)
        elif self.positive_fasta and os.path.exists(self.positive_fasta):
            logger.info("Counting positive k-mers from: %s" % os.path.basename(self.positive_fasta))
            scorer.positive_ids, scorer.positive_data = kmer.count(self.positive_fasta)

        if self.negative_features_file and os.path.exists(self.negative_features_file):
            logger.info("Reading negative features from: %s" % os.path.basename(self.negative_features_file))
            scorer.negative_ids, scorer.negative_data = fileIO.read_feature_file(self.negative_features_file, normalize=True)
        elif self.negative_fasta and os.path.exists(self.negative_fasta):
            logger.info("Counting negative k-mers from: %s" % os.path.basename(self.negative_fasta))
            scorer.negative_ids, scorer.negative_data = kmer.count(self.negative_fasta)

        scorer.find_input_files()
        # Loading input data
        if self.features_file is not None and os.path.exists(self.features_file):
            logger.info("Reading features from: %s..." % os.path.basename(self.features_file))
            scorer.data_ids, scorer.data_points = fileIO.read_feature_file(self.features_file)
        elif self.fasta_file is not None and os.path.exists(self.fasta_file):
            logger.info("Calculating features of: %s" % os.path.basename(self.fasta_file))
            self.data_ids, self.data_points = kmer.count_file(self.fasta_file, self.kmer_length, normalize=False)
            self.features_file = "{base}_features.csv".format(base=os.path.splitext(self.fasta_file)[0])
            logger.info("Saving features to {file}...".format(file=self.features_file))
            fileIO.save_counts(self.data_points, self.data_ids, self.features_file)
        else:
            logger.error("No input fasta file or features file. Exiting...")
            exit()

        self.data_points = kmer.normalize_counts(self.data_points)

        if args.length_requirement:
            scorer.screen_by_length()
Пример #4
0
    elif args.verbose:
        logger.setLevel(logging.INFO)
        logging.basicConfig(
            format='[%(asctime)s][%(levelname)s][%(funcName)s] - %(message)s')
    else:
        logger.setLevel(logging.WARNING)
        logging.basicConfig(format='[log][%(levelname)s] - %(message)s')

    validator = cross_validator()
    validator.scoring_function = phamer.score_points
    validator.method = args.method
    validator.N = args.N_fold
    validator.method = args.method
    validator.output_directory = args.output_directory

    positive_ids, positive_data = fileIO.read_feature_file(
        args.positive_features_file)
    negative_ids, negative_data = fileIO.read_feature_file(
        args.negative_features_file)

    validator.positive_ids = positive_ids
    validator.negative_ids = negative_ids
    validator.positive_data = kmer.normalize_counts(positive_data)
    validator.negative_data = kmer.normalize_counts(negative_data)
    validator.equalize_reference = args.equalize_reference

    validator.cross_validate()

    logger.info("Plotting score distributions...")
    validator.plot_score_distributions()
    logger.info("Plotting ROC curve...")
    validator.plot_ROC()
Пример #5
0
    def load_data(self):
        """
        This function loads all the data necessary for plotting into memory.
        This function will also make the necessary directories for where the outputs
        shoudl be placed
        :return: None
        """

        if self.output_directory and not os.path.isdir(self.output_directory):
            try:
                os.mkdir(self.output_directory)
            except:
                logger.error("Could not create: %s" % self.output_directory)
                logger.error(
                    "Resolve this by creating this directory yourself and re-running"
                )
                exit(1)

        if self.features_file and os.path.exists(self.features_file):
            # Loading features
            logger.info("Loading features from: %s ..." %
                        os.path.basename(self.features_file))
            self.id_list, self.features = fileIO.read_feature_file(
                self.features_file, normalize=True)
            logger.info("Loaded features.")
        elif self.fasta_file and os.path.exists(self.fasta_file):
            # Calculating Features
            logger.info("No feature file provided, calculating features...")
            self.id_list, self.features = kmer.count_file(self.fasta_file,
                                                          4,
                                                          normalize=True)
            self.features_outfile = self.get_kmers_out_filename()
            logger.info("Calculated features. Saving features to: %s" %
                        os.path.basename(self.features_outfile))
            fileIO.save_counts(self.features,
                               self.id_list,
                               self.features_outfile,
                               args=args)
            logger.info("Saved features.")

        if not self.do_tsne and os.path.isfile(
                self.tsne_file) and os.path.isfile(self.features_file):
            # Loading t-SNE data
            logger.info("Loading t-SNE data from: %s ... " %
                        os.path.basename(self.tsne_file))
            self.id_list, self.tsne_data, _ = fileIO.read_tsne_file(
                self.tsne_file)
            logger.info("Loaded t-SNE data.")
        else:
            # Doing t-SNE
            logger.info("Performing t-SNE...")
            if self.PCA_preprocess:
                logger.info("Pre-processing with PCA...")
                pca_data = PCA(
                    n_components=self.pca_preprocess_red).fit_transform(
                        self.features)
                self.tsne_data = TSNE(
                    perplexity=self.perplexity,
                    verbose=True,
                    random_state=self.tsne_seed,
                    init=self.tsne_init,
                    early_exaggeration=self.early_exaggeration,
                    learning_rate=self.tsne_learning_rate).fit_transform(
                        pca_data)
            else:
                self.tsne_data = TSNE(
                    perplexity=self.perplexity,
                    verbose=True,
                    random_state=self.tsne_seed,
                    init=self.tsne_init,
                    early_exaggeration=self.early_exaggeration,
                    learning_rate=self.tsne_learning_rate).fit_transform(
                        self.features)
            logger.info("t-SNE complete.")
            self.tsne_file = self.get_tsne_filename()
            fileIO.save_tsne_data(self.tsne_file, self.tsne_data, self.id_list)
            logger.info("Saved t-SNE to: %s" %
                        os.path.basename(self.tsne_file))

        logger.info("Loading lineages from: %s ..." %
                    os.path.basename(self.lineage_file))
        self.lineages = self.get_lineages()
Пример #6
0
import argparse

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-in",
                        "--features_file",
                        required=True,
                        help="Features file")
    parser.add_argument("-out",
                        "--output_file",
                        required=False,
                        help="Filename for the output plot")
    args = parser.parse_args()

    ids, data = fileIO.read_feature_file(args.features_file, normalize=True)

    k_clusters = np.arange(10, 600, 10)
    k_clusters = np.array(sorted(list(set(k_clusters))))
    sil_scores = np.zeros(k_clusters.shape)
    sil_score_std = np.zeros(k_clusters.shape)

    num_repeats = 5

    for i in xrange(k_clusters.shape[0]):
        k = k_clusters[i]

        means = np.zeros(num_repeats)
        for j in xrange(num_repeats):
            sils = learning.silhouette_score(
                data, learning.kmeans(data, k, verbose=True))