示例#1
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detection_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.number_of_positive_samples = None
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.gene_coverages = {}
        self.gene_detection = {}
        self.samples = {}
        self.positive_samples = {}
        self.negative_samples = {}
        self.gene_class_information = {}
        self.samples_information = {}
        self.profile_db = {}

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = self.profile_db.gene_coverages_dict
                self.gene_detection = self.profile_db.gene_detection_dict
                self.samples = set(
                    next(iter(self.gene_coverages.values())).keys())
示例#2
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.exclude_samples = A('exclude_samples')
        self.include_samples = A('include_samples')
        self.profile_db = {}
        self.coverage_values_per_nt = {}
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.sample_detection_information_was_initiated = False
        self.positive_samples = []
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_detection_information = pd.DataFrame.empty
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty
        self.additional_description = ''
        self.total_length = None
        self.samples_coverage_stats_dicts_was_initiated = False
        self.samples_coverage_stats_dicts = pd.DataFrame.empty
        self.non_outlier_indices = {}

        if self.exclude_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(self.exclude_samples, 'rU').readlines()
            ])

            if not self.samples_to_exclude:
                raise ConfigError(
                    "You asked to exclude samples, but provided an empty list."
                )

            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        if self.include_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.include_samples)
            self.samples_to_include = set([
                l.split('\t')[0].strip()
                for l in open(self.include_samples, 'rU').readlines()
            ])

            if not self.samples_to_include:
                raise ConfigError(
                    "You provided an empty list of samples to include.")

            run.info(
                'Including Samples',
                'The following samples will be included: %s' %
                self.samples_to_include,
            )
        else:
            self.samples_to_include = set([])

        # run sanity check on all input arguments
        self.sanity_check()

        if self.profile_db_path is None:
            # TODO: this will probably be removed because we don't save the coverage information in nucleotide level.
            pass
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
                self.init_samples(self.summary.p_meta['samples'])
            else:
                self.profile_db = ProfileSuperclass(args)
                self.init_samples(self.profile_db.p_meta['samples'])
                self.profile_db.init_split_coverage_values_per_nt_dict()
                self.profile_db.init_gene_level_coverage_stats_dicts()
                self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
                    self.profile_db.split_coverage_values_per_nt_dict,
                    self.samples)

                # comply with the new design and get gene_coverages and gene_detection dicsts from
                # gene_level_coverage_stats_dict.
                gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts(
                )

                self.init_coverage_and_detection_dataframes(
                    gene_coverages, gene_detection)

                # getting the total length of all contigs
                self.total_length = self.profile_db.p_meta['total_length']
示例#3
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.exclude_samples = A('exclude_samples')
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.positive_samples = pd.DataFrame.empty
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_information = pd.DataFrame.empty
        self.profile_db = {}
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty

        # check that there is a file like this
        if self.exclude_samples:
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(args.exclude_samples, 'rU').readlines()
            ])
            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = pd.DataFrame.from_dict(
                    self.profile_db.gene_coverages_dict,
                    orient='index',
                    dtype=float)
                self.gene_coverages.drop(self.samples_to_exclude,
                                         axis=1,
                                         inplace=True)
                self.Ng = len(self.gene_coverages.index)
                self.gene_detections = pd.DataFrame.from_dict(
                    self.profile_db.gene_detection_dict,
                    orient='index',
                    dtype=float)
                self.gene_detections.drop(self.samples_to_exclude,
                                          axis=1,
                                          inplace=True)
                self.samples = set(self.gene_coverages.columns)