def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detection_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.beta = A('beta') self.gamma = A('gamma') self.eta = A('eta') self.zeta = A('zeta') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.number_of_positive_samples = None self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.store_gene_detections_and_gene_coverages_tables = A( 'store_gene_detections_and_gene_coverages_tables') self.gene_coverages = {} self.gene_detection = {} self.samples = {} self.positive_samples = {} self.negative_samples = {} self.gene_class_information = {} self.samples_information = {} self.profile_db = {} self.sanity_check() if self.profile_db_path is None: self.get_data_from_txt_file() else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() else: self.profile_db = ProfileSuperclass(args) self.profile_db.init_gene_coverages_and_detection_dicts() self.gene_coverages = self.profile_db.gene_coverages_dict self.gene_detection = self.profile_db.gene_detection_dict self.samples = set( next(iter(self.gene_coverages.values())).keys())
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detections_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.exclude_samples = A('exclude_samples') self.include_samples = A('include_samples') self.profile_db = {} self.coverage_values_per_nt = {} self.gene_coverages = pd.DataFrame.empty self.gene_detections = pd.DataFrame.empty self.samples = {} self.sample_detection_information_was_initiated = False self.positive_samples = [] self.number_of_positive_samples = None self.negative_samples = pd.DataFrame.empty self.number_of_negative_samples = None self.gene_class_information = pd.DataFrame.empty self.samples_detection_information = pd.DataFrame.empty self.gene_presence_absence_in_samples = pd.DataFrame.empty self.gene_coverages_filtered = pd.DataFrame.empty self.additional_description = '' self.total_length = None self.samples_coverage_stats_dicts_was_initiated = False self.samples_coverage_stats_dicts = pd.DataFrame.empty self.non_outlier_indices = {} if self.exclude_samples: # check that there is a file like this filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([ l.split('\t')[0].strip() for l in open(self.exclude_samples, 'rU').readlines() ]) if not self.samples_to_exclude: raise ConfigError( "You asked to exclude samples, but provided an empty list." ) run.info( 'Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude, ) else: self.samples_to_exclude = set([]) if self.include_samples: # check that there is a file like this filesnpaths.is_file_exists(self.include_samples) self.samples_to_include = set([ l.split('\t')[0].strip() for l in open(self.include_samples, 'rU').readlines() ]) if not self.samples_to_include: raise ConfigError( "You provided an empty list of samples to include.") run.info( 'Including Samples', 'The following samples will be included: %s' % self.samples_to_include, ) else: self.samples_to_include = set([]) # run sanity check on all input arguments self.sanity_check() if self.profile_db_path is None: # TODO: this will probably be removed because we don't save the coverage information in nucleotide level. pass else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() self.init_samples(self.summary.p_meta['samples']) else: self.profile_db = ProfileSuperclass(args) self.init_samples(self.profile_db.p_meta['samples']) self.profile_db.init_split_coverage_values_per_nt_dict() self.profile_db.init_gene_level_coverage_stats_dicts() self.coverage_values_per_nt = get_coverage_values_per_nucleotide( self.profile_db.split_coverage_values_per_nt_dict, self.samples) # comply with the new design and get gene_coverages and gene_detection dicsts from # gene_level_coverage_stats_dict. gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts( ) self.init_coverage_and_detection_dataframes( gene_coverages, gene_detection) # getting the total length of all contigs self.total_length = self.profile_db.p_meta['total_length']
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detections_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.beta = A('beta') self.gamma = A('gamma') self.eta = A('eta') self.zeta = A('zeta') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.store_gene_detections_and_gene_coverages_tables = A( 'store_gene_detections_and_gene_coverages_tables') self.exclude_samples = A('exclude_samples') self.gene_coverages = pd.DataFrame.empty self.gene_detections = pd.DataFrame.empty self.samples = {} self.positive_samples = pd.DataFrame.empty self.number_of_positive_samples = None self.negative_samples = pd.DataFrame.empty self.number_of_negative_samples = None self.gene_class_information = pd.DataFrame.empty self.samples_information = pd.DataFrame.empty self.profile_db = {} self.gene_presence_absence_in_samples = pd.DataFrame.empty self.gene_coverages_filtered = pd.DataFrame.empty # check that there is a file like this if self.exclude_samples: filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([ l.split('\t')[0].strip() for l in open(args.exclude_samples, 'rU').readlines() ]) run.info( 'Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude, ) else: self.samples_to_exclude = set([]) self.sanity_check() if self.profile_db_path is None: self.get_data_from_txt_file() else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() else: self.profile_db = ProfileSuperclass(args) self.profile_db.init_gene_coverages_and_detection_dicts() self.gene_coverages = pd.DataFrame.from_dict( self.profile_db.gene_coverages_dict, orient='index', dtype=float) self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True) self.Ng = len(self.gene_coverages.index) self.gene_detections = pd.DataFrame.from_dict( self.profile_db.gene_detection_dict, orient='index', dtype=float) self.gene_detections.drop(self.samples_to_exclude, axis=1, inplace=True) self.samples = set(self.gene_coverages.columns)