def initialize_pool_sequence_mappings(self, mapq_cutoff=30): if self.get_property( 'force_recount' ) or not self.lib_settings.sequence_counts_exist(): gene_names = [] trimmed_sequences = bzUtils.convertFastaToDict( self.experiment_settings.get_trimmed_pool_fasta()) for sequence_name in trimmed_sequences: gene_name = sequence_name.split('_')[ 0] #TL names are assumed to be of type:YLR350W_-68_651_116 gene_names.append(gene_name) self.pool_sequence_mappings[ sequence_name] = pool_sequence_mapping( sequence_name, trimmed_sequences[sequence_name]) samfile = pysam.Samfile(self.lib_settings.get_mapped_reads(), "rb") ra = read_assigner(self.pool_sequence_mappings, samfile, mapq_cutoff) for aligned_read in samfile.fetch(): ra.assign_read(aligned_read) samfile.close() self.compute_lib_fractions() gene_counts = Counter(gene_names) for mapping_name in self.pool_sequence_mappings: if gene_counts[mapping_name.split('_')[0]] == 1: self.pool_sequence_mappings[mapping_name].is_only_tl = True else: assert gene_counts[mapping_name.split('_')[0]] != 0 self.pool_sequence_mappings[ mapping_name].is_only_tl = False bzUtils.makePickle(self.pool_sequence_mappings, self.lib_settings.get_sequence_counts()) else: self.pool_sequence_mappings = bzUtils.unPickle( self.lib_settings.get_sequence_counts())
def get_collapsed_read_fractions(self, lib_settings): out_name = os.path.join( self.experiment_settings.get_rdir(), 'QC', 'collapsed_fracs', '%(sample_name)s.collapsed_read_fractions.pkl' % {'sample_name': lib_settings.sample_name}) if not tps_utils.file_exists( out_name) and not self.experiment_settings.get_property( 'force_recollapse'): collapsed_reads_file = lib_settings.get_collapsed_reads() read_counts = [] f = gzip.open(collapsed_reads_file) for line in f: if not line.strip() == '' and not line.startswith( '#'): #ignore empty lines and commented out lines if line.startswith( '>'): #> marks the start of a new sequence num_reads = int(line[1:].strip().split('-')[1]) read_counts.append(num_reads) else: continue f.close() read_fractions = np.array(read_counts) / float(sum(read_counts)) bzUtils.makePickle(read_fractions, out_name) else: read_fractions = bzUtils.unPickle(out_name) return (lib_settings.sample_name, read_fractions)
def get_collapsed_read_fractions(self, lib_settings): out_name = os.path.join( self.experiment_settings.get_rdir(), "QC", "collapsed_fracs", "%(sample_name)s.collapsed_read_fractions.pkl" % {"sample_name": lib_settings.sample_name}, ) if not tps_utils.file_exists(out_name) and not self.experiment_settings.get_property("force_recollapse"): collapsed_reads_file = lib_settings.get_collapsed_reads() read_counts = [] f = gzip.open(collapsed_reads_file) for line in f: if not line.strip() == "" and not line.startswith("#"): # ignore empty lines and commented out lines if line.startswith(">"): # > marks the start of a new sequence num_reads = int(line[1:].strip().split("-")[1]) read_counts.append(num_reads) else: continue f.close() read_fractions = np.array(read_counts) / float(sum(read_counts)) bzUtils.makePickle(read_fractions, out_name) else: read_fractions = bzUtils.unPickle(out_name) return (lib_settings.sample_name, read_fractions)
def initialize_pool_sequence_mappings(self, mapq_cutoff = 30): if self.get_property('force_recount') or not self.lib_settings.sequence_counts_exist(): gene_names = [] trimmed_sequences = bzUtils.convertFastaToDict(self.experiment_settings.get_trimmed_pool_fasta()) for sequence_name in trimmed_sequences: gene_name = sequence_name.split('_')[0] #TL names are assumed to be of type:YLR350W_-68_651_116 gene_names.append(gene_name) self.pool_sequence_mappings[sequence_name] = pool_sequence_mapping(sequence_name, trimmed_sequences[sequence_name]) samfile = pysam.Samfile(self.lib_settings.get_mapped_reads(), "rb" ) ra = read_assigner(self.pool_sequence_mappings, samfile, mapq_cutoff) for aligned_read in samfile.fetch(): ra.assign_read(aligned_read) samfile.close() self.compute_lib_fractions() gene_counts = Counter(gene_names) for mapping_name in self.pool_sequence_mappings: if gene_counts[mapping_name.split('_')[0]]==1: self.pool_sequence_mappings[mapping_name].is_only_tl = True else: assert gene_counts[mapping_name.split('_')[0]] != 0 self.pool_sequence_mappings[mapping_name].is_only_tl = False bzUtils.makePickle(self.pool_sequence_mappings, self.lib_settings.get_sequence_counts()) else: self.pool_sequence_mappings = bzUtils.unPickle(self.lib_settings.get_sequence_counts())