def __init__(self, reads=False, genome=False, baga=False): ''' Initialise with: a baga.PrepareReads.Reads object and, a baga.CollectData.Genome object. OR a path to baga.AlignReads.SAMs (like this one) object that was previously saved. ''' if (reads and genome) and not baga: try: self.read_files = reads.trimmed_read_files except AttributeError: text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.' print(text) try: self.read_files = reads.adaptorcut_read_files except AttributeError: text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.' self.read_files = reads.read_files print(text) print('continuing with these reads . . .') # currently baga CollectData includes path to reads in pairname keys to read file pair values # check and remove here for pairname, files in self.read_files.items(): if _os.path.sep in pairname: self.read_files[pairname.split(_os.path.sep)[-1]] = files del self.read_files[pairname] self.genome_sequence = genome.sequence self.genome_id = genome.id elif baga and not (reads and genome): # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError( 'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)' )
def loadFrombaga(local_path): with _tarfile.open(local_path, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents)
def __init__(self, reads = False, genome = False, baga = False): ''' Initialise with: a baga.PrepareReads.Reads object and, a baga.CollectData.Genome object. OR a path to baga.AlignReads.SAMs (like this one) object that was previously saved. ''' if (reads and genome) and not baga: try: self.read_files = reads.trimmed_read_files except AttributeError: text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.' print(text) try: self.read_files = reads.adaptorcut_read_files except AttributeError: text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.' self.read_files = reads.read_files print(text) print('continuing with these reads . . .') # currently baga CollectData includes path to reads in pairname keys to read file pair values # check and remove here for pairname, files in self.read_files.items(): if _os.path.sep in pairname: self.read_files[pairname.split(_os.path.sep)[-1]] = files del self.read_files[pairname] self.genome_sequence = genome.sequence self.genome_id = genome.id elif baga and not (reads and genome): # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
def __init__(self, genome = False, baga = False, num_individuals = 1, large_deletions = {}, large_del_padding = 1000, random_seed = False): ''' Initialise with: a baga.CollectData.Genome object. OR a path to baga.SimulateReads.Reads (like this one) object that was previously saved. Large deletions can be included to simulate e.g. missing genomic islands or prophage. Currently, a set of genomes are generated with and a set without the large deletions if specified. large_deletions should be a dict with arbitrary names of deletions as keys and tuples of (start,end) for python slices delineating each deletion. If supplied a set of genomes with and without deletions are generated, each set consisting of num_individuals members. No variants will be generated within large_del_padding: a 'safe' distance around large deletions outside of which variant calling should be reliable. Small deletions might run over into this zone. ''' if random_seed: if not isinstance(random_seed, int): raise ValueError('random_seed must be integer') _random.seed(random_seed) # 684651 if genome and not baga: self.genome = genome elif baga and not genome: # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError('instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object') omit = set() for name,(start,end) in large_deletions.items(): omit.update(range(start-large_del_padding, end+large_del_padding)) samplefrom = sorted(set(range(len(self.genome.sequence))) - omit) self.samplefrom = samplefrom self.large_deletions = large_deletions self.num_individuals = num_individuals # to be optionally populated with methods self.SNPs_per_genome = [] self.indel_dict_by_pos_pergenome = []
def __init__(self, genome=False, baga=False, num_individuals=1, large_deletions={}, large_del_padding=1000, random_seed=False): ''' Initialise with: a baga.CollectData.Genome object. OR a path to baga.SimulateReads.Reads (like this one) object that was previously saved. Large deletions can be included to simulate e.g. missing genomic islands or prophage. Currently, a set of genomes are generated with and a set without the large deletions if specified. large_deletions should be a dict with arbitrary names of deletions as keys and tuples of (start,end) for python slices delineating each deletion. If supplied a set of genomes with and without deletions are generated, each set consisting of num_individuals members. No variants will be generated within large_del_padding: a 'safe' distance around large deletions outside of which variant calling should be reliable. Small deletions might run over into this zone. ''' if random_seed: if not isinstance(random_seed, int): raise ValueError('random_seed must be integer') _random.seed(random_seed) # 684651 if genome and not baga: self.genome = genome elif baga and not genome: # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError( 'instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object' ) omit = set() for name, (start, end) in large_deletions.items(): omit.update( range(start - large_del_padding, end + large_del_padding)) samplefrom = sorted(set(range(len(self.genome.sequence))) - omit) self.samplefrom = samplefrom self.large_deletions = large_deletions self.num_individuals = num_individuals # to be optionally populated with methods self.SNPs_per_genome = [] self.indel_dict_by_pos_pergenome = []