def getFromEntrezNucleotide(accession, user_email): print("WARNING: NCBI's Entrez query system used here can be unreliable "\ "for data download. If the download does not start (if you "\ "don't see '524,288 bytes ...') within a few seconds, press "\ "ctrl-c and issue the same command again (up-arrow, enter, "\ "usually works)\n") _Entrez.email = user_email handle = _Entrez.efetch(db = "nuccore", rettype = "gb", retmode = "text", id = accession) try: records = list(_SeqIO.parse(handle, 'genbank')) #seq_record = _SeqIO.read(handle, "genbank") except ValueError as error_message: print("There was a problem with the genome (accession: {}) downloaded "\ "from NCBI via Entrez: {}. Retry because Entrez can be "\ "unreliable, or try loading from a .gbk file downloaded "\ "manually from e.g., ftp://ftp.ncbi.nih.gov/genomes/Bacteria/"\ "".format(accession, error_message)) handle.close() # self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record) # self.sequence = _array('c', seq_record.seq) # self.id = seq_record.id for seq_record in records: if accession in seq_record.id: self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record) self.sequence = _array('c', seq_record.seq) self.id = seq_record.id
def getFromEntrezNucleotide(accession, user_email): print("WARNING: NCBI's Entrez query system used here can be unreliable "\ "for data download. If the download does not start (if you "\ "don't see '524,288 bytes ...') within a few seconds, press "\ "ctrl-c and issue the same command again (up-arrow, enter, "\ "usually works)\n") _Entrez.email = user_email handle = _Entrez.efetch(db="nuccore", rettype="gb", retmode="text", id=accession) try: records = list(_SeqIO.parse(handle, 'genbank')) #seq_record = _SeqIO.read(handle, "genbank") except ValueError as error_message: print("There was a problem with the genome (accession: {}) downloaded "\ "from NCBI via Entrez: {}. Retry because Entrez can be "\ "unreliable, or try loading from a .gbk file downloaded "\ "manually from e.g., ftp://ftp.ncbi.nih.gov/genomes/Bacteria/"\ "".format(accession, error_message)) handle.close() # self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record) # self.sequence = _array('c', seq_record.seq) # self.id = seq_record.id for seq_record in records: if accession in seq_record.id: self.ORF_ranges, self.rRNA_ranges, \ self.large_mobile_element_ranges = extractLoci(seq_record) self.sequence = _array('c', seq_record.seq) self.id = seq_record.id
def __init__(self, reads=False, genome=False, baga=False): ''' Initialise with: a baga.PrepareReads.Reads object and, a baga.CollectData.Genome object. OR a path to baga.AlignReads.SAMs (like this one) object that was previously saved. ''' if (reads and genome) and not baga: try: self.read_files = reads.trimmed_read_files except AttributeError: text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.' print(text) try: self.read_files = reads.adaptorcut_read_files except AttributeError: text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.' self.read_files = reads.read_files print(text) print('continuing with these reads . . .') # currently baga CollectData includes path to reads in pairname keys to read file pair values # check and remove here for pairname, files in self.read_files.items(): if _os.path.sep in pairname: self.read_files[pairname.split(_os.path.sep)[-1]] = files del self.read_files[pairname] self.genome_sequence = genome.sequence self.genome_id = genome.id elif baga and not (reads and genome): # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError( 'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)' )
def loadFrombaga(local_path): with _tarfile.open(local_path, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents)
def __init__(self, reads = False, genome = False, baga = False): ''' Initialise with: a baga.PrepareReads.Reads object and, a baga.CollectData.Genome object. OR a path to baga.AlignReads.SAMs (like this one) object that was previously saved. ''' if (reads and genome) and not baga: try: self.read_files = reads.trimmed_read_files except AttributeError: text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.' print(text) try: self.read_files = reads.adaptorcut_read_files except AttributeError: text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.' self.read_files = reads.read_files print(text) print('continuing with these reads . . .') # currently baga CollectData includes path to reads in pairname keys to read file pair values # check and remove here for pairname, files in self.read_files.items(): if _os.path.sep in pairname: self.read_files[pairname.split(_os.path.sep)[-1]] = files del self.read_files[pairname] self.genome_sequence = genome.sequence self.genome_id = genome.id elif baga and not (reads and genome): # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
def loadFromGBK(local_path): seq_record = list(_SeqIO.parse(local_path, "genbank"))[0] self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record) self.sequence = _array('c', seq_record.seq) self.id = seq_record.id
def getFromEntrez(search_id, user_email): ''' download a genome sequence given a search ID search_id is recommended to be a refseq or genbank accession number or other unambiguous ID that will return a single result ''' from Bio.Entrez.Parser import ValidationError as _ValidationError if '.' in search_id: search_id_unversioned,requested_ver = search_id.split('.') else: search_id_unversioned,requested_ver = search_id,None if '_' in search_id_unversioned: search_id_is_refseq = True else: search_id_is_refseq = False _Entrez.email = user_email handle = _Entrez.esearch(db = "assembly", term = search_id_unversioned) result = _Entrez.read(handle) if len(result['IdList']) != 1: print('WARNING: Your search ID: "{}" returned {} assembly results '\ 'from ncbi.nlm.nih.gov/assembly but a single result is required.'.format( search_id, len(result['IdList']))) raise LookupError Assembly_ID = result['IdList'][0] handle = _Entrez.esummary(db = "assembly", id = Assembly_ID) # some ways of handling unexpected content from NCBI try: raw = _Entrez.read(handle, validate=True) except _ValidationError as e: print('WARNING: The information about this genome returned by NCBI Entrez failed validation (ValidationError):\n{}'.format(e)) print('Trying without validation . . .\n') handle = _Entrez.esummary(db = "assembly", id = Assembly_ID) raw = _Entrez.read(handle, validate=False) if len(raw) == 0: print('NCBIs Entrez system returned an empty result for record '\ 'id {} in the Assembly database. Will attempt to '\ 'download direct from nucleotide database'\ ''.format(Assembly_ID)) raise RuntimeError("Empty record from Entrez") else: info = raw['DocumentSummarySet']['DocumentSummary'][0] print('Found: {} ({})'.format(info['Organism'],info['AssemblyStatus'])) # collect download links try: genbank_ftp = _re.findall( '<FtpPath type="GenBank">([^<]+)</FtpPath>', info['Meta'])[0] print('Found Genbank link:\n{}'.format(genbank_ftp)) except IndexError: genbank_ftp = False print('GenBank link not found') try: refseq_ftp = _re.findall( '<FtpPath type="RefSeq">([^<]+)</FtpPath>', info['Meta'])[0] print('Found RefSeq link:\n{}'.format(refseq_ftp)) except IndexError: refseq_ftp = False print('RefSeq link not found') e = 'Failed to retrieve FTP download links from MetaData:\n{}'.format(info['Meta']) assert genbank_ftp or refseq_ftp, e if refseq_ftp: use_link = refseq_ftp elif genbank_ftp: use_link = genbank_ftp # collect accessions and versions refseq_ass_acc = info['AssemblyAccession'] e = 'No RefSeq assembly found for {}. You can double check at http://www.ncbi.nlm.nih.gov/assembly'.format(search_id) assert refseq_ass_acc[:3] == 'GCF' genbank2refseq = {} genbank2version = {} refseq2genbank = {} refseq2version = {} data = DL('ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/{}.assembly.txt'.format( refseq_ass_acc), verbose = False) ID_info = data.readlines() for line in ID_info: if line[0] != '#' and len(line) > 0: cells = line.split('\t') genbank_acc, gb_ver = cells[4].split('.') refseq_acc, rs_ver = cells[6].split('.') genbank2refseq[genbank_acc] = refseq_acc genbank2version[genbank_acc] = gb_ver refseq2genbank[refseq_acc] = genbank_acc refseq2version[refseq_acc] = rs_ver if search_id_is_refseq: use_name = search_id_unversioned + '.' + refseq2version[search_id_unversioned] if requested_ver is None: print('Found version {} of RefSeq accession {}'.format( refseq2version[search_id_unversioned], search_id_unversioned)) elif requested_ver != refseq2version[search_id_unversioned]: print('RefSeq accession {} version {} was requested, '\ 'but version {} is the current version and will be used instead'.format( search_id_unversioned, requested_ver, refseq2version[search_id_unversioned])) else: use_refseq = genbank2refseq[search_id_unversioned] print('Will use RefSeq accession {} (latest version {}) which '\ 'corresponds to provided GenBank accession {}'.format( use_refseq, refseq2version[use_refseq], search_id_unversioned)) use_name = use_refseq + '.' + refseq2version[use_refseq] ### could collect other replicons in this genome . . . if len(refseq2version) > 1: print('(this is 1 of {} replicons in this genome)'.format(len(refseq2version))) else: print('(this is the only replicon in this genome)') # download checksums data = DL(use_link + '/md5checksums.txt', verbose = False) checksum = [l.split(' ./') for l in data.readlines() if '_genomic.gbff.gz' in l][0][0] # download sequences and annotations use_link += '/' + use_link.split('/')[-1] + '_genomic.gbff.gz' print('Downloading from:\n{}'.format(use_link)) data = DL(use_link, verbose = True) hasher = _md5() buff = data.read(65536) while len(buff) > 0: hasher.update(buff) buff = data.read(65536) e = '. . . checksum fail!' assert hasher.hexdigest() == checksum, e print('. . . checksum {} passed!'.format(checksum)) data.seek(0) archive = _gzip.GzipFile(mode="rb", fileobj = data) records = list(_SeqIO.parse(archive, 'genbank')) for seq_record in records: if use_name == seq_record.id: self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record) self.sequence = _array('c', seq_record.seq) self.id = seq_record.id
def loadFromGBK(local_path): seq_record = list(_SeqIO.parse(local_path, "genbank"))[0] self.ORF_ranges, self.rRNA_ranges, \ self.large_mobile_element_ranges = extractLoci(seq_record) self.sequence = _array('c', seq_record.seq) self.id = seq_record.id
def getFromEntrez(search_id, user_email): ''' download a genome sequence given a search ID search_id is recommended to be a refseq or genbank accession number or other unambiguous ID that will return a single result ''' from Bio.Entrez.Parser import ValidationError as _ValidationError if '.' in search_id: search_id_unversioned, requested_ver = search_id.split('.') else: search_id_unversioned, requested_ver = search_id, None if '_' in search_id_unversioned: search_id_is_refseq = True else: search_id_is_refseq = False _Entrez.email = user_email handle = _Entrez.esearch(db="assembly", term=search_id_unversioned) result = _Entrez.read(handle) if len(result['IdList']) != 1: print('WARNING: Your search ID: "{}" returned {} assembly results '\ 'from ncbi.nlm.nih.gov/assembly but a single result is required.'.format( search_id, len(result['IdList']))) raise LookupError Assembly_ID = result['IdList'][0] handle = _Entrez.esummary(db="assembly", id=Assembly_ID) # some ways of handling unexpected content from NCBI try: raw = _Entrez.read(handle, validate=True) except _ValidationError as e: print( 'WARNING: The information about this genome returned by NCBI Entrez failed validation (ValidationError):\n{}' .format(e)) print('Trying without validation . . .\n') handle = _Entrez.esummary(db="assembly", id=Assembly_ID) raw = _Entrez.read(handle, validate=False) if len(raw) == 0: print('NCBIs Entrez system returned an empty result for record '\ 'id {} in the Assembly database. Will attempt to '\ 'download direct from nucleotide database'\ ''.format(Assembly_ID)) raise RuntimeError("Empty record from Entrez") else: info = raw['DocumentSummarySet']['DocumentSummary'][0] print('Found: {} ({})'.format(info['Organism'], info['AssemblyStatus'])) # collect download links try: genbank_ftp = _re.findall( '<FtpPath type="GenBank">([^<]+)</FtpPath>', info['Meta'])[0] print('Found Genbank link:\n{}'.format(genbank_ftp)) except IndexError: genbank_ftp = False print('GenBank link not found') try: refseq_ftp = _re.findall( '<FtpPath type="RefSeq">([^<]+)</FtpPath>', info['Meta'])[0] print('Found RefSeq link:\n{}'.format(refseq_ftp)) except IndexError: refseq_ftp = False print('RefSeq link not found') e = 'Failed to retrieve FTP download links from MetaData:\n{}'.format( info['Meta']) assert genbank_ftp or refseq_ftp, e if refseq_ftp: use_link = refseq_ftp elif genbank_ftp: use_link = genbank_ftp # collect accessions and versions refseq_ass_acc = info['AssemblyAccession'] e = 'No RefSeq assembly found for {}. You can double check at http://www.ncbi.nlm.nih.gov/assembly'.format( search_id) assert refseq_ass_acc[:3] == 'GCF' genbank2refseq = {} genbank2version = {} refseq2genbank = {} refseq2version = {} data = DL( 'ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/{}.assembly.txt' .format(refseq_ass_acc), verbose=False) ID_info = data.readlines() for line in ID_info: if line[0] != '#' and len(line) > 0: cells = line.split('\t') genbank_acc, gb_ver = cells[4].split('.') refseq_acc, rs_ver = cells[6].split('.') genbank2refseq[genbank_acc] = refseq_acc genbank2version[genbank_acc] = gb_ver refseq2genbank[refseq_acc] = genbank_acc refseq2version[refseq_acc] = rs_ver if search_id_is_refseq: use_name = search_id_unversioned + '.' + refseq2version[ search_id_unversioned] if requested_ver is None: print('Found version {} of RefSeq accession {}'.format( refseq2version[search_id_unversioned], search_id_unversioned)) elif requested_ver != refseq2version[search_id_unversioned]: print('RefSeq accession {} version {} was requested, '\ 'but version {} is the current version and will be used instead'.format( search_id_unversioned, requested_ver, refseq2version[search_id_unversioned])) else: use_refseq = genbank2refseq[search_id_unversioned] print('Will use RefSeq accession {} (latest version {}) which '\ 'corresponds to provided GenBank accession {}'.format( use_refseq, refseq2version[use_refseq], search_id_unversioned)) use_name = use_refseq + '.' + refseq2version[use_refseq] ### could collect other replicons in this genome . . . if len(refseq2version) > 1: print('(this is 1 of {} replicons in this genome)'.format( len(refseq2version))) else: print('(this is the only replicon in this genome)') # download checksums data = DL(use_link + '/md5checksums.txt', verbose=False) checksum = [ l.split(' ./') for l in data.readlines() if '_genomic.gbff.gz' in l ][0][0] # download sequences and annotations use_link += '/' + use_link.split('/')[-1] + '_genomic.gbff.gz' print('Downloading from:\n{}'.format(use_link)) data = DL(use_link, verbose=True) hasher = _md5() buff = data.read(65536) while len(buff) > 0: hasher.update(buff) buff = data.read(65536) e = '. . . checksum fail!' assert hasher.hexdigest() == checksum, e print('. . . checksum {} passed!'.format(checksum)) data.seek(0) archive = _gzip.GzipFile(mode="rb", fileobj=data) records = list(_SeqIO.parse(archive, 'genbank')) for seq_record in records: if use_name == seq_record.id: self.ORF_ranges, self.rRNA_ranges, \ self.large_mobile_element_ranges = extractLoci(seq_record) self.sequence = _array('c', seq_record.seq) self.id = seq_record.id
def __init__(self, genome = False, baga = False, num_individuals = 1, large_deletions = {}, large_del_padding = 1000, random_seed = False): ''' Initialise with: a baga.CollectData.Genome object. OR a path to baga.SimulateReads.Reads (like this one) object that was previously saved. Large deletions can be included to simulate e.g. missing genomic islands or prophage. Currently, a set of genomes are generated with and a set without the large deletions if specified. large_deletions should be a dict with arbitrary names of deletions as keys and tuples of (start,end) for python slices delineating each deletion. If supplied a set of genomes with and without deletions are generated, each set consisting of num_individuals members. No variants will be generated within large_del_padding: a 'safe' distance around large deletions outside of which variant calling should be reliable. Small deletions might run over into this zone. ''' if random_seed: if not isinstance(random_seed, int): raise ValueError('random_seed must be integer') _random.seed(random_seed) # 684651 if genome and not baga: self.genome = genome elif baga and not genome: # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError('instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object') omit = set() for name,(start,end) in large_deletions.items(): omit.update(range(start-large_del_padding, end+large_del_padding)) samplefrom = sorted(set(range(len(self.genome.sequence))) - omit) self.samplefrom = samplefrom self.large_deletions = large_deletions self.num_individuals = num_individuals # to be optionally populated with methods self.SNPs_per_genome = [] self.indel_dict_by_pos_pergenome = []
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n,(s,e) in enumerate(ranges[:-1]): genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s,e in ranges: if pos0 > e: offset += (e-s) return(pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0,variant in SNPs: adjusted += [(adjust(pos0),variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals] for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]: adjusted = {} for pos0,indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn,SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c',genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c',self.genome.sequence) # first SNPs for pos0,SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old] assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel,str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '') genotypes += [genome_seqrecord] print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id) self.genotypes = genotypes
def __init__(self, genome=False, baga=False, num_individuals=1, large_deletions={}, large_del_padding=1000, random_seed=False): ''' Initialise with: a baga.CollectData.Genome object. OR a path to baga.SimulateReads.Reads (like this one) object that was previously saved. Large deletions can be included to simulate e.g. missing genomic islands or prophage. Currently, a set of genomes are generated with and a set without the large deletions if specified. large_deletions should be a dict with arbitrary names of deletions as keys and tuples of (start,end) for python slices delineating each deletion. If supplied a set of genomes with and without deletions are generated, each set consisting of num_individuals members. No variants will be generated within large_del_padding: a 'safe' distance around large deletions outside of which variant calling should be reliable. Small deletions might run over into this zone. ''' if random_seed: if not isinstance(random_seed, int): raise ValueError('random_seed must be integer') _random.seed(random_seed) # 684651 if genome and not baga: self.genome = genome elif baga and not genome: # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError( 'instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object' ) omit = set() for name, (start, end) in large_deletions.items(): omit.update( range(start - large_del_padding, end + large_del_padding)) samplefrom = sorted(set(range(len(self.genome.sequence))) - omit) self.samplefrom = samplefrom self.large_deletions = large_deletions self.num_individuals = num_individuals # to be optionally populated with methods self.SNPs_per_genome = [] self.indel_dict_by_pos_pergenome = []
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n, (s, e) in enumerate(ranges[:-1]): genome_large_deletions.extend( self.genome.sequence[e:ranges[n + 1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n + 1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s, e in ranges: if pos0 > e: offset += (e - s) return (pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self. num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0, variant in SNPs: adjusted += [(adjust(pos0), variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[: self . num_individuals] for indels in self.indel_dict_by_pos_pergenome[self. num_individuals:]: adjusted = {} for pos0, indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these[ 'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn, SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c', genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c', self.genome.sequence) # first SNPs for pos0, SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [ pos0 for pos0, (new, old) in enumerate(zip(genome, list(orig_genome))) if new != old ] assert changed == [pos0 for pos0, var in SNPs ], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel, str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id=self.genome.id + '_sim{:02d}'.format(gn + 1), name='', description='') genotypes += [genome_seqrecord] print(len(self.genome.sequence), len(genotypes[-1]), genotypes[-1].id) self.genotypes = genotypes