Exemplo n.º 1
0
 def getFromEntrezNucleotide(accession, user_email):
     print("WARNING: NCBI's Entrez query system used here can be unreliable "\
             "for data download. If the download does not start (if you "\
             "don't see '524,288 bytes ...') within a few seconds, press "\
             "ctrl-c and issue the same command again (up-arrow, enter, "\
             "usually works)\n")
     
     _Entrez.email = user_email
     handle = _Entrez.efetch(db = "nuccore", rettype = "gb", retmode = "text", 
             id = accession)
     try:
         records = list(_SeqIO.parse(handle, 'genbank'))
         #seq_record = _SeqIO.read(handle, "genbank")
     except ValueError as error_message:
         print("There was a problem with the genome (accession: {}) downloaded "\
                 "from NCBI via Entrez: {}. Retry because Entrez can be "\
                 "unreliable, or try loading from a .gbk file downloaded "\
                 "manually from e.g., ftp://ftp.ncbi.nih.gov/genomes/Bacteria/"\
                 "".format(accession, error_message))
     handle.close()
     # self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record)
     # self.sequence = _array('c', seq_record.seq)
     # self.id = seq_record.id
     for seq_record in records:
         if accession in seq_record.id:
             self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record)
             self.sequence = _array('c', seq_record.seq)
             self.id = seq_record.id
Exemplo n.º 2
0
        def getFromEntrezNucleotide(accession, user_email):
            print("WARNING: NCBI's Entrez query system used here can be unreliable "\
                    "for data download. If the download does not start (if you "\
                    "don't see '524,288 bytes ...') within a few seconds, press "\
                    "ctrl-c and issue the same command again (up-arrow, enter, "\
                    "usually works)\n")

            _Entrez.email = user_email
            handle = _Entrez.efetch(db="nuccore",
                                    rettype="gb",
                                    retmode="text",
                                    id=accession)
            try:
                records = list(_SeqIO.parse(handle, 'genbank'))
                #seq_record = _SeqIO.read(handle, "genbank")
            except ValueError as error_message:
                print("There was a problem with the genome (accession: {}) downloaded "\
                        "from NCBI via Entrez: {}. Retry because Entrez can be "\
                        "unreliable, or try loading from a .gbk file downloaded "\
                        "manually from e.g., ftp://ftp.ncbi.nih.gov/genomes/Bacteria/"\
                        "".format(accession, error_message))
            handle.close()
            # self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record)
            # self.sequence = _array('c', seq_record.seq)
            # self.id = seq_record.id
            for seq_record in records:
                if accession in seq_record.id:
                    self.ORF_ranges, self.rRNA_ranges, \
                            self.large_mobile_element_ranges = extractLoci(seq_record)
                    self.sequence = _array('c', seq_record.seq)
                    self.id = seq_record.id
Exemplo n.º 3
0
    def __init__(self, reads=False, genome=False, baga=False):
        '''
        Initialise with:
        a baga.PrepareReads.Reads object and,
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.AlignReads.SAMs (like this one) object that 
        was previously saved.
        '''

        if (reads and genome) and not baga:
            try:
                self.read_files = reads.trimmed_read_files
            except AttributeError:
                text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
                print(text)
                try:
                    self.read_files = reads.adaptorcut_read_files
                except AttributeError:
                    text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                    self.read_files = reads.read_files
                    print(text)
                    print('continuing with these reads . . .')

            # currently baga CollectData includes path to reads in pairname keys to read file pair values
            # check and remove here
            for pairname, files in self.read_files.items():
                if _os.path.sep in pairname:
                    self.read_files[pairname.split(_os.path.sep)[-1]] = files
                    del self.read_files[pairname]

            self.genome_sequence = genome.sequence
            self.genome_id = genome.id

        elif baga and not (reads and genome):
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)'
            )
Exemplo n.º 4
0
 def loadFrombaga(local_path):
     with _tarfile.open(local_path, "r:gz") as tar:
         for member in tar:
             contents = _StringIO(tar.extractfile(member).read())
             try:
                 # either json serialised conventional objects
                 contents = _json.loads(contents.getvalue())
             except ValueError:
                 # or longer python array.array objects
                 contents = _array('c', contents.getvalue())
             
             setattr(self, member.name, contents)
Exemplo n.º 5
0
        def loadFrombaga(local_path):
            with _tarfile.open(local_path, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)
Exemplo n.º 6
0
 def __init__(self, reads = False, genome = False, baga = False):
     '''
     Initialise with:
     a baga.PrepareReads.Reads object and,
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.AlignReads.SAMs (like this one) object that 
     was previously saved.
     '''
     
     if (reads and genome) and not baga:
         try:
             self.read_files = reads.trimmed_read_files
         except AttributeError:
             text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
             print(text)
             try:
                 self.read_files = reads.adaptorcut_read_files
             except AttributeError:
                 text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                 self.read_files = reads.read_files
                 print(text)
                 print('continuing with these reads . . .')
         
         # currently baga CollectData includes path to reads in pairname keys to read file pair values
         # check and remove here
         for pairname, files in self.read_files.items():
             if _os.path.sep in pairname:
                 self.read_files[pairname.split(_os.path.sep)[-1]] = files
                 del self.read_files[pairname]
         
         self.genome_sequence = genome.sequence
         self.genome_id = genome.id
     
     elif baga and not (reads and genome):
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
Exemplo n.º 7
0
 def loadFromGBK(local_path):
     seq_record = list(_SeqIO.parse(local_path, "genbank"))[0]
     self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record)
     self.sequence = _array('c', seq_record.seq)
     self.id = seq_record.id
Exemplo n.º 8
0
 def getFromEntrez(search_id, user_email):
     '''
     download a genome sequence given a search ID
     
     search_id is recommended to be a refseq or genbank accession number
     or other unambiguous ID that will return a single result
     '''
     
     from Bio.Entrez.Parser import ValidationError as _ValidationError
     
     if '.' in search_id:
         search_id_unversioned,requested_ver = search_id.split('.')
     else:
         search_id_unversioned,requested_ver = search_id,None
     
     if '_' in search_id_unversioned:
         search_id_is_refseq = True
     else:
         search_id_is_refseq = False
     
     _Entrez.email = user_email
     handle = _Entrez.esearch(db = "assembly", term = search_id_unversioned)
     result = _Entrez.read(handle)
     if len(result['IdList']) != 1:
         print('WARNING: Your search ID: "{}" returned {} assembly results '\
         'from ncbi.nlm.nih.gov/assembly but a single result is required.'.format(
                 search_id, len(result['IdList'])))
         raise LookupError
     Assembly_ID = result['IdList'][0]
     handle = _Entrez.esummary(db = "assembly", id = Assembly_ID)
     # some ways of handling unexpected content from NCBI
     try:
         raw = _Entrez.read(handle, validate=True)
     except _ValidationError as e:
         print('WARNING: The information about this genome returned by NCBI Entrez failed validation (ValidationError):\n{}'.format(e))
         print('Trying without validation . . .\n')
         handle = _Entrez.esummary(db = "assembly", id = Assembly_ID)
         raw = _Entrez.read(handle, validate=False)
     
     if len(raw) == 0:
         print('NCBIs Entrez system returned an empty result for record '\
                 'id {} in the Assembly database. Will attempt to '\
                 'download direct from nucleotide database'\
                 ''.format(Assembly_ID))
         raise RuntimeError("Empty record from Entrez")
     else:
         info = raw['DocumentSummarySet']['DocumentSummary'][0]
     
     print('Found: {} ({})'.format(info['Organism'],info['AssemblyStatus']))
     
     # collect download links
     try:
         genbank_ftp = _re.findall(
                 '<FtpPath type="GenBank">([^<]+)</FtpPath>', 
                 info['Meta'])[0]
         print('Found Genbank link:\n{}'.format(genbank_ftp))
     except IndexError:
         genbank_ftp = False
         print('GenBank link not found')
     
     try:
         refseq_ftp = _re.findall(
                 '<FtpPath type="RefSeq">([^<]+)</FtpPath>', 
                 info['Meta'])[0]
         print('Found RefSeq link:\n{}'.format(refseq_ftp))
     except IndexError:
         refseq_ftp = False
         print('RefSeq link not found')
     
     e = 'Failed to retrieve FTP download links from MetaData:\n{}'.format(info['Meta'])
     assert genbank_ftp or refseq_ftp, e
     
     if refseq_ftp:
         use_link = refseq_ftp
     elif genbank_ftp:
         use_link = genbank_ftp
     
     # collect accessions and versions
     refseq_ass_acc = info['AssemblyAccession']
     e = 'No RefSeq assembly found for {}. You can double check at http://www.ncbi.nlm.nih.gov/assembly'.format(search_id)
     assert refseq_ass_acc[:3] == 'GCF'
     genbank2refseq = {}
     genbank2version = {}
     refseq2genbank = {}
     refseq2version = {}
     data = DL('ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/{}.assembly.txt'.format(
             refseq_ass_acc), verbose = False)
     ID_info = data.readlines()
     
     for line in ID_info:
         if line[0] != '#' and len(line) > 0:
             cells = line.split('\t')
             genbank_acc, gb_ver = cells[4].split('.')
             refseq_acc, rs_ver = cells[6].split('.')
             genbank2refseq[genbank_acc] = refseq_acc
             genbank2version[genbank_acc] = gb_ver
             refseq2genbank[refseq_acc] = genbank_acc
             refseq2version[refseq_acc] = rs_ver
     
     if search_id_is_refseq:
         use_name = search_id_unversioned + '.' + refseq2version[search_id_unversioned]
         if requested_ver is None:
             print('Found version {} of RefSeq accession {}'.format(
                     refseq2version[search_id_unversioned], search_id_unversioned))
         elif requested_ver != refseq2version[search_id_unversioned]:
             print('RefSeq accession {} version {} was requested, '\
             'but version {} is the current version and will be used instead'.format(
                     search_id_unversioned, requested_ver, 
                     refseq2version[search_id_unversioned]))
     else:
         use_refseq = genbank2refseq[search_id_unversioned]
         print('Will use RefSeq accession {} (latest version {}) which '\
         'corresponds to provided GenBank accession {}'.format(
                 use_refseq, refseq2version[use_refseq], search_id_unversioned))
         
         use_name = use_refseq + '.' + refseq2version[use_refseq]
     
     ### could collect other replicons in this genome . . .
     if len(refseq2version) > 1:
         print('(this is 1 of {} replicons in this genome)'.format(len(refseq2version)))
     else:
         print('(this is the only replicon in this genome)')
     
     # download checksums
     data = DL(use_link + '/md5checksums.txt', verbose = False)
     checksum = [l.split('  ./') for l in data.readlines() if '_genomic.gbff.gz' in l][0][0]
     # download sequences and annotations
     use_link += '/' + use_link.split('/')[-1] + '_genomic.gbff.gz'
     print('Downloading from:\n{}'.format(use_link))
     data = DL(use_link, verbose = True)
     hasher = _md5()
     buff = data.read(65536)
     while len(buff) > 0:
         hasher.update(buff)
         buff = data.read(65536)
     
     e = '. . . checksum fail!'
     assert hasher.hexdigest() == checksum, e
     print('. . . checksum {} passed!'.format(checksum))
     data.seek(0)
     archive = _gzip.GzipFile(mode="rb", fileobj = data)
     records = list(_SeqIO.parse(archive, 'genbank'))
     for seq_record in records:
         if use_name == seq_record.id:
             self.ORF_ranges, self.large_mobile_element_ranges = extractLoci(seq_record)
             self.sequence = _array('c', seq_record.seq)
             self.id = seq_record.id
Exemplo n.º 9
0
 def loadFromGBK(local_path):
     seq_record = list(_SeqIO.parse(local_path, "genbank"))[0]
     self.ORF_ranges, self.rRNA_ranges, \
             self.large_mobile_element_ranges = extractLoci(seq_record)
     self.sequence = _array('c', seq_record.seq)
     self.id = seq_record.id
Exemplo n.º 10
0
        def getFromEntrez(search_id, user_email):
            '''
            download a genome sequence given a search ID
            
            search_id is recommended to be a refseq or genbank accession number
            or other unambiguous ID that will return a single result
            '''

            from Bio.Entrez.Parser import ValidationError as _ValidationError

            if '.' in search_id:
                search_id_unversioned, requested_ver = search_id.split('.')
            else:
                search_id_unversioned, requested_ver = search_id, None

            if '_' in search_id_unversioned:
                search_id_is_refseq = True
            else:
                search_id_is_refseq = False

            _Entrez.email = user_email
            handle = _Entrez.esearch(db="assembly", term=search_id_unversioned)
            result = _Entrez.read(handle)
            if len(result['IdList']) != 1:
                print('WARNING: Your search ID: "{}" returned {} assembly results '\
                'from ncbi.nlm.nih.gov/assembly but a single result is required.'.format(
                        search_id, len(result['IdList'])))
                raise LookupError
            Assembly_ID = result['IdList'][0]
            handle = _Entrez.esummary(db="assembly", id=Assembly_ID)
            # some ways of handling unexpected content from NCBI
            try:
                raw = _Entrez.read(handle, validate=True)
            except _ValidationError as e:
                print(
                    'WARNING: The information about this genome returned by NCBI Entrez failed validation (ValidationError):\n{}'
                    .format(e))
                print('Trying without validation . . .\n')
                handle = _Entrez.esummary(db="assembly", id=Assembly_ID)
                raw = _Entrez.read(handle, validate=False)

            if len(raw) == 0:
                print('NCBIs Entrez system returned an empty result for record '\
                        'id {} in the Assembly database. Will attempt to '\
                        'download direct from nucleotide database'\
                        ''.format(Assembly_ID))
                raise RuntimeError("Empty record from Entrez")
            else:
                info = raw['DocumentSummarySet']['DocumentSummary'][0]

            print('Found: {} ({})'.format(info['Organism'],
                                          info['AssemblyStatus']))

            # collect download links
            try:
                genbank_ftp = _re.findall(
                    '<FtpPath type="GenBank">([^<]+)</FtpPath>',
                    info['Meta'])[0]
                print('Found Genbank link:\n{}'.format(genbank_ftp))
            except IndexError:
                genbank_ftp = False
                print('GenBank link not found')

            try:
                refseq_ftp = _re.findall(
                    '<FtpPath type="RefSeq">([^<]+)</FtpPath>',
                    info['Meta'])[0]
                print('Found RefSeq link:\n{}'.format(refseq_ftp))
            except IndexError:
                refseq_ftp = False
                print('RefSeq link not found')

            e = 'Failed to retrieve FTP download links from MetaData:\n{}'.format(
                info['Meta'])
            assert genbank_ftp or refseq_ftp, e

            if refseq_ftp:
                use_link = refseq_ftp
            elif genbank_ftp:
                use_link = genbank_ftp

            # collect accessions and versions
            refseq_ass_acc = info['AssemblyAccession']
            e = 'No RefSeq assembly found for {}. You can double check at http://www.ncbi.nlm.nih.gov/assembly'.format(
                search_id)
            assert refseq_ass_acc[:3] == 'GCF'
            genbank2refseq = {}
            genbank2version = {}
            refseq2genbank = {}
            refseq2version = {}
            data = DL(
                'ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/{}.assembly.txt'
                .format(refseq_ass_acc),
                verbose=False)
            ID_info = data.readlines()

            for line in ID_info:
                if line[0] != '#' and len(line) > 0:
                    cells = line.split('\t')
                    genbank_acc, gb_ver = cells[4].split('.')
                    refseq_acc, rs_ver = cells[6].split('.')
                    genbank2refseq[genbank_acc] = refseq_acc
                    genbank2version[genbank_acc] = gb_ver
                    refseq2genbank[refseq_acc] = genbank_acc
                    refseq2version[refseq_acc] = rs_ver

            if search_id_is_refseq:
                use_name = search_id_unversioned + '.' + refseq2version[
                    search_id_unversioned]
                if requested_ver is None:
                    print('Found version {} of RefSeq accession {}'.format(
                        refseq2version[search_id_unversioned],
                        search_id_unversioned))
                elif requested_ver != refseq2version[search_id_unversioned]:
                    print('RefSeq accession {} version {} was requested, '\
                    'but version {} is the current version and will be used instead'.format(
                            search_id_unversioned, requested_ver,
                            refseq2version[search_id_unversioned]))
            else:
                use_refseq = genbank2refseq[search_id_unversioned]
                print('Will use RefSeq accession {} (latest version {}) which '\
                'corresponds to provided GenBank accession {}'.format(
                        use_refseq, refseq2version[use_refseq], search_id_unversioned))

                use_name = use_refseq + '.' + refseq2version[use_refseq]

            ### could collect other replicons in this genome . . .
            if len(refseq2version) > 1:
                print('(this is 1 of {} replicons in this genome)'.format(
                    len(refseq2version)))
            else:
                print('(this is the only replicon in this genome)')

            # download checksums
            data = DL(use_link + '/md5checksums.txt', verbose=False)
            checksum = [
                l.split('  ./') for l in data.readlines()
                if '_genomic.gbff.gz' in l
            ][0][0]
            # download sequences and annotations
            use_link += '/' + use_link.split('/')[-1] + '_genomic.gbff.gz'
            print('Downloading from:\n{}'.format(use_link))
            data = DL(use_link, verbose=True)
            hasher = _md5()
            buff = data.read(65536)
            while len(buff) > 0:
                hasher.update(buff)
                buff = data.read(65536)

            e = '. . . checksum fail!'
            assert hasher.hexdigest() == checksum, e
            print('. . . checksum {} passed!'.format(checksum))
            data.seek(0)
            archive = _gzip.GzipFile(mode="rb", fileobj=data)
            records = list(_SeqIO.parse(archive, 'genbank'))
            for seq_record in records:
                if use_name == seq_record.id:
                    self.ORF_ranges, self.rRNA_ranges, \
                            self.large_mobile_element_ranges = extractLoci(seq_record)
                    self.sequence = _array('c', seq_record.seq)
                    self.id = seq_record.id
Exemplo n.º 11
0
 def __init__(self, genome = False, 
                    baga = False, 
                    num_individuals = 1,
                    large_deletions = {},
                    large_del_padding = 1000,
                    random_seed = False):
     '''
     Initialise with:
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.SimulateReads.Reads (like this one) object that 
     was previously saved.
     
     Large deletions can be included to simulate e.g. missing genomic islands or prophage.
     Currently, a set of genomes are generated with and a set without the large deletions if 
     specified.
     
     large_deletions should be a dict with arbitrary names of deletions as keys and
     tuples of (start,end) for python slices delineating each deletion. If supplied a
     set of genomes with and without deletions are generated, each set consisting of 
     num_individuals members.
     
     No variants will be generated within large_del_padding: a 'safe' distance around large 
     deletions outside of which variant calling should be reliable. Small deletions might run
     over into this zone.
     '''
     
     if random_seed:
         if not isinstance(random_seed, int):
             raise ValueError('random_seed must be integer')
         _random.seed(random_seed) # 684651
     
     if genome and not baga:
         self.genome = genome
     elif baga and not genome:
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object')
     
     omit = set()
     for name,(start,end) in large_deletions.items():
         omit.update(range(start-large_del_padding, end+large_del_padding))
     
     samplefrom = sorted(set(range(len(self.genome.sequence))) - omit)
     self.samplefrom = samplefrom
     
     self.large_deletions = large_deletions
     self.num_individuals = num_individuals
     
     # to be optionally populated with methods
     self.SNPs_per_genome = []
     self.indel_dict_by_pos_pergenome = []
Exemplo n.º 12
0
    def generateSequences(self):
        '''
        Create full length sequences with generated variants applied to the reference sequence.

        Generated variants are saved to a csv.

        If large deletions are present, variant positions appropriately are corrected when applied.
        '''

        save_these = {}
        save_these['SNPs'] = self.SNPs_per_genome
        save_these['InDels'] = self.indel_dict_by_pos_pergenome

        if len(self.large_deletions):
            # generate a version of reference genome with large deletions
            ranges = sorted(self.large_deletions.values())
            genome_large_deletions = self.genome.sequence[:ranges[0][0]]
            for n,(s,e) in enumerate(ranges[:-1]):
                genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]])
            
            genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:])
            
            # adjust generated variant positions for geneome with deletions
            def adjust(pos0):
                offset = 0
                for s,e in ranges:
                    if pos0 > e:
                        offset += (e-s)
                return(pos0 - offset)
            
            # adjust the second half of the generated variants
            SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals]
            for SNPs in self.SNPs_per_genome[self.num_individuals:]:
                adjusted = []
                for pos0,variant in SNPs:
                    adjusted += [(adjust(pos0),variant)]
                SNPs_per_genome_adjusted += [adjusted]
            
            indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals]
            for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]:
                adjusted = {}
                for pos0,indel in indels.items():
                    adjusted[adjust(pos0)] = indel
                
                indel_dict_by_pos_pergenome_adjusted += [adjusted]
            
            save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted
            save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted
            SNPs_per_genome = SNPs_per_genome_adjusted
            indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted
        else:
            SNPs_per_genome = self.SNPs_per_genome
            indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome
            

        # adjusted are needed to apply the variants
        # unadjusted are needed to check the calling
        _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w'))
        # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r'))


        ### generate genotypes (apply variants) ###
        genotypes = []
        for gn,SNPs in enumerate(SNPs_per_genome):
            if len(self.large_deletions) and gn >= self.num_individuals:
                # use genome with large deletions for second batch
                orig_genome = genome_large_deletions
                genome = _array('c',genome_large_deletions)
            else:
                # else use original
                orig_genome = self.genome.sequence
                genome = _array('c',self.genome.sequence)
            
            # first SNPs
            for pos0,SNP in SNPs:
                assert genome[pos0] != SNP
                genome[pos0] = SNP
            
            # check it worked
            changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old]
            assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .'
            # then indels
            newgenome = _array('c')
            last_pos0 = 0
            for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()):
              if isinstance(indel,str):
                # insertion
                newgenome.extend(genome[last_pos0:pos0])
                newgenome.extend(indel)
                last_pos0 = pos0
              else:
                # deletion
                newgenome.extend(genome[last_pos0:pos0])
                last_pos0 = pos0 + indel
            
            newgenome.extend(genome[last_pos0:])
            genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), 
                    id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '')
            genotypes += [genome_seqrecord]
            print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id)

        self.genotypes = genotypes
Exemplo n.º 13
0
    def __init__(self,
                 genome=False,
                 baga=False,
                 num_individuals=1,
                 large_deletions={},
                 large_del_padding=1000,
                 random_seed=False):
        '''
        Initialise with:
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.SimulateReads.Reads (like this one) object that 
        was previously saved.
        
        Large deletions can be included to simulate e.g. missing genomic islands or prophage.
        Currently, a set of genomes are generated with and a set without the large deletions if 
        specified.
        
        large_deletions should be a dict with arbitrary names of deletions as keys and
        tuples of (start,end) for python slices delineating each deletion. If supplied a
        set of genomes with and without deletions are generated, each set consisting of 
        num_individuals members.
        
        No variants will be generated within large_del_padding: a 'safe' distance around large 
        deletions outside of which variant calling should be reliable. Small deletions might run
        over into this zone.
        '''

        if random_seed:
            if not isinstance(random_seed, int):
                raise ValueError('random_seed must be integer')
            _random.seed(random_seed)  # 684651

        if genome and not baga:
            self.genome = genome
        elif baga and not genome:
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object'
            )

        omit = set()
        for name, (start, end) in large_deletions.items():
            omit.update(
                range(start - large_del_padding, end + large_del_padding))

        samplefrom = sorted(set(range(len(self.genome.sequence))) - omit)
        self.samplefrom = samplefrom

        self.large_deletions = large_deletions
        self.num_individuals = num_individuals

        # to be optionally populated with methods
        self.SNPs_per_genome = []
        self.indel_dict_by_pos_pergenome = []
Exemplo n.º 14
0
    def generateSequences(self):
        '''
        Create full length sequences with generated variants applied to the reference sequence.

        Generated variants are saved to a csv.

        If large deletions are present, variant positions appropriately are corrected when applied.
        '''

        save_these = {}
        save_these['SNPs'] = self.SNPs_per_genome
        save_these['InDels'] = self.indel_dict_by_pos_pergenome

        if len(self.large_deletions):
            # generate a version of reference genome with large deletions
            ranges = sorted(self.large_deletions.values())
            genome_large_deletions = self.genome.sequence[:ranges[0][0]]
            for n, (s, e) in enumerate(ranges[:-1]):
                genome_large_deletions.extend(
                    self.genome.sequence[e:ranges[n + 1][0]])

            genome_large_deletions.extend(self.genome.sequence[ranges[n +
                                                                      1][1]:])

            # adjust generated variant positions for geneome with deletions
            def adjust(pos0):
                offset = 0
                for s, e in ranges:
                    if pos0 > e:
                        offset += (e - s)
                return (pos0 - offset)

            # adjust the second half of the generated variants
            SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.
                                                            num_individuals]
            for SNPs in self.SNPs_per_genome[self.num_individuals:]:
                adjusted = []
                for pos0, variant in SNPs:
                    adjusted += [(adjust(pos0), variant)]
                SNPs_per_genome_adjusted += [adjusted]

            indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:
                                                                                    self
                                                                                    .
                                                                                    num_individuals]
            for indels in self.indel_dict_by_pos_pergenome[self.
                                                           num_individuals:]:
                adjusted = {}
                for pos0, indel in indels.items():
                    adjusted[adjust(pos0)] = indel

                indel_dict_by_pos_pergenome_adjusted += [adjusted]

            save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted
            save_these[
                'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted
            SNPs_per_genome = SNPs_per_genome_adjusted
            indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted
        else:
            SNPs_per_genome = self.SNPs_per_genome
            indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome

        # adjusted are needed to apply the variants
        # unadjusted are needed to check the calling
        _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w'))
        # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r'))

        ### generate genotypes (apply variants) ###
        genotypes = []
        for gn, SNPs in enumerate(SNPs_per_genome):
            if len(self.large_deletions) and gn >= self.num_individuals:
                # use genome with large deletions for second batch
                orig_genome = genome_large_deletions
                genome = _array('c', genome_large_deletions)
            else:
                # else use original
                orig_genome = self.genome.sequence
                genome = _array('c', self.genome.sequence)

            # first SNPs
            for pos0, SNP in SNPs:
                assert genome[pos0] != SNP
                genome[pos0] = SNP

            # check it worked
            changed = [
                pos0
                for pos0, (new,
                           old) in enumerate(zip(genome, list(orig_genome)))
                if new != old
            ]
            assert changed == [pos0 for pos0, var in SNPs
                               ], 'SNPs were not correctly applied . . .'
            # then indels
            newgenome = _array('c')
            last_pos0 = 0
            for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()):
                if isinstance(indel, str):
                    # insertion
                    newgenome.extend(genome[last_pos0:pos0])
                    newgenome.extend(indel)
                    last_pos0 = pos0
                else:
                    # deletion
                    newgenome.extend(genome[last_pos0:pos0])
                    last_pos0 = pos0 + indel

            newgenome.extend(genome[last_pos0:])
            genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()),
                                          id=self.genome.id +
                                          '_sim{:02d}'.format(gn + 1),
                                          name='',
                                          description='')
            genotypes += [genome_seqrecord]
            print(len(self.genome.sequence), len(genotypes[-1]),
                  genotypes[-1].id)

        self.genotypes = genotypes