Exemplo n.º 1
0
 def saveLocal(self, name):
     '''
     Save a downloaded read info to a local compressed pickle file.
     'name' can exclude extension: .baga will be added
     '''
     fileout = 'baga.CollectData.Reads-%s.baga' % name
     print('Saving to %s' % fileout)
     _cPickle.dump(self, _gzip.open(fileout, 'wb'))
Exemplo n.º 2
0
 def saveLocal(self, name):
     '''
     Save a downloaded read info to a local compressed pickle file.
     'name' can exclude extension: .baga will be added
     '''
     fileout = 'baga.CollectData.Reads-%s.baga' % name
     print('Saving to %s' % fileout)
     _cPickle.dump(self, _gzip.open(fileout, 'wb'))
Exemplo n.º 3
0
    def generateSequences(self):
        '''
        Create full length sequences with generated variants applied to the reference sequence.

        Generated variants are saved to a csv.

        If large deletions are present, variant positions appropriately are corrected when applied.
        '''

        save_these = {}
        save_these['SNPs'] = self.SNPs_per_genome
        save_these['InDels'] = self.indel_dict_by_pos_pergenome

        if len(self.large_deletions):
            # generate a version of reference genome with large deletions
            ranges = sorted(self.large_deletions.values())
            genome_large_deletions = self.genome.sequence[:ranges[0][0]]
            for n,(s,e) in enumerate(ranges[:-1]):
                genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]])
            
            genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:])
            
            # adjust generated variant positions for geneome with deletions
            def adjust(pos0):
                offset = 0
                for s,e in ranges:
                    if pos0 > e:
                        offset += (e-s)
                return(pos0 - offset)
            
            # adjust the second half of the generated variants
            SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals]
            for SNPs in self.SNPs_per_genome[self.num_individuals:]:
                adjusted = []
                for pos0,variant in SNPs:
                    adjusted += [(adjust(pos0),variant)]
                SNPs_per_genome_adjusted += [adjusted]
            
            indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals]
            for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]:
                adjusted = {}
                for pos0,indel in indels.items():
                    adjusted[adjust(pos0)] = indel
                
                indel_dict_by_pos_pergenome_adjusted += [adjusted]
            
            save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted
            save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted
            SNPs_per_genome = SNPs_per_genome_adjusted
            indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted
        else:
            SNPs_per_genome = self.SNPs_per_genome
            indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome
            

        # adjusted are needed to apply the variants
        # unadjusted are needed to check the calling
        _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w'))
        # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r'))


        ### generate genotypes (apply variants) ###
        genotypes = []
        for gn,SNPs in enumerate(SNPs_per_genome):
            if len(self.large_deletions) and gn >= self.num_individuals:
                # use genome with large deletions for second batch
                orig_genome = genome_large_deletions
                genome = _array('c',genome_large_deletions)
            else:
                # else use original
                orig_genome = self.genome.sequence
                genome = _array('c',self.genome.sequence)
            
            # first SNPs
            for pos0,SNP in SNPs:
                assert genome[pos0] != SNP
                genome[pos0] = SNP
            
            # check it worked
            changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old]
            assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .'
            # then indels
            newgenome = _array('c')
            last_pos0 = 0
            for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()):
              if isinstance(indel,str):
                # insertion
                newgenome.extend(genome[last_pos0:pos0])
                newgenome.extend(indel)
                last_pos0 = pos0
              else:
                # deletion
                newgenome.extend(genome[last_pos0:pos0])
                last_pos0 = pos0 + indel
            
            newgenome.extend(genome[last_pos0:])
            genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), 
                    id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '')
            genotypes += [genome_seqrecord]
            print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id)

        self.genotypes = genotypes
Exemplo n.º 4
0
    def generateSequences(self):
        '''
        Create full length sequences with generated variants applied to the reference sequence.

        Generated variants are saved to a csv.

        If large deletions are present, variant positions appropriately are corrected when applied.
        '''

        save_these = {}
        save_these['SNPs'] = self.SNPs_per_genome
        save_these['InDels'] = self.indel_dict_by_pos_pergenome

        if len(self.large_deletions):
            # generate a version of reference genome with large deletions
            ranges = sorted(self.large_deletions.values())
            genome_large_deletions = self.genome.sequence[:ranges[0][0]]
            for n, (s, e) in enumerate(ranges[:-1]):
                genome_large_deletions.extend(
                    self.genome.sequence[e:ranges[n + 1][0]])

            genome_large_deletions.extend(self.genome.sequence[ranges[n +
                                                                      1][1]:])

            # adjust generated variant positions for geneome with deletions
            def adjust(pos0):
                offset = 0
                for s, e in ranges:
                    if pos0 > e:
                        offset += (e - s)
                return (pos0 - offset)

            # adjust the second half of the generated variants
            SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.
                                                            num_individuals]
            for SNPs in self.SNPs_per_genome[self.num_individuals:]:
                adjusted = []
                for pos0, variant in SNPs:
                    adjusted += [(adjust(pos0), variant)]
                SNPs_per_genome_adjusted += [adjusted]

            indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:
                                                                                    self
                                                                                    .
                                                                                    num_individuals]
            for indels in self.indel_dict_by_pos_pergenome[self.
                                                           num_individuals:]:
                adjusted = {}
                for pos0, indel in indels.items():
                    adjusted[adjust(pos0)] = indel

                indel_dict_by_pos_pergenome_adjusted += [adjusted]

            save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted
            save_these[
                'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted
            SNPs_per_genome = SNPs_per_genome_adjusted
            indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted
        else:
            SNPs_per_genome = self.SNPs_per_genome
            indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome

        # adjusted are needed to apply the variants
        # unadjusted are needed to check the calling
        _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w'))
        # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r'))

        ### generate genotypes (apply variants) ###
        genotypes = []
        for gn, SNPs in enumerate(SNPs_per_genome):
            if len(self.large_deletions) and gn >= self.num_individuals:
                # use genome with large deletions for second batch
                orig_genome = genome_large_deletions
                genome = _array('c', genome_large_deletions)
            else:
                # else use original
                orig_genome = self.genome.sequence
                genome = _array('c', self.genome.sequence)

            # first SNPs
            for pos0, SNP in SNPs:
                assert genome[pos0] != SNP
                genome[pos0] = SNP

            # check it worked
            changed = [
                pos0
                for pos0, (new,
                           old) in enumerate(zip(genome, list(orig_genome)))
                if new != old
            ]
            assert changed == [pos0 for pos0, var in SNPs
                               ], 'SNPs were not correctly applied . . .'
            # then indels
            newgenome = _array('c')
            last_pos0 = 0
            for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()):
                if isinstance(indel, str):
                    # insertion
                    newgenome.extend(genome[last_pos0:pos0])
                    newgenome.extend(indel)
                    last_pos0 = pos0
                else:
                    # deletion
                    newgenome.extend(genome[last_pos0:pos0])
                    last_pos0 = pos0 + indel

            newgenome.extend(genome[last_pos0:])
            genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()),
                                          id=self.genome.id +
                                          '_sim{:02d}'.format(gn + 1),
                                          name='',
                                          description='')
            genotypes += [genome_seqrecord]
            print(len(self.genome.sequence), len(genotypes[-1]),
                  genotypes[-1].id)

        self.genotypes = genotypes