Пример #1
0
    def __init__(self, reads=False, genome=False, baga=False):
        '''
        Initialise with:
        a baga.PrepareReads.Reads object and,
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.AlignReads.SAMs (like this one) object that 
        was previously saved.
        '''

        if (reads and genome) and not baga:
            try:
                self.read_files = reads.trimmed_read_files
            except AttributeError:
                text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
                print(text)
                try:
                    self.read_files = reads.adaptorcut_read_files
                except AttributeError:
                    text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                    self.read_files = reads.read_files
                    print(text)
                    print('continuing with these reads . . .')

            # currently baga CollectData includes path to reads in pairname keys to read file pair values
            # check and remove here
            for pairname, files in self.read_files.items():
                if _os.path.sep in pairname:
                    self.read_files[pairname.split(_os.path.sep)[-1]] = files
                    del self.read_files[pairname]

            self.genome_sequence = genome.sequence
            self.genome_id = genome.id

        elif baga and not (reads and genome):
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)'
            )
Пример #2
0
 def loadFrombaga(local_path):
     with _tarfile.open(local_path, "r:gz") as tar:
         for member in tar:
             contents = _StringIO(tar.extractfile(member).read())
             try:
                 # either json serialised conventional objects
                 contents = _json.loads(contents.getvalue())
             except ValueError:
                 # or longer python array.array objects
                 contents = _array('c', contents.getvalue())
             
             setattr(self, member.name, contents)
Пример #3
0
        def loadFrombaga(local_path):
            with _tarfile.open(local_path, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)
Пример #4
0
 def __init__(self, reads = False, genome = False, baga = False):
     '''
     Initialise with:
     a baga.PrepareReads.Reads object and,
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.AlignReads.SAMs (like this one) object that 
     was previously saved.
     '''
     
     if (reads and genome) and not baga:
         try:
             self.read_files = reads.trimmed_read_files
         except AttributeError:
             text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
             print(text)
             try:
                 self.read_files = reads.adaptorcut_read_files
             except AttributeError:
                 text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                 self.read_files = reads.read_files
                 print(text)
                 print('continuing with these reads . . .')
         
         # currently baga CollectData includes path to reads in pairname keys to read file pair values
         # check and remove here
         for pairname, files in self.read_files.items():
             if _os.path.sep in pairname:
                 self.read_files[pairname.split(_os.path.sep)[-1]] = files
                 del self.read_files[pairname]
         
         self.genome_sequence = genome.sequence
         self.genome_id = genome.id
     
     elif baga and not (reads and genome):
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
Пример #5
0
 def __init__(self, genome = False, 
                    baga = False, 
                    num_individuals = 1,
                    large_deletions = {},
                    large_del_padding = 1000,
                    random_seed = False):
     '''
     Initialise with:
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.SimulateReads.Reads (like this one) object that 
     was previously saved.
     
     Large deletions can be included to simulate e.g. missing genomic islands or prophage.
     Currently, a set of genomes are generated with and a set without the large deletions if 
     specified.
     
     large_deletions should be a dict with arbitrary names of deletions as keys and
     tuples of (start,end) for python slices delineating each deletion. If supplied a
     set of genomes with and without deletions are generated, each set consisting of 
     num_individuals members.
     
     No variants will be generated within large_del_padding: a 'safe' distance around large 
     deletions outside of which variant calling should be reliable. Small deletions might run
     over into this zone.
     '''
     
     if random_seed:
         if not isinstance(random_seed, int):
             raise ValueError('random_seed must be integer')
         _random.seed(random_seed) # 684651
     
     if genome and not baga:
         self.genome = genome
     elif baga and not genome:
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object')
     
     omit = set()
     for name,(start,end) in large_deletions.items():
         omit.update(range(start-large_del_padding, end+large_del_padding))
     
     samplefrom = sorted(set(range(len(self.genome.sequence))) - omit)
     self.samplefrom = samplefrom
     
     self.large_deletions = large_deletions
     self.num_individuals = num_individuals
     
     # to be optionally populated with methods
     self.SNPs_per_genome = []
     self.indel_dict_by_pos_pergenome = []
Пример #6
0
    def __init__(self,
                 genome=False,
                 baga=False,
                 num_individuals=1,
                 large_deletions={},
                 large_del_padding=1000,
                 random_seed=False):
        '''
        Initialise with:
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.SimulateReads.Reads (like this one) object that 
        was previously saved.
        
        Large deletions can be included to simulate e.g. missing genomic islands or prophage.
        Currently, a set of genomes are generated with and a set without the large deletions if 
        specified.
        
        large_deletions should be a dict with arbitrary names of deletions as keys and
        tuples of (start,end) for python slices delineating each deletion. If supplied a
        set of genomes with and without deletions are generated, each set consisting of 
        num_individuals members.
        
        No variants will be generated within large_del_padding: a 'safe' distance around large 
        deletions outside of which variant calling should be reliable. Small deletions might run
        over into this zone.
        '''

        if random_seed:
            if not isinstance(random_seed, int):
                raise ValueError('random_seed must be integer')
            _random.seed(random_seed)  # 684651

        if genome and not baga:
            self.genome = genome
        elif baga and not genome:
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object'
            )

        omit = set()
        for name, (start, end) in large_deletions.items():
            omit.update(
                range(start - large_del_padding, end + large_del_padding))

        samplefrom = sorted(set(range(len(self.genome.sequence))) - omit)
        self.samplefrom = samplefrom

        self.large_deletions = large_deletions
        self.num_individuals = num_individuals

        # to be optionally populated with methods
        self.SNPs_per_genome = []
        self.indel_dict_by_pos_pergenome = []