Exemplo n.º 1
0
 def saveLocal(self, name):
     '''
     Save processed SAM file info to a local compressed pickle file.
     'name' can exclude extension: .baga will be added
     '''
     fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name)
     
     with _tarfile.open(fileout, "w:gz") as tar:
         print('Writing to {} . . . '.format(fileout))
         for att_name, att in self.__dict__.items():
             if isinstance(att, _array):
                 io = _StringIO(att.tostring())
                 io.seek(0, _os.SEEK_END)
                 length = io.tell()
                 io.seek(0)
                 thisone = _tarfile.TarInfo(name = att_name)
                 thisone.size = length
                 tar.addfile(tarinfo = thisone, fileobj = io)
             else:
                 # try saving everything else here by jsoning
                 try:
                     io = _StringIO()
                     _json.dump(att, io)
                     io.seek(0, _os.SEEK_END)
                     length = io.tell()
                     io.seek(0)
                     thisone = _tarfile.TarInfo(name = att_name)
                     thisone.size = length
                     tar.addfile(tarinfo = thisone, fileobj = io)
                 except TypeError:
                     # ignore non-jsonable things like functions
                     # include unicodes, strings, lists etc etc
                     #print('omitting {}'.format(att_name))
                     pass
Exemplo n.º 2
0
    def saveLocal(self, name):
        '''
        Save processed SAM file info to a local compressed pickle file.
        'name' can exclude extension: .baga will be added
        '''
        fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name)

        with _tarfile.open(fileout, "w:gz") as tar:
            print('Writing to {} . . . '.format(fileout))
            for att_name, att in self.__dict__.items():
                if isinstance(att, _array):
                    io = _StringIO(att.tostring())
                    io.seek(0, _os.SEEK_END)
                    length = io.tell()
                    io.seek(0)
                    thisone = _tarfile.TarInfo(name=att_name)
                    thisone.size = length
                    tar.addfile(tarinfo=thisone, fileobj=io)
                else:
                    # try saving everything else here by jsoning
                    try:
                        io = _StringIO()
                        _json.dump(att, io)
                        io.seek(0, _os.SEEK_END)
                        length = io.tell()
                        io.seek(0)
                        thisone = _tarfile.TarInfo(name=att_name)
                        thisone.size = length
                        tar.addfile(tarinfo=thisone, fileobj=io)
                    except TypeError:
                        # ignore non-jsonable things like functions
                        # include unicodes, strings, lists etc etc
                        #print('omitting {}'.format(att_name))
                        pass
Exemplo n.º 3
0
    def __init__(self, reads=False, genome=False, baga=False):
        '''
        Initialise with:
        a baga.PrepareReads.Reads object and,
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.AlignReads.SAMs (like this one) object that 
        was previously saved.
        '''

        if (reads and genome) and not baga:
            try:
                self.read_files = reads.trimmed_read_files
            except AttributeError:
                text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
                print(text)
                try:
                    self.read_files = reads.adaptorcut_read_files
                except AttributeError:
                    text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                    self.read_files = reads.read_files
                    print(text)
                    print('continuing with these reads . . .')

            # currently baga CollectData includes path to reads in pairname keys to read file pair values
            # check and remove here
            for pairname, files in self.read_files.items():
                if _os.path.sep in pairname:
                    self.read_files[pairname.split(_os.path.sep)[-1]] = files
                    del self.read_files[pairname]

            self.genome_sequence = genome.sequence
            self.genome_id = genome.id

        elif baga and not (reads and genome):
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)'
            )
Exemplo n.º 4
0
 def loadFrombaga(local_path):
     with _tarfile.open(local_path, "r:gz") as tar:
         for member in tar:
             contents = _StringIO(tar.extractfile(member).read())
             try:
                 # either json serialised conventional objects
                 contents = _json.loads(contents.getvalue())
             except ValueError:
                 # or longer python array.array objects
                 contents = _array('c', contents.getvalue())
             
             setattr(self, member.name, contents)
Exemplo n.º 5
0
        def loadFrombaga(local_path):
            with _tarfile.open(local_path, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)
Exemplo n.º 6
0
 def __init__(self, reads = False, genome = False, baga = False):
     '''
     Initialise with:
     a baga.PrepareReads.Reads object and,
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.AlignReads.SAMs (like this one) object that 
     was previously saved.
     '''
     
     if (reads and genome) and not baga:
         try:
             self.read_files = reads.trimmed_read_files
         except AttributeError:
             text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
             print(text)
             try:
                 self.read_files = reads.adaptorcut_read_files
             except AttributeError:
                 text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                 self.read_files = reads.read_files
                 print(text)
                 print('continuing with these reads . . .')
         
         # currently baga CollectData includes path to reads in pairname keys to read file pair values
         # check and remove here
         for pairname, files in self.read_files.items():
             if _os.path.sep in pairname:
                 self.read_files[pairname.split(_os.path.sep)[-1]] = files
                 del self.read_files[pairname]
         
         self.genome_sequence = genome.sequence
         self.genome_id = genome.id
     
     elif baga and not (reads and genome):
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
Exemplo n.º 7
0
 def saveLocal(self, name = False):
     '''
     Save a reference genome to a local compressed baga file. This saves 
     Internet bandwidth if downloading from NCBI and time if loading a 
     genbank file.
     'filename' can exclude extension: .baga will be added
     A .baga file is mostly Python dictionaries in JSON strings and
     array.array objects in a tar.gz format.
     '''
     
     if name:
         fileout = 'baga.CollectData.Genome-{}.baga'.format(name)
     else:
         fileout = 'baga.CollectData.Genome-{}.baga'.format(self.id)
     
     with _tarfile.open(fileout, "w:gz") as tar:
         print('Writing to {} . . . '.format(fileout))
         for att_name, att in self.__dict__.items():
             if isinstance(att, _array):
                 io = _StringIO(att.tostring())
                 io.seek(0, _os.SEEK_END)
                 length = io.tell()
                 io.seek(0)
                 thisone = _tarfile.TarInfo(name = att_name)
                 thisone.size = length
                 tar.addfile(tarinfo = thisone, fileobj = io)
             elif isinstance(att, dict) or isinstance(att, str):
                 # ensure only dicts or strings for genome objects but shouldn't be anything else anyway
                 io = _StringIO()
                 _json.dump(att, io)
                 io.seek(0, _os.SEEK_END)
                 length = io.tell()
                 io.seek(0)
                 thisone = _tarfile.TarInfo(name = att_name)
                 thisone.size = length
                 tar.addfile(tarinfo = thisone, fileobj = io)
Exemplo n.º 8
0
    def saveLocal(self, name=False):
        '''
        Save a reference genome to a local compressed baga file. This saves 
        Internet bandwidth if downloading from NCBI and time if loading a 
        genbank file.
        'filename' can exclude extension: .baga will be added
        A .baga file is mostly Python dictionaries in JSON strings and
        array.array objects in a tar.gz format.
        '''

        if name:
            fileout = 'baga.CollectData.Genome-{}.baga'.format(name)
        else:
            fileout = 'baga.CollectData.Genome-{}.baga'.format(self.id)

        with _tarfile.open(fileout, "w:gz") as tar:
            print('Writing to {} . . . '.format(fileout))
            for att_name, att in self.__dict__.items():
                if isinstance(att, _array):
                    io = _StringIO(att.tostring())
                    io.seek(0, _os.SEEK_END)
                    length = io.tell()
                    io.seek(0)
                    thisone = _tarfile.TarInfo(name=att_name)
                    thisone.size = length
                    tar.addfile(tarinfo=thisone, fileobj=io)
                elif isinstance(att, dict) or isinstance(att, str):
                    # ensure only dicts or strings for genome objects but shouldn't be anything else anyway
                    io = _StringIO()
                    _json.dump(att, io)
                    io.seek(0, _os.SEEK_END)
                    length = io.tell()
                    io.seek(0)
                    thisone = _tarfile.TarInfo(name=att_name)
                    thisone.size = length
                    tar.addfile(tarinfo=thisone, fileobj=io)
Exemplo n.º 9
0
 def __init__(self, genome = False, 
                    baga = False, 
                    num_individuals = 1,
                    large_deletions = {},
                    large_del_padding = 1000,
                    random_seed = False):
     '''
     Initialise with:
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.SimulateReads.Reads (like this one) object that 
     was previously saved.
     
     Large deletions can be included to simulate e.g. missing genomic islands or prophage.
     Currently, a set of genomes are generated with and a set without the large deletions if 
     specified.
     
     large_deletions should be a dict with arbitrary names of deletions as keys and
     tuples of (start,end) for python slices delineating each deletion. If supplied a
     set of genomes with and without deletions are generated, each set consisting of 
     num_individuals members.
     
     No variants will be generated within large_del_padding: a 'safe' distance around large 
     deletions outside of which variant calling should be reliable. Small deletions might run
     over into this zone.
     '''
     
     if random_seed:
         if not isinstance(random_seed, int):
             raise ValueError('random_seed must be integer')
         _random.seed(random_seed) # 684651
     
     if genome and not baga:
         self.genome = genome
     elif baga and not genome:
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object')
     
     omit = set()
     for name,(start,end) in large_deletions.items():
         omit.update(range(start-large_del_padding, end+large_del_padding))
     
     samplefrom = sorted(set(range(len(self.genome.sequence))) - omit)
     self.samplefrom = samplefrom
     
     self.large_deletions = large_deletions
     self.num_individuals = num_individuals
     
     # to be optionally populated with methods
     self.SNPs_per_genome = []
     self.indel_dict_by_pos_pergenome = []
Exemplo n.º 10
0
    def __init__(self,
                 genome=False,
                 baga=False,
                 num_individuals=1,
                 large_deletions={},
                 large_del_padding=1000,
                 random_seed=False):
        '''
        Initialise with:
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.SimulateReads.Reads (like this one) object that 
        was previously saved.
        
        Large deletions can be included to simulate e.g. missing genomic islands or prophage.
        Currently, a set of genomes are generated with and a set without the large deletions if 
        specified.
        
        large_deletions should be a dict with arbitrary names of deletions as keys and
        tuples of (start,end) for python slices delineating each deletion. If supplied a
        set of genomes with and without deletions are generated, each set consisting of 
        num_individuals members.
        
        No variants will be generated within large_del_padding: a 'safe' distance around large 
        deletions outside of which variant calling should be reliable. Small deletions might run
        over into this zone.
        '''

        if random_seed:
            if not isinstance(random_seed, int):
                raise ValueError('random_seed must be integer')
            _random.seed(random_seed)  # 684651

        if genome and not baga:
            self.genome = genome
        elif baga and not genome:
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.SimulateReads.Reads with a loaded baga.CollectData.Genome-*.baga object or previously saved baga.SimulateReads.Reads object'
            )

        omit = set()
        for name, (start, end) in large_deletions.items():
            omit.update(
                range(start - large_del_padding, end + large_del_padding))

        samplefrom = sorted(set(range(len(self.genome.sequence))) - omit)
        self.samplefrom = samplefrom

        self.large_deletions = large_deletions
        self.num_individuals = num_individuals

        # to be optionally populated with methods
        self.SNPs_per_genome = []
        self.indel_dict_by_pos_pergenome = []