def test_read_dna_from_biostring_order_1(): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 filename = os.path.join(data_path, 'sample.fa') seqs = sequences_from_fasta(filename) with pytest.raises(ValueError): data = Bioseq.create_from_seq('train', fastafile=seqs, storage='sparse', order=order, cache=False) data = Bioseq.create_from_seq('train', fastafile=seqs, order=order, cache=False) np.testing.assert_equal(len(data), 3897) np.testing.assert_equal(data.shape, (3897, 200, 1, 4)) np.testing.assert_equal( data[0][0, :10, 0, :], np.asarray([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0]], dtype='int8'))
def test_dna_loading_from_seqrecord(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 2 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') seqs = sequences_from_fasta(refgenome) data = Bioseq.create_from_refgenome('train', refgenome=seqs, roi=bed_merged, storage='ndarray', order=order)
def test_dna_loading_from_seqrecord(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 2 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') seqs = sequences_from_fasta(refgenome) data = Bioseq.create_from_refgenome('train', refgenome=seqs, roi=bed_merged, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) chrom = data.gindexer[0].chrom start = data.gindexer[0].start end = data.gindexer[0].end np.testing.assert_equal(data[0], data[(chrom, start, end)]) np.testing.assert_equal(data[0], data[chrom, start, end])
def load_sequence(self): print('loading from lazy loader') store_whole_genome = self.store_whole_genome gindexer = self.gindexer if isinstance(self.fastafile, str): seqs = sequences_from_fasta(self.fastafile, self.seqtype) else: # This is already a list of SeqRecords seqs = self.fastafile if not store_whole_genome and gindexer is not None: # the genome is loaded with a bed file, # only the specific subset is loaded # to keep the memory overhead low. # Otherwise the entire reference genome is loaded. rgen = OrderedDict(((seq.id, seq) for seq in seqs)) subseqs = [] for giv in gindexer: subseq = rgen[giv.chrom][ max(giv.start, 0):min(giv.end, len(rgen[giv.chrom]))] if giv.start < 0: subseq = 'N' * (-giv.start) + subseq if len(subseq) < giv.length: subseq = subseq + 'N' * (giv.length - len(subseq)) subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end) subseq.name = subseq.id subseq.description = subseq.id subseqs.append(subseq) seqs = subseqs gsize = gindexer if store_whole_genome: gsize = OrderedDict(((seq.id, len(seq)) for seq in seqs)) gsize = GenomicIndexer.create_from_genomesize(gsize) self.gsize_ = gsize self.seqs_ = seqs
def create_from_seq( cls, name, # pylint: disable=too-many-locals fastafile, storage='ndarray', seqtype='dna', order=1, fixedlen=None, datatags=None, cache=False, channel_last=True, overwrite=False): """Create a Bioseq class from a biological sequences. This constructor loads a set of nucleotide or amino acid sequences. By default, the sequence are assumed to be of equal length. Alternatively, sequences can be truncated and padded to a fixed length. Parameters ----------- name : str Name of the dataset fastafile : str or list(str) or list(Bio.SeqRecord) Fasta file or list of fasta files from which the sequences are loaded or a list of Bio.SeqRecord.SeqRecord. seqtype : str Indicates whether a nucleotide or peptide sequence is loaded using 'dna' or 'protein' respectively. Default: 'dna'. order : int Order for the one-hot representation. Default: 1. fixedlen : int or None Forces the sequences to be of equal length by truncation or zero-padding. If set to None, it will be assumed that the sequences are already of equal length. An exception is raised if this is not the case. Default: None. storage : str Storage mode for storing the sequence may be 'ndarray' or 'hdf5'. Default: 'ndarray'. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. cache : boolean Indicates whether to cache the dataset. Default: False. overwrite : boolean Overwrite the cachefiles. Default: False. """ if storage not in ['ndarray', 'hdf5']: raise ValueError( 'Available storage options for Bioseq are: ndarray or hdf5') seqs = [] if isinstance(fastafile, str): fastafile = [fastafile] if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord): for fasta in fastafile: # += is necessary since sequences_from_fasta # returns a list seqs += sequences_from_fasta(fasta, seqtype) else: # This is already a list of SeqRecords seqs = fastafile if fixedlen is not None: seqs = sequence_padding(seqs, fixedlen) # Check if sequences are equally long lens = [len(seq) for seq in seqs] assert lens == [len(seqs[0])] * len(seqs), "Input sequences must " + \ "be of equal length." # Chromnames are required to be Unique chroms = [seq.id for seq in seqs] assert len(set(chroms)) == len(seqs), "Sequence IDs must be unique." # now mimic a dataframe representing a bed file reglen = lens[0] flank = 0 stepsize = 1 gindexer = GenomicIndexer(reglen, stepsize, flank, zero_padding=False) for chrom in chroms: gindexer.add_interval(chrom, 0, reglen, '.') garray = cls._make_genomic_array(name, gindexer, seqs, order, storage, cache=cache, datatags=datatags, overwrite=overwrite, store_whole_genome=False) return cls(name, garray, gindexer, alphabet=seqs[0].seq.alphabet.letters, channel_last=channel_last)
def create_from_refgenome(cls, name, refgenome, roi=None, binsize=None, stepsize=None, flank=0, order=1, storage='ndarray', datatags=None, cache=False, overwrite=False, channel_last=True, store_whole_genome=False): """Create a Bioseq class from a reference genome. This constructor loads nucleotide sequences from a reference genome. If regions of interest (ROI) is supplied, only the respective sequences are loaded, otherwise the entire genome is fetched. Parameters ----------- name : str Name of the dataset refgenome : str Fasta file. roi : str or None Bed-file defining the region of interest. If set to None, the sequence will be fetched from the entire genome and a genomic indexer must be attached later. Otherwise, the coverage is only determined for the region of interest. binsize : int or None Binsize in basepairs. For binsize=None, the binsize will be determined from the bed-file directly which requires that all intervals in the bed-file are of equal length. Otherwise, the intervals in the bed-file will be split to subintervals of length binsize in conjunction with stepsize. Default: None. stepsize : int or None stepsize in basepairs for traversing the genome. If stepsize is None, it will be set equal to binsize. Default: None. flank : int Flanking region in basepairs to be extended up and downstream of each interval. Default: 0. order : int Order for the one-hot representation. Default: 1. storage : str Storage mode for storing the sequence may be 'ndarray', 'hdf5' or 'sparse'. Default: 'hdf5'. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. cache : boolean Indicates whether to cache the dataset. Default: False. overwrite : boolean Overwrite the cachefiles. Default: False. store_whole_genome : boolean Indicates whether the whole genome or only ROI should be loaded. If False, a bed-file with regions of interest must be specified. Default: False. """ # fill up int8 rep of DNA # load bioseq, region index, and within region index if roi is not None: gindexer = GenomicIndexer.create_from_file(roi, binsize, stepsize, flank) else: gindexer = None if not store_whole_genome and gindexer is None: raise ValueError('Either roi must be supplied or store_whole_genome must be True') if isinstance(refgenome, str): seqs = sequences_from_fasta(refgenome, 'dna') else: # This is already a list of SeqRecords seqs = refgenome if not store_whole_genome and gindexer is not None: # the genome is loaded with a bed file, # only the specific subset is loaded # to keep the memory overhead low. # Otherwise the entire reference genome is loaded. rgen = {seq.id: seq for seq in seqs} subseqs = [] for giv in gindexer: subseq = rgen[giv.chrom][giv.start:(giv.end)] subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end - order + 1) subseq.name = subseq.id subseq.description = subseq.id subseqs.append(subseq) seqs = subseqs garray = cls._make_genomic_array(name, seqs, order, storage, datatags=datatags, cache=cache, overwrite=overwrite, store_whole_genome=store_whole_genome) return cls(name, garray, gindexer, alphabetsize=len(seqs[0].seq.alphabet.letters), channel_last=channel_last)
def _make_genomic_array(name, fastafile, order, storage, seqtype, cache=True, datatags=None, overwrite=False, store_whole_genome=True): """Create a genomic array or reload an existing one.""" # always use int 16 to store bioseq indices # do not use int8 at the moment, because 'N' is encoded # as -1024, which causes an underflow with int8. dtype = 'int16' # Load sequences from refgenome seqs = [] if isinstance(fastafile, str): fastafile = [fastafile] if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord): for fasta in fastafile: # += is necessary since sequences_from_fasta # returns a list seqs += sequences_from_fasta(fasta, seqtype) else: # This is already a list of SeqRecords seqs = fastafile # Extract chromosome lengths chromlens = {} for seq in seqs: chromlens[seq.id] = len(seq) - order + 1 def _seq_loader(cover, seqs, order): print('Convert sequences to index array') for seq in seqs: if cover._full_genome_stored: interval = GenomicInterval(seq.id, 0, len(seq) - order + 1, '.') else: interval = GenomicInterval( *_str_to_iv(seq.id, template_extension=0)) indarray = np.asarray(seq2ind(seq), dtype=dtype) if order > 1: # for higher order motifs, this part is used filter_ = np.asarray([ pow(len(seq.seq.alphabet.letters), i) for i in range(order) ]) indarray = np.convolve(indarray, filter_, mode='valid') cover[interval, 0] = indarray # At the moment, we treat the information contained # in each bw-file as unstranded datatags = [name] + datatags if datatags else [name] datatags += ['order{}'.format(order)] cover = create_genomic_array(chromlens, stranded=False, storage=storage, datatags=datatags, cache=cache, store_whole_genome=store_whole_genome, order=order, conditions=['idx'], overwrite=overwrite, typecode=dtype, loader=_seq_loader, loader_args=(seqs, order)) return cover