def load(datadir, create_session=None): ''' nr_loaded = load(datadir, create_session={backend.create_session}) Load ENSEMBL FASTA file into database Parameters ---------- datadir : str Directory containing the FASTA file create_session : callable, optional a callable object that returns an sqlalchemy session Returns ------- nr_loaded : integer Nr. of entries loaded ''' from waldo.backend import call_create_session session = call_create_session(create_session) inputfilename = glob.glob(path.join(datadir, 'Mus_musculus.*.pep.all.fa.gz'))[0] filename = path.join(inputfilename) nr_loaded = 0 for seq in fasta.read(filename): htokens = seq.header.split() peptide = htokens[0] gene = htokens[3] assert gene.startswith('gene:'), 'waldo.sequences.load' gene = gene[len('gene:'):] session.add( Translation( 'ensembl:gene_id', gene, 'ensembl:peptide_id', peptide)) aaseq = seq.sequence seq = models.EnsemblSequence(peptide, aaseq) session.add(seq) session.commit() nr_loaded += 1 return nr_loaded
def test_read(): seqs = list(fasta.read(path_to_testfile('test.fasta'))) assert len(seqs) == 2 assert seqs[1].header.find('gene:ENSMUSG00000064345') assert seqs[1].header[-1] != '\n' assert seqs[1].header[0] != '>'