def detect_format(config): if 'format' not in config: try: filename = path(config['filename']) with filename.open('r') as f: header = f.read(5) if filename.ext in ('.gb', '.gbk') or header.startswith('LOCUS'): log.debug("Attempting %s as genbank", filename) seqrec = genbank.parse_seq_rec(config['filename']) config['format'] = 'genbank' config['id'] = seqrec.id config['description'] = seqrec.description seq = str(seqrec.seq) elif filename.ext in ('.fna', '.fasta') or header.startswith('>'): seqrec = SeqIO.read(filename, 'fasta') config['format'] = 'fasta' config['id'] = seqrec.id config['description'] = seqrec.description seq = str(seqrec.seq) else: with filename.open('r') as f: seq = f.read() seq = re.sub('\s', '', seq) config['format'] = 'raw' config['length'] = len(seq) ddna = derive_filename(config, filename.getmtime(), 'ddna') if not ddna.exists(): with mkstemp_rename(ddna) as f: f.write(seq.upper()) config['ddna'] = ddna except: log.exception("Error detecting format") config['format'] = None
def test_genbank(gbkfile): seqrec = genbank.parse_seq_rec(gbkfile) assert seqrec assert 0 < len(seqrec)