예제 #1
0
파일: parsing.py 프로젝트: victor-lin/npact
def detect_format(config):
    if 'format' not in config:
        try:
            filename = path(config['filename'])
            with filename.open('r') as f:
                header = f.read(5)
            if filename.ext in ('.gb', '.gbk') or header.startswith('LOCUS'):
                log.debug("Attempting %s as genbank", filename)
                seqrec = genbank.parse_seq_rec(config['filename'])
                config['format'] = 'genbank'
                config['id'] = seqrec.id
                config['description'] = seqrec.description
                seq = str(seqrec.seq)
            elif filename.ext in ('.fna', '.fasta') or header.startswith('>'):
                seqrec = SeqIO.read(filename, 'fasta')
                config['format'] = 'fasta'
                config['id'] = seqrec.id
                config['description'] = seqrec.description
                seq = str(seqrec.seq)
            else:
                with filename.open('r') as f:
                    seq = f.read()
                seq = re.sub('\s', '', seq)
                config['format'] = 'raw'
            config['length'] = len(seq)
            ddna = derive_filename(config, filename.getmtime(), 'ddna')
            if not ddna.exists():
                with mkstemp_rename(ddna) as f:
                    f.write(seq.upper())
            config['ddna'] = ddna
        except:
            log.exception("Error detecting format")
            config['format'] = None
def test_genbank(gbkfile):
    seqrec = genbank.parse_seq_rec(gbkfile)
    assert seqrec
    assert 0 < len(seqrec)