Пример #1
0
def runExonerate(input):
    s = input.split(':::')
    ProtID = s[0]
    ScaffID = s[1]
    ScaffStart = int(s[2])
    ScaffEnd = int(s[3])
    #get the protein model
    query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa')
    with open(query, 'w') as output:
        SeqIO.write(protein_dict[ProtID], output, 'fasta')
    #now get the genome region, use different variable names for SeqRecords to avoid collision
    scaffold = os.path.join(
        tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' +
        str(ScaffEnd) + '.fa')
    with open(scaffold, 'w') as output2:
        with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'),
                  'rU') as fullscaff:
            for header, Sequence in SimpleFastaParser(fullscaff):
                #grab a 3 kb cushion on either side of hit region, careful of scaffold ends
                start = ScaffStart - 3000
                if start < 1:
                    start = 1
                end = ScaffEnd + 3000
                if end > len(Sequence):
                    end = len(Sequence)
                output2.write('>%s\n%s\n' % (header, Sequence[start:end]))
    exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__'
    #check that input files are created and valid
    exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out')
    ryo = "AveragePercentIdentity: %pi\n"
    cmd = [
        'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment',
        'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron',
        str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold
    ]
    if lib.checkannotations(query) and lib.checkannotations(scaffold):
        #run exonerate, capture errors
        with open(exonerate_out, 'w') as output3:
            proc = subprocess.Popen(cmd,
                                    stdout=output3,
                                    stderr=subprocess.PIPE)
        stderr = proc.communicate()
        if 'WARNING' in stderr[1]:
            lib.log.debug('Error in input:{:}'.format(input))
            lib.log.debug(
                '%s, Len=%i, %i-%i; %i-%i' %
                (header, len(Sequence), ScaffStart, ScaffEnd, start, end))
            os.rename(query,
                      os.path.join(tmpdir, 'failed', os.path.basename(query)))
            os.rename(
                scaffold,
                os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
        else:
            for y in [query, scaffold]:
                try:
                    lib.SafeRemove(y)
                except OSError:
                    lib.log.debug("Error removing %s" % (y))
        #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
        if lib.getSize(exonerate_out) < 500:
            os.remove(exonerate_out)
    else:
        lib.log.debug('Error in query or scaffold:{:}'.format(input))
        lib.SafeRemove(query)
        lib.SafeRemove(scaffold)
Пример #2
0
                        output.write('\t'.join(cols))

#convert to GFF3 using ExoConverter from EVM
with open(args.out, 'w') as output:
    subprocess.call([ExoConverter, exonerate_raw], stdout=output, stderr=FNULL)

#output some quick summary of exonerate alignments that you found
Found = lib.countGFFgenes(exonerate_raw)
lib.log.info('Exonerate finished: found {:,} alignments'.format(Found))

#check for saving output of tblastn
if args.tblastn_out:
    shutil.copyfile(BlastResult, args.tblastn_out)

#finally clean-up your mess if failed is empty
if args.debug:
    try:
        os.rmdir(os.path.join(tmpdir, 'failed'))
        empty = True
    except OSError:
        empty = False
    if empty:
        lib.SafeRemove(tmpdir)
    else:
        lib.log.error("Failed exonerate alignments found, see files in %s" %
                      os.path.join(tmpdir, 'failed'))
else:
    if os.path.isdir(tmpdir):
        lib.SafeRemove(tmpdir)
sys.exit(1)
Пример #3
0
#create tmp folder to run tbl2asn from
#make tmp folder
tmp = outputname + '_tmp'
if not os.path.exists(tmp):
    os.makedirs(tmp)

#now move files into proper location
if not lib.checkannotations(args.fasta):
    print('FASTA genome file not found: {:}'.format(args.fasta))
    sys.exit(1)
if not lib.checkannotations(args.tbl):
    print('TBL annotations file not found: {:}'.format(args.tbl))
    sys.exit(1)
shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa'))
shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl'))

#now we can run tbl2asn
if args.sbt:
    SBT = args.sbt
else:
    SBT = os.path.join(parentdir, 'lib', 'test.sbt')
discrep = outputname + '.discrepency.txt'
version = 1
tbl2asn_cmd = runtbl2asn(tmp, SBT, discrep, organism, args.isolate,
                         args.strain, args.tbl2asn, version)

#get output files
gbkout = outputname + '.gbk'
shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout)
lib.SafeRemove(tmp)