示例#1
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path,
               tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess([
                tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path
            ],
                                                 stdout=err_file,
                                                 stderr=err_file,
                                                 indent='  ' +
                                                 qutils.index_to_str(index) +
                                                 '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for ind, seq in read_fasta(fasta_fpath):
        ind = re.sub('[/. ]', '_', ind)
        contig_path = os.path.join(base_dir, ind + '.fasta')
        gff_path = os.path.join(base_dir, ind + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff')
    unique, total = set(), 0
    genes = []
    cnt = [0] * len(gene_lengths)
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start:end + 1]
        else:
            gene_seq = rev_comp(contigs[contig][start:end + 1])
        if gene_seq not in unique:
            unique.add(gene_seq)
        genes.append((gene_id, gene_seq))
        for idx, gene_length in enumerate(gene_lengths):
            cnt[idx] += end - start > gene_length

    if OUTPUT_FASTA:
        out_fasta_path = out_fpath + '_genes.fasta'
        write_fasta(out_fasta_path, genes)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, len(unique), total, cnt
示例#2
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for ind, seq in read_fasta(fasta_fpath):
        contig_path = os.path.join(base_dir, ind + '.fasta')
        gff_path = os.path.join(base_dir, ind + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        logger.error(
            'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
            ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath))
        return None, None, None, None

    out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff')
    unique, total = set(), 0
    genes = []
    cnt = [0] * len(gene_lengths)
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start:end + 1]
        else:
            gene_seq = rev_comp(contigs[contig][start:end + 1])
        if gene_seq not in unique:
            unique.add(gene_seq)
        genes.append((gene_id, gene_seq))
        for idx, gene_length in enumerate(gene_lengths):
            cnt[idx] += end - start > gene_length

    if OUTPUT_FASTA:
        out_fasta_path = out_fpath + '_genes.fasta'
        write_fasta(out_fasta_path, genes)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, len(unique), total, cnt
示例#3
0
import sys
import os
sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '../'))
import libs
from libs import fastaparser

if len(sys.argv) <= 3 or len(sys.argv) >= 6:
    print("Returns [reverse-complement] sequence from START to END position from each entry of input fasta")
    print("Usage: " + sys.argv[0] + " <input fasta> <START> <END, -1 for the end> [any string -- optional parameter for reverse-complement]")
    sys.exit()

inp=sys.argv[1]
start=int(sys.argv[2])
end=int(sys.argv[3])
reverse = False
if len(sys.argv) == 5:
  reverse = True

for tup in fastaparser.read_fasta(inp):
    cur_start = min(start, len(tup[1]))
    if end == -1:
        cur_end = len(tup[1])
    else:
        cur_end = min(end, len(tup[1]))    
    print (">" + tup[0] + "_cropped_" + str(cur_start) + "_" + str(cur_end))
    if reverse:
        print (fastaparser.rev_comp(tup[1][cur_start - 1 : cur_end]))
    else:
        print (tup[1][cur_start - 1 : cur_end])