def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)): seq_num = str(seq_num) ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER] contig_path = os.path.join(base_dir, seq_num + '.fasta') gff_path = os.path.join(base_dir, seq_num + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: return None, None, None, None out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '') out_gff_path = merge_gffs(gffs, out_gff_fpath) unique, total = set(), 0 genes = [] for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start - 1:end] else: gene_seq = rev_comp(contigs[contig][start - 1:end]) if gene_seq not in unique: unique.add(gene_seq) gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq) gene.is_full = gene.start > 1 and gene.end < len(contigs[contig]) genes.append(gene) full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths] partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths] if OUTPUT_FASTA: out_fasta_fpath = out_fpath + '_genes.fasta' add_genes_to_fasta(genes, out_fasta_fpath) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
def parse_gmhmm_out(out_fpath): reading_gene = False reading_protein = False protein = '' with open(out_fpath) as f: for line in f: if line.startswith('>gene'): seq = [] seq_id, contig_id = line.strip().split('\t') # >gene_2|GeneMark.hmm|57_nt|+|1|57 >NODE_3_length_713_cov_1.25228 _, _, seq_len, strand, left_index, right_index = seq_id.split('|') contig_id = contig_id[1:] if 'nt' in seq_len: reading_gene = True elif 'aa' in seq_len: reading_protein = True elif reading_gene or reading_protein: if line.isspace(): left_index = int(left_index) right_index = int(right_index) if reading_gene: seq = ''.join(seq) reading_gene = False elif reading_protein: protein = ''.join(seq) seq = [] reading_protein = False #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq)) if seq: gene = Gene(contig=contig_id, start=left_index, end=right_index, strand=strand, seq=seq, protein=protein) yield gene else: seq.append(line.strip())
def parse_gtf_out(out_fpath): with open(out_fpath) as f: for line in f: if 'CDS' in line: l = line.strip().split() gene = Gene(contig=l[0], strand=l[6], start=int(l[3]), end=int(l[4]), seq=l[9]) yield gene
def parse_gmhmm_out(out_fpath): reading_gene = False reading_protein = False protein = '' genes_by_id = OrderedDict() gene_id = None with open(out_fpath) as f: for line in f: if line.startswith('>gene'): seq = [] seq_id, contig_id = line.strip().split('\t') # >gene_2|GeneMark.hmm|57_nt|+|1|57 >NODE_3_length_713_cov_1.25228 gene_id, _, seq_len, strand, left_index, right_index = seq_id.split( '|') gene_id = gene_id[1:] contig_id = contig_id[1:] if 'nt' in seq_len: reading_gene = True elif 'aa' in seq_len: reading_protein = True elif reading_gene or reading_protein: if line.isspace(): left_index = int(left_index) right_index = int(right_index) if reading_gene: seq = ''.join(seq) reading_gene = False elif reading_protein: protein = ''.join(seq) seq = [] reading_protein = False #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq)) gene = genes_by_id[gene_id] if gene_id in genes_by_id else \ Gene(contig=contig_id, start=left_index, end=right_index, strand=strand) if seq: gene.seq = seq seq = [] if protein: gene.protein = protein protein = None genes_by_id[gene_id] = gene else: seq.append(line.strip()) return list(genes_by_id.values())