def fasta_stats(fastafile, display=False): # get basic stats on the fasta file: genelens = sorted( [ len(seq[1]) for seq in internal.parsefasta(fastafile)] , reverse=True) totallen = sum(genelens) print "Number of transcripts: %d" % len(genelens) print "Total length of transcripts: %d" % totallen print "longest transcript: %d" % genelens[0] if display: plt.hist(genelens) plt.title("Histogram of gene lengths (bp)") plt.show() prevl = 0 lensum = 0 trancount = 0 for l in genelens: if lensum + l >= (totallen / 2): print "N50: %d (%d transcripts)" % (prevl, trancount) break lensum += l prevl = l trancount += 1
def fasta_stats(fastafile, display=False): # get basic stats on the fasta file: genelens = sorted([len(seq[1]) for seq in internal.parsefasta(fastafile)], reverse=True) totallen = sum(genelens) print "Number of transcripts: %d" % len(genelens) print "Total length of transcripts: %d" % totallen print "longest transcript: %d" % genelens[0] if display: plt.hist(genelens) plt.title("Histogram of gene lengths (bp)") plt.show() prevl = 0 lensum = 0 trancount = 0 for l in genelens: if lensum + l >= (totallen / 2): print "N50: %d (%d transcripts)" % (prevl, trancount) break lensum += l prevl = l trancount += 1
if args.stats and args.fasta: fasta_stats(args.fasta, args.display_on) ############################################################################# ### extract sequence information from each gene in supplied transdecoder file transcript_dic = {} gene_families = {} seq_families = {} geneid_idx = {} gf_idx = {} full_blast = get_full_blast_idx(args.blast) verbalise("Y", "Created full blast index with %d entries" % (len(full_blast))) for defline, seq in internal.parsefasta(args.transdecoder): # get trinity and transdecoder gene ids: tdid, trinityid = parse_defline(defline) # get any blast results: if tdid in full_blast: blastline = full_blast[tdid] else: blastline = None # create new transcript instance newtranscript = Transcript(trinity_id=trinityid, transdecoder_id=tdid, blastline=blastline, seq=seq)
def get_similar_sequences(temp_dir, buildhmmer=False, fastafile=None, specieslist={}, species=None, genes=[], dbpaths={}, mincollect=2, globalthresh=0.2, localthresh=0.8, verbalise=lambda *a: None): # clean gene list type and content: if not isinstance(genes, list): genes = [genes] genes = [g for g in genes if g != ''] # count genes provided: genelist_num, fasta_num = internal.count_genes(genes, fastafile) verbalise("Y", "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num)) # if fasta files are provided, create a temp fastafile to search against with hmmer: if fastafile: extra_file = os.path.join(temp_dir, "query_fasta") handle = open(extra_file, 'w') for defline, seq in internal.parsefasta(fastafile): handle.write(">%s\n%s\n" % (defline, seq)) handle.close() extra_file_search = extra_file else: extra_file_search = None if genelist_num + fasta_num > 1: buildhmmer = True if buildhmmer: hmminput = os.path.join(temp_dir, "hmminput.fa") handle = open(hmminput, 'w') seqcount = 0 verbalise("B", "Extracting sequence data from %d peptides" % len(genes)) for defline, seq, species in internal.get_gene_fastas( genes=genes, species=None, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): if seq: seqcount += 1 fasta_seq = "%s\n%s\n" % (defline, seq) handle.write(fasta_seq) handle.close() if seqcount == 0: verbalise("R", "No gene sequences were found.") return {} # create alignment of input sequences: mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa") mafft_align(hmminput, mafft_align1) verbalise("B", "Creating hidden markov model from %d sequences" % seqcount) # create hmmbuild model of alignment: hmmmodel = os.path.join(temp_dir, "hmmmodel.fa") open(hmmmodel, 'a').close() handle = os.popen(" ".join( ['hmmbuild --informat afa', hmmmodel, mafft_align1])) handle.close() homologlist = hmmer_search(None, specieslist, query_species=species, minthresh=localthresh, temp_dir=temp_dir, dbpaths=dbpaths, mincollect=mincollect, globalthresh=globalthresh, hmmfile=hmmmodel, verbalise=verbalise, extra_file_search=extra_file_search) os.remove(mafft_align1) os.remove(hmminput) else: verbalise("B", "Extracting sequence from %s" % genes) if not isinstance(genes, list): genes = [genes] # run phmmer on a single input gene/sequence: seq = "" for defline, seq, species in internal.get_gene_fastas( genes=genes, species=species, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): fasta_seq = "%s\n%s\n" % (defline, seq) verbalise("C", fasta_seq) if not seq: verbalise("R", "No genes sequences were found.") return {} ## phmmer all lpep files homologlist = hmmer_search(fasta_seq, specieslist, query_species=species, minthresh=localthresh, dbpaths=dbpaths, temp_dir=temp_dir, mincollect=mincollect, globalthresh=globalthresh, hmmfile=None, verbalise=verbalise, extra_file_search=extra_file_search) return homologlist
def get_similar_sequences(temp_dir, buildhmmer=False, fastafile=None, specieslist={}, species=None, genes=[], dbpaths={}, mincollect=2, globalthresh=0.2, localthresh=0.8, verbalise=lambda *a: None): # clean gene list type and content: if not isinstance(genes, list): genes = [genes] genes = [ g for g in genes if g != '' ] # count genes provided: genelist_num, fasta_num = internal.count_genes(genes, fastafile) verbalise("Y", "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num )) # if fasta files are provided, create a temp fastafile to search against with hmmer: if fastafile: extra_file = os.path.join(temp_dir, "query_fasta") handle = open(extra_file, 'w') for defline, seq in internal.parsefasta(fastafile): handle.write(">%s\n%s\n" % (defline, seq)) handle.close() extra_file_search = extra_file else: extra_file_search = None if genelist_num + fasta_num > 1: buildhmmer = True if buildhmmer: hmminput = os.path.join(temp_dir, "hmminput.fa") handle = open(hmminput, 'w') seqcount = 0 verbalise("B", "Extracting sequence data from %d peptides" % len(genes)) for defline, seq, species in internal.get_gene_fastas(genes=genes, species=None, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): if seq: seqcount += 1 fasta_seq = "%s\n%s\n" % (defline, seq) handle.write(fasta_seq) handle.close() if seqcount == 0: verbalise("R", "No gene sequences were found.") return {} # create alignment of input sequences: mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa") mafft_align(hmminput, mafft_align1) verbalise("B", "Creating hidden markov model from %d sequences" % seqcount) # create hmmbuild model of alignment: hmmmodel = os.path.join(temp_dir, "hmmmodel.fa") open(hmmmodel, 'a').close() handle = os.popen(" ".join(['hmmbuild --informat afa', hmmmodel, mafft_align1])) handle.close() homologlist = hmmer_search(None, specieslist, query_species=species, minthresh=localthresh, temp_dir=temp_dir, dbpaths=dbpaths, mincollect=mincollect, globalthresh=globalthresh, hmmfile=hmmmodel, verbalise=verbalise, extra_file_search=extra_file_search) os.remove(mafft_align1) os.remove(hmminput) else: verbalise("B", "Extracting sequence from %s" % genes) if not isinstance(genes, list): genes = [genes] # run phmmer on a single input gene/sequence: seq = "" for defline, seq, species in internal.get_gene_fastas(genes=genes, species=species, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): fasta_seq = "%s\n%s\n" % (defline, seq) verbalise("C", fasta_seq) if not seq: verbalise("R", "No genes sequences were found.") return {} ## phmmer all lpep files homologlist = hmmer_search(fasta_seq, specieslist, query_species=species, minthresh=localthresh, dbpaths=dbpaths, temp_dir=temp_dir, mincollect=mincollect, globalthresh=globalthresh, hmmfile=None, verbalise=verbalise, extra_file_search=extra_file_search) return homologlist
verbalise=verbalise) verbalise( "C", "Best hmmer score = %d" % max(v[1] for v in homologlist.values())) ######### Extract identified sequences from LNRP fasta files ######### conv_handle = open(logfile[:-3] + 'name_conversion.txt', 'w') conv_dic = {} itercount = 0 previousseq = "" seqdic = {} # loaded up to remove duplicate sequences excluded_genes = config.make_a_list(args.exclude_genes) excluded_species = config.make_a_list(args.exclude_species) # place any sequences provided in the input into the seqdic if args.fasta: for defline, seq in internal.parsefasta(args.fasta): if sequence_filter(seq, args.maxlength, args.minlength): continue else: seqdic[seq] = defline for homolog in sorted(homologlist): # remove excluded genes before bothering to look up their sequence: searchname = internal.fix_leaky_pipes(homolog) if searchname in excluded_genes: continue if homologlist[homolog][0] in excluded_species: continue # extract sequences of remaining genes and add to conversion dictionary itercount += 1
verbalise=verbalise, ) verbalise("C", "Best hmmer score = %d" % max(v[1] for v in homologlist.values())) ######### Extract identified sequences from LNRP fasta files ######### conv_handle = open(logfile[:-3] + "name_conversion.txt", "w") conv_dic = {} itercount = 0 previousseq = "" seqdic = {} # loaded up to remove duplicate sequences excluded_genes = config.make_a_list(args.exclude_genes) excluded_species = config.make_a_list(args.exclude_species) # place any sequences provided in the input into the seqdic if args.fasta: for defline, seq in internal.parsefasta(args.fasta): if sequence_filter(seq, args.maxlength, args.minlength): continue else: seqdic[seq] = defline for homolog in sorted(homologlist): # remove excluded genes before bothering to look up their sequence: searchname = internal.fix_leaky_pipes(homolog) if searchname in excluded_genes: continue if homologlist[homolog][0] in excluded_species: continue # extract sequences of remaining genes and add to conversion dictionary itercount += 1