def compute_fasta_stats(formats, input_file, seqtype, priority): MIN_LENGTH='MIN_LENGTH' MAX_LENGTH='MAX_LENGTH' NUMSEQ='NUMSEQ' TOTAL_LENGTH='TOTAL_LENGTH' AVG_LENGTH='AVG_LENGTH' stats = { MIN_LENGTH: 0, MAX_LENGTH: 0, NUMSEQ : 0, TOTAL_LENGTH: 0, AVG_LENGTH : 0 } """ min length """ _MAX = 1000000000000 stats[MAX_LENGTH] = -(_MAX) stats[MIN_LENGTH]= _MAX fastareader= FastaReader(input_file) """ process one fasta sequence at a time """ lengths_str="" for record in fastareader: seqname = record.name seq = record.sequence length = len(seq) stats[NUMSEQ] += 1 stats[AVG_LENGTH] = stats[AVG_LENGTH] + length if stats[MIN_LENGTH] > length: stats[MIN_LENGTH] = length if stats[MAX_LENGTH] < length: stats[MAX_LENGTH] = length if stats[NUMSEQ] > 0 : stats[AVG_LENGTH] = stats[AVG_LENGTH]/stats[NUMSEQ] else: stats[AVG_LENGTH] = 0 # printf("%s\tNumber of sequences in input file BEFORE QC (%s)\t%s\n" %(str(priority), opts.seqtype, str(stats[NUMSEQ][BEFORE])) ) # printf("%s\tNumber of sequences AFTER QC (%s)\t%s\n" %(str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER]))) printf(formats %(str(priority + 5), str(stats[NUMSEQ]))) printf("%s\t-min length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH])) ) printf("%s\t-avg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH])))) printf("%s\t-max length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH])) ) printf("%s\t-total base pairs (bp)\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH]* stats[NUMSEQ])) ))
def create_gff_faa(tempfile, gfffile, faafile): patt = re.compile(r'>(.*)_(\d+)_(\d+)_([+-])') idpatt = re.compile(r'.*_(\d+_\d+)') with open(gfffile, 'w') as gffout: with open(faafile, 'w') as faaout: fastareader = FastaReader(tempfile) for fasta in fastareader: res=patt.search(fasta.name) if res: #nameprint(res.group(1),res.group(2), res.group(3), res.group(4)) orfname=res.group(1) start=res.group(2) end=res.group(3) strand=res.group(4) res=idpatt.search(orfname) id='' if res: id=res.group(1) attr = "ID=" + id + ";partial=00" fields=[orfname, 'FGS+', 'CDS', start, end, '0', strand, "0", attr] fprintf(faaout,'>' + orfname + "\n" + fasta.sequence+"\n") fprintf(gffout,'\t'.join(fields) +'\n')
def countNoOfSequencesInFile(file): fastareader = FastaReader(file) count = 0 for record in fastareader: count += 1 return count
def create_splits(outputdir, listfilename, input_filename, maxMBytes, maxSize, splitPrefix='split', splitSuffix=''): maxBytes = 1024 * 1024 * maxMBytes if splitSuffix: suffix = '.' + splitSuffix else: suffix = '' try: if path.exists(listfilename): listfile = open(listfilename, 'r') listfilenames = [x.strip() for x in listfile.readlines()] remove_files(outputdir, listfilenames) listfile.close() except IOError: print "Cannot read file " + listfilename + " !" sys.exit(0) try: listfile = open(listfilename, 'w') except IOError: print "Cannot read file " + listfilename + " !" sys.exit(0) fragments = [] seq_beg_pattern = re.compile(">") splitno = 0 currblocksize = 0 currblockbyteSize = 0 fastareader = FastaReader(input_filename) # Read sequences from sorted sequence file and write them to block files for name in fastareader: fragments.append(fastareader.seqname) fragments.append(fastareader.sequence) if currblocksize >= maxSize - 1 or currblockbyteSize >= maxBytes: splitfile = open( outputdir + PATHDELIM + splitPrefix + str(splitno) + suffix, 'w') fprintf(splitfile, "%s", '\n'.join(fragments)) fragments = [] splitfile.close() # Add this block name to the blocklistfile fprintf(listfile, "%s\n", splitPrefix + str(splitno) + suffix) splitno += 1 currblocksize = 0 currblockbyteSize = 0 else: currblocksize += 1 currblockbyteSize += len(fastareader.sequence) if fragments: splitfile = open( outputdir + PATHDELIM + splitPrefix + str(splitno) + suffix, 'w') fprintf(splitfile, "%s", '\n'.join(fragments)) splitfile.close() fragments = [] fprintf(listfile, "%s\n", splitPrefix + str(splitno) + suffix) splitno += 1 #Add this block name to the blocklistfile currblocksize = 0 currblockbyteSize = 0 listfile.close() return True
def main(argv, errorlogger=None, runstatslogger=None): global parser (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print(usage) sys.exit(0) min_length = 0 #inputfile = open(opts.input_fasta,'r') outfile = open(opts.output_fasta, 'w') outfilefna = open(opts.output_fna, 'w') outfilefaa = open(opts.output_faa, 'w') outfilegff = open(opts.output_gff, 'w') logfile = open(opts.log_file, 'w') lengthsfile = open(opts.lengths_file, 'w') if opts.map_file: mapfile = open(opts.map_file, 'w') else: mapfile = None sample_name = opts.input_fasta sample_name = re.sub(r'^.*/', '', sample_name, re.I) sample_name = re.sub(r'^.*\\', '', sample_name, re.I) sample_name = re.sub(r'\.fasta$', '', sample_name, re.I) sample_name = re.sub(r'\.fna$', '', sample_name, re.I) sample_name = re.sub(r'\.faa$', '', sample_name, re.I) sample_name = re.sub(r'\.fas$', '', sample_name, re.I) sample_name = re.sub(r'\.fa$', '', sample_name, re.I) BEFORE = 'BEFORE' AFTER = 'AFTER' NUMSEQ = "#INFO\tNumber of sequences :" NUMSEQ_SHORTER = "@INFO\tNumber of sequences shorter than minimum length of sequences" AVG_LENGTH = "@INFO\tAverage length of sequences:" MIN_LENGTH = "@INFO\tMinimum length of sequences:" MAX_LENGTH = "@INFO\tMaximum length of sequences:" _MAX = 1000000000000 stats = { MIN_LENGTH: { 'BEFORE': _MAX, 'AFTER': _MAX }, MAX_LENGTH: { 'BEFORE': 0, 'AFTER': 0 }, NUMSEQ: { 'BEFORE': 0, 'AFTER': 0 }, NUMSEQ_SHORTER: { 'BEFORE': 0, 'AFTER': 0 }, AVG_LENGTH: { 'BEFORE': 0, 'AFTER': 0 }, } length_distribution = {} length_cumulative_distribution = {} for i in range(0, 31): length_distribution[i] = 0 length_cumulative_distribution[i] = 0 seq_count = 0 allNames = dict() outputStr = "" outputLines = [] fastareader = FastaReader(opts.input_fasta) """ process one fasta sequence at a time """ lengths_str = "" for record in fastareader: seqname = record.name seq = record.sequence length = len(seq) index = int(len(seq) / 50) if index >= 30: index = 30 length_distribution[index] += 1 if length < stats[MIN_LENGTH][BEFORE]: stats[MIN_LENGTH][BEFORE] = length if length > stats[MAX_LENGTH][BEFORE]: stats[MAX_LENGTH][BEFORE] = length if length < MIN_LENGTH: stats[NUMSEQ_SHORTER][BEFORE] += 1 stats[AVG_LENGTH][BEFORE] = stats[AVG_LENGTH][BEFORE] + length seqvalue = filter_sequence(seq) stats[NUMSEQ][BEFORE] += 1 seqlen = len(seqvalue) if seqlen >= min_length: if len(lengths_str) > 100: fprintf(lengthsfile, "%s\n", lengths_str) lengths_str = str(seqlen) else: lengths_str += '\t' + str(seqlen) stats[NUMSEQ][AFTER] += 1 stats[AVG_LENGTH][AFTER] = stats[AVG_LENGTH][AFTER] + seqlen if mapfile == None: fprintf(outfile, "%s\n", seqname) else: contigID = sample_name + '_' + str(seq_count) orfID = sample_name + '_' + str(seq_count) + "_0" fprintf(outfile, ">%s\n", contigID) fprintf(outfilefna, ">%s\n", orfID) fprintf(outfilefaa, ">%s\n", orfID) gffString = sample_name + '_' + str(seq_count) gffString += "\t" + "AMINO_ACID_SEQ" gffString += "\t" + "CDS" gffString += "\t" + "0" gffString += "\t" + str(3 * seqlen) gffString += "\t" + "0" gffString += "\t" + "+" gffString += "\t" + "0" gffString += "\t" + "ID=" + orfID + ";" gffString += "locus_tag=" + orfID + ";" gffString += "partial=00;" gffString += "orf_length=" + str(seqlen) + ";" gffString += "contig_length=" + str(3 * seqlen) fprintf(outfilegff, "%s\n", gffString) key = re.sub(r'^>', '', seqname) fprintf( mapfile, "%s\n", sample_name + '_' + str(seq_count) + '\t' + key + '\t' + str(seqlen)) seq_count += 1 fprintf(outfile, "%s\n", "DUMMY CONTIGS FOR AMINO ACID SEQUENCES") fprintf(outfilefna, "%s\n", "DUMMY ORFS FOR AMINO ACID SEQUENCES") fprintf(outfilefaa, "%s\n", seqvalue) if seqlen < stats[MIN_LENGTH][AFTER]: stats[MIN_LENGTH][AFTER] = seqlen if seqlen > stats[MAX_LENGTH][AFTER]: stats[MAX_LENGTH][AFTER] = seqlen fprintf(lengthsfile, "%s\n", lengths_str) if stats[NUMSEQ][BEFORE] > 0: stats[AVG_LENGTH][ BEFORE] = stats[AVG_LENGTH][BEFORE] / stats[NUMSEQ][BEFORE] else: stats[AVG_LENGTH][BEFORE] = 0 if stats[NUMSEQ][AFTER] > 0: stats[AVG_LENGTH][ AFTER] = stats[AVG_LENGTH][AFTER] / stats[NUMSEQ][AFTER] else: stats[AVG_LENGTH][AFTER] = 0 lengthsfile.close() outfile.close() outfilefna.close() outfilefaa.close() outfilegff.close() #inputfile.close() if mapfile != None: mapfile.close() """ min length """ if stats[MIN_LENGTH][BEFORE] == _MAX: stats[MIN_LENGTH][BEFORE] = 0 if stats[MIN_LENGTH][AFTER] == _MAX: stats[MIN_LENGTH][AFTER] = 0 fprintf(logfile, "@INFO\tBEFORE\tAFTER\n") fprintf( logfile, "%s\n", NUMSEQ + '\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER])) fprintf( logfile, "%s\n", NUMSEQ_SHORTER + '\t' + str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER])) fprintf( logfile, "%s\n", AVG_LENGTH + '\t' + str(stats[AVG_LENGTH][BEFORE]) + '\t' + str(stats[AVG_LENGTH][AFTER])) fprintf( logfile, "%s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) + '\t' + str(stats[MIN_LENGTH][AFTER])) fprintf( logfile, "%s\n", MAX_LENGTH + '\t' + str(stats[MAX_LENGTH][BEFORE]) + '\t' + str(stats[MAX_LENGTH][AFTER])) fprintf(logfile, "@INFO\tLOW\tHIGH\tFREQUENCY\tCUMULATIVE_FREQUENCY\n") # fprintf(logfile, "# ---\t-----\t--------\t---------\t----------\n"); i = 30 length_cumulative_distribution[i] = length_cumulative_distribution[i] i -= 1 while i >= 0: length_cumulative_distribution[i] = length_cumulative_distribution[ i + 1] + length_distribution[i] i -= 1 for i in range(0, 31): fprintf(logfile, " %s\n", str(i*50) + '\t' + str((i+1)*50) + '\t' +\ str(length_distribution[i]) +'\t' + str(length_cumulative_distribution[i]) ) logfile.close() seqtype = 'amino' """priority is used to sort the output to print in the right order""" priority = 2000 if runstatslogger != None: runstatslogger.write( "%s\tSequences BEFORE Filtering (%s)\t%s\n" % (str(priority), seqtype, str(stats[NUMSEQ][BEFORE]))) runstatslogger.write( "%s\tmin length\t%s\n" % (str(priority + 1), str(stats[MIN_LENGTH][BEFORE]))) runstatslogger.write( "%s\tavg length\t%s\n" % (str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE])))) runstatslogger.write( "%s\tmax length\t%s\n" % (str(priority + 3), str(stats[MAX_LENGTH][BEFORE]))) runstatslogger.write( "%s\ttot length\t%s\n" % (str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE] * stats[NUMSEQ][BEFORE])))) runstatslogger.write( "%s\tSequences AFTER Filtering (%s)\t%s\n" % (str(priority + 5), seqtype, str(stats[NUMSEQ][AFTER]))) runstatslogger.write( "%s\tmin length\t%s\n" % (str(priority + 6), str(stats[MIN_LENGTH][AFTER]))) runstatslogger.write( "%s\tavg length\t%s\n" % (str(priority + 7), str(int(stats[AVG_LENGTH][AFTER])))) runstatslogger.write( "%s\tmax length\t%s\n" % (str(priority + 8), str(stats[MAX_LENGTH][AFTER]))) runstatslogger.write( "%s\ttot length\t%s\n" % (str(priority + 9), str(int(stats[AVG_LENGTH][AFTER] * stats[NUMSEQ][AFTER]))))
def main(argv, errorlogger=None, runstatslogger=None): global parser global errorcode (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print(usage) sys.exit(0) min_length = opts.min_length outfile = open(opts.output_fasta + '.tmp', 'w') logfile = open(opts.log_file, 'w') lengthsfile = open(opts.lengths_file + '.tmp', 'w') if opts.map_file: mapfile = open(opts.map_file, 'w') else: mapfile = None if opts.seqtype == 'nucleotide': errorcode = 1 else: errorcode = 3 sample_name = opts.input_fasta sample_name = re.sub(r'^.*/', '', sample_name, re.I) sample_name = re.sub(r'^.*\\', '', sample_name, re.I) sample_name = re.sub(r'\.fasta$', '', sample_name, re.I) sample_name = re.sub(r'\.fna$', '', sample_name, re.I) sample_name = re.sub(r'\.faa$', '', sample_name, re.I) sample_name = re.sub(r'\.fas$', '', sample_name, re.I) sample_name = re.sub(r'\.fa$', '', sample_name, re.I) BEFORE = 'BEFORE' AFTER = 'AFTER' NUMSEQ = "#INFO\tNumber of sequences :" NUMSEQ_SHORTER = "@INFO\tNumber of sequences shorter than minimum length of sequences" AVG_LENGTH = "@INFO\tAverage length of sequences:" MIN_LENGTH = "@INFO\tMinimum length of sequences:" MAX_LENGTH = "@INFO\tMaximum length of sequences:" _MAX = 1000000000000 stats = { MIN_LENGTH: { 'BEFORE': _MAX, 'AFTER': _MAX }, MAX_LENGTH: { 'BEFORE': 0, 'AFTER': 0 }, NUMSEQ: { 'BEFORE': 0, 'AFTER': 0 }, NUMSEQ_SHORTER: { 'BEFORE': 0, 'AFTER': 0 }, AVG_LENGTH: { 'BEFORE': 0, 'AFTER': 0 }, } length_distribution = {} length_cumulative_distribution = {} for i in range(0, 31): length_distribution[i] = 0 length_cumulative_distribution[i] = 0 seq_count = 0 allNames = dict() outputStr = "" outputLines = [] print(opts.input_fasta) fastareader = FastaReader(opts.input_fasta) """ process one fasta sequence at a time """ lengths_str = "" for record in fastareader: seqname = record.name seq = record.sequence length = len(seq) index = int(len(seq) / 50) if index >= 30: index = 30 length_distribution[index] += 1 if length < stats[MIN_LENGTH][BEFORE]: stats[MIN_LENGTH][BEFORE] = length if length > stats[MAX_LENGTH][BEFORE]: stats[MAX_LENGTH][BEFORE] = length if length < min_length: stats[NUMSEQ_SHORTER][BEFORE] += 1 stats[AVG_LENGTH][BEFORE] = stats[AVG_LENGTH][BEFORE] + length #stopped the filtering process seqvalue = filter_sequence(seq) seqvalue = seq.upper() stats[NUMSEQ][BEFORE] += 1 seqlen = len(seqvalue) if seqlen >= min_length: if len(lengths_str) > 100: fprintf(lengthsfile, "%s\n", lengths_str) lengths_str = str(seqlen) else: lengths_str += '\t' + str(seqlen) stats[NUMSEQ][AFTER] += 1 stats[AVG_LENGTH][AFTER] = stats[AVG_LENGTH][AFTER] + seqlen if mapfile == None: fprintf(outfile, "%s\n", seqname) else: fprintf(outfile, ">%s\n", sample_name + '_' + str(seq_count)) key = re.sub(r'^>', '', seqname) fprintf( mapfile, "%s\n", sample_name + '_' + str(seq_count) + '\t' + key + '\t' + str(seqlen)) seq_count += 1 fprintf(outfile, "%s\n", seqvalue) if seqlen < stats[MIN_LENGTH][AFTER]: stats[MIN_LENGTH][AFTER] = seqlen if seqlen > stats[MAX_LENGTH][AFTER]: stats[MAX_LENGTH][AFTER] = seqlen fprintf(lengthsfile, "%s\n", lengths_str) if stats[NUMSEQ][BEFORE] > 0: stats[AVG_LENGTH][ BEFORE] = stats[AVG_LENGTH][BEFORE] / stats[NUMSEQ][BEFORE] else: stats[AVG_LENGTH][BEFORE] = 0 if stats[NUMSEQ][AFTER] > 0: stats[AVG_LENGTH][ AFTER] = stats[AVG_LENGTH][AFTER] / stats[NUMSEQ][AFTER] else: stats[AVG_LENGTH][AFTER] = 0 lengthsfile.close() outfile.close() rename(opts.output_fasta + ".tmp", opts.output_fasta) rename(opts.lengths_file + ".tmp", opts.lengths_file) #inputfile.close() if mapfile != None: mapfile.close() """ min length """ if stats[MIN_LENGTH][BEFORE] == _MAX: stats[MIN_LENGTH][BEFORE] = 0 if stats[MIN_LENGTH][AFTER] == _MAX: stats[MIN_LENGTH][AFTER] = 0 fprintf(logfile, "@INFO\tBEFORE\tAFTER\n") fprintf( logfile, "%s\n", NUMSEQ + '\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER])) fprintf( logfile, "%s\n", NUMSEQ_SHORTER + '\t' + str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER])) fprintf( logfile, "%s\n", AVG_LENGTH + '\t' + str(stats[AVG_LENGTH][BEFORE]) + '\t' + str(stats[AVG_LENGTH][AFTER])) fprintf( logfile, "%s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) + '\t' + str(stats[MIN_LENGTH][AFTER])) fprintf( logfile, "%s\n", MAX_LENGTH + '\t' + str(stats[MAX_LENGTH][BEFORE]) + '\t' + str(stats[MAX_LENGTH][AFTER])) fprintf(logfile, "@INFO\tLOW\tHIGH\tFREQUENCY\tCUMULATIVE_FREQUENCY\n") # fprintf(logfile, "# ---\t-----\t--------\t---------\t----------\n"); i = 30 length_cumulative_distribution[i] = length_cumulative_distribution[i] i -= 1 while i >= 0: length_cumulative_distribution[i] = length_cumulative_distribution[ i + 1] + length_distribution[i] i -= 1 for i in range(0, 31): fprintf(logfile, " %s\n", str(i*50) + '\t' + str((i+1)*50) + '\t' +\ str(length_distribution[i]) +'\t' + str(length_cumulative_distribution[i]) ) logfile.close() if opts.seqtype == 'nucleotide': priority = 1000 else: priority = 2000 if runstatslogger != None: if opts.seqtype == 'nucleotide': runstatslogger.write( "%s\tNumber of sequences in input file BEFORE QC (%s)\t%s\n" % (str(priority), opts.seqtype, str(stats[NUMSEQ][BEFORE]))) runstatslogger.write( "%s\t-min length\t%s\n" % (str(priority + 1), str(stats[MIN_LENGTH][BEFORE]))) runstatslogger.write( "%s\t-avg length\t%s\n" % (str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE])))) runstatslogger.write( "%s\t-max length\t%s\n" % (str(priority + 3), str(stats[MAX_LENGTH][BEFORE]))) runstatslogger.write( "%s\t-total base pairs (bp)\t%s\n" % (str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE] * stats[NUMSEQ][BEFORE])))) runstatslogger.write( "%s\tNumber of sequences AFTER QC (%s)\t%s\n" % (str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER]))) runstatslogger.write( "%s\t-min length\t%s\n" % (str(priority + 6), str(stats[MIN_LENGTH][AFTER]))) runstatslogger.write( "%s\t-avg length\t%s\n" % (str(priority + 7), str(int(stats[AVG_LENGTH][AFTER])))) runstatslogger.write( "%s\t-max length\t%s\n" % (str(priority + 8), str(stats[MAX_LENGTH][AFTER]))) runstatslogger.write( "%s\t-total base pairs (bp)\t%s\n" % (str(priority + 9), str(int(stats[AVG_LENGTH][AFTER] * stats[NUMSEQ][AFTER])))) else: runstatslogger.write( "%s\tNumber of translated ORFs BEFORE QC (%s)\t%s\n" % (str(priority), opts.seqtype, str(stats[NUMSEQ][BEFORE]))) runstatslogger.write( "%s\t-min length\t%s\n" % (str(priority + 1), str(stats[MIN_LENGTH][BEFORE]))) runstatslogger.write( "%s\t-avg length\t%s\n" % (str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE])))) runstatslogger.write( "%s\t-max length\t%s\n" % (str(priority + 3), str(stats[MAX_LENGTH][BEFORE]))) runstatslogger.write( "%s\t-total base pairs (bp)\t%s\n" % (str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE] * stats[NUMSEQ][BEFORE])))) runstatslogger.write( "%s\tNumber of tranlated ORFs AFTER QC (%s)\t%s\n" % (str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER]))) runstatslogger.write( "%s\t-min length\t%s\n" % (str(priority + 6), str(stats[MIN_LENGTH][AFTER]))) runstatslogger.write( "%s\t-avg length\t%s\n" % (str(priority + 7), str(int(stats[AVG_LENGTH][AFTER])))) runstatslogger.write( "%s\t-max length\t%s\n" % (str(priority + 8), str(stats[MAX_LENGTH][AFTER]))) runstatslogger.write( "%s\t-total base pairs (bp)\t%s\n" % (str(priority + 9), str(int(stats[AVG_LENGTH][AFTER] * stats[NUMSEQ][AFTER]))))