def main(): parser = argparse.ArgumentParser( description='Calculates RT stop coverage from <.rtsc> file(s)') in_files = parser.add_argument_group('Input') in_files.add_argument('fasta', type=str, metavar='fasta', help='Reference Fasta') in_files.add_argument('f', type=str, help='Input <.rtsc> files', nargs='+') settings = parser.add_argument_group('Settings') settings.add_argument('-bases', type=str, default='AC', metavar='ACGT', help='[default = AC] Coverage Specificity') settings.add_argument('-ot', type=float, default=1.0, help='[default = 1.0] Overlap file threshold') out_files = parser.add_argument_group('Output') out_files.add_argument('-name', type=str, default=None, help='Output file name') out_files.add_argument('-ol', action='store_true', help='Create an overlap file') out_files.add_argument('-on', type=str, metavar='', default=None, help='Overlap file name') args = parser.parse_args() #Generate or assign name default_name = '_'.join( sorted([fyle.replace('.rtsc', '') for fyle in args.f]) + ['coverage']) + '.csv' out_name = sfio.check_extension(args.name, '.csv') if args.name else default_name #Collect Data coverage_data = collect_coverages(args.f, args.fasta, args.bases) #Write Data write_coverage(coverage_data, out_name) #Create overlap file if args.ol: default_ol = '_'.join( sorted([fyle.replace('.rtsc', '') for fyle in args.f]) + ['overlap', str(args.ot)]) + '.txt' out_ol = default_ol if args.on == None else sfio.check_extension( args.on, '.txt') write_ol(coverage_data, out_ol, args.ot)
def main(): parser = argparse.ArgumentParser(description='Converts <.sam> into reverse transcriptase stop files <.rtsc>') in_files = parser.add_argument_group('Input') in_files.add_argument('fasta',default=None,help='Index Fasta File') in_files.add_argument('sam',default=None,help='Input SAM file(s)',nargs='+') settings = parser.add_argument_group('Settings') settings.add_argument('-mismatches',type=int,default=3,metavar='<number>',help='[default = 3] Maximum allowed mismatches/indels') settings.add_argument('-firstmm',action='store_true',default=False,help='Accept alignments with first base mismatches') settings.add_argument('-reverse',action='store_true',default=False,help='Accept alignments to the reverse strand') settings.add_argument('-rm_secondary',action='store_false',default=True,help='Remove secondary alignments',dest='secondary') out_files = parser.add_argument_group('Output') out_files.add_argument('-logname',type=str,default='filter_log.csv',help='[default = filter_log.csv] Log file name') parser.set_defaults(r1_unmap=False) args = parser.parse_args() #Generate Seq Limits reference = sfio.read_fasta(args.fasta) limits = {name:len(seq) for name, seq in reference.items()} #Generate SAM filterflag keys = {'r1_unmap':args.r1_unmap,'r1_reverse':args.reverse,'secondary':args.secondary} passing_keys = [k for k,v in keys.items() if not v] keyflag = sum([sf3sam.flag_values[R] for R in passing_keys]) #Iterate through files log_data = {} for fyle in args.sam: entry = sam_to_rtsc(fyle,limits,args.mismatches,args.firstmm,keyflag) log_data[fyle] = entry #Write out log write_sam_filter_report(log_data,sfio.check_extension(args.logname,'.csv'))
def main(): parser = argparse.ArgumentParser(description='Determines approximate transcript abundance using <.rtsc> files') in_files = parser.add_argument_group('Input') in_files.add_argument('rtsc',default=None,nargs='+',help='Input <.rtsc> file(s)') settings = parser.add_argument_group('Settings') settings.add_argument('-mode',type=str.upper,choices=['RPKM','TPM'],help='Abundance Metric to Calculate') settings.add_argument('-zero',action='store_true',help='Set missing values to zero') out_files = parser.add_argument_group('Output') out_files.add_argument('-name',default=None,help='Specify output file name') args = parser.parse_args() #Files to operate on, dictionary of functions abundance_methods = {'RPKM':values_to_RPKM,'TPM':values_to_TPM} #Nomenclature name = sorted([x.replace('.rtsc','') for x in args.rtsc])+[args.mode] default_name = '_'.join(name_1)+'.csv' out_name = default_name if not args.name else sfio.check_extension(args.name,'.csv') #Build data set abundances = populate_dictionary(args.rtsc,abundance_methods[args.mode]) #Write out blank = 0.0 if zero else 'NA' write_data_batch(abundances,out_name,args.mode,blank)
def main(): parser = argparse.ArgumentParser(description='Generates a simple statistical summary for <.react> files.') in_files = parser.add_argument_group('Input') in_files.add_argument('react',default=None,help='Input <.react> files',nargs='+') settings = parser.add_argument_group('Settings') settings.add_argument('-restrict',default=None,metavar='<.txt>',help='Limit analysis to these specific transcripts') settings.add_argument('-trim',type=int,default=20,metavar='<number>',help='[default = 20] ignore n last bp of reactivity') settings.add_argument('-minlen',type=int,default=10,metavar='<number>',help='[default = 10] minimum effective length of transcript') out_files = parser.add_argument_group('Output') out_files.add_argument('-name',default=None,help='Specify output file name') args = parser.parse_args() #File Input rx_data = sfio.read_rx_files(sorted(args.react),'.react',verbose=False) #Filter by coverage restrict = sfio.read_restrict(args.restrict) if args.restrict else None if restrict: rx_data = {n:s for n,s in rx_data.items() if n in restrict} #Nomenclature name_1 = sorted([x.replace('.react','') for x in in_files]) name_2 = [str(qq)+q for qq,q in zip([args.trim,args.minlen],['trim','minlen'])] default_name = '_'.join(name_1+name_2+['statistics'])+'.csv' out_name = sfio.check_extension(args.name,'.csv') if args.name else default_name #Write Out File write_out_stats(data,out_name,args.trim,args.minlen)
def main(): parser = argparse.ArgumentParser( description= 'Combines <.rtsc> files, typically replicates of the same sample') in_files = parser.add_argument_group('Input') in_files.add_argument('rtsc', help='Input <.rtsc> files', nargs='+') out_files = parser.add_argument_group('Output') out_files.add_argument('-sort', action='store_true', default=False, help='Sort output by transcript name') out_files.add_argument('-name', default=None, help='Specify output file name') args = parser.parse_args() #Generate name or assign the user provided name default_name = '_'.join(sorted([x.replace('.rtsc', '') for x in args.rtsc])) + '.rtsc' out_name = default_name if args.name == None else sfio.check_extension( args.name, '.rtsc') #Pool all <.rtsc> into a dictionary all_stops = merge_rtsc(args.rtsc) #Write out the dictionary sfio.write_rtsc(all_stops, args.sort, out_name)
def main(): parser = argparse.ArgumentParser( description='Analyzes native/reagent nucleotide RT stop specificity') in_files = parser.add_argument_group('Input') in_files.add_argument('index', type=str, help="Fasta used to generate the <.rtsc>") in_files.add_argument('rtsc', default=None, help='Input <.rtsc>', nargs='+') settings = parser.add_argument_group('Settings') settings.add_argument('-report', default='ACGT', help='Include these nucelotides in report') settings.add_argument('-round', type=int, default=5, help='[default = 5] Decimal places to report', dest='digits') out_files = parser.add_argument_group('Output') out_files.add_argument('-name', default=None, help='Specify output file name') args = parser.parse_args() #Outfile Nomenclature base_name = sorted([f.replace('.rtsc', '') for f in args.rtsc]) suffixes = [args.report, 'specificity'] default_name = '_'.join(base_name + suffixes) + '.csv' out_fyle = sfio.check_extension(args.name, '.csv') if args.name else default_name #Calculate Specificity specificity_data = collect_specificity(args.rtsc, args.index) #Writeout write_specificity(specificity_data, out_fyle, args.report, args.digits)
def main(): parser = argparse.ArgumentParser(description='Splits sequences into sliding window segments') in_files = parser.add_argument_group('Input') in_files.add_argument('fasta',type=str,help='Input Fasta File') settings = parser.add_argument_group('Settings') settings.add_argument('-size',default=10,type=int,help='[default = 10] Size of the Windows') settings.add_argument('-step',default=5,type=int,help='[default = 5] Step of the Windows') out_files = parser.add_argument_group('Output') out_files.add_argument('-name',default=None,help='Specify output file name') out_files.add_argument('-sort_file',action='store_true',help='Sort the output by name') args = parser.parse_args() #Get Sequences seqs = sfio.read_fasta(args.fasta) #Create Segments segements = fasta_stepwise(seqs,args.size,args.step) #Output base = sfio.rm_ext(args.fasta,'.fasta','.fas','.fa') default_name = '_'.join([base,str(args.size)+'window',str(args.step)+'step'])+'.fasta' out_name = sfio.check_extension(args.name,'.fasta') if args.name else default_name sfio.write_fasta(segements,out_name,args.sort_file)
def main(): parser = argparse.ArgumentParser(description='Batch runs folding programs') in_files = parser.add_argument_group('Input') in_files.add_argument('fasta', default=None, help='Reference Fasta File') settings = parser.add_argument_group('Settings') settings.add_argument('-mode', type=str.upper, default='R', choices=['R', 'V'], help='RNAStructure or Vienna') settings.add_argument('-react', default=None, metavar='<.react>', help='React file to use as restraints/constraints') settings.add_argument('-restrict', default=None, metavar='<.txt>', help='Limit folding to these specific transcripts') settings.add_argument( '-temp', type=float, default=310.15, help='[default = 310.15] Temperature to use for folding') settings.add_argument('-cores', default='4', type=str, help="[default = 4] Number of cores to use") settings.add_argument('-distance', default=99999, type=int, help='[default = 99999] Maximum pairing distance') settings.add_argument('-minlen', default=10, type=int, help='[default = 10] Minimum length to fold') settings.add_argument('-maxlen', default=5000, type=int, help='[default = 5000] Maximum length to fold') settings.add_argument( '-truncate', default=0, type=int, help='[default = 0] Ignore <.react> for last N nucleotides') settings.add_argument('-threshold', default=None, type=float, help='Apply hard constraints using this threshold', dest='th') rna = parser.add_argument_group('RNAStructure Settings') rna.add_argument('-slope', default=1.8, type=float, help='[default = 1.8] Parameter for RNAstructure') rna.add_argument('-intercept', default=-0.6, type=float, help='[default = -0.6] Parameter for RNAstructure') rna.add_argument('-partition', action='store_true', help='Use the partition function') rna.add_argument('-multiple', action='store_true', help='Output all structures rather than just MFE') out_files = parser.add_argument_group('Output') out_files.add_argument('-errorname', default='errors.csv', help='Name the error log') out_files.add_argument('-paraname', default='parameters.csv', help='Name the parameter log') out_files.add_argument('-outdir', default='folded', help='Name the out directory') out_files.add_argument('-ct', default='CT', help='Name the ct folder') out_files.add_argument('-ps', default='PS', help='Name the ps folder') out_files.add_argument('-bp', default='PS', help='Name the bp folder (Partition Only)') out_files.add_argument('-pfs', default='PFS', help='Name the pfs folder (Partition Only)') args = parser.parse_args() #Prepare out directory if os.path.isdir(args.outdir): print('Out directory already exists, quitting...') quit() else: os.mkdir(args.outdir) #Input sequences = sfio.read_fasta(args.fasta) xstraints = sfio.read_react(args.react) if args.react else None restrict = sfio.read_restrict(args.restrict) if args.restrict else None #Populate and filter errors errors = {} gen_length_errors(errors, sequences, args.minlen, args.maxlen) gen_xstraint_errors(errors, sequences, xstraints) if errors: sl = len(sequences) sequences = {z: y for z, y in sequences.items() if z not in errors} el = len(sequences) print('After screening errors, {} of {} folds remain'.format(el, sl)) if restrict: sl = len(sequences) sequences = {n: s for n, s in sequences.items() if n in restrict} el = len(sequences) print('After restricting analysis, {} of {} folds remain'.format( el, sl)) #Housekeeping in base directory write_params_log( args, os.path.join(args.outdir, sfio.check_extension(args.paraname, '.csv'))) write_error_log( errors, os.path.join(args.outdir, sfio.check_extension(args.errorname, '.csv'))) #RNAStructure Suite if args.mode == 'R': rna_env = {**os.environ, 'OMP_NUM_THREADS': args.cores} options = {'-t': args.temp, '-md': args.distance} #partition-smp Suite if args.partition: paths = gen_paths(args.outdir, args.pfs, args.bp, args.ps, args.ct) extensions = ['.pfs', '.bp', '.ps', '.ct'] #Restrained/Constrained if xstraints: opts = {} if args.th else { '-si': args.intercept, '-sm': args.slope } options.update(opts) out_type = 'constraint' if args.th else 'restraint' suffixes = [[str(args.temp), out_type + X] for X in extensions] params = sfio.flatten_list([[k, str(v)] for k, v in options.items()]) #Iterate through sequences for name, seqeunce in sequences.items(): temp_seq = gen_temp_sequence(name, seq, args.outdir) gargs = [name, xstraints[name], args.outdir, args.truncate] guide = gen_cons(*gargs, args.th) if args.th else gen_rest( *gargs) #Out Files pf_out, bp_out, ps_out, ct_out = gen_file_paths( name, paths, suffixes) #Run Commands gflag = '-c' if args.th else '-sh' pf_command = [ 'partition-smp', temp_seq, pf_out, gflag, guide ] + params bp_command = ['ProbabilityPlot', pf_out, bp_out, '-t'] ps_command = ['ProbabilityPlot', pf_out, ps_out] subprocess.run(pf_command, stdout=subprocess.DEVNULL, env=rna_env) subprocess.run(bp_command, env=rna_env, stdout=subprocess.DEVNULL) subprocess.run(ps_command, env=rna_env, stdout=subprocess.DEVNULL) #Generate CT file if file_len(bp_out) > 2: ct_command = ['MaxExpect', pf_out, ct_out] subprocess.run(ct_command, stdout=subprocess.DEVNULL) remove_temp(temp_seq, guide) #in-silico else: suffixes = [[str(args.temp), 'silico' + X] for X in extensions] params = sfio.flatten_list([[k, str(v)] for k, v in options.items()]) #Iterate through sequences for name, seq in sequences.items(): temp_seq = gen_temp_sequence(name, seq, args.outdir) #Out Files pf_out, bp_out, ps_out, ct_out = gen_file_paths( name, paths, suffixes) #Run Commands pf_command = ['partition-smp', temp_seq, out_pfs] + params bp_command = ['ProbabilityPlot', pf_out, bp_out, '-t'] ps_command = ['ProbabilityPlot', pf_out, ps_out] subprocess.run(pf_command, stdout=subprocess.DEVNULL, env=rna_env) subprocess.run(bp_command, stdout=subprocess.DEVNULL) subprocess.run(ps_command, stdout=subprocess.DEVNULL) #Generate CT file if file_len(out_bp) > 2: ct_command = ['MaxExpect', pf_out, out_ct] subprocess.run(ct_command, stdout=subprocess.DEVNULL) remove_temp(temp_seq) #Fold-smp Suite else: paths = gen_paths(args.outdir, args.ct, args.ps) extensions = ['.ct', '.ps'] flags = {'-mfe': not args.multiple} applied_flags = [k for k, v in flags.items() if v] #Restrained/Constrained if xstraints: out_type = 'constraint' if args.th else 'restraint' suffixes = [[str(args.temp), out_type + X] for X in extensions] opts = {} if args.th else { '-si': args.intercept, '-sm': args.slope } options.update(opts) params = sfio.flatten_list([[k, str(v)] for k, v in options.items()]) params.extend(applied_flags) #Iterate through sequences for name, seq in sequences.items(): temp_seq = gen_temp_sequence(name, seq, args.outdir) gargs = [name, xstraints[name], args.outdir, args.truncate] guide = gen_cons(*gargs, args.th) if args.th else gen_rest( *gargs) gflag = '-c' if args.th else '-sh' #Out Files ct_out, ps_out = gen_file_paths(name, paths, suffixes) #Run Commands fold_command = [ 'Fold-smp', temp_seq, ct_out, gflag, guide ] + params draw_command = ['draw', ct_out, ps_out] subprocess.run(fold_command, env=rna_env, stdout=subprocess.DEVNULL) subprocess.run(draw_command, stdout=subprocess.DEVNULL) #Remove Temporary Files remove_temp(temp_seq, guide) #in-silico else: out_type = 'silico' suffixes = [[str(args.temp), out_type + X] for X in extensions] params = sfio.flatten_list([[k, str(v)] for k, v in options.items()]) params.extend(applied_flags) #Iterate through sequences for name, seq in sequences.items(): #Generate Temporary Files temp_seq = gen_temp_sequence(name, seq, args.outdir) #Out Files ct_out, ps_out = gen_file_paths(name, paths, suffixes) #Run Commands fold_command = ['Fold-smp', temp_seq, ct_out] + params draw_command = ['draw', ct_out, ps_out] subprocess.run(fold_command, env=rna_env, stdout=subprocess.DEVNULL) subprocess.run(draw_command, stdout=subprocess.DEVNULL) #Remove Temporary Files remove_temp(temp_seq) #Vienna Package Suite if args.mode == 'V': print('Vienna Package not yet supported')
def main(): parser = argparse.ArgumentParser( description='Reformats <.rtsc>/<.react> for easy correlation analysis') in_files = parser.add_argument_group('Input') in_files.add_argument('fasta', help='Reference Fasta') in_files.add_argument('rx', help='Input <.rx> files', nargs='+') settings = parser.add_argument_group('Settings') settings.add_argument('-verbose', action='store_true', help='Display metrics') settings.add_argument('-bases', default='AGCT', metavar='ACGT', help='[default = ACGT] Nucleotide specifictiy') settings.add_argument('-restrict', default=None, metavar='<.txt>', help='Filter to these transcripts via coverage file') out_files = parser.add_argument_group('Output') out_files.add_argument('-name', default=None, help='Specify output file name') args = parser.parse_args() #Set up covered = sfio.read_restrict(args.restrict) if args.restrict else None fasta_dict = sfio.read_fasta(args.fasta) desc = [sfio.rm_ext(x, '.rtsc', '.react') for x in args.rx] #All files are <.rtsc> if all(fyle.split('.')[-1] == 'rtsc' for fyle in args.rx): rx_data = sfio.read_rx_files(args.rx, mode='rtsc', verbose=args.verbose) default_name = '_'.join(desc + ['rtsc'] + ['correlation.csv']) out_name = sfio.check_extension(args.name, '.csv') if args.name else default_name if args.restrict: rx_data = {n: s for n, s in rx_data.items() if n in covered} if args.verbose: print('Remaining after filtering', len(rx_data), sep=',') print('Writing File:', out_name, sep=',') write_rtsc_repeatability(rx_data, fasta_dict, out_name, args.bases) #All files are <.react> elif all(fyle.split('.')[-1] == 'react' for fyle in args.rx): rx_data = sfio.read_rx_files(args.rx, mode='react', verbose=args.verbose) default_name = '_'.join(desc + ['react'] + ['correlation.csv']) out_name = sfio.check_extension(args.name, '.csv') if args.name else default_name if args.restrict: rx_data = {n: s for n, s in rx_data.items() if n in covered} if args.verbose: print('Remaining after filtering', len(rx_data), sep=',') print('Writing File:', out_name, sep=',') write_react_repeatability(rx_data, fasta_dict, out_name, args.bases) #Files do not make sense else: print('All Files must be the same type, and either .rtsc or .react!')
def main(): parser = argparse.ArgumentParser( description='Generates a <.react> file from two <.rtsc> files') in_files = parser.add_argument_group('Input') in_files.add_argument('control', type=str, help='Control <.rtsc> file') in_files.add_argument('treatment', type=str, help='Reagent <.rtsc> file') in_files.add_argument('fasta', type=str, help='Transcript <.fasta> file') settings = parser.add_argument_group('Settings') settings.add_argument('-restrict', default=None, metavar='<.txt>', help='Limit analysis to these specific transcripts') settings.add_argument('-scale', type=str, default=None, metavar='<.scale>', help='Provide a normalizaiton file for calculation') settings.add_argument('-threshold', type=float, default=7.0, help='[default = 7.0] Maximum Reactivity Cap') settings.add_argument( '-ln_off', action='store_true', help='Do not take the natural log of the stop counts') settings.add_argument('-nrm_off', action='store_true', help='Do not perform final 2-8' + u"\uFF05" + ' reactivity normalization') settings.add_argument('-bases', type=str, default='AC', metavar='ACGT', help='[default = AC] Reaction Specificity') out_files = parser.add_argument_group('Output') out_files.add_argument('-save_fails', action='store_true', help='Log transcripts with zero or missing scales') out_files.add_argument('-name', type=str, default=None, help='Specify output file name') args = parser.parse_args() #Create output name base_tag = [ x.split(os.sep)[-1].replace('.rtsc', '') for x in [args.control, args.treatment] ] log_tag = ['ln'] if args.ln_off == False else [] nrm_tag = ['nrm'] if args.ln_off == False else [] base_name = '_'.join(base_tag + log_tag + nrm_tag) out_name = base_name + '.react' if args.name == None else sfio.check_extension( args.name, '.react') #Read in data control_data, treatment_data = map(sfio.read_rtsc, [args.control, args.treatment]) #Apply filter if enabled if args.restrict: covered = sfio.read_restrict(args.restrict) control_data = {n: s for n, s in control_data.items() if n in covered} treatment_data = { n: s for n, s in treatment_data.items() if n in covered } #Calculate Derived Reactivity data = calculate_raw_reactivity(control_data, treatment_data, args.ln_off) #Read in transcript sequences seqs = sfio.read_fasta(args.fasta) #Generate and write scale, or read a <.scale> file in normalizaiton_scale = generate_normalization_scale( data, seqs, args.bases) if args.scale == None else read_norm_scale( args.scale) if args.scale == None: write_norm_scale(normalizaiton_scale, out_name.replace('.react', '.scale')) #Calculate Final Reactivity out_reactivity, out_missing = calculate_final_reactivity( data, seqs, args.bases, args.threshold, normalizaiton_scale, args.nrm_off) #Write Out sfio.write_react(out_reactivity, out_name, sort_flag=True) #Write Out Fails if args.save_fails: sfio.write_keys( out_missing, out_name.replace('.react', '_unresolvable_transcripts.txt'))