def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser()]) parser.add_argument("--exclude",nargs="+",default=[], help="Feature types to exclude from consideration") parser.add_argument("infile",metavar="infile.gff",type=str, help="Input GFF3 file") parser.add_argument("outfile",metavar="outfile.txt",type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin,return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0],"parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + ["parent unspecified","parent not in database","multiple parents"] with argsopener(args.outfile,args,"w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i,0) fh.write("%s\n" % sout) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser], ) parser.add_argument("outfile",type=str,help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args,printer=printer) transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args,printer=printer) ga_sum = ga.sum() normconst = 1000.0*1e6 / ga_sum with argsopener(args.outfile,args,"w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n") for n,ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts)/length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AlignmentParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),ap.get_parser()]) parser.add_argument("-o","--out",dest="outbase",type=str,required=True, metavar="FILENAME", help="Base name for output files") parser.add_argument("--window_size",default=100000,metavar="N",type=int, help="Size of nucleotides to fetch at once for export. "+\ "Large values are faster but require more memory "+\ "(Default: 100000)") track_opts = parser.add_argument_group(title="Browser track options") track_opts.add_argument("--color",type=str,default=None, help="An RGB hex string (`'#NNNNNN'`, `N` in `[0-9,A-F]`) specifying \ the track color.") track_opts.add_argument("-t","--track_name",dest="track_name",type=str, help="Name to give browser track", default=None) track_opts.add_argument("--output_format",choices=("bedgraph","variable_step"), default="bedgraph", help="Format of output file (Default: bedgraph)") args = parser.parse_args(argv) gnd = ap.get_genome_array_from_args(args,printer=printer) bp.get_base_ops_from_args(args) if args.track_name is None: name = args.outbase else: name = args.track_name if args.color is not None: fw_color = rc_color = "%s,%s,%s" % tuple(get_rgb255(args.color)) else: fw_color = rc_color = "0,0,0" if args.output_format == "bedgraph": outfn = gnd.to_bedgraph elif args.output_format == "variable_step": outfn = gnd.to_variable_step track_fw = "%s_fw.wig" % args.outbase track_rc = "%s_rc.wig" % args.outbase with argsopener(track_fw,args,"w") as fw_out: printer.write("Writing forward strand track to %s ..." % track_fw) outfn(fw_out,"%s_fw" % name,"+",window_size=args.window_size,color=fw_color, printer=printer) fw_out.close() with argsopener(track_rc,args,"w") as rc_out: printer.write("Writing reverse strand track to %s ..." % track_rc) outfn(rc_out,"%s_rc" % name,"-",window_size=args.window_size,color=rc_color, printer=printer) rc_out.close() printer.write("Done!")
def main(args=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser() an = AnnotationParser() mp = MaskParser() bp = BaseParser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") mask_file_parser = mp.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser], ) parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors") parser.add_argument( "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)" ) parser.add_argument( "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')" ) args = parser.parse_args(args) bp.get_base_ops_from_args(args) # if output folder doesn't exist, create it if not os.path.isdir(args.out_folder): os.mkdir(args.out_folder) # parse args ga = al.get_genome_array_from_args(args, printer=printer) transcripts = an.get_segmentchains_from_args(args, printer=printer) mask_hash = mp.get_genome_hash_from_args(args, printer=printer) # evaluate for n, tx in enumerate(transcripts): if n % 1000 == 0: printer.write("Processed %s regions of interest" % n) filename = "%s%s.txt" % (args.out_prefix, tx.get_name()) full_filename = os.path.join(args.out_folder, filename) # mask out overlapping masked regions overlapping = mask_hash.get_overlapping_features(tx) for feature in overlapping: tx.add_masks(*feature.segments) count_vec = tx.get_masked_counts(ga) numpy.savetxt(full_filename, count_vec, fmt=args.format)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser() bp = BaseParser() annotation_parser = ap.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), parents=[base_parser,annotation_parser], formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--no_escape",default=True,action="store_false", help="If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)") parser.add_argument("--output_format",choices=["BED","GTF2"],default="GTF2", help="Format of output file. (default: GTF2)") parser.add_argument("--extra_columns",nargs="+",default=[],type=str, help="Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only).") parser.add_argument("--empty_value",default="na",type=str, help="Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'") parser.add_argument("outfile",metavar="outfile.[ bed | gtf ]",type=str, help="Output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) end_message = "" extra_cols = args.extra_columns if extra_cols is not None: if args.output_format == "BED": # avoid name clashes names_used = copy.copy(BED12_RESERVED_NAMES) asql_names = [fix_name(X,names_used) for X in extra_cols] autosql_str = "\n".join(AUTOSQL_ROW_FMT_STR % (X," "*max(15-len(X),2)) for X in asql_names) file_info = { "outbase" : args.outfile.replace(".bed","").replace(".gtf",""), "numcols" : len(extra_cols), "autosql" : DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]),autosql_str), } end_message = MAKE_BIGBED_MESSAGE % file_info else: warn("`--extra_columns` is ignored for %s-formatted output." % (args.output_format),ArgumentWarning) with argsopener(args.outfile,args,"w") as fout: c = 0 transcripts = ap.get_transcripts_from_args(args,printer=printer) for transcript in transcripts: if args.output_format == "GTF2": fout.write(transcript.as_gtf(escape=args.no_escape)) elif args.output_format == "BED": fout.write(transcript.as_bed(extra_columns=extra_cols,empty_value=args.empty_value)) if c % 1000 == 1: printer.write("Processed %s transcripts ..." % c) c += 1 printer.write("Processed %s transcripts total." % c) printer.write("Done.") print(end_message)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES) annotation_file_parser = ap.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser,annotation_file_parser]) parser.add_argument("--export_tophat",default=False,action="store_true", help="Export tophat `.juncs` file in addition to BED output") parser.add_argument("outbase",type=str,help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain) with argsopener("%s.bed" % args.outbase,args,"w") as bed_out: if args.export_tophat == True: tophat_out = open("%s.juncs" % args.outbase,"w") printer.write("params: " +" ".join(argv)) printer.write("Detecting & comparing junctions...") ex_pairs = {} c = 0 u = 0 for chain in transcripts: if len(chain) > 1: # if multi-exon chrom = chain.chrom strand = chain.strand try: ep = ex_pairs[(chrom,strand)] except KeyError: ex_pairs[(chrom,strand)] = [] ep = ex_pairs[(chrom,strand)] for i in range(0,len(chain)-1): seg1 = chain[i] seg2 = chain[i+1] if c % 1000 == 0 and c > 0: printer.write("Processed %s junctions. Found %s unique..." % (c,u) ) c+=1 key = (seg1.end,seg2.start) if key not in ep: ep.append(key) u += 1 new_chain = SegmentChain(seg1,seg2) bed_out.write(new_chain.as_bed()) if args.export_tophat == True: my_junc = (chrom,seg1.end-1,seg2.start,strand) tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc) del new_chain del seg1 del seg2 del chain printer.write("Processed %s total junctions. Found %s unique." % (c,u) ) bed_out.close() if args.export_tophat == True: tophat_out.close() printer.write("Done.")
def main(args=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser() an = AnnotationParser() mp = MaskParser() bp = BaseParser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") mask_file_parser = mp.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[ base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser ]) parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors") parser.add_argument( "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)") parser.add_argument( "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')") args = parser.parse_args(args) bp.get_base_ops_from_args(args) # if output folder doesn't exist, create it if not os.path.isdir(args.out_folder): os.mkdir(args.out_folder) # parse args ga = al.get_genome_array_from_args(args, printer=printer) transcripts = an.get_segmentchains_from_args(args, printer=printer) mask_hash = mp.get_genome_hash_from_args(args, printer=printer) # evaluate for n, tx in enumerate(transcripts): if n % 1000 == 0: printer.write("Processed %s regions of interest" % n) filename = "%s%s.txt" % (args.out_prefix, tx.get_name()) full_filename = os.path.join(args.out_folder, filename) # mask out overlapping masked regions overlapping = mask_hash.get_overlapping_features(tx) for feature in overlapping: tx.add_masks(*feature.segments) count_vec = tx.get_masked_counts(ga) numpy.savetxt(full_filename, count_vec, fmt=args.format)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directrly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"], disabled=["normalize","big_genome",]) bp = BaseParser() alignment_file_parser = ap.get_parser() base_parser = bp.get_parser() pp = PlottingParser() plotting_parser = pp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, plotting_parser]) parser.add_argument("--min_counts",type=int,default=10,metavar="N", help="Minimum counts required in normalization region "+ "to be included in metagene average (Default: 10)") parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N", default=None, #default=(20,50), help="Portion of each window against which its individual raw count profile"+ " will be normalized. Specify two integers, in nucleotide"+ " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)") parser.add_argument("--norm_region",type=int,nargs=2,metavar="N", default=None, help="Deprecated. Use ``--normalize_over`` instead. "+ "Formerly, Portion of each window against which its individual raw count profile"+ " will be normalized. Specify two integers, in nucleotide"+ " distance, from 5\' end of window. (Default: 70 100)") parser.add_argument("--require_upstream",default=False,action="store_true", help="If supplied, the P-site offset is taken to be the distance "+ "between the largest peak upstream of the start codon and "+ "the start codon itself. Otherwise, the P-site offset is taken "+ "to be the distance between the largest peak in the entire ROI "+ "and the start codon. Ignored if ``--constrain`` is used." ) parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X", help="Constrain P-site offset to be between specified distance from "+ "start codon. Useful for noisy data. "+ "(Reasonable set: 10 15; default: not constrained)") parser.add_argument("--aggregate",default=False,action="store_true", help="Estimate P-site from aggregate reads at each position, instead "+ "of median normalized read density. Noisier, but helpful for "+ "lower-count data or read lengths with few counts. (Default: False)" ), parser.add_argument("--keep",default=False,action="store_true", help="Save intermediate count files. Useful for additional computations (Default: False)") parser.add_argument("--default",type=int,default=13, help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)") parser.add_argument("roi_file",type=str, help="ROI file surrounding start codons, from ``metagene generate`` subprogram") parser.add_argument("outbase",type=str,help="Basename for output files") # set manual options args = parser.parse_args(argv) bp.get_base_ops_from_args(args) # set defaults args.mapping = "fiveprime" args.offset = 0 args.nibble = 0 # process arguments min_len = args.min_length max_len = args.max_length profiles = max_len + 1 - min_len lengths = list(range(min_len,max_len+1)) outbase = args.outbase title = "Fiveprime read offsets by length" if args.title is None else args.title pp.set_style_from_args(args) colors = pp.get_colors_from_args(args,profiles) printer.write("Opening ROI file %s ..." % args.roi_file) with opener(args.roi_file) as roi_fh: roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0) roi_fh.close() printer.write("Opening count files %s ..." % ",".join(args.count_files)) ga = ap.get_genome_array_from_args(args,printer=printer) # remove default size filters my_filters = ga._filters.keys() for f in my_filters: ga.remove_filter(f) norm_start, norm_end = _get_norm_region(roi_table,args) # count count_dict, norm_count_dict, metagene_profile = do_count(roi_table, ga, norm_start, norm_end, args.min_counts, min_len, max_len, aggregate=args.aggregate, printer=printer) # save counts profile_fn = "%s_metagene_profiles.txt" % outbase with argsopener(profile_fn,args,"w") as metagene_out: metagene_profile.to_csv(metagene_out, sep="\t", header=True, index=False, na_rep="nan", columns=["x"]+["%s-mers" % X for X in lengths]) metagene_out.close() if args.keep == True: printer.write("Saving raw and normalized counts ...") for k in count_dict: count_fn = "%s_%s_rawcounts.txt.gz" % (outbase,k) normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k) mask_fn = "%s_%s_mask.txt.gz" % (outbase,k) numpy.savetxt(count_fn,count_dict[k],delimiter="\t") numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t") numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t") # plotting & offsets printer.write("Plotting and determining offsets ...") offset_dict = OrderedDict() # Determine scaling factor for plotting metagene profiles max_y = numpy.nan with warnings.catch_warnings(): # ignore warnings for slices that contain only NaNs warnings.simplefilter("ignore",category=RuntimeWarning) for k in lengths: max_y = numpy.nanmax([max_y, numpy.nanmax(metagene_profile["%s-mers"% k].values)]) if numpy.isnan(max_y) or max_y == 0: max_y = 1.0 # parse arguments & set styles mplrc = matplotlib.rcParams plt_incr = 1.2 # use this figsize if not specified on command line figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles) default_figsize = (7.5,figheight) fig = pp.get_figure_from_args(args,figsize=default_figsize) ax = plt.gca() plt.title(title) plt.xlabel("Distance from CDS start, (nt; 5' end mapping)") if args.aggregate == True: plt.ylabel("Aggregate read counts (au)") else: plt.ylabel("Median normalized read density (au)") plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2]) x = metagene_profile["x"].values xmin = x.min() xmax = x.max() if args.constrain is not None: mask = numpy.tile(True,len(x)) zp = (x==0).argmax() l,r = args.constrain if l == r: warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning) mindist = min(l,r) maxdist = max(l,r) mask[zp-maxdist:zp-mindist+1] = False elif args.require_upstream == True: mask = x >= 0 else: mask = numpy.tile(False,len(x)) for n,k in enumerate(lengths): color = colors[n] baseline = plt_incr*n y = metagene_profile["%s-mers" % k].values #ymask = y[mask] ymask = numpy.ma.MaskedArray(y,mask=mask) if numpy.isnan(y).all(): plot_y = numpy.zeros_like(x) else: if args.aggregate == False: plot_y = y / max_y else: plot_y = y.astype(float) / numpy.nanmax(y) * 0.9 # plot metagene profiles on common scale, offset by baseline from bottom to top ax.plot(x,baseline + plot_y,color=color) ax.text(xmin,baseline,"%s-mers" % k, ha="left", va="bottom", color=color, transform=matplotlib.transforms.offset_copy(ax.transData,fig, x=6.0,y=3.0,units="points")) ymax = baseline + numpy.nanmax(plot_y) # if all valid positions are nan, or if all valid positions are <= 0 if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0: offset = args.default usedefault = True else: offset = -x[numpy.ma.argmax(ymask)] usedefault = False offset_dict[k] = offset if usedefault == False: yadj = ymax - 0.2 * plt_incr ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2]) ax.text(-offset / 2.0, yadj, "%s nt" % (offset), color=color, ha="center", va="bottom", transform=matplotlib.transforms.offset_copy(ax.transData,fig, x=0.0,y=3.0,units="points") ) plt.xlim(xmin,xmax) plt.ylim(-0.1,plt_incr+baseline) ax.yaxis.set_ticks([]) # save data as p-site offset table fn = "%s_p_offsets.txt" % outbase fout = argsopener(fn,args) printer.write("Writing offset table to %s ..." % fn) fout.write("length\tp_offset\n") for k in offset_dict: fout.write("%s\t%s\n" % (k,offset_dict[k])) fout.write("default\t%s" % args.default) fout.close() # save plot plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat) printer.write("Saving plot to %s ..." % plot_fn) plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight") printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser()]) parser.add_argument("--exclude", nargs="+", default=[], help="Feature types to exclude from consideration") parser.add_argument("infile", metavar="infile.gff", type=str, help="Input GFF3 file") parser.add_argument("outfile", metavar="outfile.txt", type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin, return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0], "parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + [ "parent unspecified", "parent not in database", "multiple parents" ] with argsopener(args.outfile, args, "w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i, 0) fh.write("%s\n" % sout) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser( disabled=["normalize", "big_genome", "spliced_bowtie_files"], input_choices=["BAM"]) an = AnnotationParser() pp = PlottingParser() bp = BaseParser() plotting_parser = pp.get_parser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[ base_parser, annotation_file_parser, alignment_file_parser, plotting_parser ]) parser.add_argument("roi_file",type=str,nargs="?",default=None, help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\ "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\ "prevents double-counting of codons when multiple transcript isoforms exist "+\ "for a gene. See the documentation for `metagene` for more info about ROI files."+\ "If an ROI file is not given, supply an annotation with ``--annotation_files``") parser.add_argument("outbase", type=str, help="Required. Basename for output files") parser.add_argument( "--codon_buffer", type=int, default=5, help="Codons before and after start codon to ignore (Default: 5)") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) pp.set_style_from_args(args) gnd = al.get_genome_array_from_args(args, printer=printer) read_lengths = list(range(args.min_length, args.max_length + 1)) codon_buffer = args.codon_buffer dtmp = { "read_length": numpy.array(read_lengths), "reads_counted": numpy.zeros_like(read_lengths, dtype=int), } if args.roi_file is not None: using_roi = True roi_table = read_pl_table(args.roi_file) regions = roi_table.iterrows() transform_fn = roi_row_to_cds back_buffer = -1 if len(args.annotation_files) > 0: warnings.warn( "If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file, ", ".join(args.annotation_files)), ArgumentWarning) else: using_roi = False if len(args.annotation_files) == 0: printer.write( "Either an ROI file or at least annotation file must be given." ) sys.exit(1) else: warnings.warn( "Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.", ArgumentWarning) regions = an.get_transcripts_from_args(args, printer=printer) back_buffer = -codon_buffer transform_fn = lambda x: x.get_cds() phase_sums = {} for k in read_lengths: phase_sums[k] = numpy.zeros(3) for n, roi in enumerate(regions): if n % 1000 == 1: printer.write("Counted %s ROIs ..." % n) # transformation needed to extract CDS from transcript or from ROI file window cds_part = transform_fn(roi) # only calculate for coding genes if len(cds_part) > 0: read_dict = {} count_vectors = {} for k in read_lengths: read_dict[k] = [] count_vectors[k] = [] # for each seg, fetch reads, sort them, and create individual count vectors for seg in cds_part: reads = gnd.get_reads(seg) for read in filter(lambda x: len(x.positions) in read_dict, reads): read_dict[len(read.positions)].append(read) # map and sort by length for read_length in read_dict: count_vector = list( gnd.map_fn(read_dict[read_length], seg)[1]) count_vectors[read_length].extend(count_vector) # add each count vector for each length to total for k, vec in count_vectors.items(): counts = numpy.array(vec) if cds_part.strand == "-": counts = counts[::-1] if len(counts) % 3 == 0: counts = counts.reshape((int(len(counts) / 3), 3)) else: if using_roi == False: message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % ( roi.get_name(), len(counts)) warnings.warn(message, DataWarning) newlen = int(len(counts) // 3) counts = counts[:3 * newlen] counts = counts.reshape(newlen, 3) phase_sums[k] += counts[codon_buffer:back_buffer, :].sum(0) printer.write("Counted %s ROIs total." % (n + 1)) for k in dtmp: dtmp[k] = numpy.array(dtmp[k]) # total reads counted for each size for k in read_lengths: dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum() # read length distribution dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype( float) / dtmp["reads_counted"].sum() # phase vectors phase_vectors = { K: V.astype(float) / V.astype(float).sum() for K, V in phase_sums.items() } for i in range(3): dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"])) for k, vec in phase_vectors.items(): for i in range(3): dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i] # phase table fn = "%s_phasing.txt" % args.outbase printer.write("Saving phasing table to %s ..." % fn) dtmp = pd.DataFrame(dtmp) with argsopener(fn, args) as fh: dtmp.to_csv(fh, columns=[ "read_length", "reads_counted", "fraction_reads_counted", "phase0", "phase1", "phase2", ], float_format="%.6f", na_rep="nan", sep="\t", index=False, header=True) fh.close() fig = {} if args.figsize is not None: fig["figsize"] = tuple(args.figsize) colors = pp.get_colors_from_args(args, len(read_lengths)) fn = "%s_phasing.%s" % (args.outbase, args.figformat) printer.write("Plotting to %s ..." % fn) plot_counts = numpy.vstack([V for (_, V) in sorted(phase_sums.items())]) fig, (ax1, _) = phase_plot(plot_counts, labels=read_lengths, lighten_by=0.3, cmap=None, color=colors, fig=fig) if args.title is not None: ax1.set_title(args.title) else: ax1.set_title("Phasing stats for %s" % args.outbase) fig.savefig(fn, dpi=args.dpi, bbox_inches="tight")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser(disabled=["normalize","big_genome","spliced_bowtie_files"], input_choices=["BAM"]) an = AnnotationParser() pp = PlottingParser() bp = BaseParser() plotting_parser = pp.get_parser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[base_parser, annotation_file_parser, alignment_file_parser, plotting_parser]) parser.add_argument("roi_file",type=str,nargs="?",default=None, help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\ "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\ "prevents double-counting of codons when multiple transcript isoforms exist "+\ "for a gene. See the documentation for `metagene` for more info about ROI files."+\ "If an ROI file is not given, supply an annotation with ``--annotation_files``") parser.add_argument("outbase",type=str,help="Required. Basename for output files") parser.add_argument("--codon_buffer",type=int,default=5, help="Codons before and after start codon to ignore (Default: 5)") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) pp.set_style_from_args(args) gnd = al.get_genome_array_from_args(args,printer=printer) read_lengths = list(range(args.min_length,args.max_length+1)) codon_buffer = args.codon_buffer dtmp = { "read_length" : numpy.array(read_lengths), "reads_counted" : numpy.zeros_like(read_lengths,dtype=int), } if args.roi_file is not None: using_roi = True roi_table = read_pl_table(args.roi_file) regions = roi_table.iterrows() transform_fn = roi_row_to_cds back_buffer = -1 if len(args.annotation_files) > 0: warnings.warn("If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file, ", ".join(args.annotation_files)), ArgumentWarning) else: using_roi = False if len(args.annotation_files) == 0: printer.write("Either an ROI file or at least annotation file must be given.") sys.exit(1) else: warnings.warn("Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.", ArgumentWarning) regions = an.get_transcripts_from_args(args,printer=printer) back_buffer = -codon_buffer transform_fn = lambda x: x.get_cds() phase_sums = {} for k in read_lengths: phase_sums[k] = numpy.zeros(3) for n, roi in enumerate(regions): if n % 1000 == 1: printer.write("Counted %s ROIs ..." % n) # transformation needed to extract CDS from transcript or from ROI file window cds_part = transform_fn(roi) # only calculate for coding genes if len(cds_part) > 0: read_dict = {} count_vectors = {} for k in read_lengths: read_dict[k] = [] count_vectors[k] = [] # for each seg, fetch reads, sort them, and create individual count vectors for seg in cds_part: reads = gnd.get_reads(seg) for read in filter(lambda x: len(x.positions) in read_dict,reads): read_dict[len(read.positions)].append(read) # map and sort by length for read_length in read_dict: count_vector = list(gnd.map_fn(read_dict[read_length],seg)[1]) count_vectors[read_length].extend(count_vector) # add each count vector for each length to total for k, vec in count_vectors.items(): counts = numpy.array(vec) if cds_part.strand == "-": counts = counts[::-1] if len(counts) % 3 == 0: counts = counts.reshape((len(counts)/3,3)) else: if using_roi == False: message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (roi.get_name(),len(counts)) warnings.warn(message,DataWarning) newlen = len(counts)//3 counts = counts[:3*newlen] counts = counts.reshape(newlen,3) phase_sums[k] += counts[codon_buffer:back_buffer,:].sum(0) printer.write("Counted %s ROIs total." % (n+1)) for k in dtmp: dtmp[k] = numpy.array(dtmp[k]) # total reads counted for each size for k in read_lengths: dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum() # read length distribution dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(float) / dtmp["reads_counted"].sum() # phase vectors phase_vectors = { K : V.astype(float)/V.astype(float).sum() for K,V in phase_sums.items() } for i in range(3): dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"])) for k, vec in phase_vectors.items(): for i in range(3): dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i] # phase table fn = "%s_phasing.txt" % args.outbase printer.write("Saving phasing table to %s ..." % fn) dtmp = pd.DataFrame(dtmp) with argsopener(fn,args) as fh: dtmp.to_csv(fh,columns=["read_length", "reads_counted", "fraction_reads_counted", "phase0", "phase1", "phase2", ], float_format="%.6f", na_rep="nan", sep="\t", index=False, header=True ) fh.close() fig = {} if args.figsize is not None: fig["figsize"] = tuple(args.figsize) colors = pp.get_colors_from_args(args,len(read_lengths)) fn = "%s_phasing.%s" % (args.outbase,args.figformat) printer.write("Plotting to %s ..." % fn) plot_counts = numpy.vstack([V for (_,V) in sorted(phase_sums.items())]) fig, (ax1,_) = phase_plot(plot_counts,labels=read_lengths,lighten_by=0.3, cmap=None,color=colors,fig=fig) if args.title is not None: ax1.set_title(args.title) else: ax1.set_title("Phasing stats for %s" % args.outbase) fig.savefig(fn,dpi=args.dpi,bbox_inches="tight")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser(disabled=["normalize"]) an = AnnotationParser() mp = MaskParser() pl = PlottingParser() bp = BaseParser() alignment_file_parser = al.get_parser() annotation_file_parser = an.get_parser() mask_file_parser = mp.get_parser() plotting_parser = pl.get_parser() base_parser = bp.get_parser() generator_help = "Create unambiguous position file from GFF3 annotation" generator_desc = format_module_docstring(do_generate.__doc__) counter_help = "Count reads in unambiguous gene positions" counter_desc = format_module_docstring(do_count.__doc__) chart_help = "Produce charts comparing reads between samples" chart_desc = format_module_docstring(do_chart.__doc__) parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers( title="subcommands", description="choose one of the following", dest="program") gparser = subparsers.add_parser( "generate", help=generator_help, description=generator_desc, formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, annotation_file_parser, mask_file_parser], ) cparser = subparsers.add_parser( "count", help=counter_help, description=counter_desc, parents=[base_parser, alignment_file_parser], formatter_class=argparse.RawDescriptionHelpFormatter, ) pparser = subparsers.add_parser( "chart", help=chart_help, description=chart_desc, parents=[base_parser, plotting_parser], formatter_class=argparse.RawDescriptionHelpFormatter) gparser.add_argument("outbase", metavar="outbase", type=str, help="Basename for output files") cparser.add_argument( "position_file", type=str, metavar="file.positions", help= "File assigning positions to genes or transcripts (made using 'generate' subcommand)" ) cparser.add_argument("outbase", type=str, help="Basename for output files") pparser.add_argument("-i", "--in", nargs="+", type=str, dest="infiles", help="input files, made by 'count' subprogram") pparser.add_argument( "--bins", nargs="+", type=int, default=(0, 32, 64, 128, 256, 512, 1024, 2048, 4096), help="Bins into which features are partitioned based on counts") pparser.add_argument( "--regions", nargs="+", type=str, default=("exon", "utr5", "cds", "utr3"), help="Regions to compare (default: exon, utr5, cds, utr3)") pparser.add_argument("--metrics", nargs="+", type=str, default=("rpkm", "reads"), help="Metrics to compare (default: rpkm, reads)") pparser.add_argument( "list_of_regions", type=str, metavar='gene_list.txt', nargs="?", default=None, help= "Optional. File listing regions (genes or transcripts), one per line, to include in comparisons. If not given, all genes are included." ) pparser.add_argument("outbase", type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) if args.program == "generate": #generate position file do_generate(args, an, mp) elif args.program == "count": #use position file to count gene expression in infiles do_count(args, al) elif args.program == "chart": #use count files to generate a family of charts and tables do_chart(args, pl)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser()]) parser.add_argument("-k",dest="read_length",metavar="READ_LENGTH", type=int,default=29, help="K-mer length to generate from input file. "+ "(Default: 29)") parser.add_argument("--offset",type=int,default=14, help="Offset from 5' end of plus-strand read at which to attribute score (Default: 14)") parser.add_argument("--mismatches",metavar="N", type=int,default=0, help="Number of mismatches tolerated in alignment. "+ "(Default: 0)") parser.add_argument("--bowtie",dest="bowtie",default="/usr/local/bin/bowtie", type=str, help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)") parser.add_argument("--have_kmers",default=False,action="store_true", help="If specified, use k-mer files from previous run. "+\ " In this case 'sequence_file' should be the value "+\ "'outbase' from the k-mer files you want to use.") parser.add_argument("--save_kmers",default=False,action="store_true", help="Save k-mer files for reuse in a subsequent run.") parser.add_argument("-p","--processes",type=int,default=2,metavar="N", help="Number of processes to use (should be <= number of chromosomes") parser.add_argument("ebwt",type=str, help="Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`.") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) #filenames base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches) bed_file = "%s_crossmap.bed" % base #if not os.path.exists(args.sequence_file): # printer.write("Could not find source file: %s" % args.sequence_file) # printer.write("Exiting.") # sys.exit(1) if args.have_kmers == True: import glob kmer_files = glob.glob(args.sequence_file+"*kmers.fa") seq_pat = re.compile(r".*_([^_]*)_kmers.fa") seqs = { seq_pat.search(X).groups()[0] : X for X in kmer_files } else: seqs = sp.get_seqdict_from_args(args,index=True) worker = functools.partial(chrom_worker,args=args) chroms = seqs.items() pool = multiprocessing.Pool(processes=args.processes) bed_filenames = pool.map(worker,chroms,1) pool.close() pool.join() with open(bed_file,"w") as fout: for f in sorted(bed_filenames): shutil.copyfileobj(open(f,"r"),fout) os.remove(f) fout.close() printer.write("Done.") printer.write(BigBedMessage.replace("OUTFILE",bed_file.replace(".bed","")).replace("BOWTIE_INDEX",args.ebwt))
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() bp = BaseParser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(), sp.get_parser()]) parser.add_argument("-k", dest="read_length", metavar="READ_LENGTH", type=int, default=29, help="K-mer length to generate from input file. " + "(Default: 29)") parser.add_argument( "--offset", type=int, default=14, help= "Offset from 5' end of plus-strand read at which to attribute score (Default: 14)" ) parser.add_argument("--mismatches", metavar="N", type=int, default=0, help="Number of mismatches tolerated in alignment. " + "(Default: 0)") parser.add_argument( "--bowtie", dest="bowtie", default="/usr/local/bin/bowtie", type=str, help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)") parser.add_argument("--have_kmers",default=False,action="store_true", help="If specified, use k-mer files from previous run. "+\ " In this case 'sequence_file' should be the value "+\ "'outbase' from the k-mer files you want to use.") parser.add_argument("--save_kmers", default=False, action="store_true", help="Save k-mer files for reuse in a subsequent run.") parser.add_argument( "-p", "--processes", type=int, default=2, metavar="N", help="Number of processes to use (should be <= number of chromosomes") parser.add_argument( "ebwt", type=str, help= "Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`." ) parser.add_argument("outbase", type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) #filenames base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches) bed_file = "%s_crossmap.bed" % base #if not os.path.exists(args.sequence_file): # printer.write("Could not find source file: %s" % args.sequence_file) # printer.write("Exiting.") # sys.exit(1) if args.have_kmers == True: import glob kmer_files = glob.glob(args.sequence_file + "*kmers.fa") seq_pat = re.compile(r".*_([^_]*)_kmers.fa") seqs = {seq_pat.search(X).groups()[0]: X for X in kmer_files} else: seqs = sp.get_seqdict_from_args(args, index=True) worker = functools.partial(chrom_worker, args=args) chroms = seqs.items() pool = multiprocessing.Pool(processes=args.processes) bed_filenames = pool.map(worker, chroms, 1) pool.close() pool.join() with open(bed_file, "w") as fout: for f in sorted(bed_filenames): shutil.copyfileobj(open(f, "r"), fout) os.remove(f) fout.close() printer.write("Done.") printer.write( BigBedMessage.replace("OUTFILE", bed_file.replace(".bed", "")).replace( "BOWTIE_INDEX", args.ebwt))
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() mp = MaskParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()], ) parser.add_argument("--maxslide",type=int,default=10, help="Maximum number of nt to search 5\' and 3\' of intron"+ " boundaries (Default: 10)") parser.add_argument("--ref",type=str,metavar="ref.bed",default=None, help="Reference file describing known splice junctions") parser.add_argument("--slide_canonical",action="store_true",default=False, help="Slide junctions to canonical junctions if present within equal support region") parser.add_argument("infile",type=str,metavar="input.bed", help="BED file describing discovered junctions") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) printer.write("Opening genome from %s..." % args.sequence_file) genome = sp.get_seqdict_from_args(args) # load crossmap cross_hash = mp.get_genome_hash_from_args(args) # load ref junctions if args.ref is not None: printer.write("Loading reference junctions from %s" % args.ref) known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False) else: known_hash = GenomeHash() # set up variables canonicals_plus = [("GT","AG"), ("GC","AG") ] canonicals_minus = [("CT","AC"), ("CT","GC") ] known_in_range = 0 canonical_in_range = 0 repetitive = 0 untouched = 0 c = 0 seen_already = [] outfiles = { "repetitive" : "%s_repetitive.bed" % args.outbase, "known" : "%s_shifted_known.bed" % args.outbase, "canonical" : "%s_shifted_canonical.bed" % args.outbase, "untouched" : "%s_untouched.bed" % args.outbase, } outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() } # process data printer.write("Opening junctions from %s..." % args.infile) for ivc in BED_Reader(CommentReader(opener(args.infile))): processed = False tup = None if c % 1000 == 0 and c > 0: printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) assert len(ivc) == 2 strand = ivc.strand minus_range, plus_range = find_match_range(ivc,genome,args.maxslide) # see if either end of splice junction +- match_range lands in repetitive areas of genome if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash): repetitive += 1 outfiles["repetitive"].write(ivc.as_bed()) processed = True # see if one or more known junctions in range if processed == False and args.ref is not None: # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions) known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc)) if len(known_juncs) > 0: known_in_range += 1 for my_known in known_juncs: tup = get_junction_tuple(my_known) if tup not in seen_already: outfiles["known"].write(my_known.as_bed()) seen_already.append(tup) processed = True # see if one or more canonical junctions in range if processed == False and args.slide_canonical == True: canonicals = canonicals_plus if strand == "+" else canonicals_minus #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals) canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals) if len(canonical_juncs) > 0: canonical_in_range += 1 for can in canonical_juncs: tup = get_junction_tuple(can) if tup not in seen_already: outfiles["canonical"].write(can.as_bed()) seen_already.append(tup) processed = True if processed == False: outfiles["untouched"].write(ivc.as_bed()) untouched += 1 c += 1 # save output printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) for v in outfiles.values(): v.close() printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[ base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser ], ) parser.add_argument("outfile", type=str, help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args, printer=printer) transcripts = ap.get_transcripts_from_args(args, printer=printer, return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args, printer=printer) ga_sum = ga.sum() normconst = 1000.0 * 1e6 / ga_sum with argsopener(args.outfile, args, "w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write( "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n" ) for n, ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts) / length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [ name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length ] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")