예제 #1
0
def main(argv=sys.argv[1:]):
	"""Command-line program
	
	Parameters
	----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

		Default: `sys.argv[1:]`. The command-line arguments, if the script is
		invoked from the command line
	"""
	bp = BaseParser()
	parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
									 formatter_class=argparse.RawDescriptionHelpFormatter,
									 parents=[bp.get_parser()])
	parser.add_argument("--exclude",nargs="+",default=[],
					    help="Feature types to exclude from consideration")
	parser.add_argument("infile",metavar="infile.gff",type=str,
	                   help="Input GFF3 file")
	parser.add_argument("outfile",metavar="outfile.txt",type=str,
					   help="Name of output file")
	
	args = parser.parse_args(argv)
	bp.get_base_ops_from_args(args)
	excluded = set(args.exclude)

	fin = sys.stdin if args.infile == "-" else opener(args.infile)
	
	feature_counts        = Counter()
	features_with_parents = []
	feature_types         = {}
	name_type             = {}

	
	printer.write("Opening %s..." % args.infile)
	c = 0
	for feature in GFF3_Reader(fin,return_stopfeatures=False):
		if c % 10000 == 0:
			printer.write("Processed %s features..." % c)
		c += 1
		ftype = feature.attr["type"]
		fname = feature.get_name()
		if ftype not in excluded:
			if ftype not in feature_types:
				feature_types[ftype] = Counter()
			feature_counts[ftype] += 1
			if fname is not None:
				name_type[fname] = ftype
			if "Parent" in feature.attr:
				features_with_parents.append(feature)
			else:
				feature_types[ftype]["parent unspecified"] += 1
	
	printer.write("Sorting parents...")
	c = 0
	for feature in features_with_parents:
		if c % 10000 == 0:
			printer.write("Processed %s parents..." % c)
		c += 1
		pnames = feature.attr["Parent"]
		ftype = feature.attr["type"]
		if pnames == "":
			feature_types[ftype]["parent unspecified"] += 1
		else:
			if len(pnames) > 1:
				feature_types[ftype]["multiple parents"] += 1
			else:
				ptype = name_type.get(pnames[0],"parent not in database")
				feature_types[ftype][ptype] += 1

	rows = sorted(feature_types.keys())
	cols = rows + ["parent unspecified","parent not in database","multiple parents"]

	with argsopener(args.outfile,args,"w") as fh:
		printer.write("Writing %s..." % args.outfile)
		header = "#feature_type\tcount\t" + "\t".join(cols) + "\n"
		fh.write(header)
		for r in rows:
			sout = "%s\t%s" % (r, feature_counts[r])
			for i in cols:
				sout += "\t%s" % feature_types[r].get(i,0)
			fh.write("%s\n" % sout)

	printer.write("Done.")
예제 #2
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")
    
    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser  = al.get_parser(conflict_handler="resolve")
    
    mp = MaskParser()
    mask_file_parser = mp.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              annotation_file_parser,
                                              mask_file_parser],
                                     )
    parser.add_argument("outfile",type=str,help="Output filename")
    
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga          = al.get_genome_array_from_args(args,printer=printer)
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    crossmap    = mp.get_genome_hash_from_args(args,printer=printer)
    
    ga_sum = ga.sum()
    normconst = 1000.0*1e6 / ga_sum
    
    with argsopener(args.outfile,args,"w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n")
        for n,ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)
                
            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts)/length
            rpkm = numpy.nan if length == 0 else rpnt * normconst 
            ltmp = [name,
                    str(ivc),
                    "%.8e" % counts,
                    "%.8e" % rpnt,
                    "%.8e" % rpkm,
                    "%d" % length]
            fout.write("%s\n" % "\t".join(ltmp))
    
        fout.close()
        
    printer.write("Processed %s regions total." % n)

    printer.write("Done.")
예제 #3
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AlignmentParser()
    bp = BaseParser()
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),ap.get_parser()])
    parser.add_argument("-o","--out",dest="outbase",type=str,required=True,
                        metavar="FILENAME",
                        help="Base name for output files")
    parser.add_argument("--window_size",default=100000,metavar="N",type=int,
                        help="Size of nucleotides to fetch at once for export. "+\
                             "Large values are faster but require more memory "+\
                             "(Default: 100000)")

    track_opts = parser.add_argument_group(title="Browser track options")
    track_opts.add_argument("--color",type=str,default=None,
                        help="An RGB hex string (`'#NNNNNN'`, `N` in `[0-9,A-F]`) specifying \
                              the track color.")
    track_opts.add_argument("-t","--track_name",dest="track_name",type=str,
                        help="Name to give browser track",
                        default=None)
    track_opts.add_argument("--output_format",choices=("bedgraph","variable_step"),
                        default="bedgraph",
                        help="Format of output file (Default: bedgraph)")

    args = parser.parse_args(argv)
    gnd  = ap.get_genome_array_from_args(args,printer=printer)
    bp.get_base_ops_from_args(args)
    
    if args.track_name is None:
        name = args.outbase
    else:
        name = args.track_name
    
    if args.color is not None:
        fw_color = rc_color = "%s,%s,%s" % tuple(get_rgb255(args.color))
    else:
        fw_color = rc_color = "0,0,0"
    
    if args.output_format == "bedgraph":
        outfn = gnd.to_bedgraph
    elif args.output_format == "variable_step":
        outfn = gnd.to_variable_step

    track_fw = "%s_fw.wig" % args.outbase
    track_rc = "%s_rc.wig" % args.outbase

    with argsopener(track_fw,args,"w") as fw_out:
        printer.write("Writing forward strand track to %s ..." % track_fw)
        outfn(fw_out,"%s_fw" % name,"+",window_size=args.window_size,color=fw_color,
                printer=printer)
        fw_out.close()

    with argsopener(track_rc,args,"w") as rc_out:
        printer.write("Writing reverse strand track to %s ..." % track_rc)
        outfn(rc_out,"%s_rc" % name,"-",window_size=args.window_size,color=rc_color,
                printer=printer)
        rc_out.close()
    
    printer.write("Done!")
예제 #4
0
def main(args=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser()
    an = AnnotationParser()
    mp = MaskParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    mask_file_parser = mp.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser],
    )

    parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors")
    parser.add_argument(
        "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)"
    )
    parser.add_argument(
        "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')"
    )
    args = parser.parse_args(args)
    bp.get_base_ops_from_args(args)

    # if output folder doesn't exist, create it
    if not os.path.isdir(args.out_folder):
        os.mkdir(args.out_folder)

    # parse args
    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = an.get_segmentchains_from_args(args, printer=printer)
    mask_hash = mp.get_genome_hash_from_args(args, printer=printer)

    # evaluate
    for n, tx in enumerate(transcripts):
        if n % 1000 == 0:
            printer.write("Processed %s regions of interest" % n)
        filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
        full_filename = os.path.join(args.out_folder, filename)

        # mask out overlapping masked regions
        overlapping = mask_hash.get_overlapping_features(tx)
        for feature in overlapping:
            tx.add_masks(*feature.segments)

        count_vec = tx.get_masked_counts(ga)
        numpy.savetxt(full_filename, count_vec, fmt=args.format)
예제 #5
0
def main(argv=sys.argv[1:]): 
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser()
    bp = BaseParser()
    annotation_parser = ap.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     parents=[base_parser,annotation_parser],
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--no_escape",default=True,action="store_false",
                        help="If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)")
    parser.add_argument("--output_format",choices=["BED","GTF2"],default="GTF2",
                        help="Format of output file. (default: GTF2)")
    parser.add_argument("--extra_columns",nargs="+",default=[],type=str,
                        help="Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only).")
    parser.add_argument("--empty_value",default="na",type=str,
                        help="Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'")
    parser.add_argument("outfile",metavar="outfile.[ bed | gtf ]",type=str,
                        help="Output file")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    end_message = ""    
    extra_cols = args.extra_columns
    if extra_cols is not None:
        if args.output_format == "BED":
            
            # avoid name clashes
            names_used = copy.copy(BED12_RESERVED_NAMES)
            asql_names = [fix_name(X,names_used) for X in extra_cols]
            autosql_str = "\n".join(AUTOSQL_ROW_FMT_STR % (X," "*max(15-len(X),2)) for X in asql_names)
            
            file_info = {
                "outbase" : args.outfile.replace(".bed","").replace(".gtf",""),
                "numcols" : len(extra_cols),
                "autosql" : DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]),autosql_str),
                         
            }
            end_message = MAKE_BIGBED_MESSAGE % file_info
        else:
            warn("`--extra_columns` is ignored for %s-formatted output." % (args.output_format),ArgumentWarning)
            
            
    with argsopener(args.outfile,args,"w") as fout:
        c = 0
        transcripts = ap.get_transcripts_from_args(args,printer=printer)
        
        for transcript in transcripts:
            if args.output_format == "GTF2":
                fout.write(transcript.as_gtf(escape=args.no_escape))
            elif args.output_format == "BED":
                fout.write(transcript.as_bed(extra_columns=extra_cols,empty_value=args.empty_value))
            if c % 1000 == 1:
                printer.write("Processed %s transcripts ..." % c)
            c += 1
    
    printer.write("Processed %s transcripts total." % c)
    printer.write("Done.")
    print(end_message)
예제 #6
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES)
    annotation_file_parser = ap.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,annotation_file_parser])
    parser.add_argument("--export_tophat",default=False,action="store_true",
                         help="Export tophat `.juncs` file in addition to BED output")
    parser.add_argument("outbase",type=str,help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    
    with argsopener("%s.bed" % args.outbase,args,"w") as bed_out:
        if args.export_tophat == True:
            tophat_out = open("%s.juncs" % args.outbase,"w")
    
        printer.write("params: " +" ".join(argv))
        printer.write("Detecting & comparing junctions...")
        ex_pairs = {}
        
        c = 0
        u = 0
        for chain in transcripts:
            if len(chain) > 1: # if multi-exon
                chrom = chain.chrom
                strand = chain.strand
                try:
                    ep = ex_pairs[(chrom,strand)]
                except KeyError:
                    ex_pairs[(chrom,strand)] = []
                    ep = ex_pairs[(chrom,strand)]

                for i in range(0,len(chain)-1):
                    
                    seg1 = chain[i]
                    seg2 = chain[i+1]
                    if c % 1000 == 0 and c > 0:
                        printer.write("Processed %s junctions. Found %s unique..." % (c,u) )
                    c+=1
                    key = (seg1.end,seg2.start)
                        
                    if key not in ep:
                        ep.append(key)
                        u += 1
                        new_chain = SegmentChain(seg1,seg2)
                        bed_out.write(new_chain.as_bed())
                        if args.export_tophat == True:
                            my_junc = (chrom,seg1.end-1,seg2.start,strand)
                            tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc)
                        
                        del new_chain
                    
                    del seg1
                    del seg2
                    
            del chain
    
        printer.write("Processed %s total junctions. Found %s unique." % (c,u) )
    
        bed_out.close()
        if args.export_tophat == True:
            tophat_out.close()

    printer.write("Done.")
예제 #7
0
def main(args=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser()
    an = AnnotationParser()
    mp = MaskParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    mask_file_parser = mp.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ])

    parser.add_argument("out_folder",
                        type=str,
                        help="Folder in which to save output vectors")
    parser.add_argument(
        "--out_prefix",
        default="",
        type=str,
        help="Prefix to prepend to output files (default: no prefix)")
    parser.add_argument(
        "--format",
        default="%.8f",
        type=str,
        help=r"printf-style format string for output (default: '%%.8f')")
    args = parser.parse_args(args)
    bp.get_base_ops_from_args(args)

    # if output folder doesn't exist, create it
    if not os.path.isdir(args.out_folder):
        os.mkdir(args.out_folder)

    # parse args
    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = an.get_segmentchains_from_args(args, printer=printer)
    mask_hash = mp.get_genome_hash_from_args(args, printer=printer)

    # evaluate
    for n, tx in enumerate(transcripts):
        if n % 1000 == 0:
            printer.write("Processed %s regions of interest" % n)
        filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
        full_filename = os.path.join(args.out_folder, filename)

        # mask out overlapping masked regions
        overlapping = mask_hash.get_overlapping_features(tx)
        for feature in overlapping:
            tx.add_masks(*feature.segments)

        count_vec = tx.get_masked_counts(ga)
        numpy.savetxt(full_filename, count_vec, fmt=args.format)
예제 #8
0
파일: psite.py 프로젝트: zzygyx9119/plastid
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directrly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"],
                         disabled=["normalize","big_genome",])
    bp = BaseParser()
    alignment_file_parser = ap.get_parser()
    base_parser = bp.get_parser()
    
    pp = PlottingParser()
    plotting_parser = pp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              plotting_parser])
    
    parser.add_argument("--min_counts",type=int,default=10,metavar="N",
                         help="Minimum counts required in normalization region "+
                              "to be included in metagene average (Default: 10)")
    parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N",
                         default=None,
                         #default=(20,50),
                         help="Portion of each window against which its individual raw count profile"+
                              " will be normalized. Specify two integers, in nucleotide"+
                              " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)")
    parser.add_argument("--norm_region",type=int,nargs=2,metavar="N",
                         default=None,
                         help="Deprecated. Use ``--normalize_over`` instead. "+
                              "Formerly, Portion of each window against which its individual raw count profile"+
                              " will be normalized. Specify two integers, in nucleotide"+
                              " distance, from 5\' end of window. (Default: 70 100)")
    parser.add_argument("--require_upstream",default=False,action="store_true",
                        help="If supplied, the P-site offset is taken to be the distance "+
                             "between the largest peak upstream of the start codon and "+
                             "the start codon itself. Otherwise, the P-site offset is taken "+
                             "to be the distance between the largest peak in the entire ROI "+
                             "and the start codon. Ignored if ``--constrain`` is used."
                        )
    parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X",
                        help="Constrain P-site offset to be between specified distance from "+
                             "start codon. Useful for noisy data. "+
                             "(Reasonable set: 10 15; default: not constrained)")
    parser.add_argument("--aggregate",default=False,action="store_true",
                        help="Estimate P-site from aggregate reads at each position, instead "+
                             "of median normalized read density. Noisier, but helpful for "+
                             "lower-count data or read lengths with few counts. (Default: False)"
                        ),
    parser.add_argument("--keep",default=False,action="store_true",
                        help="Save intermediate count files. Useful for additional computations (Default: False)")
    parser.add_argument("--default",type=int,default=13,
                        help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)")

    parser.add_argument("roi_file",type=str,
                        help="ROI file surrounding start codons, from ``metagene generate`` subprogram")
    
    parser.add_argument("outbase",type=str,help="Basename for output files")
    
    # set manual options
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    # set defaults
    args.mapping = "fiveprime"
    args.offset  = 0
    args.nibble  = 0

    
    # process arguments
    min_len = args.min_length
    max_len = args.max_length
    profiles = max_len + 1 - min_len
    lengths = list(range(min_len,max_len+1))
    outbase = args.outbase
    title  = "Fiveprime read offsets by length" if args.title is None else args.title
    
    pp.set_style_from_args(args)
    colors = pp.get_colors_from_args(args,profiles)
 
    printer.write("Opening ROI file %s ..." % args.roi_file)
    with opener(args.roi_file) as roi_fh:
        roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0)
        roi_fh.close()
        
    printer.write("Opening count files %s ..." % ",".join(args.count_files))
    ga = ap.get_genome_array_from_args(args,printer=printer)

    
    # remove default size filters
    my_filters = ga._filters.keys()
    for f in my_filters:
        ga.remove_filter(f)

    norm_start, norm_end = _get_norm_region(roi_table,args)
    
    # count
    count_dict, norm_count_dict, metagene_profile = do_count(roi_table,
                                                             ga,
                                                             norm_start,
                                                             norm_end,
                                                             args.min_counts,
                                                             min_len,
                                                             max_len,
                                                             aggregate=args.aggregate,
                                                             printer=printer)
    
    # save counts
    profile_fn = "%s_metagene_profiles.txt" % outbase
    with argsopener(profile_fn,args,"w") as metagene_out:
        metagene_profile.to_csv(metagene_out,
                                sep="\t",
                                header=True,
                                index=False,
                                na_rep="nan",
                                columns=["x"]+["%s-mers" % X for X in lengths])
        metagene_out.close()

    if args.keep == True:
        printer.write("Saving raw and normalized counts ...")
        for k in count_dict:
            count_fn     = "%s_%s_rawcounts.txt.gz"  % (outbase,k)
            normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k)
            mask_fn      = "%s_%s_mask.txt.gz" % (outbase,k)
            numpy.savetxt(count_fn,count_dict[k],delimiter="\t")
            numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t")
            numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t")
    
    # plotting & offsets
    printer.write("Plotting and determining offsets ...")
    offset_dict = OrderedDict() 

    # Determine scaling factor for plotting metagene profiles
    max_y = numpy.nan 
    with warnings.catch_warnings():
        # ignore warnings for slices that contain only NaNs
        warnings.simplefilter("ignore",category=RuntimeWarning)
        for k in lengths:
            max_y = numpy.nanmax([max_y,
                                  numpy.nanmax(metagene_profile["%s-mers"% k].values)])

    if numpy.isnan(max_y) or max_y == 0:
        max_y = 1.0


    # parse arguments & set styles
    mplrc = matplotlib.rcParams
    plt_incr  = 1.2

    # use this figsize if not specified on command line
    figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles)
    default_figsize = (7.5,figheight)

    fig = pp.get_figure_from_args(args,figsize=default_figsize)

    ax = plt.gca()
    plt.title(title)
    plt.xlabel("Distance from CDS start, (nt; 5' end mapping)")
    if args.aggregate == True:
        plt.ylabel("Aggregate read counts (au)")
    else:
        plt.ylabel("Median normalized read density (au)")
        
    plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2])

    x = metagene_profile["x"].values
    xmin = x.min()
    xmax = x.max()
    
    if args.constrain is not None:
        mask = numpy.tile(True,len(x))
        
        zp = (x==0).argmax()
        l,r = args.constrain
        if l == r:
            warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning)
            
        mindist = min(l,r)
        maxdist = max(l,r)
        
        mask[zp-maxdist:zp-mindist+1] = False
    elif args.require_upstream == True:
        mask = x >= 0
    else:
        mask = numpy.tile(False,len(x))

    for n,k in enumerate(lengths):
        color = colors[n]
        baseline = plt_incr*n
        y = metagene_profile["%s-mers" % k].values
        #ymask = y[mask]
        ymask = numpy.ma.MaskedArray(y,mask=mask)

        if numpy.isnan(y).all():
            plot_y = numpy.zeros_like(x)
        else:
            if args.aggregate == False:
                plot_y = y / max_y
            else:
                plot_y = y.astype(float) / numpy.nanmax(y) * 0.9
 
        # plot metagene profiles on common scale, offset by baseline from bottom to top
        ax.plot(x,baseline + plot_y,color=color)
        ax.text(xmin,baseline,"%s-mers" % k,
                ha="left",
                va="bottom",
                color=color,
                transform=matplotlib.transforms.offset_copy(ax.transData,fig,
                                                            x=6.0,y=3.0,units="points"))

        ymax = baseline + numpy.nanmax(plot_y)

        # if all valid positions are nan, or if all valid positions are <= 0
        if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0:
            offset = args.default
            usedefault = True
        else:
            offset = -x[numpy.ma.argmax(ymask)]
            usedefault = False

        offset_dict[k] = offset
        if usedefault == False:
            yadj = ymax - 0.2 * plt_incr

            ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2])
            ax.text(-offset / 2.0,
                     yadj,
                     "%s nt" % (offset),
                     color=color,
                     ha="center",
                     va="bottom",
                     transform=matplotlib.transforms.offset_copy(ax.transData,fig,
                                                                 x=0.0,y=3.0,units="points")
                    )   

    plt.xlim(xmin,xmax)
    plt.ylim(-0.1,plt_incr+baseline)
    ax.yaxis.set_ticks([])

    # save data as p-site offset table
    fn = "%s_p_offsets.txt" % outbase
    fout = argsopener(fn,args)
    printer.write("Writing offset table to %s ..." % fn)
    fout.write("length\tp_offset\n")
    for k in offset_dict:
        fout.write("%s\t%s\n" % (k,offset_dict[k]))
    
    fout.write("default\t%s" % args.default)
    
    fout.close()

    # save plot
    plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat) 
    printer.write("Saving plot to %s ..." % plot_fn)
    plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight")

    printer.write("Done.")
예제 #9
0
def main(argv=sys.argv[1:]):
    """Command-line program
	
	Parameters
	----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

		Default: `sys.argv[1:]`. The command-line arguments, if the script is
		invoked from the command line
	"""
    bp = BaseParser()
    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[bp.get_parser()])
    parser.add_argument("--exclude",
                        nargs="+",
                        default=[],
                        help="Feature types to exclude from consideration")
    parser.add_argument("infile",
                        metavar="infile.gff",
                        type=str,
                        help="Input GFF3 file")
    parser.add_argument("outfile",
                        metavar="outfile.txt",
                        type=str,
                        help="Name of output file")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    excluded = set(args.exclude)

    fin = sys.stdin if args.infile == "-" else opener(args.infile)

    feature_counts = Counter()
    features_with_parents = []
    feature_types = {}
    name_type = {}

    printer.write("Opening %s..." % args.infile)
    c = 0
    for feature in GFF3_Reader(fin, return_stopfeatures=False):
        if c % 10000 == 0:
            printer.write("Processed %s features..." % c)
        c += 1
        ftype = feature.attr["type"]
        fname = feature.get_name()
        if ftype not in excluded:
            if ftype not in feature_types:
                feature_types[ftype] = Counter()
            feature_counts[ftype] += 1
            if fname is not None:
                name_type[fname] = ftype
            if "Parent" in feature.attr:
                features_with_parents.append(feature)
            else:
                feature_types[ftype]["parent unspecified"] += 1

    printer.write("Sorting parents...")
    c = 0
    for feature in features_with_parents:
        if c % 10000 == 0:
            printer.write("Processed %s parents..." % c)
        c += 1
        pnames = feature.attr["Parent"]
        ftype = feature.attr["type"]
        if pnames == "":
            feature_types[ftype]["parent unspecified"] += 1
        else:
            if len(pnames) > 1:
                feature_types[ftype]["multiple parents"] += 1
            else:
                ptype = name_type.get(pnames[0], "parent not in database")
                feature_types[ftype][ptype] += 1

    rows = sorted(feature_types.keys())
    cols = rows + [
        "parent unspecified", "parent not in database", "multiple parents"
    ]

    with argsopener(args.outfile, args, "w") as fh:
        printer.write("Writing %s..." % args.outfile)
        header = "#feature_type\tcount\t" + "\t".join(cols) + "\n"
        fh.write(header)
        for r in rows:
            sout = "%s\t%s" % (r, feature_counts[r])
            for i in cols:
                sout += "\t%s" % feature_types[r].get(i, 0)
            fh.write("%s\n" % sout)

    printer.write("Done.")
예제 #10
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(
        disabled=["normalize", "big_genome", "spliced_bowtie_files"],
        input_choices=["BAM"])
    an = AnnotationParser()
    pp = PlottingParser()
    bp = BaseParser()
    plotting_parser = pp.get_parser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[
            base_parser, annotation_file_parser, alignment_file_parser,
            plotting_parser
        ])

    parser.add_argument("roi_file",type=str,nargs="?",default=None,
                        help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\
                             "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\
                             "prevents double-counting of codons when multiple transcript isoforms exist "+\
                             "for a gene. See the documentation for `metagene` for more info about ROI files."+\
                             "If an ROI file is not given, supply an annotation with ``--annotation_files``")
    parser.add_argument("outbase",
                        type=str,
                        help="Required. Basename for output files")
    parser.add_argument(
        "--codon_buffer",
        type=int,
        default=5,
        help="Codons before and after start codon to ignore (Default: 5)")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    pp.set_style_from_args(args)
    gnd = al.get_genome_array_from_args(args, printer=printer)

    read_lengths = list(range(args.min_length, args.max_length + 1))
    codon_buffer = args.codon_buffer

    dtmp = {
        "read_length": numpy.array(read_lengths),
        "reads_counted": numpy.zeros_like(read_lengths, dtype=int),
    }

    if args.roi_file is not None:
        using_roi = True
        roi_table = read_pl_table(args.roi_file)
        regions = roi_table.iterrows()
        transform_fn = roi_row_to_cds
        back_buffer = -1
        if len(args.annotation_files) > 0:
            warnings.warn(
                "If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'"
                % (args.roi_file, ", ".join(args.annotation_files)),
                ArgumentWarning)
    else:
        using_roi = False
        if len(args.annotation_files) == 0:
            printer.write(
                "Either an ROI file or at least annotation file must be given."
            )
            sys.exit(1)
        else:
            warnings.warn(
                "Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.",
                ArgumentWarning)
            regions = an.get_transcripts_from_args(args, printer=printer)
            back_buffer = -codon_buffer
            transform_fn = lambda x: x.get_cds()

    phase_sums = {}
    for k in read_lengths:
        phase_sums[k] = numpy.zeros(3)

    for n, roi in enumerate(regions):
        if n % 1000 == 1:
            printer.write("Counted %s ROIs ..." % n)

        # transformation needed to extract CDS from transcript or from ROI file window
        cds_part = transform_fn(roi)

        # only calculate for coding genes
        if len(cds_part) > 0:

            read_dict = {}
            count_vectors = {}
            for k in read_lengths:
                read_dict[k] = []
                count_vectors[k] = []

            # for each seg, fetch reads, sort them, and create individual count vectors
            for seg in cds_part:
                reads = gnd.get_reads(seg)
                for read in filter(lambda x: len(x.positions) in read_dict,
                                   reads):
                    read_dict[len(read.positions)].append(read)

                # map and sort by length
                for read_length in read_dict:
                    count_vector = list(
                        gnd.map_fn(read_dict[read_length], seg)[1])
                    count_vectors[read_length].extend(count_vector)

            # add each count vector for each length to total
            for k, vec in count_vectors.items():
                counts = numpy.array(vec)
                if cds_part.strand == "-":
                    counts = counts[::-1]

                if len(counts) % 3 == 0:
                    counts = counts.reshape((int(len(counts) / 3), 3))
                else:
                    if using_roi == False:
                        message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (
                            roi.get_name(), len(counts))
                        warnings.warn(message, DataWarning)
                    newlen = int(len(counts) // 3)
                    counts = counts[:3 * newlen]
                    counts = counts.reshape(newlen, 3)

                phase_sums[k] += counts[codon_buffer:back_buffer, :].sum(0)

    printer.write("Counted %s ROIs total." % (n + 1))
    for k in dtmp:
        dtmp[k] = numpy.array(dtmp[k])

    # total reads counted for each size
    for k in read_lengths:
        dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum()

    # read length distribution
    dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(
        float) / dtmp["reads_counted"].sum()

    # phase vectors
    phase_vectors = {
        K: V.astype(float) / V.astype(float).sum()
        for K, V in phase_sums.items()
    }
    for i in range(3):
        dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"]))

    for k, vec in phase_vectors.items():
        for i in range(3):
            dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i]

    # phase table
    fn = "%s_phasing.txt" % args.outbase
    printer.write("Saving phasing table to %s ..." % fn)
    dtmp = pd.DataFrame(dtmp)
    with argsopener(fn, args) as fh:
        dtmp.to_csv(fh,
                    columns=[
                        "read_length",
                        "reads_counted",
                        "fraction_reads_counted",
                        "phase0",
                        "phase1",
                        "phase2",
                    ],
                    float_format="%.6f",
                    na_rep="nan",
                    sep="\t",
                    index=False,
                    header=True)
        fh.close()

    fig = {}
    if args.figsize is not None:
        fig["figsize"] = tuple(args.figsize)

    colors = pp.get_colors_from_args(args, len(read_lengths))

    fn = "%s_phasing.%s" % (args.outbase, args.figformat)
    printer.write("Plotting to %s ..." % fn)
    plot_counts = numpy.vstack([V for (_, V) in sorted(phase_sums.items())])
    fig, (ax1, _) = phase_plot(plot_counts,
                               labels=read_lengths,
                               lighten_by=0.3,
                               cmap=None,
                               color=colors,
                               fig=fig)

    if args.title is not None:
        ax1.set_title(args.title)
    else:
        ax1.set_title("Phasing stats for %s" % args.outbase)

    fig.savefig(fn, dpi=args.dpi, bbox_inches="tight")
예제 #11
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(disabled=["normalize","big_genome","spliced_bowtie_files"],
                        input_choices=["BAM"])
    an = AnnotationParser()
    pp = PlottingParser()
    bp = BaseParser()
    plotting_parser = pp.get_parser()
    
    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     conflict_handler="resolve",
                                     parents=[base_parser,
                                              annotation_file_parser,
                                              alignment_file_parser,
                                              plotting_parser])
    
    parser.add_argument("roi_file",type=str,nargs="?",default=None,
                        help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\
                             "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\
                             "prevents double-counting of codons when multiple transcript isoforms exist "+\
                             "for a gene. See the documentation for `metagene` for more info about ROI files."+\
                             "If an ROI file is not given, supply an annotation with ``--annotation_files``")
    parser.add_argument("outbase",type=str,help="Required. Basename for output files")
    parser.add_argument("--codon_buffer",type=int,default=5,
                        help="Codons before and after start codon to ignore (Default: 5)")


    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    pp.set_style_from_args(args)
    gnd = al.get_genome_array_from_args(args,printer=printer)

    read_lengths = list(range(args.min_length,args.max_length+1))
    codon_buffer = args.codon_buffer
    
    dtmp = { "read_length"   : numpy.array(read_lengths),
             "reads_counted" : numpy.zeros_like(read_lengths,dtype=int),
            }

    if args.roi_file is not None:
        using_roi = True
        roi_table = read_pl_table(args.roi_file)
        regions = roi_table.iterrows()
        transform_fn = roi_row_to_cds
        back_buffer = -1
        if len(args.annotation_files) > 0:
            warnings.warn("If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file,
                                                                                                                               ", ".join(args.annotation_files)),
                          ArgumentWarning)
    else:
        using_roi = False
        if len(args.annotation_files) == 0:
            printer.write("Either an ROI file or at least annotation file must be given.")
            sys.exit(1)
        else:
            warnings.warn("Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.",
                          ArgumentWarning)        
            regions = an.get_transcripts_from_args(args,printer=printer)
            back_buffer  = -codon_buffer
            transform_fn = lambda x: x.get_cds()
    
    phase_sums = {}
    for k in read_lengths:
        phase_sums[k] = numpy.zeros(3)
    
    for n, roi in enumerate(regions):
        if n % 1000 == 1:
            printer.write("Counted %s ROIs ..." % n)
        
        # transformation needed to extract CDS from transcript or from ROI file window
        cds_part = transform_fn(roi)
        
        # only calculate for coding genes
        if len(cds_part) > 0:

            read_dict     = {}
            count_vectors = {}
            for k in read_lengths:
                read_dict[k]     = []
                count_vectors[k] = []
            
            # for each seg, fetch reads, sort them, and create individual count vectors
            for seg in cds_part:
                reads = gnd.get_reads(seg)
                for read in filter(lambda x: len(x.positions) in read_dict,reads):
                    read_dict[len(read.positions)].append(read)
    
                # map and sort by length
                for read_length in read_dict:
                    count_vector = list(gnd.map_fn(read_dict[read_length],seg)[1])
                    count_vectors[read_length].extend(count_vector)
            
            # add each count vector for each length to total
            for k, vec in count_vectors.items():
                counts = numpy.array(vec)
                if cds_part.strand == "-":
                    counts = counts[::-1]
               
                if len(counts) % 3 == 0:
                    counts = counts.reshape((len(counts)/3,3))
                else:
                    if using_roi == False:
                        message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (roi.get_name(),len(counts))
                        warnings.warn(message,DataWarning)
                    newlen = len(counts)//3
                    counts = counts[:3*newlen]
                    counts = counts.reshape(newlen,3)
    
                phase_sums[k] += counts[codon_buffer:back_buffer,:].sum(0)

    printer.write("Counted %s ROIs total." % (n+1))
    for k in dtmp:
        dtmp[k] = numpy.array(dtmp[k])
    
    # total reads counted for each size    
    for k in read_lengths:
        dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum() 
    
    # read length distribution
    dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(float) / dtmp["reads_counted"].sum() 
    
    # phase vectors
    phase_vectors = { K : V.astype(float)/V.astype(float).sum() for K,V in phase_sums.items() }
    for i in range(3):
        dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"])) 

    for k, vec in phase_vectors.items():
        for i in range(3):
            dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i]
    
    # phase table
    fn = "%s_phasing.txt" % args.outbase
    printer.write("Saving phasing table to %s ..." % fn)
    dtmp = pd.DataFrame(dtmp)
    with argsopener(fn,args) as fh:
        dtmp.to_csv(fh,columns=["read_length",
                                "reads_counted",
                                "fraction_reads_counted",
                                "phase0",
                                "phase1",
                                "phase2",
                                ],
                       float_format="%.6f",
                       na_rep="nan",
                       sep="\t",
                       index=False,
                       header=True
                      )
        fh.close()
    
    fig = {}
    if args.figsize is not None:
        fig["figsize"] = tuple(args.figsize)

    colors = pp.get_colors_from_args(args,len(read_lengths))

    fn = "%s_phasing.%s" % (args.outbase,args.figformat)
    printer.write("Plotting to %s ..." % fn)
    plot_counts = numpy.vstack([V for (_,V) in sorted(phase_sums.items())])
    fig, (ax1,_) = phase_plot(plot_counts,labels=read_lengths,lighten_by=0.3,
                                cmap=None,color=colors,fig=fig)

    if args.title is not None:
        ax1.set_title(args.title)
    else:
        ax1.set_title("Phasing stats for %s" % args.outbase)

    fig.savefig(fn,dpi=args.dpi,bbox_inches="tight")
예제 #12
0
파일: cs.py 프로젝트: zzygyx9119/plastid
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(disabled=["normalize"])
    an = AnnotationParser()
    mp = MaskParser()
    pl = PlottingParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser()
    annotation_file_parser = an.get_parser()
    mask_file_parser = mp.get_parser()
    plotting_parser = pl.get_parser()
    base_parser = bp.get_parser()

    generator_help = "Create unambiguous position file from GFF3 annotation"
    generator_desc = format_module_docstring(do_generate.__doc__)

    counter_help = "Count reads in unambiguous gene positions"
    counter_desc = format_module_docstring(do_count.__doc__)

    chart_help = "Produce charts comparing reads between samples"
    chart_desc = format_module_docstring(do_chart.__doc__)

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter)
    subparsers = parser.add_subparsers(
        title="subcommands",
        description="choose one of the following",
        dest="program")
    gparser = subparsers.add_parser(
        "generate",
        help=generator_help,
        description=generator_desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[base_parser, annotation_file_parser, mask_file_parser],
    )
    cparser = subparsers.add_parser(
        "count",
        help=counter_help,
        description=counter_desc,
        parents=[base_parser, alignment_file_parser],
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    pparser = subparsers.add_parser(
        "chart",
        help=chart_help,
        description=chart_desc,
        parents=[base_parser, plotting_parser],
        formatter_class=argparse.RawDescriptionHelpFormatter)

    gparser.add_argument("outbase",
                         metavar="outbase",
                         type=str,
                         help="Basename for output files")

    cparser.add_argument(
        "position_file",
        type=str,
        metavar="file.positions",
        help=
        "File assigning positions to genes or transcripts (made using 'generate' subcommand)"
    )
    cparser.add_argument("outbase", type=str, help="Basename for output files")

    pparser.add_argument("-i",
                         "--in",
                         nargs="+",
                         type=str,
                         dest="infiles",
                         help="input files, made by 'count' subprogram")
    pparser.add_argument(
        "--bins",
        nargs="+",
        type=int,
        default=(0, 32, 64, 128, 256, 512, 1024, 2048, 4096),
        help="Bins into which features are partitioned based on counts")
    pparser.add_argument(
        "--regions",
        nargs="+",
        type=str,
        default=("exon", "utr5", "cds", "utr3"),
        help="Regions to compare (default: exon, utr5, cds, utr3)")
    pparser.add_argument("--metrics",
                         nargs="+",
                         type=str,
                         default=("rpkm", "reads"),
                         help="Metrics to compare (default: rpkm, reads)")
    pparser.add_argument(
        "list_of_regions",
        type=str,
        metavar='gene_list.txt',
        nargs="?",
        default=None,
        help=
        "Optional. File listing regions (genes or transcripts), one per line, to include in comparisons. If not given, all genes are included."
    )
    pparser.add_argument("outbase", type=str, help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    if args.program == "generate":
        #generate position file
        do_generate(args, an, mp)

    elif args.program == "count":
        #use position file to count gene expression in infiles
        do_count(args, al)

    elif args.program == "chart":
        #use count files to generate a family of charts and tables
        do_chart(args, pl)
예제 #13
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    bp = BaseParser()
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser()])
    parser.add_argument("-k",dest="read_length",metavar="READ_LENGTH",
                        type=int,default=29,
                        help="K-mer length to generate from input file. "+
                             "(Default: 29)")
    parser.add_argument("--offset",type=int,default=14,
                        help="Offset from 5' end of plus-strand read at which to attribute score (Default: 14)")
    parser.add_argument("--mismatches",metavar="N",
                        type=int,default=0,
                        help="Number of mismatches tolerated in alignment. "+
                           "(Default: 0)")
    parser.add_argument("--bowtie",dest="bowtie",default="/usr/local/bin/bowtie",
                        type=str,
                        help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)")
    parser.add_argument("--have_kmers",default=False,action="store_true",
                        help="If specified, use k-mer files from previous run. "+\
                             " In this case 'sequence_file' should be the value "+\
                             "'outbase' from the k-mer files you want to use.")
    parser.add_argument("--save_kmers",default=False,action="store_true",
                        help="Save k-mer files for reuse in a subsequent run.")
    parser.add_argument("-p","--processes",type=int,default=2,metavar="N",
                        help="Number of processes to use (should be <= number of chromosomes")
    parser.add_argument("ebwt",type=str,
                        help="Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`.")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)


    #filenames
    base         = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches)
    bed_file     = "%s_crossmap.bed" % base

    #if not os.path.exists(args.sequence_file):
    #    printer.write("Could not find source file: %s" % args.sequence_file)
    #    printer.write("Exiting.")
    #    sys.exit(1)

    if args.have_kmers == True:
        import glob
        kmer_files = glob.glob(args.sequence_file+"*kmers.fa")
        seq_pat = re.compile(r".*_([^_]*)_kmers.fa")
        seqs = { seq_pat.search(X).groups()[0] : X for X in kmer_files }
    else:
        seqs = sp.get_seqdict_from_args(args,index=True) 

    worker = functools.partial(chrom_worker,args=args)
    chroms = seqs.items()

    pool = multiprocessing.Pool(processes=args.processes)
    bed_filenames = pool.map(worker,chroms,1)
    pool.close()
    pool.join()
   
    with open(bed_file,"w") as fout:
        for f in sorted(bed_filenames):
            shutil.copyfileobj(open(f,"r"),fout)
            os.remove(f)

    fout.close()

    printer.write("Done.")
    printer.write(BigBedMessage.replace("OUTFILE",bed_file.replace(".bed","")).replace("BOWTIE_INDEX",args.ebwt))
예제 #14
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    bp = BaseParser()
    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[bp.get_parser(), sp.get_parser()])
    parser.add_argument("-k",
                        dest="read_length",
                        metavar="READ_LENGTH",
                        type=int,
                        default=29,
                        help="K-mer length to generate from input file. " +
                        "(Default: 29)")
    parser.add_argument(
        "--offset",
        type=int,
        default=14,
        help=
        "Offset from 5' end of plus-strand read at which to attribute score (Default: 14)"
    )
    parser.add_argument("--mismatches",
                        metavar="N",
                        type=int,
                        default=0,
                        help="Number of mismatches tolerated in alignment. " +
                        "(Default: 0)")
    parser.add_argument(
        "--bowtie",
        dest="bowtie",
        default="/usr/local/bin/bowtie",
        type=str,
        help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)")
    parser.add_argument("--have_kmers",default=False,action="store_true",
                        help="If specified, use k-mer files from previous run. "+\
                             " In this case 'sequence_file' should be the value "+\
                             "'outbase' from the k-mer files you want to use.")
    parser.add_argument("--save_kmers",
                        default=False,
                        action="store_true",
                        help="Save k-mer files for reuse in a subsequent run.")
    parser.add_argument(
        "-p",
        "--processes",
        type=int,
        default=2,
        metavar="N",
        help="Number of processes to use (should be <= number of chromosomes")
    parser.add_argument(
        "ebwt",
        type=str,
        help=
        "Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`."
    )
    parser.add_argument("outbase", type=str, help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    #filenames
    base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches)
    bed_file = "%s_crossmap.bed" % base

    #if not os.path.exists(args.sequence_file):
    #    printer.write("Could not find source file: %s" % args.sequence_file)
    #    printer.write("Exiting.")
    #    sys.exit(1)

    if args.have_kmers == True:
        import glob
        kmer_files = glob.glob(args.sequence_file + "*kmers.fa")
        seq_pat = re.compile(r".*_([^_]*)_kmers.fa")
        seqs = {seq_pat.search(X).groups()[0]: X for X in kmer_files}
    else:
        seqs = sp.get_seqdict_from_args(args, index=True)

    worker = functools.partial(chrom_worker, args=args)
    chroms = seqs.items()

    pool = multiprocessing.Pool(processes=args.processes)
    bed_filenames = pool.map(worker, chroms, 1)
    pool.close()
    pool.join()

    with open(bed_file, "w") as fout:
        for f in sorted(bed_filenames):
            shutil.copyfileobj(open(f, "r"), fout)
            os.remove(f)

    fout.close()

    printer.write("Done.")
    printer.write(
        BigBedMessage.replace("OUTFILE", bed_file.replace(".bed", "")).replace(
            "BOWTIE_INDEX", args.ebwt))
예제 #15
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")
예제 #16
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")

    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser = al.get_parser(conflict_handler="resolve")

    mp = MaskParser()
    mask_file_parser = mp.get_parser()

    bp = BaseParser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ],
    )
    parser.add_argument("outfile", type=str, help="Output filename")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = ap.get_transcripts_from_args(args,
                                               printer=printer,
                                               return_type=SegmentChain)
    crossmap = mp.get_genome_hash_from_args(args, printer=printer)

    ga_sum = ga.sum()
    normconst = 1000.0 * 1e6 / ga_sum

    with argsopener(args.outfile, args, "w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write(
            "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n"
        )
        for n, ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)

            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts) / length
            rpkm = numpy.nan if length == 0 else rpnt * normconst
            ltmp = [
                name,
                str(ivc),
                "%.8e" % counts,
                "%.8e" % rpnt,
                "%.8e" % rpkm,
                "%d" % length
            ]
            fout.write("%s\n" % "\t".join(ltmp))

        fout.close()

    printer.write("Processed %s regions total." % n)

    printer.write("Done.")