def check_get_junction_tuple(ivc,expected_tup): """Check output of :py:func:`splice_to_tuple` Parameters ---------- ivc : |SegmentChain| A two-exon fragment representing a splice junction expected_tup : tuple (chromosome name, half-open end of fiveprime exon, first position of threeprime exon, strand) """ found_tup = get_junction_tuple(ivc) assert_equal(found_tup,expected_tup,"Expected %s, got %s." % (expected_tup,found_tup))
def check_get_junction_tuple(ivc, expected_tup): """Check output of :py:func:`splice_to_tuple` Parameters ---------- ivc : |SegmentChain| A two-exon fragment representing a splice junction expected_tup : tuple (chromosome name, half-open end of fiveprime exon, first position of threeprime exon, strand) """ found_tup = get_junction_tuple(ivc) assert_equal(found_tup, expected_tup, "Expected %s, got %s." % (expected_tup, found_tup))
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() mp = MaskParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()], ) parser.add_argument("--maxslide",type=int,default=10, help="Maximum number of nt to search 5\' and 3\' of intron"+ " boundaries (Default: 10)") parser.add_argument("--ref",type=str,metavar="ref.bed",default=None, help="Reference file describing known splice junctions") parser.add_argument("--slide_canonical",action="store_true",default=False, help="Slide junctions to canonical junctions if present within equal support region") parser.add_argument("infile",type=str,metavar="input.bed", help="BED file describing discovered junctions") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) printer.write("Opening genome from %s..." % args.sequence_file) genome = sp.get_seqdict_from_args(args) # load crossmap cross_hash = mp.get_genome_hash_from_args(args) # load ref junctions if args.ref is not None: printer.write("Loading reference junctions from %s" % args.ref) known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False) else: known_hash = GenomeHash() # set up variables canonicals_plus = [("GT","AG"), ("GC","AG") ] canonicals_minus = [("CT","AC"), ("CT","GC") ] known_in_range = 0 canonical_in_range = 0 repetitive = 0 untouched = 0 c = 0 seen_already = [] outfiles = { "repetitive" : "%s_repetitive.bed" % args.outbase, "known" : "%s_shifted_known.bed" % args.outbase, "canonical" : "%s_shifted_canonical.bed" % args.outbase, "untouched" : "%s_untouched.bed" % args.outbase, } outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() } # process data printer.write("Opening junctions from %s..." % args.infile) for ivc in BED_Reader(CommentReader(opener(args.infile))): processed = False tup = None if c % 1000 == 0 and c > 0: printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) assert len(ivc) == 2 strand = ivc.strand minus_range, plus_range = find_match_range(ivc,genome,args.maxslide) # see if either end of splice junction +- match_range lands in repetitive areas of genome if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash): repetitive += 1 outfiles["repetitive"].write(ivc.as_bed()) processed = True # see if one or more known junctions in range if processed == False and args.ref is not None: # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions) known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc)) if len(known_juncs) > 0: known_in_range += 1 for my_known in known_juncs: tup = get_junction_tuple(my_known) if tup not in seen_already: outfiles["known"].write(my_known.as_bed()) seen_already.append(tup) processed = True # see if one or more canonical junctions in range if processed == False and args.slide_canonical == True: canonicals = canonicals_plus if strand == "+" else canonicals_minus #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals) canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals) if len(canonical_juncs) > 0: canonical_in_range += 1 for can in canonical_juncs: tup = get_junction_tuple(can) if tup not in seen_already: outfiles["canonical"].write(can.as_bed()) seen_already.append(tup) processed = True if processed == False: outfiles["untouched"].write(ivc.as_bed()) untouched += 1 c += 1 # save output printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) for v in outfiles.values(): v.close() printer.write("Done.")