示例#1
0
def check_get_junction_tuple(ivc,expected_tup):
    """Check output of :py:func:`splice_to_tuple`
    
    Parameters
    ----------
    ivc : |SegmentChain|
         A two-exon fragment representing a splice junction

    expected_tup : tuple
        (chromosome name, half-open end of fiveprime exon, first position of threeprime exon, strand)
    """
    found_tup = get_junction_tuple(ivc)
    assert_equal(found_tup,expected_tup,"Expected %s, got %s." % (expected_tup,found_tup))
示例#2
0
def check_get_junction_tuple(ivc, expected_tup):
    """Check output of :py:func:`splice_to_tuple`
    
    Parameters
    ----------
    ivc : |SegmentChain|
         A two-exon fragment representing a splice junction

    expected_tup : tuple
        (chromosome name, half-open end of fiveprime exon, first position of threeprime exon, strand)
    """
    found_tup = get_junction_tuple(ivc)
    assert_equal(found_tup, expected_tup,
                 "Expected %s, got %s." % (expected_tup, found_tup))
示例#3
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")