예제 #1
0
파일: idr.py 프로젝트: Simon-Coetzee/idr
def main():
    args = parse_args()

    # load and merge peaks
    merged_peaks, signal_type = load_samples(args)
    s1 = numpy.array([pk.signals[0] for pk in merged_peaks])
    s2 = numpy.array([pk.signals[1] for pk in merged_peaks])

    # build the ranks vector
    idr.log("Ranking peaks", 'VERBOSE')
    r1, r2 = build_rank_vectors(merged_peaks)
    
    if args.only_merge_peaks:
        localIDRs, IDRs = None, None
    else:
        if len(merged_peaks) < 20:
            error_msg = "Peak files must contain at least 20 peaks post-merge"
            error_msg += "\nHint: Merged peaks were written to the output file"
            write_results_to_file(
                merged_peaks, args.output_file,
                args.output_file_type, signal_type)
            raise ValueError(error_msg)

        localIDRs = fit_model_and_calc_local_idr(
            r1, r2, 
            starting_point=(
                args.initial_mu, args.initial_sigma, 
                args.initial_rho, args.initial_mix_param),
            max_iter=args.max_iter,
            convergence_eps=args.convergence_eps,
            fix_mu=args.fix_mu, fix_sigma=args.fix_sigma )    

        if args.use_best_multisummit_IDR:
            localIDRs = correct_multi_summit_peak_IDR_values(
                localIDRs, merged_peaks)
        IDRs = calc_global_IDR(localIDRs)
        
        if args.plot:
            assert len(args.samples) == 2
            plot(args, [s1, s2], [r1, r2], IDRs)
        
    
    num_peaks_passing_thresh = write_results_to_file(
        merged_peaks, 
        args.output_file, 
        args.output_file_type, 
        signal_type,
        localIDRs=localIDRs, 
        IDRs=IDRs,
        max_allowed_idr=args.idr_threshold,
        soft_max_allowed_idr=args.soft_idr_threshold,
        useBackwardsCompatibleOutput=args.use_old_output_format)
    
    args.output_file.close()
예제 #2
0
파일: idr.py 프로젝트: wwang-chcn/idr
def main():
    args = parse_args()

    # load and merge peaks
    merged_peaks, signal_type = load_samples(args)
    s1 = numpy.array([pk.signals[0] for pk in merged_peaks])
    s2 = numpy.array([pk.signals[1] for pk in merged_peaks])

    # build the ranks vector
    idr.log("Ranking peaks", 'VERBOSE')
    r1, r2 = build_rank_vectors(merged_peaks)

    if args.only_merge_peaks:
        localIDRs, IDRs = None, None
    else:
        if len(merged_peaks) < 20:
            error_msg = "Peak files must contain at least 20 peaks post-merge"
            error_msg += "\nHint: Merged peaks were written to the output file"
            write_results_to_file(merged_peaks, args.output_file,
                                  args.input_file_type, signal_type)
            raise ValueError(error_msg)

        localIDRs, IDRs = fit_model_and_calc_idr(
            r1,
            r2,
            starting_point=(args.initial_mu, args.initial_sigma,
                            args.initial_rho, args.initial_mix_param),
            max_iter=args.max_iter,
            convergence_eps=args.convergence_eps,
            fix_mu=args.fix_mu,
            fix_sigma=args.fix_sigma)

        if args.plot:
            assert len(args.samples) == 2
            plot(args, [s1, s2], [r1, r2], IDRs)

    num_peaks_passing_thresh = write_results_to_file(
        merged_peaks,
        args.output_file,
        args.input_file_type,
        signal_type,
        localIDRs=localIDRs,
        IDRs=IDRs,
        max_allowed_idr=args.idr_threshold,
        soft_max_allowed_idr=args.soft_idr_threshold,
        useBackwardsCompatibleOutput=args.use_old_output_format)

    args.output_file.close()
예제 #3
0
파일: idr.py 프로젝트: wwang-chcn/idr
def fit_model_and_calc_idr(r1,
                           r2,
                           starting_point=None,
                           max_iter=idr.MAX_ITER_DEFAULT,
                           convergence_eps=idr.CONVERGENCE_EPS_DEFAULT,
                           fix_mu=False,
                           fix_sigma=False):
    # in theory we would try to find good starting point here,
    # but for now just set it to somethign reasonable
    if type(starting_point) == type(None):
        starting_point = (DEFAULT_MU, DEFAULT_SIGMA, DEFAULT_RHO,
                          DEFAULT_MIX_PARAM)

    idr.log("Initial parameter values: [%s]" %
            " ".join("%.2f" % x for x in starting_point))

    # fit the model parameters
    idr.log("Fitting the model parameters", 'VERBOSE')
    if idr.PROFILE:
        import cProfile
        cProfile.runctx(
            """theta, loss = estimate_model_params(
                                    r1,r2,
                                    starting_point, 
                                    max_iter=max_iter, 
                                    convergence_eps=convergence_eps,
                                    fix_mu=fix_mu, fix_sigma=fix_sigma)
                                   """,
            {'estimate_model_params': estimate_model_params}, {
                'r1': r1,
                'r2': r2,
                'starting_point': starting_point,
                'max_iter': max_iter,
                'convergence_eps': convergence_eps,
                'fix_mu': fix_mu,
                'fix_sigma': fix_sigma
            })
        assert False
    theta, loss = estimate_model_params(r1,
                                        r2,
                                        starting_point,
                                        max_iter=max_iter,
                                        convergence_eps=convergence_eps,
                                        fix_mu=fix_mu,
                                        fix_sigma=fix_sigma)

    idr.log("Finished running IDR on the datasets", 'VERBOSE')
    idr.log("Final parameter values: [%s]" % " ".join("%.2f" % x
                                                      for x in theta))

    # calculate the global IDR
    localIDRs, IDRs = calc_IDR(numpy.array(theta), r1, r2)

    return localIDRs, IDRs
예제 #4
0
파일: idr.py 프로젝트: wwang-chcn/idr
def fit_model_and_calc_idr(r1, r2, 
                           starting_point=None,
                           max_iter=idr.MAX_ITER_DEFAULT, 
                           convergence_eps=idr.CONVERGENCE_EPS_DEFAULT, 
                           fix_mu=False, fix_sigma=False ):
    # in theory we would try to find good starting point here,
    # but for now just set it to somethign reasonable
    if type(starting_point) == type(None):
        starting_point = (DEFAULT_MU, DEFAULT_SIGMA, 
                          DEFAULT_RHO, DEFAULT_MIX_PARAM)
    
    idr.log("Initial parameter values: [%s]" % " ".join(
            "%.2f" % x for x in starting_point))
    
    # fit the model parameters    
    idr.log("Fitting the model parameters", 'VERBOSE');
    if idr.PROFILE:
            import cProfile
            cProfile.runctx("""theta, loss = estimate_model_params(
                                    r1,r2,
                                    starting_point, 
                                    max_iter=max_iter, 
                                    convergence_eps=convergence_eps,
                                    fix_mu=fix_mu, fix_sigma=fix_sigma)
                                   """, 
                            {'estimate_model_params': estimate_model_params}, 
                            {'r1':r1, 'r2':r2, 
                             'starting_point': starting_point,
                             'max_iter': max_iter, 
                             'convergence_eps': convergence_eps,
                             'fix_mu': fix_mu, 'fix_sigma': fix_sigma} )
            assert False
    theta, loss = estimate_model_params(
        r1, r2,
        starting_point, 
        max_iter=max_iter, 
        convergence_eps=convergence_eps,
        fix_mu=fix_mu, fix_sigma=fix_sigma)
    
    idr.log("Finished running IDR on the datasets", 'VERBOSE')
    idr.log("Final parameter values: [%s]"%" ".join("%.2f" % x for x in theta))
    
    # calculate the global IDR
    localIDRs, IDRs = calc_IDR(numpy.array(theta), r1, r2)

    return localIDRs, IDRs
예제 #5
0
파일: idr.py 프로젝트: Simon-Coetzee/idr
def write_results_to_file(merged_peaks, output_file, 
                          output_file_type, signal_type,
                          max_allowed_idr=1.0,
                          soft_max_allowed_idr=1.0,
                          localIDRs=None, IDRs=None, 
                          useBackwardsCompatibleOutput=False):
    if useBackwardsCompatibleOutput:
        build_idr_output_line = build_backwards_compatible_idr_output_line
    else:
        build_idr_output_line = build_idr_output_line_with_bed6
    
    # write out the result
    idr.log("Writing results to file", "VERBOSE");
    
    if localIDRs is None or IDRs is None:
        assert IDRs is None
        assert localIDRs is None
        localIDRs = numpy.ones(len(merged_peaks))
        IDRs = numpy.ones(len(merged_peaks))

    
    num_peaks_passing_hard_thresh = 0
    num_peaks_passing_soft_thresh = 0
    for localIDR, IDR, merged_peak in zip(
            localIDRs, IDRs, merged_peaks):
        # skip peaks with global idr values below the threshold
        if max_allowed_idr != None and IDR > max_allowed_idr: 
            continue
        num_peaks_passing_hard_thresh += 1
        if IDR <= soft_max_allowed_idr:
            num_peaks_passing_soft_thresh += 1
        opline = build_idr_output_line(
            merged_peak, IDR, localIDR, output_file_type, signal_type)
        print( opline, file=output_file )

    if len(merged_peaks) == 0: return
    
    idr.log(
        "Number of reported peaks - {}/{} ({:.1f}%)\n".format(
            num_peaks_passing_hard_thresh, len(merged_peaks),
            100*float(num_peaks_passing_hard_thresh)/len(merged_peaks))
    )
    
    idr.log(
        "Number of peaks passing IDR cutoff of {} - {}/{} ({:.1f}%)\n".format(
            soft_max_allowed_idr, 
            num_peaks_passing_soft_thresh, len(merged_peaks),
            100*float(num_peaks_passing_soft_thresh)/len(merged_peaks))
    )
    
    return
예제 #6
0
def write_results_to_file(merged_peaks, output_file, 
                          output_file_type, signal_type,
                          max_allowed_idr=1.0,
                          soft_max_allowed_idr=1.0,
                          localIDRs=None, IDRs=None, 
                          useBackwardsCompatibleOutput=False):
    if useBackwardsCompatibleOutput:
        build_idr_output_line = build_backwards_compatible_idr_output_line
    else:
        build_idr_output_line = build_idr_output_line_with_bed6
    
    # write out the result
    idr.log("Writing results to file", "VERBOSE");
    
    if localIDRs is None or IDRs is None:
        assert IDRs is None
        assert localIDRs is None
        localIDRs = numpy.ones(len(merged_peaks))
        IDRs = numpy.ones(len(merged_peaks))

    
    num_peaks_passing_hard_thresh = 0
    num_peaks_passing_soft_thresh = 0
    for localIDR, IDR, merged_peak in zip(
            localIDRs, IDRs, merged_peaks):
        # skip peaks with global idr values below the threshold
        if max_allowed_idr != None and IDR > max_allowed_idr: 
            continue
        num_peaks_passing_hard_thresh += 1
        if IDR <= soft_max_allowed_idr:
            num_peaks_passing_soft_thresh += 1
        opline = build_idr_output_line(
            merged_peak, IDR, localIDR, output_file_type, signal_type)
        print( opline, file=output_file )

    if len(merged_peaks) == 0: return
    
    idr.log(
        "Number of reported peaks - {}/{} ({:.1f}%)\n".format(
            num_peaks_passing_hard_thresh, len(merged_peaks),
            100*float(num_peaks_passing_hard_thresh)/len(merged_peaks))
    )
    
    idr.log(
        "Number of peaks passing IDR cutoff of {} - {}/{} ({:.1f}%)\n".format(
            soft_max_allowed_idr, 
            num_peaks_passing_soft_thresh, len(merged_peaks),
            100*float(num_peaks_passing_soft_thresh)/len(merged_peaks))
    )
    
    return
예제 #7
0
파일: idr.py 프로젝트: wwang-chcn/idr
def load_samples(args):
    # decide what aggregation function to use for peaks that need to be merged
    idr.log("Loading the peak files", 'VERBOSE')
    if args.input_file_type in ['narrowPeak', 'broadPeak']:
        if args.rank == None: signal_type = 'signal.value'
        else: signal_type = args.rank

        try:
            signal_index = {
                "score": 4,
                "signal.value": 6,
                "p.value": 7,
                "q.value": 8
            }[signal_type]
        except KeyError:
            raise ValueError(
                "Unrecognized signal type for {} filetype: '{}'".format(
                    args.input_file_type, signal_type))

        if args.peak_merge_method != None:
            peak_merge_fn = {
                "sum": sum,
                "avg": mean,
                "min": min,
                "max": max
            }[args.peak_merge_method]
        elif signal_index in (4, 6):
            peak_merge_fn = sum
        else:
            peak_merge_fn = min
        if args.input_file_type == 'narrowPeak':
            summit_index = 9
        else:
            summit_index = None
        f1, f2 = [
            load_bed(fp, signal_index, summit_index) for fp in args.samples
        ]
        oracle_pks = (load_bed(args.peak_list, signal_index, summit_index)
                      if args.peak_list != None else None)
    elif args.input_file_type in [
            'bed',
    ]:
        # set the default
        if args.rank == None:
            signal_type = 'score'

        if args.rank == 'score':
            signal_type = 'score'
            signal_index = 4
        else:
            try:
                signal_index = int(args.rank) - 1
                signal_type = "COL%i" % (signal_index + 1)
            except ValueError:
                raise ValueError("For bed files --signal-type must either "\
                                 +"be set to score or an index specifying "\
                                 +"the column to use.")

        if args.peak_merge_method == None:
            peak_merge_fn = sum
        else:
            peak_merge_fn = {
                "sum": sum,
                "avg": mean,
                "min": min,
                "max": max
            }[args.peak_merge_method]

        f1, f2 = [load_bed(fp, signal_index) for fp in args.samples]
        oracle_pks = (load_bed(args.peak_list, signal_index)
                      if args.peak_list != None else None)
    else:
        raise ValueError("Unrecognized file type: '{}'".format(
            args.input_file_type))

    # build a unified peak set
    idr.log("Merging peaks", 'VERBOSE')
    merged_peaks = merge_peaks([f1, f2], peak_merge_fn, oracle_pks,
                               args.use_nonoverlapping_peaks)
    return merged_peaks, signal_type
예제 #8
0
파일: idr.py 프로젝트: wwang-chcn/idr
def parse_args():
    import argparse

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description="""
Program: IDR (Irreproducible Discovery Rate)
Version: {PACKAGE_VERSION}
Contact: Nathan Boley <*****@*****.**>
""".format(PACKAGE_VERSION=idr.__version__))

    def PossiblyGzippedFile(fname):
        if fname.endswith(".gz"):
            return io.TextIOWrapper(gzip.open(fname, 'rb'))
        else:
            return open(fname, 'r')

    parser.add_argument('--samples',
                        '-s',
                        type=PossiblyGzippedFile,
                        nargs=2,
                        required=True,
                        help='Files containing peaks and scores.')
    parser.add_argument(
        '--peak-list',
        '-p',
        type=PossiblyGzippedFile,
        help='If provided, all peaks will be taken from this file.')
    parser.add_argument('--input-file-type',
                        default='narrowPeak',
                        choices=['narrowPeak', 'broadPeak', 'bed'],
                        help='File type of --samples and --peak-list.')

    parser.add_argument( '--rank',
        help="Which column to use to rank peaks."\
            +"\t\nOptions: signal.value p.value q.value columnIndex"\
            +"\nDefaults:\n\tnarrowPeak/broadPeak: signal.value\n\tbed: score")

    default_ofname = "idrValues.txt"
    parser.add_argument(
        '--output-file',
        "-o",
        default=default_ofname,
        help='File to write output to.\nDefault: {}'.format(default_ofname))

    parser.add_argument('--log-output-file',
                        "-l",
                        type=argparse.FileType("w"),
                        default=sys.stderr,
                        help='File to write output to. Default: stderr')

    parser.add_argument( '--idr-threshold', "-i", type=float, default=None,
        help="Only return peaks with a global idr threshold below this value."\
            +"\nDefault: report all peaks")
    parser.add_argument( '--soft-idr-threshold', type=float, default=None,
        help="Report statistics for peaks with a global idr below this "\
        +"value but return all peaks with an idr below --idr.\nDefault: %.2f" \
                         % idr.DEFAULT_SOFT_IDR_THRESH)

    parser.add_argument('--use-old-output-format',
                        action='store_true',
                        default=False,
                        help="Use old output format.")

    parser.add_argument('--plot',
                        action='store_true',
                        default=False,
                        help='Plot the results to [OFNAME].png')

    parser.add_argument(
        '--use-nonoverlapping-peaks',
        action="store_true",
        default=False,
        help='Use peaks without an overlapping match and set the value to 0.')

    parser.add_argument( '--peak-merge-method',
                         choices=["sum", "avg", "min", "max"], default=None,
        help="Which method to use for merging peaks.\n" \
              + "\tDefault: 'sum' for signal/score/column indexes, 'min' for p/q-value.")

    parser.add_argument('--initial-mu',
                        type=float,
                        default=idr.DEFAULT_MU,
                        help="Initial value of mu. Default: %.2f" %
                        idr.DEFAULT_MU)
    parser.add_argument('--initial-sigma',
                        type=float,
                        default=idr.DEFAULT_SIGMA,
                        help="Initial value of sigma. Default: %.2f" %
                        idr.DEFAULT_SIGMA)
    parser.add_argument('--initial-rho',
                        type=float,
                        default=idr.DEFAULT_RHO,
                        help="Initial value of rho. Default: %.2f" %
                        idr.DEFAULT_RHO)
    parser.add_argument( '--initial-mix-param',
        type=float, default=idr.DEFAULT_MIX_PARAM,
        help="Initial value of the mixture params. Default: %.2f" \
                         % idr.DEFAULT_MIX_PARAM)

    parser.add_argument(
        '--fix-mu',
        action='store_true',
        help="Fix mu to the starting point and do not let it vary.")
    parser.add_argument(
        '--fix-sigma',
        action='store_true',
        help="Fix sigma to the starting point and do not let it vary.")

    parser.add_argument(
        '--random-seed',
        type=int,
        default=0,
        help="The random seed value (sor braking ties). Default: 0")
    parser.add_argument(
        '--max-iter',
        type=int,
        default=idr.MAX_ITER_DEFAULT,
        help="The maximum number of optimization iterations. Default: %i" %
        idr.MAX_ITER_DEFAULT)
    parser.add_argument( '--convergence-eps', type=float,
                         default=idr.CONVERGENCE_EPS_DEFAULT,
        help="The maximum change in parameter value changes " \
             + "for convergence. Default: %.2e" % idr.CONVERGENCE_EPS_DEFAULT)

    parser.add_argument('--only-merge-peaks',
                        action='store_true',
                        help="Only return the merged peak list.")

    parser.add_argument('--verbose',
                        action="store_true",
                        default=False,
                        help="Print out additional debug information")
    parser.add_argument('--quiet',
                        action="store_true",
                        default=False,
                        help="Don't print any status messages")

    parser.add_argument('--version',
                        action='version',
                        version='IDR %s' % idr.__version__)

    args = parser.parse_args()

    args.output_file = open(args.output_file, "w")
    idr.log_ofp = args.log_output_file

    if args.verbose:
        idr.VERBOSE = True

    global QUIET
    if args.quiet:
        idr.QUIET = True
        idr.VERBOSE = False

    assert idr.DEFAULT_IDR_THRESH == 1.0
    if args.idr_threshold == None and args.soft_idr_threshold == None:
        args.idr_threshold = idr.DEFAULT_IDR_THRESH
        args.soft_idr_threshold = idr.DEFAULT_SOFT_IDR_THRESH
    elif args.soft_idr_threshold == None:
        assert args.idr_threshold != None
        args.soft_idr_threshold = args.idr_threshold
    elif args.idr_threshold == None:
        assert args.soft_idr_threshold != None
        args.idr_threshold = idr.DEFAULT_IDR_THRESH

    numpy.random.seed(args.random_seed)

    if args.plot:
        try:
            import matplotlib
        except ImportError:
            idr.log("WARNING: matplotlib does not appear to be installed and "\
                    +"is required for plotting - turning plotting off.",
                    level="WARNING" )
            args.plot = False

    return args
예제 #9
0
파일: idr.py 프로젝트: Simon-Coetzee/idr
def load_samples(args):
    # decide what aggregation function to use for peaks that need to be merged
    idr.log("Loading the peak files", 'VERBOSE')
    if args.input_file_type in ['narrowPeak', 'broadPeak']:
        if args.rank == None: signal_type = 'signal.value'
        else: signal_type = args.rank

        try: 
            signal_index = {"score": 4, "signal.value": 6, 
                            "p.value": 7, "q.value": 8}[signal_type]
        except KeyError:
            raise ValueError(
                "Unrecognized signal type for {} filetype: '{}'".format(
                    args.input_file_type, signal_type))

        if args.peak_merge_method != None:
            peak_merge_fn = {
                "sum": sum, "avg": mean, "min": min, "max": max}[
                    args.peak_merge_method]
        elif signal_index in (4,6):
            peak_merge_fn = sum
        else:
            peak_merge_fn = min
        if args.input_file_type == 'narrowPeak':
            summit_index = 9
        else:
            summit_index = None
        f1, f2 = [load_bed(fp, signal_index, summit_index) 
                  for fp in args.samples]
        oracle_pks =  (
            load_bed(args.peak_list, signal_index, summit_index) 
            if args.peak_list != None else None)
    elif args.input_file_type in ['bed', ]:
        # set the default
        if args.rank == None: 
            signal_type = 'score'

        if args.rank == 'score':
            signal_type = 'score'
            signal_index = 4
        else:
            try: 
                signal_index = int(args.rank) - 1
                signal_type = "COL%i" % (signal_index + 1)
            except ValueError:
                raise ValueError("For bed files --signal-type must either "\
                                 +"be set to score or an index specifying "\
                                 +"the column to use.")
        
        if args.peak_merge_method == None:
            peak_merge_fn = sum
        else:
            peak_merge_fn = {
                "sum": sum, "avg": mean, "min": min, "max": max}[
                    args.peak_merge_method]
        
        f1, f2 = [load_bed(fp, signal_index) for fp in args.samples]
        oracle_pks =  (
            load_bed(args.peak_list, signal_index) 
            if args.peak_list != None else None)
    elif args.input_file_type in ['gff', ]:
        # set the default
        if args.rank == None: 
            signal_type = 'score'
        else:
            assert args.rank == 'score'
        
        if args.peak_merge_method == None:
            peak_merge_fn = sum
        else:
            peak_merge_fn = {
                "sum": sum, "avg": mean, "min": min, "max": max}[
                    args.peak_merge_method]
        
        f1, f2 = [load_gff(fp) for fp in args.samples]
        oracle_pks =  (
            load_gff(args.peak_list) 
            if args.peak_list != None else None)
    else:
        raise ValueError( "Unrecognized file type: '{}'".format(
            args.input_file_type))
    # build a unified peak set
    idr.log("Merging peaks", 'VERBOSE')
    merged_peaks = merge_peaks([f1, f2], peak_merge_fn, 
                               oracle_pks, args.use_nonoverlapping_peaks)
    return merged_peaks, signal_type
예제 #10
0
파일: idr.py 프로젝트: Simon-Coetzee/idr
def parse_args():
    import argparse

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description="""
Program: IDR (Irreproducible Discovery Rate)
Version: {PACKAGE_VERSION}
Contact: Nathan Boley <*****@*****.**>
""".format(PACKAGE_VERSION=idr.__version__))

    def PossiblyGzippedFile(fname):
        if fname.endswith(".gz"):
            return io.TextIOWrapper(gzip.open(fname, 'rb'))
        else:
            return open(fname, 'r')
    
    parser.add_argument( '--samples', '-s', type=PossiblyGzippedFile, nargs=2, 
                         required=True,
                         help='Files containing peaks and scores.')
    parser.add_argument( '--peak-list', '-p', type=PossiblyGzippedFile,
        help='If provided, all peaks will be taken from this file.')
    parser.add_argument( '--input-file-type', default='narrowPeak',
                         choices=['narrowPeak', 'broadPeak', 'bed', 'gff'], 
        help='File type of --samples and --peak-list.')
    
    parser.add_argument( '--rank',
        help="Which column to use to rank peaks."\
            +"\t\nOptions: signal.value p.value q.value columnIndex"\
            +"\nDefaults:\n\tnarrowPeak/broadPeak: signal.value\n\tbed: score")
    
    default_ofname = "idrValues.txt"
    parser.add_argument( '--output-file', "-o", 
                         default=default_ofname, 
        help='File to write output to.\nDefault: {}'.format(default_ofname))
    parser.add_argument( '--output-file-type', 
                         choices=['narrowPeak', 'broadPeak', 'bed'], 
                         default=None, 
        help='Output file type. Defaults to input file type when available, otherwise bed.')

    parser.add_argument( '--log-output-file', "-l", type=argparse.FileType("w"),
                         default=sys.stderr,
                         help='File to write output to. Default: stderr')
    
    parser.add_argument( '--idr-threshold', "-i", type=float, default=None,
        help="Only return peaks with a global idr threshold below this value."\
            +"\nDefault: report all peaks")
    parser.add_argument( '--soft-idr-threshold', type=float, default=None, 
        help="Report statistics for peaks with a global idr below this "\
        +"value but return all peaks with an idr below --idr.\nDefault: %.2f" \
                         % idr.DEFAULT_SOFT_IDR_THRESH)

    parser.add_argument( '--use-old-output-format', 
                         action='store_true', default=False,
                         help="Use old output format.")

    parser.add_argument( '--plot', action='store_true', default=False,
                         help='Plot the results to [OFNAME].png')
        
    parser.add_argument( '--use-nonoverlapping-peaks', 
                         action="store_true", default=False,
        help='Use peaks without an overlapping match and set the value to 0.')
    
    parser.add_argument( '--peak-merge-method', 
                         choices=["sum", "avg", "min", "max"], default=None,
        help="Which method to use for merging peaks.\n" \
              + "\tDefault: 'sum' for signal/score/column indexes, 'min' for p/q-value.")

    parser.add_argument( '--initial-mu', type=float, default=idr.DEFAULT_MU,
        help="Initial value of mu. Default: %.2f" % idr.DEFAULT_MU)
    parser.add_argument( '--initial-sigma', type=float, 
                         default=idr.DEFAULT_SIGMA,
        help="Initial value of sigma. Default: %.2f" % idr.DEFAULT_SIGMA)
    parser.add_argument( '--initial-rho', type=float, default=idr.DEFAULT_RHO,
        help="Initial value of rho. Default: %.2f" % idr.DEFAULT_RHO)
    parser.add_argument( '--initial-mix-param', 
        type=float, default=idr.DEFAULT_MIX_PARAM,
        help="Initial value of the mixture params. Default: %.2f" \
                         % idr.DEFAULT_MIX_PARAM)

    parser.add_argument( '--fix-mu', action='store_true', 
        help="Fix mu to the starting point and do not let it vary.")    
    parser.add_argument( '--fix-sigma', action='store_true', 
        help="Fix sigma to the starting point and do not let it vary.")    

    parser.add_argument( '--dont-filter-peaks-below-noise-mean', 
                         default=False,
                         action='store_true', 
        help="Allow signal points that are below the noise mean (should only be used if you know what you are doing).")    

    parser.add_argument( '--use-best-multisummit-IDR',
                         default=False, action='store_true',
        help="Set the IDR value for a group of multi summit peaks (same chr/start/stop but different summit) to the best value across all peaks. This is a work around for peak callers that don't do a good job splitting scores across multi summit peaks.")

    parser.add_argument( '--allow-negative-scores', 
                         default=False,
                         action='store_true', 
        help="Allow negative values for scores. (should only be used if you know what you are doing)")    

    parser.add_argument( '--random-seed', type=int, default=0, 
        help="The random seed value (sor braking ties). Default: 0") 
    parser.add_argument( '--max-iter', type=int, default=idr.MAX_ITER_DEFAULT, 
        help="The maximum number of optimization iterations. Default: %i" 
                         % idr.MAX_ITER_DEFAULT)
    parser.add_argument( '--convergence-eps', type=float, 
                         default=idr.CONVERGENCE_EPS_DEFAULT, 
        help="The maximum change in parameter value changes " \
             + "for convergence. Default: %.2e" % idr.CONVERGENCE_EPS_DEFAULT)
    
    parser.add_argument( '--only-merge-peaks', action='store_true', 
        help="Only return the merged peak list.")    
    
    parser.add_argument( '--verbose', action="store_true", default=False, 
                         help="Print out additional debug information")
    parser.add_argument( '--quiet', action="store_true", default=False, 
                         help="Don't print any status messages")

    parser.add_argument('--version', action='version', 
                        version='IDR %s' % idr.__version__)

    args = parser.parse_args()

    args.output_file = open(args.output_file, "w")
    idr.log_ofp = args.log_output_file

    if args.output_file_type is None:
        if args.input_file_type in ('narrowPeak', 'broadPeak', 'bed'):
            args.output_file_type = args.input_file_type
        else:
            args.output_file_type = 'bed'
    
    if args.verbose: 
        idr.VERBOSE = True 

    global QUIET
    if args.quiet: 
        idr.QUIET = True 
        idr.VERBOSE = False

    if args.dont_filter_peaks_below_noise_mean is True:
        idr.FILTER_PEAKS_BELOW_NOISE_MEAN = False

    if args.allow_negative_scores is True:
        idr.ONLY_ALLOW_NON_NEGATIVE_VALUES = False
        
    assert idr.DEFAULT_IDR_THRESH == 1.0
    if args.idr_threshold == None and args.soft_idr_threshold == None:
        args.idr_threshold = idr.DEFAULT_IDR_THRESH
        args.soft_idr_threshold = idr.DEFAULT_SOFT_IDR_THRESH
    elif args.soft_idr_threshold == None:
        assert args.idr_threshold != None
        args.soft_idr_threshold = args.idr_threshold
    elif args.idr_threshold == None:
        assert args.soft_idr_threshold != None
        args.idr_threshold = idr.DEFAULT_IDR_THRESH

    numpy.random.seed(args.random_seed)

    if args.plot:
        try: 
            import matplotlib
        except ImportError:
            idr.log("WARNING: matplotlib does not appear to be installed and "\
                    +"is required for plotting - turning plotting off.", 
                    level="WARNING" )
            args.plot = False
    
    return args
예제 #11
0
def main():
    args = parse_args()

    # load and merge peaks
    merged_peaks, signal_type = load_samples(args)
    s1 = numpy.array([pk.signals[0] for pk in merged_peaks])
    s2 = numpy.array([pk.signals[1] for pk in merged_peaks])

    # build the ranks vector
    idr.log("Ranking peaks", 'VERBOSE')
    r1, r2 = build_rank_vectors(merged_peaks)

    if args.only_merge_peaks:
        localIDRs, IDRs = None, None
    else:
        if len(merged_peaks) < 20:
            error_msg = "Peak files must contain at least 20 peaks post-merge"
            error_msg += "\nHint: Merged peaks were written to the output file"
            write_results_to_file(merged_peaks, args.output_file,
                                  args.output_file_type, signal_type)
            raise ValueError(error_msg)

        localIDRs = fit_model_and_calc_local_idr(
            r1,
            r2,
            starting_point=(args.initial_mu, args.initial_sigma,
                            args.initial_rho, args.initial_mix_param),
            max_iter=args.max_iter,
            convergence_eps=args.convergence_eps,
            fix_mu=args.fix_mu,
            fix_sigma=args.fix_sigma)

        # if the use chose to use the best multi summit IDR, then
        # make the correction and plot just the corrected peaks
        if args.use_best_multisummit_IDR:
            update_indices, localIDRs = correct_multi_summit_peak_IDR_values(
                localIDRs, merged_peaks)
            IDRs = calc_global_IDR(localIDRs)
            if args.plot:
                assert len(args.samples) == 2
                plot(args, [s1[update_indices], s2[update_indices]],
                     [r1[update_indices], r2[update_indices]],
                     IDRs[update_indices],
                     args.output_file.name + ".noalternatesummitpeaks")
        # we wrap this in an else statement to avoid calculating the global IDRs
        # twice
        else:
            IDRs = calc_global_IDR(localIDRs)

        if args.plot:
            assert len(args.samples) == 2
            plot(args, [s1, s2], [r1, r2], IDRs)

    num_peaks_passing_thresh = write_results_to_file(
        merged_peaks,
        args.output_file,
        args.output_file_type,
        signal_type,
        localIDRs=localIDRs,
        IDRs=IDRs,
        max_allowed_idr=args.idr_threshold,
        soft_max_allowed_idr=args.soft_idr_threshold,
        useBackwardsCompatibleOutput=args.use_old_output_format)

    args.output_file.close()