示例#1
0
def read_states_signals(args):
    # Read states from the annotation file
    states = ""
    with open(args.annotate_file) as f:
        for line in f:
            if len(line) < 2 or "#" in line or "=" in line:
                continue
            ll = line.strip().split(" ")
            for state in ll[1:-1]:
                states += state

    # If need to estimate bias table
    genome_data = GenomeData(args.organism)
    table = None

    # If the bias table is provided
    if args.bias_table:
        bias_table = BiasTable()
        bias_table_list = args.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])

    # Get the normalization and slope signal from the raw bam file
    raw_signal = GenomicSignal(args.reads_file)
    raw_signal.load_sg_coefs(slope_window_size=9)
    norm_signal, slope_signal = \
        raw_signal.get_signal(args.chrom, args.start, args.end,
                              args.downstream_ext, args.upstream_ext,
                              args.forward_shift, args.reverse_shift,
                              bias_table=table, genome_file_name=genome_data.get_genome())
    if args.print_bed_file:
        args.output_bed_file(states)

    return states, norm_signal, slope_signal
示例#2
0
def read_states_signals(args):
    # Read states from the annotation file
    states = ""
    with open(args.annotate_file) as f:
        for line in f:
            if len(line) < 2 or "#" in line or "=" in line:
                continue
            ll = line.strip().split(" ")
            for state in ll[1:-1]:
                states += state

    # If need to estimate bias table
    genome_data = GenomeData(args.organism)
    table = None

    # If the bias table is provided
    if args.bias_table:
        bias_table = BiasTable()
        bias_table_list = args.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])

    # Get the normalization and slope signal from the raw bam file
    raw_signal = GenomicSignal(args.reads_file)
    raw_signal.load_sg_coefs(slope_window_size=9)
    norm_signal, slope_signal = \
        raw_signal.get_signal(args.chrom, args.start, args.end,
                              args.downstream_ext, args.upstream_ext,
                              args.forward_shift, args.reverse_shift,
                              bias_table=table, genome_file_name=genome_data.get_genome())
    if args.print_bed_file:
        args.output_bed_file(states)

    return states, norm_signal, slope_signal
示例#3
0
def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    # check if they have same length
    mpbs_files = args.mpbs_files.strip().split(",")
    reads_files = args.reads_files.strip().split(",")
    conditions = args.conditions.strip().split(",")

    if args.colors is not None:
        colors = args.colors.strip().split(",")
    else:
        colors = [
            "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33",
            "#a65628", "#f781bf", "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3",
            "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "#8dd3c7", "#ffffb3",
            "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5",
            "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02",
            "#a6761d", "#666666", "#7fc97f", "#beaed4", "#fdc086", "#ffff99",
            "#386cb0", "#f0027f", "#bf5b17", "#666666"
        ]

    assert len(mpbs_files) == len(reads_files) == len(conditions), \
        "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files),
                                                                                    len(conditions))

    # Check if the index file exists
    for reads_file in reads_files:
        base_name = "{}.bai".format(reads_file)
        if not os.path.exists(base_name):
            pysam.index(reads_file)

    mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions")
    for i, mpbs_file in enumerate(mpbs_files):
        mpbs.read(mpbs_file)

    mpbs.sort()
    mpbs.remove_duplicates()
    mpbs_name_list = list(set(mpbs.get_names()))

    signals = np.zeros(shape=(len(conditions), len(mpbs_name_list),
                              args.window_size),
                       dtype=np.float32)
    motif_len = list()
    motif_num = list()
    motif_pwm = list()

    print((" {} cpus are detected and {} of them will be used...\n".format(
        cpu_count(), args.nc)))

    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    print("generating signal for each motif and condition...\n")
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_forward = hmm_data.get_default_bias_table_F_ATAC()
        table_reverse = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_forward,
                                            table_file_name_R=table_reverse)

        # do not use multi-processing
        if args.nc == 1:
            for i, condition in enumerate(conditions):
                for j, mpbs_name in enumerate(mpbs_name_list):
                    mpbs_regions = mpbs.by_names([mpbs_name])
                    arguments = (mpbs_regions, reads_files[i], args.organism,
                                 args.window_size, args.forward_shift,
                                 args.reverse_shift, bias_table)
                    try:
                        signals[i, j, :] = get_bc_signal(arguments)
                    except Exception:
                        logging.exception("get bias corrected signal failed")

                    # get motif length, number and pwm matrix
                    motif_len.append(mpbs_regions[0].final -
                                     mpbs_regions[0].initial)
                    motif_num.append(len(mpbs_regions))
                    motif_pwm.append(
                        get_pwm(fasta, mpbs_regions, args.window_size))

        # use multi-processing
        else:
            for i, condition in enumerate(conditions):
                print((
                    "generating signal for condition {} \n".format(condition)))
                with Pool(processes=args.nc) as pool:
                    arguments_list = list()
                    for mpbs_name in mpbs_name_list:
                        mpbs_regions = mpbs.by_names([mpbs_name])
                        arguments = (mpbs_regions, reads_files[i],
                                     args.organism, args.window_size,
                                     args.forward_shift, args.reverse_shift,
                                     bias_table)
                        arguments_list.append(arguments)

                        # get motif length, number and pwm matrix
                        motif_len.append(mpbs_regions[0].final -
                                         mpbs_regions[0].initial)
                        motif_num.append(len(mpbs_regions))
                        motif_pwm.append(
                            get_pwm(fasta, mpbs_regions, args.window_size))

                    res = pool.map(get_bc_signal, arguments_list)
                    signals[i] = np.array(res)

    # differential analysis using raw signal
    else:
        # do not use multi-processing
        if args.nc == 1:
            for i, condition in enumerate(conditions):
                for j, mpbs_name in enumerate(mpbs_name_list):
                    mpbs_regions = mpbs.by_names([mpbs_name])
                    arguments = (mpbs_regions, reads_files[i], args.organism,
                                 args.window_size, args.forward_shift,
                                 args.reverse_shift)
                    signals[i, j, :] = get_raw_signal(arguments)

                    # get motif length, number and pwm matrix
                    motif_len.append(mpbs_regions[0].final -
                                     mpbs_regions[0].initial)
                    motif_num.append(len(mpbs_regions))
                    motif_pwm.append(
                        get_pwm(fasta, mpbs_regions, args.window_size))

        # use multi-processing
        else:
            for i, condition in enumerate(conditions):
                print((
                    "generating signal for condition {} \n".format(condition)))
                with Pool(processes=args.nc) as pool:
                    arguments_list = list()
                    for mpbs_name in mpbs_name_list:
                        mpbs_regions = mpbs.by_names([mpbs_name])
                        arguments = (mpbs_regions, reads_files[i],
                                     args.organism, args.window_size,
                                     args.forward_shift, args.reverse_shift)
                        arguments_list.append(arguments)

                        # get motif length, number and pwm matrix
                        motif_len.append(mpbs_regions[0].final -
                                         mpbs_regions[0].initial)
                        motif_num.append(len(mpbs_regions))
                        motif_pwm.append(
                            get_pwm(fasta, mpbs_regions, args.window_size))

                    res = pool.map(get_raw_signal, arguments_list)
                    signals[i] = np.array(res)

    print("signal generation is done!\n")

    # compute normalization facotr for each condition
    factors = compute_factors(signals)
    output_factor(args, factors, conditions)

    # normalize signals by factor and number of motifs
    for i in range(len(conditions)):
        for j in range(len(mpbs_name_list)):
            signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j])

    if args.output_profiles:
        output_profiles(mpbs_name_list, signals, conditions,
                        args.output_location)

    print("generating line plot for each motif...\n")
    if args.nc == 1:
        for i, mpbs_name in enumerate(mpbs_name_list):
            output_line_plot(
                (mpbs_name, motif_num[i], signals[:, i, :], conditions,
                 motif_pwm[i], output_location, args.window_size, colors))
    else:
        with Pool(processes=args.nc) as pool:
            arguments_list = list()
            for i, mpbs_name in enumerate(mpbs_name_list):
                arguments_list.append(
                    (mpbs_name, motif_num[i], signals[:, i, :], conditions,
                     motif_pwm[i], output_location, args.window_size, colors))
            pool.map(output_line_plot, arguments_list)

    ps_tc_results = list()
    for i, mpbs_name in enumerate(mpbs_name_list):
        ps_tc_results.append(
            get_ps_tc_results(signals[:, i, :], motif_len[i],
                              args.window_size))

    # find the significant motifs and generate a scatter plot if two conditions are given
    if len(conditions) == 2:
        ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list,
                                     conditions)

    output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num,
                        args)
def footprint(bam: str,
              bed: str,
              assembly: str = "hg38",
              w: int = 500,
              dnase: bool = False,
              bias_type="SH"):

    # load HMM and bias parameters for ATAC-seq
    g = GenomeData(organism=assembly)
    hmm_data = HmmData()
    if dnase:
        hmm_file = hmm_data.get_default_hmm_dnase_bc()
        if bias_type == 'SH':
            table_F = hmm_data.get_default_bias_table_F_SH()
            table_R = hmm_data.get_default_bias_table_R_SH()
            bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                                table_file_name_R=table_R)
        elif bias_type == 'DH':
            table_F = hmm_data.get_default_bias_table_F_DH()
            table_R = hmm_data.get_default_bias_table_R_DH()
            bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                                table_file_name_R=table_R)
    else:
        hmm_file = hmm_data.get_default_hmm_atac_paired()
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                            table_file_name_R=table_R)

    # load reads from BAM
    reads_file = GenomicSignal(bam)
    reads_file.load_sg_coefs(SG_WINDOW_SIZE)

    # open data and sequence
    bam = Samfile(bam, "rb")
    fasta = Fastafile(g.get_genome())

    # load and expand regions
    with open(bed, 'r') as f:
        regions = [
            expandRegion(
                *tuple(line.strip().split()[:3]),
                line.strip().split()[3]
                if len(line.strip().split()) >= 4 else None, w,
                line.strip().split()[4]
                if len(line.strip().split()) >= 5 else '.') for line in f
        ]

    # load signal
    forward = []
    reverse = []
    failed = 0
    get_reads = reads_file.get_signal_atac if not dnase else reads_file.get_signal
    for i, x in enumerate(regions):
        try:
            chromosome, start, end, _, strand = x
            atac_norm_f, atac_slope_f, atac_norm_r, atac_slope_r = get_reads(
                chromosome, start, end, 0, 0,
                FORWARD_SHIFT if not dnase else 0,
                REVERSE_SHIFT if not dnase else 0, 1000 if dnase else 150, 98,
                98, bias_table, g.get_genome())
            atac_norm_f = [float(x) for x in atac_norm_f]
            atac_norm_r = [float(x) for x in atac_norm_r]
            if strand == '-':
                atac_norm_f.reverse()
                atac_norm_r.reverse()
            forward.append(atac_norm_f if strand != '-' else atac_norm_r)
            reverse.append(atac_norm_r if strand != '-' else atac_norm_f)
            if i % 500 == 0:
                print("INFO: aggregating region %d of %d" % (i, len(regions)),
                      file=sys.stderr)
        except:
            if len(forward) <= i: forward.append(None)
            if len(reverse) <= i: reverse.append(None)
            failed += 1
    if failed > 0:
        print(
            "WARNING: failed to generate bias-corrected signal profiles for %d regions"
            % failed,
            file=sys.stderr)

    return [
        regionDict(regions[i], forward[i], reverse[i])
        for i in range(len(regions))
        if forward[i] is not None and reverse[i] is not None
    ]
示例#5
0
def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(args.mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(args.mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()
    mpbs_name_list = list(set(mpbs.get_names()))

    signal_dict_by_tf_1 = dict()
    signal_dict_by_tf_2 = dict()
    motif_len_dict = dict()
    motif_num_dict = dict()
    pwm_dict_by_tf = dict()

    pool = Pool(processes=args.nc)
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table1 = BiasTable().load_table(table_file_name_F=table_F,
                                             table_file_name_R=table_R)
        bias_table2 = BiasTable().load_table(table_file_name_F=table_F,
                                             table_file_name_R=table_R)

        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append(
                (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1,
                 args.reads_file2, args.organism, args.window_size,
                 args.forward_shift, args.reverse_shift, bias_table1,
                 bias_table2))
        try:
            res = pool.map(get_bc_signal, mpbs_list)
        except Exception:
            logging.exception("get bias corrected signal failed")

    # differential analysis using raw signal
    else:
        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append(
                (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1,
                 args.reads_file2, args.organism, args.window_size,
                 args.forward_shift, args.reverse_shift))
        try:
            res = pool.map(get_raw_signal, mpbs_list)
        except Exception:
            logging.exception("get raw signal failed")

    for idx, mpbs_name in enumerate(mpbs_name_list):
        signal_dict_by_tf_1[mpbs_name] = res[idx][0]
        signal_dict_by_tf_2[mpbs_name] = res[idx][1]
        motif_len_dict[mpbs_name] = res[idx][2]
        pwm_dict_by_tf[mpbs_name] = res[idx][3]
        motif_num_dict[mpbs_name] = res[idx][4]

    if args.factor1 is None or args.factor2 is None:
        args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1,
                                                     signal_dict_by_tf_2)
        output_factor(args, args.factor1, args.factor2)

    if args.output_profiles:
        output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location,
                        args.condition1)
        output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location,
                        args.condition2)

    ps_tc_results_by_tf = dict()

    plots_list = list()
    for mpbs_name in mpbs_name_list:
        plots_list.append(
            (mpbs_name, motif_num_dict[mpbs_name],
             signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
             args.factor1, args.factor2, args.condition1, args.condition2,
             pwm_dict_by_tf[mpbs_name], output_location, args.window_size,
             args.standardize))

    pool.map(line_plot, plots_list)

    for mpbs_name in mpbs_name_list:
        res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name],
                                signal_dict_by_tf_2[mpbs_name], args.factor1,
                                args.factor2, motif_num_dict[mpbs_name],
                                motif_len_dict[mpbs_name])
        #
        #     # only use the factors whose protection scores are greater than 0
        #     if res[0] > 0 and res[1] < 0:
        ps_tc_results_by_tf[mpbs_name] = res
    #
    stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
    scatter_plot(args, stat_results_by_tf)
    output_stat_results(args, stat_results_by_tf, motif_num_dict)
示例#6
0
def get_bc_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG",
                        add_msg="You must specify reads and regions file.")

    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()

    reads_file = GenomicSignal()

    bam = Samfile(args.input_files[0], "rb")
    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    hmm_data = HmmData()
    if args.bias_table:
        bias_table_list = args.bias_table.split(",")
        bias_table = BiasTable().load_table(
            table_file_name_F=bias_table_list[0],
            table_file_name_R=bias_table_list[1])
    else:
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                            table_file_name_R=table_R)

    if args.strand_specific:
        fname_forward = os.path.join(
            args.output_location, "{}_forward.wig".format(args.output_prefix))
        fname_reverse = os.path.join(
            args.output_location, "{}_reverse.wig".format(args.output_prefix))

        f_forward = open(fname_forward, "a")
        f_reverse = open(fname_reverse, "a")
        for region in regions:
            signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length(
                ref=region.chrom,
                start=region.initial,
                end=region.final,
                bam=bam,
                fasta=fasta,
                bias_table=bias_table,
                forward_shift=args.forward_shift,
                reverse_shift=args.reverse_shift,
                min_length=None,
                max_length=None,
                strand=True)

            if args.norm:
                signal_f = reads_file.boyle_norm(signal_f)
                perc = scoreatpercentile(signal_f, 98)
                std = np.std(signal_f)
                signal_f = reads_file.hon_norm_atac(signal_f, perc, std)

                signal_r = reads_file.boyle_norm(signal_r)
                perc = scoreatpercentile(signal_r, 98)
                std = np.std(signal_r)
                signal_r = reads_file.hon_norm_atac(signal_r, perc, std)

            f_forward.write(
                "fixedStep chrom=" + region.chrom + " start=" +
                str(region.initial + 1) + " step=1\n" +
                "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n")

            f_reverse.write(
                "fixedStep chrom=" + region.chrom + " start=" +
                str(region.initial + 1) + " step=1\n" +
                "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n")

        f_forward.close()
        f_reverse.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            bw_filename = os.path.join(
                args.output_location,
                "{}_forward.bw".format(args.output_prefix))
            os.system(" ".join([
                "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename,
                "-verbose=0"
            ]))
            os.remove(fname_forward)

            bw_filename = os.path.join(
                args.output_location,
                "{}_reverse.bw".format(args.output_prefix))
            os.system(" ".join([
                "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename,
                "-verbose=0"
            ]))
            os.remove(fname_reverse)

    else:
        output_fname = os.path.join(args.output_location,
                                    "{}.wig".format(args.output_prefix))
        with open(output_fname, "a") as output_f:
            for region in regions:
                signal = reads_file.get_bc_signal_by_fragment_length(
                    ref=region.chrom,
                    start=region.initial,
                    end=region.final,
                    bam=bam,
                    fasta=fasta,
                    bias_table=bias_table,
                    forward_shift=args.forward_shift,
                    reverse_shift=args.reverse_shift,
                    min_length=None,
                    max_length=None,
                    strand=False)

                if args.norm:
                    signal = reads_file.boyle_norm(signal)
                    perc = scoreatpercentile(signal, 98)
                    std = np.std(signal)
                    signal = reads_file.hon_norm_atac(signal, perc, std)

                output_f.write(
                    "fixedStep chrom=" + region.chrom + " start=" +
                    str(region.initial + 1) + " step=1\n" +
                    "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
        output_f.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()
            bw_filename = os.path.join(args.output_location,
                                       "{}.bw".format(args.output_prefix))
            os.system(" ".join([
                "wigToBigWig", output_fname, chrom_sizes_file, bw_filename,
                "-verbose=0"
            ]))
            os.remove(output_fname)