Python GenomicRegionSet.combine示例

def merge_DBD_regions(path):
    """Merge all available DBD regions in BED format. """

    for t in os.listdir(path):
        if os.path.isdir(os.path.join(path, t)):
            dbd_pool = GenomicRegionSet(t)
            for rna in os.listdir(os.path.join(path,t)):
                f = os.path.join(path, t, rna, "DBD_"+rna+".bed")
                if os.path.exists(f):
                    dbd = GenomicRegionSet(rna)
                    dbd.read_bed(f)
                    for r in dbd: r.name = rna+"_"+r.name
                    dbd_pool.combine(dbd)
            dbd_pool.write_bed(os.path.join(path, t, "DBD_"+t+".bed"))

示例#2

显示文件

文件： DifferentialAnalysis.py 项目： CostaLab/reg-gen

def get_bc_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    # Fetch bias corrected signal
    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1,
                                  bias_table=bias_table1, genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift, reverse_shift=reverse_shift)

        signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2,
                                  bias_table=bias_table2, genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift, reverse_shift=reverse_shift)

        if len(signal1) != len(signal_1) or len(signal2) != len(signal_2):
            continue

        # smooth the signal
        signal_1 = np.add(signal_1, np.array(signal1))
        signal_2 = np.add(signal_2, np.array(signal2))

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif

示例#3

显示文件

文件： DifferentialAnalysis.py 项目： CostaLab/reg-gen

def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif

示例#4

显示文件

文件： DifferentialAnalysis.py 项目： CostaLab/reg-gen

def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    # Check if the index file exists
    base_name1 = "{}.bai".format(args.reads_file1)
    if not os.path.exists(base_name1):
        pysam.index(args.reads_file1)

    base_name2 = "{}.bai".format(args.reads_file2)
    if not os.path.exists(base_name2):
        pysam.index(args.reads_file2)

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(args.mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(args.mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()
    mpbs.remove_duplicates()
    mpbs_name_list = list(set(mpbs.get_names()))

    signal_dict_by_tf_1 = dict()
    signal_dict_by_tf_2 = dict()
    motif_len_dict = dict()
    motif_num_dict = dict()
    pwm_dict_by_tf = dict()

    pool = Pool(processes=args.nc)
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R)
        bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R)

        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2,
                              args.organism, args.window_size, args.forward_shift, args.reverse_shift,
                              bias_table1, bias_table2))
        try:
            res = pool.map(get_bc_signal, mpbs_list)
        except Exception:
            logging.exception("get bias corrected signal failed")

    # differential analysis using raw signal
    else:
        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2,
                              args.organism, args.window_size, args.forward_shift, args.reverse_shift))
        try:
            res = pool.map(get_raw_signal, mpbs_list)
        except Exception:
            logging.exception("get raw signal failed")

    for idx, mpbs_name in enumerate(mpbs_name_list):
        signal_dict_by_tf_1[mpbs_name] = res[idx][0]
        signal_dict_by_tf_2[mpbs_name] = res[idx][1]
        motif_len_dict[mpbs_name] = res[idx][2]
        pwm_dict_by_tf[mpbs_name] = res[idx][3]
        motif_num_dict[mpbs_name] = res[idx][4]

    if args.factor1 is None or args.factor2 is None:
        args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2)
        output_factor(args, args.factor1, args.factor2)

    if args.output_profiles:
        output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1)
        output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2)

    ps_tc_results_by_tf = dict()

    plots_list = list()
    for mpbs_name in mpbs_name_list:
        plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name],
                           signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1,
                           args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size,
                           args.standardize))

    pool.map(line_plot, plots_list)

    for mpbs_name in mpbs_name_list:
        res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
                                args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name])
        #
        #     # only use the factors whose protection scores are greater than 0
        #     if res[0] > 0 and res[1] < 0:
        ps_tc_results_by_tf[mpbs_name] = res
    #
    # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
    ps_tc_results_by_tf = scatter_plot(args, ps_tc_results_by_tf)
    output_stat_results(args, ps_tc_results_by_tf, motif_num_dict)

示例#5

显示文件

 def posi2region(self, regions, p):
     all = range(len(regions))
     new_r = GenomicRegionSet(name="")
     for r in p:
         new_r.combine(regions[r])
     return new_r

示例#6

显示文件

文件： DifferentialAnalysis.py 项目： alexyfyf/reg-gen

def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(args.mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(args.mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()
    mpbs_name_list = list(set(mpbs.get_names()))

    signal_dict_by_tf_1 = dict()
    signal_dict_by_tf_2 = dict()
    motif_len_dict = dict()
    motif_num_dict = dict()
    pwm_dict_by_tf = dict()

    pool = Pool(processes=args.nc)
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table1 = BiasTable().load_table(table_file_name_F=table_F,
                                             table_file_name_R=table_R)
        bias_table2 = BiasTable().load_table(table_file_name_F=table_F,
                                             table_file_name_R=table_R)

        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append(
                (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1,
                 args.reads_file2, args.organism, args.window_size,
                 args.forward_shift, args.reverse_shift, bias_table1,
                 bias_table2))
        try:
            res = pool.map(get_bc_signal, mpbs_list)
        except Exception:
            logging.exception("get bias corrected signal failed")

    # differential analysis using raw signal
    else:
        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append(
                (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1,
                 args.reads_file2, args.organism, args.window_size,
                 args.forward_shift, args.reverse_shift))
        try:
            res = pool.map(get_raw_signal, mpbs_list)
        except Exception:
            logging.exception("get raw signal failed")

    for idx, mpbs_name in enumerate(mpbs_name_list):
        signal_dict_by_tf_1[mpbs_name] = res[idx][0]
        signal_dict_by_tf_2[mpbs_name] = res[idx][1]
        motif_len_dict[mpbs_name] = res[idx][2]
        pwm_dict_by_tf[mpbs_name] = res[idx][3]
        motif_num_dict[mpbs_name] = res[idx][4]

    if args.factor1 is None or args.factor2 is None:
        args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1,
                                                     signal_dict_by_tf_2)
        output_factor(args, args.factor1, args.factor2)

    if args.output_profiles:
        output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location,
                        args.condition1)
        output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location,
                        args.condition2)

    ps_tc_results_by_tf = dict()

    plots_list = list()
    for mpbs_name in mpbs_name_list:
        plots_list.append(
            (mpbs_name, motif_num_dict[mpbs_name],
             signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
             args.factor1, args.factor2, args.condition1, args.condition2,
             pwm_dict_by_tf[mpbs_name], output_location, args.window_size,
             args.standardize))

    pool.map(line_plot, plots_list)

    for mpbs_name in mpbs_name_list:
        res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name],
                                signal_dict_by_tf_2[mpbs_name], args.factor1,
                                args.factor2, motif_num_dict[mpbs_name],
                                motif_len_dict[mpbs_name])
        #
        #     # only use the factors whose protection scores are greater than 0
        #     if res[0] > 0 and res[1] < 0:
        ps_tc_results_by_tf[mpbs_name] = res
    #
    stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
    scatter_plot(args, stat_results_by_tf)
    output_stat_results(args, stat_results_by_tf, motif_num_dict)

示例#7

显示文件

文件： DifferentialAnalysis.py 项目： alexyfyf/reg-gen

def get_bc_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift, bias_table1,
     bias_table2) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    # Fetch bias corrected signal
    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        signal1 = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam1,
                                  bias_table=bias_table1,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)

        signal2 = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam2,
                                  bias_table=bias_table2,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)

        if len(signal1) != len(signal_1) or len(signal2) != len(signal_2):
            continue

        signal_1 = np.add(signal_1, np.array(signal1))
        signal_2 = np.add(signal_2, np.array(signal2))

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif

示例#8

显示文件

文件： DifferentialAnalysis.py 项目： alexyfyf/reg-gen

def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif

示例#9

显示文件

文件： boxplot.py 项目： eggduzao/reg-gen

class Boxplot:
    """
    input:
        exps: input experimental matrix
        title: Default = boxplot
        groupby: Group the data by the given factor in the header of experimental matrix

    output:
        parameters: list of records
        figs: a list of figure(s)
    """

    def __init__(self, EMpath, fields, title="boxplot", df=False):
        # Read the Experimental Matrix
        self.title = title
        self.exps = ExperimentalMatrix()
        self.exps.read(EMpath)
        for f in fields:
            if f not in ["None", "reads", "regions", "factor"]:
                self.exps.match_ms_tags(f)
        self.exps.remove_name()
        self.beds = self.exps.get_regionsets()  # A list of GenomicRegionSets
        self.bednames = self.exps.get_regionsnames()
        self.reads = self.exps.get_readsfiles()
        self.readsnames = self.exps.get_readsnames()
        self.fieldsDict = self.exps.fieldsDict
        self.parameter = []
        self.df = df

    def combine_allregions(self):

        self.all_bed = GenomicRegionSet("All regions")
        for bed in self.beds:
            self.all_bed.combine(bed)
        self.all_bed.remove_duplicates()  # all_bed is sorted!!

    def bedCoverage(self):
        """ Return coverage matrix of multiple reads on one bed.
        bed --> GenomicRegionSet
        """
        c = []
        for rp in self.reads:
            print("    processing: ..." + rp[-45:])
            r = os.path.abspath(rp)  # Here change the relative path into absolute path
            cov = CoverageSet(r, self.all_bed)
            cov.coverage_from_genomicset(r)
            cov.normRPM()
            c.append(cov.coverage)
        self.all_table = numpy.transpose(c)

    def quantile_normalization(self):
        """ Return the np.array which contains the normalized values
        """
        rank_matrix = []
        for c in range(self.all_table.shape[1]):
            col = self.all_table[:, c]
            rank_col = mstats.rankdata(col)
            rank_matrix.append(rank_col)

        ranks = numpy.array(rank_matrix)
        trans_rank = numpy.transpose(ranks)

        # Calculate for means of ranks
        print("    Calculating for the mean of ranked data...")
        sort_matrix = numpy.sort(self.all_table, axis=0)
        means = []
        for r in range(self.all_table.shape[0]):
            row = [x for x in sort_matrix[r, :]]
            means.append(numpy.mean(row))

        # Replace the value by new means
        print("    Replacing the data value by normalized mean...")
        normalized_table = numpy.around(trans_rank)
        for i, v in enumerate(means):
            normalized_table[normalized_table == i + 1] = v
        # print(rounded_rank)
        self.norm_table = normalized_table

    def tables_for_plot(self):
        """ Return a Dict which stores all tables for each bed with file name as its key. """
        self.tableDict = OrderedDict()  # Storage all tables for each bed with bedname as the key
        conList = []  # Store containers of beds
        iterList = []

        for i, bed in enumerate(self.beds):
            self.tableDict[bed.name] = []
            bed.sort()
            conList.append(bed.__iter__())
            iterList.append(conList[-1].next())

        for i, r in enumerate(self.all_bed.sequences):
            for j in range(len(self.beds)):
                while r > iterList[j]:
                    try:
                        iterList[j] = conList[j].next()
                    except:
                        break
                if r == iterList[j]:
                    self.tableDict[self.beds[j].name].append(self.norm_table[i])
                elif r < iterList[j]:
                    continue

    def print_plot_table(self, directory, folder):
        for i, bed in enumerate(self.tableDict.keys()):
            # table = []
            # header = ["chrom", "initial", "final"]
            # for rp in self.reads:
            #     header.append(os.path.basename(rp))
            # table.append(header)
            # for j, re in enumerate(self.beds[i]):
            #     table.append([re.chrom, re.initial, re.final] + self.tableDict[bed][j].tolist())
            # output_array(table, directory, folder, filename="table_" + bed + ".txt")
            output_array(self.tableDict[bed], directory, folder, filename="table_" + bed + ".txt")

    def group_tags(self, groupby, sortby, colorby):
        """Generate the tags for the grouping of plot
        Parameters:
            groupby = 'reads','regions','cell',or 'factor'
            colorby = 'reads','regions','cell',or 'factor'
            sortby = 'reads','regions','cell',or 'factor'
        """
        self.tag_type = [groupby, sortby, colorby]

        if groupby == "None":
            self.group_tags = [""]
        else:
            self.group_tags = gen_tags(self.exps, groupby)
        if sortby == "None":
            self.sort_tags = [""]
        else:
            self.sort_tags = gen_tags(self.exps, sortby)
        if colorby == "None":
            self.color_tags = [""]
        else:
            self.color_tags = gen_tags(self.exps, colorby)

    def group_data(self, directory, folder, log=False):
        plotDict = OrderedDict()  # Extracting the data from different bed_bams file
        cuesbed = OrderedDict()  # Storing the cues for back tracking
        cuesbam = OrderedDict()
        for bedname in self.tableDict.keys():
            plotDict[bedname] = OrderedDict()
            mt = numpy.array(self.tableDict[bedname])

            cuesbed[bedname] = set(tag_from_r(self.exps, self.tag_type, bedname))
            # cuesbed[bedname] = [tag for tag in self.exps.get_types(bedname) if tag in self.group_tags + self.sort_tags + self.color_tags]

            for i, readname in enumerate(self.readsnames):
                plotDict[bedname][readname] = mt[:, i]
                # print(plotDict[bedname][readname])
                cuesbam[readname] = set(tag_from_r(self.exps, self.tag_type, readname))
                # cuesbam[readname] = [tag for tag in self.exps.get_types(readname) if tag in self.group_tags + self.sort_tags + self.color_tags]

        sortDict = OrderedDict()  # Storing the data by sorting tags
        for g in self.group_tags:
            # print("    "+g)
            sortDict[g] = OrderedDict()
            for a in self.sort_tags:
                # print("        "+a)
                sortDict[g][a] = OrderedDict()
                for c in self.color_tags:
                    # sortDict[g][a][c] = None
                    # print("            "+c)
                    for i, bed in enumerate(cuesbed.keys()):
                        if set([g, a, c]) >= cuesbed[bed]:
                            sortDict[g][a][c] = []
                            for bam in cuesbam.keys():
                                if set([g, a, c]) >= cuesbam[bam]:
                                    if self.df:
                                        sortDict[g][a][c].append(plotDict[bed][bam])
                                        if len(sortDict[g][a][c]) == 2:
                                            if log:
                                                sortDict[g][a][c][0] = numpy.log(sortDict[g][a][c][0]+1)
                                                sortDict[g][a][c][1] = numpy.log(sortDict[g][a][c][1]+1)
                                                sortDict[g][a][c] = numpy.subtract(sortDict[g][a][c][0],
                                                                                   sortDict[g][a][c][1]).tolist()
                                            else:
                                                sortDict[g][a][c] = numpy.subtract(sortDict[g][a][c][0],
                                                                                   sortDict[g][a][c][1]).tolist()
                                    else:
                                        sortDict[g][a][c] = plotDict[bed][bam]
        self.sortDict = sortDict

    def color_map(self, colorby, definedinEM):
        self.colors = colormap(self.exps, colorby, definedinEM)

    def print_table(self, directory, folder):
        self.printtable = OrderedDict()
        table = []
        table.append(["#group_tag", "sort_tag", "color_tag", "Signals"])
        for i, g in enumerate(self.group_tags):
            for k, a in enumerate(self.sort_tags):
                for j, c in enumerate(self.color_tags):
                    table.append([g, a, c] + [str(x) for x in self.sortDict[g][a][c]])
        # print(table)
        output_array(table, directory, folder, filename="output_table.txt")

    def plot(self, title, scol, logT=False, ylim=False, pw=3, ph=4):
        """ Return boxplot from the given tables.

        """
        self.xtickrotation, self.xtickalign = 0, "center"
        if len(self.group_tags) < 2:
            ticklabelsize = pw * 1.5
        else:
            ticklabelsize = pw * 6
        tw = len(self.group_tags) * pw
        th = ph

        f, axarr = plt.subplots(1, len(self.group_tags), dpi=300, sharey=scol,
                                figsize=(tw, th))
        # f, axarr = plt.subplots(1, len(self.group_tags), dpi=300, sharey = scol)


        # nm = len(self.group_tags) * len(self.color_tags) * len(self.sort_tags)
        # if nm > 30:
        # f.set_size_inches(nm * 0.25 ,nm * 0.15)
        # legend_x = 1.2
        # self.xtickrotation, self.xtickalign = 70,"right"

        canvas = FigureCanvas(f)
        canvas.set_window_title(title)
        try:
            axarr = axarr.reshape(-1)
        except:
            axarr = [axarr]
        # plt.subplots_adjust(bottom=0.3)
        if logT:
            if self.df:
                axarr[0].set_ylabel("Read number difference (log)",
                                    fontsize=ticklabelsize + 1)
            else:
                axarr[0].set_ylabel("Read number (log)", fontsize=ticklabelsize + 1)
        else:
            if self.df:
                axarr[0].set_ylabel("Read number difference", fontsize=ticklabelsize + 1)
            else:
                axarr[0].set_ylabel("Read number", fontsize=ticklabelsize + 1)

        for i, g in enumerate(self.sortDict.keys()):
            # if self.df:
            #     axarr[i].set_title(g + "_df", y=1.02, fontsize=ticklabelsize + 2)
            # else:
            axarr[i].set_title(g, y=1.02, fontsize=ticklabelsize + 2)

            if logT and not self.df:
                axarr[i].set_yscale('log')
            else:
                axarr[i].locator_params(axis='y', nbins=4)

            axarr[i].tick_params(axis='y', direction='out')
            axarr[i].yaxis.tick_left()
            axarr[i].yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.7, zorder=1)
            if ylim:
                axarr[i].set_ylim([-ylim, ylim])
            d = []  # Store data within group
            color_t = []  # Store tag for coloring boxes
            x_ticklabels = []  # Store ticklabels
            for j, a in enumerate(self.sortDict[g].keys()):
                # if len(a) > 10:
                    # print(a)
                self.xtickrotation = 70
                self.xtickalign = "right"
                for k, c in enumerate(self.sortDict[g][a].keys()):
                    if self.sortDict[g][a][c] == None:  # When there is no matching data, skip it
                        continue
                    else:
                        if self.df:
                            d.append(self.sortDict[g][a][c])
                        else:
                            d.append([x + 1 for x in self.sortDict[g][a][c]])
                        color_t.append(self.colors[k])
                        x_ticklabels.append(a)  # + "." + c

            # Fine tuning boxplot
            # print(d)
            bp = axarr[i].boxplot(d, notch=False, sym='o', vert=True, whis=1.5, positions=None,
                                  widths=None, patch_artist=True, bootstrap=None)
            z = 10  # zorder for boxplot
            plt.setp(bp['whiskers'], color='black', linestyle='-', linewidth=0.8, zorder=z)
            plt.setp(bp['fliers'], markerfacecolor='gray', color='white', alpha=0.3, markersize=1.8, zorder=z)
            plt.setp(bp['caps'], color='white', zorder=z)
            plt.setp(bp['medians'], color='black', linewidth=1.5, zorder=z + 1)
            legends = []
            for patch, color in zip(bp['boxes'], color_t):
                patch.set_facecolor(color)  # When missing the data, the color patch will exceeds
                patch.set_edgecolor("none")
                patch.set_zorder(z)
                legends.append(patch)

            # Fine tuning subplot
            axarr[i].set_xticks([len(self.color_tags) * n + 1 + (len(self.color_tags) - 1) / 2 for n, s in
                                 enumerate(self.sortDict[g].keys())])
            # plt.xticks(xlocations, sort_tags, rotation=90, fontsize=10)
            axarr[i].set_xticklabels(self.sortDict[g].keys(), rotation=self.xtickrotation,
                                     ha=self.xtickalign)
            # axarr[i].set_xticklabels(self.sortDict[g].keys(), rotation=70, ha=self.xtickalign, fontsize=10)

            # axarr[i].set_ylim(bottom=0.95)
            for spine in ['top', 'right', 'left', 'bottom']:
                axarr[i].spines[spine].set_visible(False)
            axarr[i].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='on')
            axarr[i].tick_params(labelsize=ticklabelsize + 1)
            if scol:
                # plt.setp(axarr[i].get_yticklabels(),visible=False)
                axarr[i].minorticks_off()

                # axarr[i].tick_params(axis='y', which='both', left='off', right='off', labelbottom='off')
            else:
                plt.setp(axarr[i].get_yticklabels(), visible=True)
                axarr[i].tick_params(axis='y', which='both', left='on', right='off', labelbottom='on')
                # plt.setp(axarr[i].get_yticks(),visible=False)

        axarr[-1].legend(legends[0:len(self.color_tags)], self.color_tags, loc='center left', handlelength=1,
                         handletextpad=1, columnspacing=2, borderaxespad=0., prop={'size': ticklabelsize + 1},
                         bbox_to_anchor=(1.05, 0.5))
        # f.tight_layout(pad=2, h_pad=None, w_pad=None)
        # f.tight_layout()
        self.fig = f

    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = title
        link_d = OrderedDict()
        link_d["Boxplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        # fp = os.path.join(dir,outputname,title)

        html.add_figure("boxplot.png", align="center")

        type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss'

        #### Calculate p value ####
        plist = {}
        for g in self.sortDict.keys():
            plist[g] = {}
            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    data1 = self.sortDict[g][s1][c1]
                    plist[g][s1 + c1] = {}
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                pass
                            else:
                                data2 = self.sortDict[g][s2][c2]
                                u, p_value = mannwhitneyu(data1, data2)
                                plist[g][s1 + c1][s2 + c2] = p_value

        print("Multiple test correction.")
        multiple_correction(plist)

        for g in self.sortDict.keys():
            html.add_heading(g, size=4, bold=False)
            data_table = []
            col_size_list = [15]
            header_list = ["p-value"]
            for s in self.sortDict[g].keys():
                for c in self.sortDict[g][s1].keys():
                    header_list.append(s + "\n" + c)
                    col_size_list.append(15)

            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    row = [s1 + "\n" + c1]
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                row.append("-")
                            else:
                                p = plist[g][s1 + c1][s2 + c2]
                                if p > 0.05:
                                    row.append(value2str(p))
                                else:
                                    row.append("<font color=\"red\">" + value2str(p) + "</font>")
                    data_table.append(row)

            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align + 50)

        # html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")

        header_list = ["Assumptions and hypothesis"]
        col_size_list = [50]
        data_table = [['All the regions among different BED files are normalized by quantile normalization.'],
                      [
                          'If there is any grouping problem, please check all the optional columns in input experimental matrix.']]
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left")

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'])
        html.write(os.path.join(directory, title, "parameters.html"))

示例#10

显示文件

文件： boxplot.py 项目： rafalcode/reg-gen

class Boxplot:
    """
    input:
        exps: input experimental matrix
        title: Default = boxplot
        groupby: Group the data by the given factor in the header of experimental matrix

    output:
        parameters: list of records
        figs: a list of figure(s)
    """
    def __init__(self, EMpath, fields, title="boxplot", df=False):
        # Read the Experimental Matrix
        self.title = title
        self.exps = ExperimentalMatrix()
        self.exps.read(EMpath)
        for f in fields:
            if f not in ["None", "reads", "regions", "factor"]:
                self.exps.match_ms_tags(f)
        self.exps.remove_name()
        self.beds = self.exps.get_regionsets()  # A list of GenomicRegionSets
        self.bednames = self.exps.get_regionsnames()
        self.reads = self.exps.get_readsfiles()
        self.readsnames = self.exps.get_readsnames()
        self.fieldsDict = self.exps.fieldsDict
        self.parameter = []
        self.df = df

    def combine_allregions(self):

        self.all_bed = GenomicRegionSet("All regions")
        for bed in self.beds:
            self.all_bed.combine(bed)
        self.all_bed.remove_duplicates()  # all_bed is sorted!!

    def bedCoverage(self):
        """ Return coverage matrix of multiple reads on one bed.
        bed --> GenomicRegionSet
        """
        c = []
        for rp in self.reads:
            print("    processing: ..." + rp[-45:])
            r = os.path.abspath(
                rp)  # Here change the relative path into absolute path
            cov = CoverageSet(r, self.all_bed)
            cov.coverage_from_genomicset(r)
            cov.normRPM()
            c.append(cov.coverage)
        self.all_table = numpy.transpose(c)

    def quantile_normalization(self):
        """ Return the np.array which contains the normalized values
        """
        rank_matrix = []
        for c in range(self.all_table.shape[1]):
            col = self.all_table[:, c]
            rank_col = mstats.rankdata(col)
            rank_matrix.append(rank_col)

        ranks = numpy.array(rank_matrix)
        trans_rank = numpy.transpose(ranks)

        # Calculate for means of ranks
        print("    Calculating for the mean of ranked data...")
        sort_matrix = numpy.sort(self.all_table, axis=0)
        means = []
        for r in range(self.all_table.shape[0]):
            row = [x for x in sort_matrix[r, :]]
            means.append(numpy.mean(row))

        # Replace the value by new means
        print("    Replacing the data value by normalized mean...")
        normalized_table = numpy.around(trans_rank)
        for i, v in enumerate(means):
            normalized_table[normalized_table == i + 1] = v
        # print(rounded_rank)
        self.norm_table = normalized_table

    def tables_for_plot(self):
        """ Return a Dict which stores all tables for each bed with file name as its key. """
        self.tableDict = OrderedDict(
        )  # Storage all tables for each bed with bedname as the key
        conList = []  # Store containers of beds
        iterList = []

        for i, bed in enumerate(self.beds):
            self.tableDict[bed.name] = []
            bed.sort()
            conList.append(bed.__iter__())
            iterList.append(conList[-1].next())

        for i, r in enumerate(self.all_bed.sequences):
            for j in range(len(self.beds)):
                while r > iterList[j]:
                    try:
                        iterList[j] = conList[j].next()
                    except:
                        break
                if r == iterList[j]:
                    self.tableDict[self.beds[j].name].append(
                        self.norm_table[i])
                elif r < iterList[j]:
                    continue

    def print_plot_table(self, directory, folder):
        for i, bed in enumerate(self.tableDict.keys()):
            # table = []
            # header = ["chrom", "initial", "final"]
            # for rp in self.reads:
            #     header.append(os.path.basename(rp))
            # table.append(header)
            # for j, re in enumerate(self.beds[i]):
            #     table.append([re.chrom, re.initial, re.final] + self.tableDict[bed][j].tolist())
            # output_array(table, directory, folder, filename="table_" + bed + ".txt")
            output_array(self.tableDict[bed],
                         directory,
                         folder,
                         filename="table_" + bed + ".txt")

    def group_tags(self, groupby, sortby, colorby):
        """Generate the tags for the grouping of plot
        Parameters:
            groupby = 'reads','regions','cell',or 'factor'
            colorby = 'reads','regions','cell',or 'factor'
            sortby = 'reads','regions','cell',or 'factor'
        """
        self.tag_type = [groupby, sortby, colorby]

        if groupby == "None":
            self.group_tags = [""]
        else:
            self.group_tags = gen_tags(self.exps, groupby)
        if sortby == "None":
            self.sort_tags = [""]
        else:
            self.sort_tags = gen_tags(self.exps, sortby)
        if colorby == "None":
            self.color_tags = [""]
        else:
            self.color_tags = gen_tags(self.exps, colorby)

    def group_data(self, directory, folder, log=False):
        plotDict = OrderedDict(
        )  # Extracting the data from different bed_bams file
        cuesbed = OrderedDict()  # Storing the cues for back tracking
        cuesbam = OrderedDict()
        for bedname in self.tableDict.keys():
            plotDict[bedname] = OrderedDict()
            mt = numpy.array(self.tableDict[bedname])

            cuesbed[bedname] = set(
                tag_from_r(self.exps, self.tag_type, bedname))
            # cuesbed[bedname] = [tag for tag in self.exps.get_types(bedname) if tag in self.group_tags + self.sort_tags + self.color_tags]

            for i, readname in enumerate(self.readsnames):
                plotDict[bedname][readname] = mt[:, i]
                # print(plotDict[bedname][readname])
                cuesbam[readname] = set(
                    tag_from_r(self.exps, self.tag_type, readname))
                # cuesbam[readname] = [tag for tag in self.exps.get_types(readname) if tag in self.group_tags + self.sort_tags + self.color_tags]

        sortDict = OrderedDict()  # Storing the data by sorting tags
        for g in self.group_tags:
            # print("    "+g)
            sortDict[g] = OrderedDict()
            for a in self.sort_tags:
                # print("        "+a)
                sortDict[g][a] = OrderedDict()
                for c in self.color_tags:
                    # sortDict[g][a][c] = None
                    # print("            "+c)
                    for i, bed in enumerate(cuesbed.keys()):
                        if set([g, a, c]) >= cuesbed[bed]:
                            sortDict[g][a][c] = []
                            for bam in cuesbam.keys():
                                if set([g, a, c]) >= cuesbam[bam]:
                                    if self.df:
                                        sortDict[g][a][c].append(
                                            plotDict[bed][bam])
                                        if len(sortDict[g][a][c]) == 2:
                                            if log:
                                                sortDict[g][a][c][
                                                    0] = numpy.log(
                                                        sortDict[g][a][c][0] +
                                                        1)
                                                sortDict[g][a][c][
                                                    1] = numpy.log(
                                                        sortDict[g][a][c][1] +
                                                        1)
                                                sortDict[g][a][
                                                    c] = numpy.subtract(
                                                        sortDict[g][a][c][0],
                                                        sortDict[g][a][c]
                                                        [1]).tolist()
                                            else:
                                                sortDict[g][a][
                                                    c] = numpy.subtract(
                                                        sortDict[g][a][c][0],
                                                        sortDict[g][a][c]
                                                        [1]).tolist()
                                    else:
                                        sortDict[g][a][c] = plotDict[bed][bam]
        self.sortDict = sortDict

    def color_map(self, colorby, definedinEM):
        self.colors = colormap(self.exps, colorby, definedinEM)

    def print_table(self, directory, folder):
        self.printtable = OrderedDict()
        table = []
        table.append(["#group_tag", "sort_tag", "color_tag", "Signals"])
        for i, g in enumerate(self.group_tags):
            for k, a in enumerate(self.sort_tags):
                for j, c in enumerate(self.color_tags):
                    table.append([g, a, c] +
                                 [str(x) for x in self.sortDict[g][a][c]])
        # print(table)
        output_array(table, directory, folder, filename="output_table.txt")

    def plot(self, title, scol, logT=False, ylim=False, pw=3, ph=4):
        """ Return boxplot from the given tables.

        """
        self.xtickrotation, self.xtickalign = 0, "center"
        if len(self.group_tags) < 2:
            ticklabelsize = pw * 1.5
        else:
            ticklabelsize = pw * 6
        tw = len(self.group_tags) * pw
        th = ph

        f, axarr = plt.subplots(1,
                                len(self.group_tags),
                                dpi=300,
                                sharey=scol,
                                figsize=(tw, th))
        # f, axarr = plt.subplots(1, len(self.group_tags), dpi=300, sharey = scol)

        # nm = len(self.group_tags) * len(self.color_tags) * len(self.sort_tags)
        # if nm > 30:
        # f.set_size_inches(nm * 0.25 ,nm * 0.15)
        # legend_x = 1.2
        # self.xtickrotation, self.xtickalign = 70,"right"

        canvas = FigureCanvas(f)
        canvas.set_window_title(title)
        try:
            axarr = axarr.reshape(-1)
        except:
            axarr = [axarr]
        # plt.subplots_adjust(bottom=0.3)
        if logT:
            if self.df:
                axarr[0].set_ylabel("Read number difference (log)",
                                    fontsize=ticklabelsize + 1)
            else:
                axarr[0].set_ylabel("Read number (log)",
                                    fontsize=ticklabelsize + 1)
        else:
            if self.df:
                axarr[0].set_ylabel("Read number difference",
                                    fontsize=ticklabelsize + 1)
            else:
                axarr[0].set_ylabel("Read number", fontsize=ticklabelsize + 1)

        for i, g in enumerate(self.sortDict.keys()):
            # if self.df:
            #     axarr[i].set_title(g + "_df", y=1.02, fontsize=ticklabelsize + 2)
            # else:
            axarr[i].set_title(g, y=1.02, fontsize=ticklabelsize + 2)

            if logT and not self.df:
                axarr[i].set_yscale('log')
            else:
                axarr[i].locator_params(axis='y', nbins=4)

            axarr[i].tick_params(axis='y', direction='out')
            axarr[i].yaxis.tick_left()
            axarr[i].yaxis.grid(True,
                                linestyle='-',
                                which='major',
                                color='lightgrey',
                                alpha=0.7,
                                zorder=1)
            if ylim:
                axarr[i].set_ylim([-ylim, ylim])
            d = []  # Store data within group
            color_t = []  # Store tag for coloring boxes
            x_ticklabels = []  # Store ticklabels
            for j, a in enumerate(self.sortDict[g].keys()):
                # if len(a) > 10:
                # print(a)
                self.xtickrotation = 70
                self.xtickalign = "right"
                for k, c in enumerate(self.sortDict[g][a].keys()):
                    if self.sortDict[g][a][
                            c] == None:  # When there is no matching data, skip it
                        continue
                    else:
                        if self.df:
                            d.append(self.sortDict[g][a][c])
                        else:
                            d.append([x + 1 for x in self.sortDict[g][a][c]])
                        color_t.append(self.colors[k])
                        x_ticklabels.append(a)  # + "." + c

            # Fine tuning boxplot
            # print(d)
            bp = axarr[i].boxplot(d,
                                  notch=False,
                                  sym='o',
                                  vert=True,
                                  whis=1.5,
                                  positions=None,
                                  widths=None,
                                  patch_artist=True,
                                  bootstrap=None)
            z = 10  # zorder for boxplot
            plt.setp(bp['whiskers'],
                     color='black',
                     linestyle='-',
                     linewidth=0.8,
                     zorder=z)
            plt.setp(bp['fliers'],
                     markerfacecolor='gray',
                     color='white',
                     alpha=0.3,
                     markersize=1.8,
                     zorder=z)
            plt.setp(bp['caps'], color='white', zorder=z)
            plt.setp(bp['medians'], color='black', linewidth=1.5, zorder=z + 1)
            legends = []
            for patch, color in zip(bp['boxes'], color_t):
                patch.set_facecolor(
                    color
                )  # When missing the data, the color patch will exceeds
                patch.set_edgecolor("none")
                patch.set_zorder(z)
                legends.append(patch)

            # Fine tuning subplot
            axarr[i].set_xticks([
                len(self.color_tags) * n + 1 + (len(self.color_tags) - 1) / 2
                for n, s in enumerate(self.sortDict[g].keys())
            ])
            # plt.xticks(xlocations, sort_tags, rotation=90, fontsize=10)
            axarr[i].set_xticklabels(self.sortDict[g].keys(),
                                     rotation=self.xtickrotation,
                                     ha=self.xtickalign)
            # axarr[i].set_xticklabels(self.sortDict[g].keys(), rotation=70, ha=self.xtickalign, fontsize=10)

            # axarr[i].set_ylim(bottom=0.95)
            for spine in ['top', 'right', 'left', 'bottom']:
                axarr[i].spines[spine].set_visible(False)
            axarr[i].tick_params(axis='x',
                                 which='both',
                                 bottom='off',
                                 top='off',
                                 labelbottom='on')
            axarr[i].tick_params(labelsize=ticklabelsize + 1)
            if scol:
                # plt.setp(axarr[i].get_yticklabels(),visible=False)
                axarr[i].minorticks_off()

                # axarr[i].tick_params(axis='y', which='both', left='off', right='off', labelbottom='off')
            else:
                plt.setp(axarr[i].get_yticklabels(), visible=True)
                axarr[i].tick_params(axis='y',
                                     which='both',
                                     left='on',
                                     right='off',
                                     labelbottom='on')
                # plt.setp(axarr[i].get_yticks(),visible=False)

        axarr[-1].legend(legends[0:len(self.color_tags)],
                         self.color_tags,
                         loc='center left',
                         handlelength=1,
                         handletextpad=1,
                         columnspacing=2,
                         borderaxespad=0.,
                         prop={'size': ticklabelsize + 1},
                         bbox_to_anchor=(1.05, 0.5))
        # f.tight_layout(pad=2, h_pad=None, w_pad=None)
        # f.tight_layout()
        self.fig = f

    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = title
        link_d = OrderedDict()
        link_d["Boxplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")
        # fp = os.path.join(dir,outputname,title)

        html.add_figure("boxplot.png", align="center")

        type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss'

        #### Calculate p value ####
        plist = {}
        for g in self.sortDict.keys():
            plist[g] = {}
            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    data1 = self.sortDict[g][s1][c1]
                    plist[g][s1 + c1] = {}
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                pass
                            else:
                                data2 = self.sortDict[g][s2][c2]
                                u, p_value = mannwhitneyu(data1, data2)
                                plist[g][s1 + c1][s2 + c2] = p_value

        print("Multiple test correction.")
        multiple_correction(plist)

        for g in self.sortDict.keys():
            html.add_heading(g, size=4, bold=False)
            data_table = []
            col_size_list = [15]
            header_list = ["p-value"]
            for s in self.sortDict[g].keys():
                for c in self.sortDict[g][s1].keys():
                    header_list.append(s + "\n" + c)
                    col_size_list.append(15)

            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    row = [s1 + "\n" + c1]
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                row.append("-")
                            else:
                                p = plist[g][s1 + c1][s2 + c2]
                                if p > 0.05:
                                    row.append(value2str(p))
                                else:
                                    row.append("<font color=\"red\">" +
                                               value2str(p) + "</font>")
                    data_table.append(row)

            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=align + 50)

        # html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")

        header_list = ["Assumptions and hypothesis"]
        col_size_list = [50]
        data_table = [
            [
                'All the regions among different BED files are normalized by quantile normalization.'
            ],
            [
                'If there is any grouping problem, please check all the optional columns in input experimental matrix.'
            ]
        ]
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left")

        html.add_free_content([
            '<a href="parameters.txt" style="margin-left:100">See parameters</a>'
        ])
        html.add_free_content([
            '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'
        ])
        html.write(os.path.join(directory, title, "parameters.html"))