Пример #1
0
def initialize_genome(genome_name):

    from bioutilities import Genome_2bit
    import urllib

    info('Initializing Genome:%s' % genome_name)
    genome_directory = determine_path('genomes')
    info('genome_directory: %s' % genome_directory)
    genome_filename = os.path.join(genome_directory, "%s.2bit" % genome_name)
    chr_len_filename = os.path.join(genome_directory,
                                    "%s_chr_lengths.txt" % genome_name)
    meme_bg_filename = os.path.join(genome_directory,
                                    "%s_meme_bg" % genome_name)

    download_genome = True
    if os.path.exists(genome_filename):

        try:
            Genome_2bit(genome_filename, verbose=True)

            md5_check_flag = check_md5sum(genome_filename, genome_name)

            if md5_check_flag:
                download_genome = False
                info('File %s exists. Skipping genome download' %
                     genome_filename)

            else:
                download_genome = True
        except:
            download_genome = True
            error("Unable to check MD5 sum. Downloading genome.")

    if download_genome:
        info(
            'Sorry I need the genome file to perform the analysis. Downloading...'
        )
        urlpath = "http://hgdownload.cse.ucsc.edu/goldenPath/%s/bigZips/%s.2bit" % (
            genome_name, genome_name)
        info('Downloading %s in %s...' % (urlpath, genome_filename))
        try:
            with TqdmUpTo(unit='B',
                          unit_scale=True,
                          mininterval=30,
                          miniters=1,
                          desc=urlpath.split('/')[-1]) as t:
                urllib.urlretrieve(urlpath,
                                   filename=genome_filename,
                                   reporthook=t.update_to,
                                   data=None)

            info('Downloaded %s in %s:' % (urlpath, genome_filename))
        except IOError, e:
            error("Can't retrieve %r to %r: %s" %
                  (urlpath, genome_filename, e))
            info(
                'Sorry I need the genome file to perform the analysis. Exiting...'
            )
            sys.exit(1)
Пример #2
0
                urllib.urlretrieve(urlpath,
                                   filename=genome_filename,
                                   reporthook=t.update_to,
                                   data=None)

            info('Downloaded %s in %s:' % (urlpath, genome_filename))
        except IOError, e:
            error("Can't retrieve %r to %r: %s" %
                  (urlpath, genome_filename, e))
            info(
                'Sorry I need the genome file to perform the analysis. Exiting...'
            )
            sys.exit(1)

    check_file(genome_filename)
    genome = Genome_2bit(genome_filename, verbose=True)

    if not os.path.exists(chr_len_filename):
        info('Extracting chromosome lengths')
        genome.write_chr_len(chr_len_filename)
        info('Done!')
    else:
        info('File %s exists, skipping generation' % chr_len_filename)

    if not os.path.exists(meme_bg_filename):
        info('Calculating nucleotide frequencies....')
        genome.write_meme_background(meme_bg_filename)
        info('Done!')
    else:
        info('File %s exists, skipping generation' % meme_bg_filename)
Пример #3
0
def main():

    print '\n[H A Y S T A C K   M O T I F S]'
    print(
        '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    bootstrap = False
    ngram_correction = 'g'

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument(
        'bed_target_filename',
        type=str,
        help=
        'A bed file containing the target coordinates on the genome of reference'
    )
    parser.add_argument(
        'genome_name',
        type=str,
        help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')

    #optional
    parser.add_argument(
        '--bed_bg_filename',
        type=str,
        help=
        "A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)",
        default='random_background')
    parser.add_argument(
        '--meme_motifs_filename',
        type=str,
        help='Motifs database in MEME format (default JASPAR CORE 2016)')
    parser.add_argument(
        '--nucleotide_bg_filename',
        type=str,
        help=
        'Nucleotide probability for the background in MEME format (default precomupted on the Genome)'
    )
    parser.add_argument(
        '--p_value',
        type=float,
        help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)',
        default=1e-4)
    parser.add_argument(
        '--no_c_g_correction',
        help='Disable the matching of the C+G density of the background',
        action='store_true')
    parser.add_argument(
        '--c_g_bins',
        type=int,
        help='Number of bins for the C+G density correction (default: 8)',
        default=8)
    parser.add_argument('--mask_repetitive',
                        help='Mask repetitive sequences',
                        action='store_true')
    parser.add_argument(
        '--n_target_coordinates',
        type=int,
        help='Number of target coordinates to use (default: all)',
        default=np.inf)
    parser.add_argument(
        '--use_entire_bg',
        help=
        'Use the entire background file (use only when the cg correction is disabled)',
        action='store_true')
    parser.add_argument(
        '--bed_score_column',
        type=int,
        help='Column in the bedfile that represents the score (default: 5)',
        default=5)
    parser.add_argument(
        '--bg_target_ratio',
        type=int,
        help='Background size/Target size ratio (default: 1.0)',
        default=2)
    parser.add_argument(
        '--bootstrap',
        help=
        'Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)',
        action='store_true')
    parser.add_argument(
        '--temp_directory',
        help='Directory to store temporary files  (default: /tmp)',
        default='/tmp')
    parser.add_argument(
        '--no_random_sampling_target',
        help=
        'Select the best --n_target_coordinates using the score column from the target file instead of randomly select them',
        action='store_true')
    parser.add_argument('--name',
                        help='Define a custom output filename for the report',
                        default='')
    parser.add_argument(
        '--internal_window_length',
        type=int,
        help=
        'Window length in bp for the enrichment (default: average lenght of the target sequences)'
    )
    parser.add_argument(
        '--window_length',
        type=int,
        help=
        'Window length in bp for the profiler (default:internal_window_length*5)'
    )
    parser.add_argument(
        '--min_central_enrichment',
        type=float,
        help='Minimum central enrichment to report a motif (default:>1.0)',
        default=1.0)
    parser.add_argument('--disable_ratio',
                        help='Disable target/bg ratio filter',
                        action='store_true')
    parser.add_argument(
        '--dump',
        help=
        'Dump all the intermediate data, choices: True, False (default: False)',
        action='store_true')
    parser.add_argument('--output_directory',
                        type=str,
                        help='Output directory (default: current directory)',
                        default='')
    parser.add_argument(
        '--smooth_size',
        type=int,
        help=
        'Size in bp for the smoothing window (default: internal_window_length/4)'
    )
    parser.add_argument(
        '--gene_annotations_filename',
        type=str,
        help=
        'Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene'
    )
    parser.add_argument(
        '--gene_ids_to_names_filename',
        type=str,
        help=
        'Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)'
    )
    parser.add_argument(
        '--n_processes',
        type=int,
        help=
        'Specify the number of processes to use. The default is #cores available.',
        default=mp.cpu_count())
    parser.add_argument('--version',
                        help='Print version and exit.',
                        action='version',
                        version='Version %s' % HAYSTACK_VERSION)

    args = parser.parse_args()

    args_dict = vars(args)
    for key, value in args_dict.items():
        if key == 'n_target_coordinates':
            n_target_coordinates = value
        else:
            exec('%s=%s' % (key, repr(value)))

    bed_score_column -= 1

    if no_c_g_correction:
        c_g_correction = False
    else:
        c_g_correction = True

    if no_random_sampling_target:
        random_sampling_target = False
    else:
        random_sampling_target = True

    check_file(bed_target_filename)

    if not bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)

    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename = os.path.join(
            determine_path('motif_databases'),
            'JASPAR_CORE_2016_vertebrates.meme')

    annotation_directory = determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error(
                'The mapping to the closest gene requires Java free available from: http://java.com/en/download/'
            )
            use_gene_annotations = False
        else:
            check_file(gene_annotations_filename)
            info('Using %s as gene annotations file' %
                 gene_annotations_filename)
            use_gene_annotations = True
    else:
        gene_annotations_filename = os.path.join(annotation_directory,
                                                 '%s_genes.bed' % genome_name)
        gene_ids_to_names_filename = os.path.join(
            annotation_directory, '%s_genes_id_to_names' % genome_name)

        if os.path.exists(gene_annotations_filename) and os.path.exists(
                gene_ids_to_names_filename):
            use_gene_annotations = True
        else:
            use_gene_annotations = False
            info('No gene annotations file specified')

    target_name = ntpath.basename(bed_target_filename.replace('.bed', ''))
    bg_name = ntpath.basename(bed_bg_filename.replace('.bed', ''))
    #timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name = 'HAYSTACK_MOTIFS_on_' + name
    else:
        directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name


    info('###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n'\
         %(bed_target_filename,bed_bg_filename,str(bg_target_ratio),str(c_g_correction),str(mask_repetitive),'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates),output_directory))

    info('Initializing Genome:%s' % genome_name)

    genome_directory = determine_path('genomes')
    genome_2bit = os.path.join(genome_directory, genome_name + '.2bit')

    if os.path.exists(genome_2bit):
        genome = Genome_2bit(genome_2bit)
    else:
        info("\nIt seems you don't have the required genome file.")
        if query_yes_no('Should I download it for you?'):
            sb.call('haystack_download_genome %s' % genome_name,
                    shell=True,
                    env=system_env)
            if os.path.exists(genome_2bit):
                info('Genome correctly downloaded!')
                genome = Genome_2bit(genome_2bit)
            else:
                error(
                    'Sorry I cannot download the required file for you. Check your Internet connection.'
                )
                sys.exit(1)
        else:
            error(
                'Sorry I need the genome file to perform the analysis. Exiting...'
            )
            sys.exit(1)

    if not nucleotide_bg_filename:
        nucleotide_bg_filename = os.path.join(genome_directory,
                                              genome_name + '_meme_bg')

    check_file(nucleotide_bg_filename)

    N_TARGET = None
    N_BG = None
    COMMAND_USED = ' '.join(sys.argv)

    _n_target_coordinates = n_target_coordinates

    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords = Coordinate.bed_to_coordinates(bed_target_filename,
                                                  cl_score=bed_score_column)

    if len(target_coords) == 0:
        info('No coordinates to analyze in your input file. Exiting.')
        sys.exit(1)

    #calculate automatically the average lenght of the target regions

    if internal_window_length:
        info('Using the user defined internal window length:%d' %
             internal_window_length)
        if internal_window_length % 2:
            internal_window_length += 1

    else:

        internal_window_length = int(np.mean(map(len, target_coords)))
        if internal_window_length % 2:
            internal_window_length += 1
        info(
            'Using the average length of target coordinates as internal window length:%d'
            % internal_window_length)

        if not window_length:
            window_length = internal_window_length * 5

    info('Total window length:%d' % window_length)

    if not smooth_size:
        smooth_size = internal_window_length / 5

    target_coords = Coordinate.coordinates_of_intervals_around_center(
        target_coords, internal_window_length)

    if len(target_coords) > n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            target_coords = random.sample(target_coords, n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            sorted_idxs_by_score = np.argsort([c.score
                                               for c in target_coords])[::-1]
            target_coords = [
                target_coords[idx]
                for idx in sorted_idxs_by_score[:n_target_coordinates]
            ]
    else:

        if random_sampling_target and bootstrap and not np.isinf(
                n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords = sample_wr(target_coords, n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))

    info('Extracting Motifs in target coordinates')
    positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning(
        target_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)
    n_target_coordinates = len(target_coords)  #fix for the bootstrap!

    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords = []
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))

            for _ in range(bg_target_ratio):
                for idx_c, c in enumerate(target_coords):
                    c_bin = np.nonzero(
                        np.histogram(c_g_content_target[idx_c], bins)[0])[0][0]
                    c_random_bin = -1

                    while c_random_bin != c_bin:
                        random_bpstart = np.random.randint(
                            1, genome.chr_len[c.chr_id] - len(c) + 1)
                        c_random = Coordinate(c.chr_id, random_bpstart,
                                              random_bpstart + len(c) - 1)
                        seq = genome.extract_sequence(c_random)
                        c_g_content_c_random = (seq.count('c') +
                                                seq.count('g')) / float(len(c))
                        c_random_bin = np.nonzero(
                            np.histogram(c_g_content_c_random, bins)[0])[0][0]

                    #print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            debug('original: ' +
                  str(np.histogram(c_g_content_target, bins)[0]))
            debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0]))

        else:
            bg_coords = get_random_coordinates(target_coords, genome)

        info('Done!')

    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords = Coordinate.coordinates_of_intervals_around_center(
            bg_coords, internal_window_length)

        if use_entire_bg:
            bg_target_ratio = float(len(bg_coords)) / n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f',
                 bg_target_ratio)

        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))
            target_hist = np.histogram(c_g_content_target, bins)[0]
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            ratios = bg_hist / (target_hist * 1.0)
            debug('original:%s' % target_hist)
            debug('bg:%s' % bg_hist)
            debug('ratios:%s' % ratios)
            K_MATCH = min(
                bg_target_ratio,
                ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) &
                       (target_hist / float(target_hist.sum()) > 0.05)].min())

            debug('K_MATCH:%d' % K_MATCH)

            to_match = np.int32(np.floor(K_MATCH * target_hist))

            debug('to_match:%s' % to_match)

            idxs_corrected_bg = np.array([], dtype=int)

            for idx_bin in range(len(bins) - 1):
                idxs_matching_regions = np.nonzero(
                    (c_g_content_bg >= bins[idx_bin])
                    & (c_g_content_bg < bins[idx_bin + 1]))[0]
                to_take = np.random.permutation(len(idxs_matching_regions))
                to_take = to_take[range(
                    min(len(idxs_matching_regions), to_match[idx_bin]))]
                idxs_corrected_bg = np.hstack(
                    (idxs_corrected_bg, idxs_matching_regions[to_take]))

            debug('original:%s' % target_hist)
            debug('K:%d' % K_MATCH)
            debug('to sample:%s' % to_match)
            debug('obtained:%s' %
                  np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0])
            bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug(np.histogram(c_g_content_bg, bins)[0])
            if np.array_equal(K_MATCH * target_hist,
                              np.histogram(c_g_content_bg, bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s'
                     % (target_hist, np.histogram(c_g_content_bg, bins)[0]))
            else:
                warn(
                    'C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'
                    % (target_hist, np.histogram(c_g_content_bg, bins)[0]))

            debug(target_hist / np.histogram(c_g_content_bg, bins)[0])

    if len(bg_coords) >= bg_target_ratio * n_target_coordinates:
        bg_coords = random.sample(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
    else:
        if bootstrap and len(bg_coords) < (bg_target_ratio *
                                           n_target_coordinates *
                                           0.95):  #allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords = sample_wr(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' %
                  (target_hist, np.histogram(c_g_content_bg, bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning(
        bg_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)[0:3]

    #allocate date for reports
    N_MOTIFS = len(motif_ids)
    rankings = np.zeros(N_MOTIFS, dtype=np.int16)
    motif_ratios = np.zeros(N_MOTIFS)
    support_p = np.zeros(N_MOTIFS)
    support_n = np.zeros(N_MOTIFS)
    fisher_p_values = np.zeros(N_MOTIFS)
    central_enrichment = np.zeros(N_MOTIFS)

    N_seq_p = positive_matrix.shape[0]
    N_seq_n = negative_matrix.shape[0]

    profile_presence_p = (positive_matrix > 0).sum(0)
    profile_presence_n = (negative_matrix > 0).sum(0)

    support_p = profile_presence_p / float(N_seq_p)
    support_n = profile_presence_n / float(N_seq_n)

    internal_bpstart = window_length / 2 - internal_window_length / 2
    internal_bpend = window_length / 2 + internal_window_length / 2

    for idx, motif_id in enumerate(motif_ids):
        fisher_p_values[idx] = stats.fisher_exact(
            [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]],
             [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1]
        central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][
            internal_bpstart:internal_bpend].mean() / np.hstack([
                motifs_profiles_in_sequences[motif_id][:internal_bpstart],
                motifs_profiles_in_sequences[motif_id][internal_bpend:]
            ]).mean()

    motif_ratios = (support_p + 0.01) / (support_n + 0.01)

    #Foundamental!
    if not disable_ratio:
        motif_ratios[support_p < 0.03] = 1

    rankings = stats.rankdata(-motif_ratios)

    #filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep = np.nonzero(motif_ratios > 1)[0]
    else:
        idxs_to_keep = range(len(motif_ratios))

    rankings = rankings[idxs_to_keep]
    motif_ratios = motif_ratios[idxs_to_keep]
    support_p = support_p[idxs_to_keep]
    support_n = support_n[idxs_to_keep]
    fisher_p_values = fisher_p_values[idxs_to_keep]
    central_enrichment = central_enrichment[idxs_to_keep]

    motif_ids = [motif_ids[_] for _ in idxs_to_keep]
    motif_names = [motif_names[_] for _ in idxs_to_keep]
    motif_idxs = [_ for _ in idxs_to_keep]

    try:
        qvalues = estimate_qvalues(fisher_p_values)
        # we test the ones only with ratio >1
    except:
        print fisher_p_values

    #qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################

    #generate reports in html
    info('Generating HTML report...')
    imgs_directory = os.path.join(output_directory, 'images')
    genes_list_directory = os.path.join(output_directory, 'genes_lists')
    motif_regions_directory = os.path.join(output_directory, 'motifs_regions')

    #create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)

    j2_env = Environment(
        loader=FileSystemLoader(determine_path('extra') + '/templates/'),
        trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra') + '/templates/')
    template = j2_env.get_template('report_template.html')

    #copy haystack logo and bg
    shutil.copyfile(
        determine_path('extra') + '/templates/haystack_logo.png',
        os.path.join(imgs_directory, 'haystack_logo.png'))
    shutil.copyfile(
        determine_path('extra') + '/templates/noise.png',
        os.path.join(imgs_directory, 'noise.png'))

    motifs_dump = []
    for i in np.argsort(rankings):
        if (support_p[i] >= 0.03
                or disable_ratio) and fisher_p_values[i] < 0.01 and (
                    motif_ratios[i] > 1 or disable_ratio
                ) and central_enrichment[i] > min_central_enrichment:
            #if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):

            info('Generating logo and profile for:' + motif_ids[i])

            #create motif logo
            img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i],
                             SEQLOGO=determine_path('extra') + '/seqlogo')
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i],
                             SEQLOGO=determine_path('extra') + '/seqlogo',
                             file_format='pdf')
            #fix the weblogo prefix problem
            img_logo_url = os.path.join('images',
                                        'logo_' + motif_ids[i] + '.png')

            #create motif enrichment profile
            img_profile = os.path.join(imgs_directory,
                                       'profile_' + motif_ids[i] + '.png')
            motif_profile_target = motifs_profiles_in_sequences[
                motif_ids[i]] / N_seq_p
            motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n

            #print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,
                                   motif_profile_bg,
                                   motif_ids[i],
                                   img_profile,
                                   smooth_size=smooth_size,
                                   window_size=window_length)
            img_profile_url = os.path.join('images',
                                           'profile_' + motif_ids[i] + '.png')

            #create regions
            info('Extracting regions with:' + motif_ids[i])
            regions = os.path.join(
                motif_regions_directory,
                motif_ids[i] + '_motif_region_in_target.bed')
            with open(regions, 'w+') as outfile:
                outfile.write(
                    'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n'
                )
                for c, locations in motif_coords_in_seqs_with_motif[
                        motif_ids[i]].items():
                    outfile.write('\t'.join([
                        c.chr_id,
                        str(c.bpstart),
                        str(c.bpend), ';'.join([
                            '-'.join(map(str, map(int, l))) for l in locations
                        ]),
                        str(len(locations))
                    ]) + '\n')
            regions_url = os.path.join(
                'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed')

            #map closest downstream genes
            genes_url = None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' %
                     motif_ids[i])

                peak_annotator_path = os.path.join(determine_path('extra/'),
                                                   'PeakAnnotator.jar')

                if gene_ids_to_names_filename:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \
                            %(regions,gene_annotations_filename,gene_ids_to_names_filename,genes_list_directory),  shell=True,env=system_env)
                else:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s  -o %s >/dev/null 2>&1' \
                            %(regions,gene_annotations_filename,genes_list_directory),  shell=True,env=system_env)

                genes_url = os.path.join(
                    'genes_lists',
                    motif_ids[i] + '_motif_region_in_target.tss.bed')

            motifs_dump.append({
                'id': motif_ids[i],
                'name': motif_names[i],
                'support_p': support_p[i] * 100,
                'support_n': support_n[i] * 100,
                'ratio': motif_ratios[i],
                'rank': float(rankings[i]),
                'pvalue': fisher_p_values[i],
                'qvalue': qvalues[i],
                'central_enrichment': central_enrichment[i],
                'img_logo': img_logo_url,
                'img_profile': img_profile_url,
                'regions': regions_url,
                'genes': genes_url,
                'idx_motif': motif_idxs[i]
            })

    outfile = codecs.open(
        os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump,bed_target_filename=bed_target_filename,bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n,\
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,use_gene_annotations=use_gene_annotations))
    outfile.close()

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory = os.path.join(output_directory, 'dump')

        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)

        np.save(os.path.join(dump_directory, 'matrix_' + target_name),
                positive_matrix)
        np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name),
                negative_matrix)

        cp.dump(
            motifs_dump,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_dumps.pickle'), 'w'))

        #cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        #cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(
            idxs_seqs_with_motif,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_seqs_idxs.pickle'), 'w'))
        cp.dump(
            idxs_seqs_with_motif_bg,
            open(
                os.path.join(dump_directory,
                             bg_name + '_motif_seqs_idxs.pickle'), 'w'))

        cp.dump(
            motif_coords_in_seqs_with_motif,
            open(
                os.path.join(
                    dump_directory,
                    target_name + '_motif_coords_in_seqs_with_motif.pickle'),
                'w'))

        Coordinate.coordinates_to_bed(
            target_coords,
            os.path.join(
                dump_directory,
                'Target_coordinates_selected_on_' + target_name + '.bed'),
            minimal_format=False)
        Coordinate.coordinates_to_bed(
            bg_coords,
            os.path.join(dump_directory,
                         'BG_coordinates_selected_on_' + bg_name + '.bed'),
            minimal_format=True)

    info('All done! Ciao!')
    sys.exit(0)
Пример #4
0
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)

for pam in pams:
	if not all(x in ['A', 'T', 'C', 'G', '[', ']'] for x in set(list(pam))):
		logger.error('The PAM %s includes an unidentifiable character. Please only use A, T, C, G bases.\nBrackets [] can be used to specify multiple bases at a single position.\nFor example: NGG = [ATCG]GG ...' % pam)
		sys.exit(1)

##### Input genome
logger.info('Importing %s genome ...' % genome)
genome = Genome_2bit(genome)

##### Input regions file
logger.info('Reading in %s file ...' % input_f)

total_regions = 0
bed_dict = {}
with open(input_f, 'r') as f:
	for line in f:

		total_regions += 1

		if 'csv' in input_f:
			line = line.strip('\n').split(',')
		else:
			line = line.strip('\n').split()
Пример #5
0
def main():

    print '\n[H A Y S T A C K   H O T S P O T]'
    print(
        '\n-SELECTION OF VARIABLE REGIONS- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    if which('samtools') is None:
        error(
            'Haystack requires samtools free available at: http://sourceforge.net/projects/samtools/files/samtools/0.1.19/'
        )
        sys.exit(1)

    if which('bedtools') is None:
        error(
            'Haystack requires bedtools free available at: https://github.com/arq5x/bedtools2/releases/tag/v2.20.1'
        )
        sys.exit(1)

    if which('bedGraphToBigWig') is None:
        info(
            'To generate the bigwig files Haystack requires bedGraphToBigWig please download from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH'
        )

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument(
        'samples_filename_or_bam_folder',
        type=str,
        help=
        'A tab delimeted file with in each row (1) a sample name, (2) the path to the corresponding bam filename. Alternatively it is possible to specify a folder containing some .bam files to analyze.'
    )
    parser.add_argument(
        'genome_name',
        type=str,
        help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')

    #optional
    parser.add_argument('--bin_size',
                        type=int,
                        help='bin size to use(default: 500bp)',
                        default=500)
    parser.add_argument('--disable_quantile_normalization',
                        help='Disable quantile normalization (default: False)',
                        action='store_true')
    parser.add_argument(
        '--th_rpm',
        type=float,
        help=
        'Percentile on the signal intensity to consider for the hotspots (default: 99)',
        default=99)
    parser.add_argument(
        '--transformation',
        type=str,
        help=
        'Variance stabilizing transformation among: none, log2, angle (default: angle)',
        default='angle',
        choices=['angle', 'log2', 'none'])
    parser.add_argument('--recompute_all',
                        help='Ignore any file previously precalculated',
                        action='store_true')
    parser.add_argument(
        '--z_score_high',
        type=float,
        help='z-score value to select the specific regions(default: 1.5)',
        default=1.5)
    parser.add_argument(
        '--z_score_low',
        type=float,
        help='z-score value to select the not specific regions(default: 0.25)',
        default=0.25)
    parser.add_argument('--name',
                        help='Define a custom output filename for the report',
                        default='')
    parser.add_argument('--output_directory',
                        type=str,
                        help='Output directory (default: current directory)',
                        default='')
    parser.add_argument(
        '--use_X_Y',
        help=
        'Force to process the X and Y chromosomes (default: not processed)',
        action='store_true')
    parser.add_argument(
        '--max_regions_percentage',
        type=float,
        help=
        'Upper bound on the %% of the regions selected  (deafult: 0.1, 0.0=0%% 1.0=100%%)',
        default=0.1)
    parser.add_argument(
        '--depleted',
        help=
        'Look for cell type specific regions with depletion of signal instead of enrichment',
        action='store_true')
    parser.add_argument(
        '--input_is_bigwig',
        help=
        'Use the bigwig format instead of the bam format for the input. Note: The files must have extension .bw',
        action='store_true')
    parser.add_argument('--version',
                        help='Print version and exit.',
                        action='version',
                        version='Version %s' % HAYSTACK_VERSION)
    args = parser.parse_args()

    args_dict = vars(args)
    for key, value in args_dict.items():
        exec('%s=%s' % (key, repr(value)))

    if input_is_bigwig:
        extension_to_check = '.bw'
        info('Input is set BigWig (.bw)')
    else:
        extension_to_check = '.bam'
        info('Input is set compressed SAM (.bam)')

    #check folder or sample filename
    if os.path.isfile(samples_filename_or_bam_folder):
        BAM_FOLDER = False
        bam_filenames = []
        sample_names = []
        with open(samples_filename_or_bam_folder) as infile:
            for line in infile:

                if not line.strip():
                    continue

                if line.startswith(
                        '#'):  #skip optional header line or empty lines
                    info('Skipping header/comment line:%s' % line)
                    continue

                fields = line.strip().split()
                n_fields = len(fields)

                if n_fields == 2:
                    sample_names.append(fields[0])
                    bam_filenames.append(fields[1])
                else:
                    error('The samples file format is wrong!')
                    sys.exit(1)

    else:
        if os.path.exists(samples_filename_or_bam_folder):
            BAM_FOLDER = True
            bam_filenames = glob.glob(
                os.path.join(samples_filename_or_bam_folder,
                             '*' + extension_to_check))

            if not bam_filenames:
                error('No bam/bigwig  files to analyze in %s. Exiting.' %
                      samples_filename_or_bam_folder)
                sys.exit(1)

            sample_names = [
                os.path.basename(bam_filename).replace(extension_to_check, '')
                for bam_filename in bam_filenames
            ]
        else:
            error("The file or folder %s doesn't exist. Exiting." %
                  samples_filename_or_bam_folder)
            sys.exit(1)

    #check all the files before starting
    info('Checking samples files location...')
    for bam_filename in bam_filenames:
        check_file(bam_filename)

    info('Initializing Genome:%s' % genome_name)

    genome_directory = determine_path('genomes')
    genome_2bit = os.path.join(genome_directory, genome_name + '.2bit')

    if os.path.exists(genome_2bit):
        genome = Genome_2bit(genome_2bit)
    else:
        info("\nIt seems you don't have the required genome file.")
        if query_yes_no('Should I download it for you?'):
            sb.call('haystack_download_genome %s' % genome_name,
                    shell=True,
                    env=system_env)
            if os.path.exists(genome_2bit):
                info('Genome correctly downloaded!')
                genome = Genome_2bit(genome_2bit)
            else:
                error(
                    'Sorry I cannot download the required file for you. Check your Internet connection.'
                )
                sys.exit(1)
        else:
            error(
                'Sorry I need the genome file to perform the analysis. Exiting...'
            )
            sys.exit(1)

    chr_len_filename = os.path.join(genome_directory,
                                    "%s_chr_lengths.txt" % genome_name)
    check_file(chr_len_filename)

    if name:
        directory_name = 'HAYSTACK_HOTSPOTS_on_%s' % name

    else:
        directory_name = 'HAYSTACK_HOTSPOTS'

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    genome_sorted_bins_file = os.path.join(
        output_directory,
        '%s.%dbp.bins.sorted.bed' % (os.path.basename(genome_name), bin_size))

    tracks_directory = os.path.join(output_directory, 'TRACKS')
    if not os.path.exists(tracks_directory):
        os.makedirs(tracks_directory)

    intermediate_directory = os.path.join(output_directory, 'INTERMEDIATE')
    if not os.path.exists(intermediate_directory):
        os.makedirs(intermediate_directory)

    if not os.path.exists(genome_sorted_bins_file) or recompute_all:
        info('Creating bins of %dbp for %s in %s' %
             (bin_size, chr_len_filename, genome_sorted_bins_file))
        sb.call(
            'bedtools makewindows -g %s -w %s |  bedtools sort -i stdin |' %
            (chr_len_filename, bin_size) + "perl -nle 'print " + '"$_\t$.";' +
            "' /dev/stdin> %s" % genome_sorted_bins_file,
            shell=True,
            env=system_env)

    #convert bam files to genome-wide rpm tracks
    for base_name, bam_filename in zip(sample_names, bam_filenames):

        info('Processing:%s' % bam_filename)

        rpm_filename = os.path.join(tracks_directory,
                                    '%s.bedgraph' % base_name)
        sorted_rpm_filename = os.path.join(tracks_directory,
                                           '%s_sorted.bedgraph' % base_name)
        mapped_sorted_rpm_filename = os.path.join(
            tracks_directory, '%s_mapped_sorted.bedgraph' % base_name)
        binned_rpm_filename = os.path.join(
            intermediate_directory, '%s.%dbp.rpm' % (base_name, bin_size))
        bigwig_filename = os.path.join(tracks_directory, '%s.bw' % base_name)

        if input_is_bigwig and which('bigWigAverageOverBed'):
            if not os.path.exists(binned_rpm_filename) or recompute_all:
                cmd = 'bigWigAverageOverBed %s %s  /dev/stdout | sort -s -n -k 1,1 | cut -f5 > %s' % (
                    bam_filename, genome_sorted_bins_file, binned_rpm_filename)
                sb.call(cmd, shell=True, env=system_env)
                shutil.copy2(bam_filename, bigwig_filename)

        else:
            if not os.path.exists(binned_rpm_filename) or recompute_all:
                info('Computing Scaling Factor...')
                cmd = 'samtools view -c -F 512 %s' % bam_filename
                #print cmd
                proc = sb.Popen(cmd,
                                stdout=sb.PIPE,
                                shell=True,
                                env=system_env)
                (stdout, stderr) = proc.communicate()
                #print stdout,stderr
                scaling_factor = (1.0 / float(stdout.strip())) * 1000000

                info('Scaling Factor: %e' % scaling_factor)

                info('Building BedGraph RPM track...')
                cmd = 'samtools view -b -F 512 %s | bamToBed | slopBed  -r %s -l 0 -s -i stdin -g %s | genomeCoverageBed -g  %s -i stdin -bg -scale %.32f > %s' % (
                    bam_filename, bin_size, chr_len_filename, chr_len_filename,
                    scaling_factor, rpm_filename)
                #print cmd

                proc = sb.call(cmd, shell=True, env=system_env)

            if which('bedGraphToBigWig'):
                if not os.path.exists(bigwig_filename) or recompute_all:
                    info('Converting BedGraph to BigWig')
                    cmd = 'bedGraphToBigWig %s %s %s' % (
                        rpm_filename, chr_len_filename, bigwig_filename)
                    proc = sb.call(cmd, shell=True, env=system_env)

            else:
                info(
                    'Sorry I cannot create the bigwig file.\nPlease download and install bedGraphToBigWig from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH'
                )

            if not os.path.exists(binned_rpm_filename) or recompute_all:
                info('Make constant binned (%dbp) rpm values file' % bin_size)
                #cmd='bedtools sort -i %s |  bedtools map -a %s -b stdin -c 4 -o mean -null 0.0 | cut -f5 > %s'   %(rpm_filename,genome_sorted_bins_file,binned_rpm_filename)
                #proc=sb.call(cmd,shell=True,env=system_env)

                cmd = 'sort -k1,1 -k2,2n  %s  > %s' % (rpm_filename,
                                                       sorted_rpm_filename)
                proc = sb.call(cmd, shell=True, env=system_env)

                cmd = 'bedtools map -a %s -b %s -c 4 -o mean -null 0.0  > %s' % (
                    genome_sorted_bins_file, sorted_rpm_filename,
                    mapped_sorted_rpm_filename)
                proc = sb.call(cmd, shell=True, env=system_env)

                cmd = 'cut -f5 %s  > %s' % (mapped_sorted_rpm_filename,
                                            binned_rpm_filename)
                proc = sb.call(cmd, shell=True, env=system_env)

            try:
                os.remove(rpm_filename)
                os.remove(sorted_rpm_filename)
                os.remove(mapped_sorted_rpm_filename)
            except:
                pass

    #load coordinates of bins
    coordinates_bin = pd.read_csv(genome_sorted_bins_file,
                                  names=['chr_id', 'bpstart', 'bpend'],
                                  sep='\t',
                                  header=None,
                                  usecols=[0, 1, 2])
    N_BINS = coordinates_bin.shape[0]
    if not use_X_Y:
        coordinates_bin = coordinates_bin.ix[
            (coordinates_bin['chr_id'] != 'chrX')
            & (coordinates_bin['chr_id'] != 'chrY')]

    #load all the tracks
    info('Loading the processed tracks')
    df_chip = {}
    for state_file in glob.glob(os.path.join(intermediate_directory, '*.rpm')):
        col_name = os.path.basename(state_file).replace('.rpm', '')
        df_chip[col_name] = pd.read_csv(state_file, squeeze=True, header=None)
        info('Loading:%s' % col_name)

    df_chip = pd.DataFrame(df_chip)

    if disable_quantile_normalization:
        info('Skipping quantile normalization...')
    else:
        info('Normalizing the data...')
        df_chip = pd.DataFrame(quantile_normalization(df_chip.values),
                               columns=df_chip.columns,
                               index=df_chip.index)

    if which('bedGraphToBigWig'):
        #write quantile normalized tracks
        coord_quantile = coordinates_bin.copy()
        for col in df_chip:

            if disable_quantile_normalization:
                normalized_output_filename = os.path.join(
                    tracks_directory, '%s.bedgraph' % os.path.basename(col))
            else:
                normalized_output_filename = os.path.join(
                    tracks_directory,
                    '%s_quantile_normalized.bedgraph' % os.path.basename(col))

            normalized_output_filename_bigwig = normalized_output_filename.replace(
                '.bedgraph', '.bw')

            if not os.path.exists(
                    normalized_output_filename_bigwig) or recompute_all:
                info('Writing binned track: %s' %
                     normalized_output_filename_bigwig)
                coord_quantile['rpm_normalized'] = df_chip.ix[:, col]
                coord_quantile.dropna().to_csv(normalized_output_filename,
                                               sep='\t',
                                               header=False,
                                               index=False)

                cmd = 'bedGraphToBigWig %s %s %s' % (
                    normalized_output_filename, chr_len_filename,
                    normalized_output_filename_bigwig)
                proc = sb.call(cmd, shell=True, env=system_env)
                try:
                    os.remove(normalized_output_filename)
                except:
                    pass
    else:
        info(
            'Sorry I cannot creat the bigwig file.\nPlease download and install bedGraphToBigWig from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH'
        )

    #th_rpm=np.min(df_chip.apply(lambda x: np.percentile(x,th_rpm)))
    th_rpm = find_th_rpm(df_chip, th_rpm)
    info('Estimated th_rpm:%s' % th_rpm)

    df_chip_not_empty = df_chip.ix[(df_chip > th_rpm).any(1), :]

    if transformation == 'log2':
        df_chip_not_empty = df_chip_not_empty.applymap(log2_transform)
        info('Using log2 transformation')

    elif transformation == 'angle':
        df_chip_not_empty = df_chip_not_empty.applymap(angle_transform)
        info('Using angle transformation')

    else:
        info('Using no transformation')

    iod_values = df_chip_not_empty.var(1) / df_chip_not_empty.mean(1)

    ####calculate the inflation point a la superenhancers
    scores = iod_values
    min_s = np.min(scores)
    max_s = np.max(scores)

    N_POINTS = len(scores)
    x = np.linspace(0, 1, N_POINTS)
    y = sorted((scores - min_s) / (max_s - min_s))
    m = smooth((np.diff(y) / np.diff(x)), 50)
    m = m - 1
    m[m <= 0] = np.inf
    m[:int(len(m) * (1 - max_regions_percentage))] = np.inf
    idx_th = np.argmin(m) + 1

    #print idx_th,
    th_iod = sorted(iod_values)[idx_th]
    #print th_iod

    hpr_idxs = iod_values > th_iod
    #print len(iod_values),len(hpr_idxs),sum(hpr_idxs), sum(hpr_idxs)/float(len(hpr_idxs)),

    info('Selected %f%% regions (%d)' %
         (sum(hpr_idxs) / float(len(hpr_idxs)) * 100, sum(hpr_idxs)))
    coordinates_bin['iod'] = iod_values

    #we remove the regions "without" signal in any of the cell types
    coordinates_bin.dropna(inplace=True)

    #create a track for IGV
    bedgraph_iod_track_filename = os.path.join(tracks_directory,
                                               'VARIABILITY.bedgraph')
    bw_iod_track_filename = os.path.join(tracks_directory, 'VARIABILITY.bw')

    if not os.path.exists(bw_iod_track_filename) or recompute_all:

        info('Generating variability track in bigwig format in:%s' %
             bw_iod_track_filename)

        coordinates_bin.to_csv(bedgraph_iod_track_filename,
                               sep='\t',
                               header=False,
                               index=False)
        sb.call('bedGraphToBigWig %s %s %s' %
                (bedgraph_iod_track_filename, chr_len_filename,
                 bw_iod_track_filename),
                shell=True,
                env=system_env)
        try:
            os.remove(bedgraph_iod_track_filename)
        except:
            pass

    #Write the HPRs
    bedgraph_hpr_filename = os.path.join(
        tracks_directory, 'SELECTED_VARIABILITY_HOTSPOT.bedgraph')

    to_write = coordinates_bin.ix[hpr_idxs[hpr_idxs].index]
    to_write.dropna(inplace=True)
    to_write['bpstart'] = to_write['bpstart'].astype(int)
    to_write['bpend'] = to_write['bpend'].astype(int)

    to_write.to_csv(bedgraph_hpr_filename, sep='\t', header=False, index=False)

    bed_hpr_fileaname = os.path.join(output_directory,
                                     'SELECTED_VARIABILITY_HOTSPOT.bed')

    if not os.path.exists(bed_hpr_fileaname) or recompute_all:
        info('Writing the HPRs in: %s' % bed_hpr_fileaname)
        sb.call('sort -k1,1 -k2,2n %s | bedtools merge -i stdin >  %s' %
                (bedgraph_hpr_filename, bed_hpr_fileaname),
                shell=True,
                env=system_env)

    #os.remove(bedgraph_hpr_filename)

    df_chip_hpr = df_chip_not_empty.ix[hpr_idxs, :]
    df_chip_hpr_zscore = df_chip_hpr.apply(zscore, axis=1)

    specific_regions_directory = os.path.join(output_directory,
                                              'SPECIFIC_REGIONS')
    if not os.path.exists(specific_regions_directory):
        os.makedirs(specific_regions_directory)

    if depleted:
        z_score_high = -z_score_high
        z_score_low = -z_score_low

    #write target
    info('Writing Specific Regions for each cell line...')
    coord_zscore = coordinates_bin.copy()
    for col in df_chip_hpr_zscore:

        regions_specific_filename = 'Regions_specific_for_%s_z_%.2f.bedgraph' % (
            os.path.basename(col).replace('.rpm', ''), z_score_high)
        specific_output_filename = os.path.join(specific_regions_directory,
                                                regions_specific_filename)
        specific_output_bed_filename = specific_output_filename.replace(
            '.bedgraph', '.bed')

        if not os.path.exists(specific_output_bed_filename) or recompute_all:
            if depleted:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] < z_score_high, col]
            else:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] > z_score_high, col]
            coord_zscore.dropna().to_csv(specific_output_filename,
                                         sep='\t',
                                         header=False,
                                         index=False)

            info('Writing:%s' % specific_output_bed_filename)
            sb.call('sort -k1,1 -k2,2n %s | bedtools merge -i stdin >  %s' %
                    (specific_output_filename, specific_output_bed_filename),
                    shell=True,
                    env=system_env)

    #write background
    info('Writing Background Regions for each cell line...')
    coord_zscore = coordinates_bin.copy()
    for col in df_chip_hpr_zscore:

        regions_bg_filename = 'Background_for_%s_z_%.2f.bedgraph' % (
            os.path.basename(col).replace('.rpm', ''), z_score_low)
        bg_output_filename = os.path.join(
            specific_regions_directory, 'Background_for_%s_z_%.2f.bedgraph' %
            (os.path.basename(col).replace('.rpm', ''), z_score_low))
        bg_output_bed_filename = bg_output_filename.replace(
            '.bedgraph', '.bed')

        if not os.path.exists(bg_output_bed_filename) or recompute_all:

            if depleted:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] > z_score_low, col]
            else:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] < z_score_low, col]
            coord_zscore.dropna().to_csv(bg_output_filename,
                                         sep='\t',
                                         header=False,
                                         index=False)

            info('Writing:%s' % bg_output_bed_filename)
            sb.call('sort -k1,1 -k2,2n -i %s | bedtools merge -i stdin >  %s' %
                    (bg_output_filename, bg_output_bed_filename),
                    shell=True,
                    env=system_env)

    ###plot selection
    pl.figure()
    pl.title('Selection of the HPRs')
    pl.plot(x, y, 'r', lw=3)
    pl.plot(x[idx_th], y[idx_th], '*', markersize=20)
    pl.hold(True)
    x_ext = np.linspace(-0.1, 1.2, N_POINTS)
    y_line = (m[idx_th] + 1.0) * (x_ext - x[idx_th]) + y[idx_th]
    pl.plot(x_ext, y_line, '--k', lw=3)
    pl.xlim(0, 1.1)
    pl.ylim(0, 1)
    pl.xlabel('Fraction of bins')
    pl.ylabel('Score normalized')
    pl.savefig(
        os.path.join(output_directory, 'SELECTION_OF_VARIABILITY_HOTSPOT.pdf'))
    pl.close()

    igv_session_filename = os.path.join(output_directory,
                                        'OPEN_ME_WITH_IGV.xml')
    info('Creating an IGV session file (.xml) in: %s' % igv_session_filename)

    session = ET.Element("Session")
    session.set("genome", genome_name)
    session.set("hasGeneTrack", "true")
    session.set("version", "7")
    resources = ET.SubElement(session, "Resources")
    panel = ET.SubElement(session, "Panel")

    resource_items = []
    track_items = []

    hpr_iod_scores = scores[scores > th_iod]
    min_h = np.mean(hpr_iod_scores) - 2 * np.std(hpr_iod_scores)
    max_h = np.mean(hpr_iod_scores) + 2 * np.std(hpr_iod_scores)
    mid_h = np.mean(hpr_iod_scores)
    #write the tracks
    for sample_name in sample_names:
        if disable_quantile_normalization:
            track_full_path = os.path.join(
                output_directory, 'TRACKS',
                '%s.%dbp.bw' % (sample_name, bin_size))
        else:
            track_full_path = os.path.join(
                output_directory, 'TRACKS',
                '%s.%dbp_quantile_normalized.bw' % (sample_name, bin_size))

        track_filename = rem_base_path(track_full_path, output_directory)

        if os.path.exists(track_full_path):
            resource_items.append(ET.SubElement(resources, "Resource"))
            resource_items[-1].set("path", track_filename)
            track_items.append(ET.SubElement(panel, "Track"))
            track_items[-1].set('color', "0,0,178")
            track_items[-1].set('id', track_filename)
            track_items[-1].set("name", sample_name)

    resource_items.append(ET.SubElement(resources, "Resource"))
    resource_items[-1].set(
        "path", rem_base_path(bw_iod_track_filename, output_directory))

    track_items.append(ET.SubElement(panel, "Track"))
    track_items[-1].set('color', "178,0,0")
    track_items[-1].set('id',
                        rem_base_path(bw_iod_track_filename, output_directory))
    track_items[-1].set('renderer', "HEATMAP")
    track_items[-1].set(
        "colorScale",
        "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" %
        (mid_h, min_h, mid_h, max_h))
    track_items[-1].set("name", 'VARIABILITY')

    resource_items.append(ET.SubElement(resources, "Resource"))
    resource_items[-1].set("path",
                           rem_base_path(bed_hpr_fileaname, output_directory))
    track_items.append(ET.SubElement(panel, "Track"))
    track_items[-1].set('color', "178,0,0")
    track_items[-1].set('id', rem_base_path(bed_hpr_fileaname,
                                            output_directory))
    track_items[-1].set('renderer', "HEATMAP")
    track_items[-1].set(
        "colorScale",
        "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" %
        (mid_h, min_h, mid_h, max_h))
    track_items[-1].set("name", 'HOTSPOTS')

    for sample_name in sample_names:
        track_full_path = glob.glob(
            os.path.join(output_directory, 'SPECIFIC_REGIONS',
                         'Regions_specific_for_%s*.bedgraph' % sample_name))[0]
        specific_track_filename = rem_base_path(track_full_path,
                                                output_directory)
        if os.path.exists(track_full_path):
            resource_items.append(ET.SubElement(resources, "Resource"))
            resource_items[-1].set("path", specific_track_filename)

            track_items.append(ET.SubElement(panel, "Track"))
            track_items[-1].set('color', "178,0,0")
            track_items[-1].set('id', specific_track_filename)
            track_items[-1].set('renderer', "HEATMAP")
            track_items[-1].set(
                "colorScale",
                "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0"
                % (mid_h, min_h, mid_h, max_h))
            track_items[-1].set("name", 'REGION SPECIFIC FOR %s' % sample_name)

    tree = ET.ElementTree(session)
    tree.write(igv_session_filename, xml_declaration=True)

    info('All done! Ciao!')
    sys.exit(0)