示例#1
0
def get_data_filepaths(samples_filename_or_bam_folder, input_is_bigwig):
    # check folder or sample filename
    if not os.path.exists(samples_filename_or_bam_folder):
        error("The file or folder %s doesn't exist. Exiting." %
              samples_filename_or_bam_folder)
        sys.exit(1)
    if os.path.isfile(samples_filename_or_bam_folder):
        data_filenames = []
        sample_names = []
        with open(samples_filename_or_bam_folder) as infile:
            for line in infile:
                if not line.strip():
                    continue
                if line.startswith('#'):  # skip optional header line
                    info('Skipping header/comment line:%s' % line)
                    continue
                fields = line.strip().split()
                n_fields = len(fields)
                if n_fields == 2:
                    sample_names.append(fields[0])
                    data_filenames.append(fields[1])
                else:
                    error('The samples file format is wrong!')
                    sys.exit(1)
        # dir_path = os.path.dirname(os.path.realpath(samples_filename_or_bam_folder))
        # data_filenames = [os.path.join(dir_path, filename)
        #                     for filename in data_filenames]
    else:
        if input_is_bigwig:
            extension_to_check = '.bw'
            info('Input is set BigWig (.bw)')
        else:
            extension_to_check = '.bam'
            info('Input is set compressed SAM (.bam)')

        data_filenames = glob.glob(
            os.path.join(samples_filename_or_bam_folder,
                         '*' + extension_to_check))
        if not data_filenames:
            error('No bam/bigwig  files to analyze in %s. Exiting.' %
                  samples_filename_or_bam_folder_or_bam_folder)
            sys.exit(1)
        sample_names = [
            os.path.basename(data_filename).replace(extension_to_check, '')
            for data_filename in data_filenames
        ]
    # check all the files before starting
    info('Checking samples files location...')
    for data_filename in data_filenames:
        check_file(data_filename)
    return sample_names, data_filenames
示例#2
0
def get_target_motifs_filepaths(target_motifs_filepaths_file):
    # check folder or sample filename
    if not os.path.exists(target_motifs_filepaths_file):
        error("The file or folder %s doesn't exist. Exiting." %
              target_motifs_filepaths_file)
        sys.exit(1)
    if os.path.isfile(target_motifs_filepaths_file):
        specific_regions_filenames = []
        bg_regions_filenames = []
        sample_names = []
        with open(target_motifs_filepaths_file) as infile:
            for line in infile:
                if not line.strip():
                    continue

                if line.startswith(
                        '#'):  # skip optional header line or empty lines
                    info('Skipping header/comment line:%s' % line)
                    continue

                fields = line.strip().split()
                n_fields = len(fields)

                if n_fields == 2:

                    sample_names.append(fields[0])
                    specific_regions_filenames.append(fields[1])
                    bg_regions_filenames.append("random_background")

                elif n_fields == 3:
                    sample_names.append(fields[0])
                    specific_regions_filenames.append(fields[1])
                    bg_regions_filenames.append(fields[2])

                else:
                    error('The samples file format is wrong!')
                    sys.exit(1)

    # check all the files before starting
    info('Checking  files location...')
    for specific_regions_filename in specific_regions_filenames:
        check_file(specific_regions_filename)

    if n_fields == 3:
        # check all the files before starting
        info('Checking  files location...')
        for bg_regions_filename in bg_regions_filenames:
            check_file(bg_regions_filename)

    return sample_names, specific_regions_filenames, bg_regions_filenames
示例#3
0
def main():

    print '\n[H A Y S T A C K   P I P E L I N E]'
    print('\n-SELECTION OF HOTSPOTS OF VARIABILITY AND ENRICHED MOTIFS- [Luca Pinello - [email protected]]\n')
    print 'Version %s\n' % HAYSTACK_VERSION
    
    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument('samples_filename_or_bam_folder', type=str,  help='A tab delimeted file with in each row (1) a sample name, (2) the path to the corresponding bam filename, (3 optional) the path to the corresponding gene expression filaneme. Alternatively it is possible to specify a folder containing some .bam files to analyze.')
    parser.add_argument('genome_name', type=str,  help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')
    
    #optional
    parser.add_argument('--name',  help='Define a custom output filename for the report', default='')
    parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)',default='')
    parser.add_argument('--bin_size', type=int,help='bin size to use(default: 500bp)',default=500)
    parser.add_argument('--recompute_all',help='Ignore any file previously precalculated fot the command haystack_hotstpot',action='store_true')
    parser.add_argument('--depleted', help='Look for cell type specific regions with depletion of signal instead of enrichment',action='store_true')
    parser.add_argument('--input_is_bigwig', help='Use the bigwig format instead of the bam format for the input. Note: The files must have extension .bw',action='store_true')
    parser.add_argument('--disable_quantile_normalization',help='Disable quantile normalization (default: False)',action='store_true')
    parser.add_argument('--transformation',type=str,help='Variance stabilizing transformation among: none, log2, angle (default: angle)',default='angle',choices=['angle', 'log2', 'none'])
    parser.add_argument('--z_score_high', type=float,help='z-score value to select the specific regions(default: 1.5)',default=1.5)
    parser.add_argument('--z_score_low', type=float,help='z-score value to select the not specific regions(default: 0.25)',default=0.25)
    parser.add_argument('--th_rpm',type=float,help='Percentile on the signal intensity to consider for the hotspots (default: 99)', default=99)
    parser.add_argument('--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2016)')
    parser.add_argument('--motif_mapping_filename', type=str, help='Custom motif to gene mapping file (the default is for JASPAR CORE 2016 database)')
    parser.add_argument('--plot_all',  help='Disable the filter on the TF activity and correlation (default z-score TF>0 and rho>0.3)',action='store_true')
    parser.add_argument('--n_processes',type=int, help='Specify the number of processes to use. The default is #cores available.',default=multiprocessing.cpu_count())
    parser.add_argument('--temp_directory',  help='Directory to store temporary files  (default: /tmp)', default='/tmp')
    parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %s' % HAYSTACK_VERSION)
    
    args = parser.parse_args()
    args_dict=vars(args)
    for key,value in args_dict.items():
            exec('%s=%s' %(key,repr(value)))
            
            
            
    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    
    if motif_mapping_filename:
        check_file(motif_mapping_filename)
        
    if not os.path.exists(temp_directory):
        error('The folder specified with --temp_directory: %s does not exist!' % temp_directory)
        sys.exit(1)
    
    if input_is_bigwig:
            extension_to_check='.bw'
            info('Input is set BigWig (.bw)')
    else:
            extension_to_check='.bam'
            info('Input is set compressed SAM (.bam)')
    
    if name:
            directory_name='HAYSTACK_PIPELINE_RESULTS_on_%s' % name
    
    else:
            directory_name='HAYSTACK_PIPELINE_RESULTS'
    
    if output_directory:
            output_directory=os.path.join(output_directory,directory_name)
    else:
            output_directory=directory_name
    
    #check folder or sample filename
    
    USE_GENE_EXPRESSION=True
    
    if os.path.isfile(samples_filename_or_bam_folder):
            BAM_FOLDER=False
            bam_filenames=[]
            gene_expression_filenames=[]
            sample_names=[]
    
            with open(samples_filename_or_bam_folder) as infile:
                for line in infile:
    
                    if not line.strip():
                            continue
                    
                    if line.startswith('#'): #skip optional header line or empty lines
                            info('Skipping header/comment line:%s' % line)
                            continue
    
                    fields=line.strip().split()
                    n_fields=len(fields)
    
                    if n_fields==2:
    
                        USE_GENE_EXPRESSION=False
                        
                        sample_names.append(fields[0])
                        bam_filenames.append(fields[1])
    
                    elif n_fields==3:
    
                        USE_GENE_EXPRESSION=USE_GENE_EXPRESSION and True
    
                        sample_names.append(fields[0])
                        bam_filenames.append(fields[1])
                        gene_expression_filenames.append(fields[2])
                    else:
                        error('The samples file format is wrong!')
                        sys.exit(1)
            
    else:
            if os.path.exists(samples_filename_or_bam_folder):
                    BAM_FOLDER=True
                    USE_GENE_EXPRESSION=False
                    bam_filenames=glob.glob(os.path.join(samples_filename_or_bam_folder,'*'+extension_to_check))
    
                    if not bam_filenames:
                        error('No bam/bigwig  files to analyze in %s. Exiting.' % samples_filename_or_bam_folder)
                        sys.exit(1)
                    
                    sample_names=[os.path.basename(bam_filename).replace(extension_to_check,'') for bam_filename in bam_filenames]
            else:
                    error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder)
                    sys.exit(1)
    
    
    #check all the files before starting
    info('Checking samples files location...')
    for bam_filename in bam_filenames:
            check_file(bam_filename)
    
    if USE_GENE_EXPRESSION:
        for gene_expression_filename in gene_expression_filenames:
                check_file(gene_expression_filename)
    
    if not os.path.exists(output_directory):
            os.makedirs(output_directory)
    
    #copy back the file used
    if not BAM_FOLDER:
            shutil.copy2(samples_filename_or_bam_folder,output_directory)
    
    #write hotspots conf files
    sample_names_hotspots_filename=os.path.join(output_directory,'sample_names_hotspots.txt')
    
    with open(sample_names_hotspots_filename,'w+') as outfile:
        for sample_name,bam_filename in zip(sample_names,bam_filenames):
            outfile.write('%s\t%s\n' % (sample_name, bam_filename))
    
    #write tf activity  conf files
    if USE_GENE_EXPRESSION:
            sample_names_tf_activity_filename=os.path.join(output_directory,'sample_names_tf_activity.txt')
    
            with open(sample_names_tf_activity_filename,'w+') as outfile:
                    for sample_name,gene_expression_filename in zip(sample_names,gene_expression_filenames):
                            outfile.write('%s\t%s\n' % (sample_name, gene_expression_filename))
    
            tf_activity_directory=os.path.join(output_directory,'HAYSTACK_TFs_ACTIVITY_PLANES')
    
    
    #CALL HAYSTACK HOTSPOTS
    cmd_to_run='haystack_hotspots %s %s --output_directory %s --bin_size %d %s %s %s %s %s %s %s %s' % \
                (sample_names_hotspots_filename, genome_name,output_directory,bin_size,
                 ('--recompute_all' if recompute_all else ''),
                 ('--depleted' if depleted else ''),
                 ('--input_is_bigwig' if input_is_bigwig else ''),
                 ('--disable_quantile_normalization' if disable_quantile_normalization else ''),
                 '--transformation %s' % transformation,
                 '--z_score_high %f' % z_score_high,
                 '--z_score_low %f' % z_score_low,
                 '--th_rpm %f' % th_rpm)
    print cmd_to_run
    sb.call(cmd_to_run ,shell=True,env=system_env)        
    
    #CALL HAYSTACK MOTIFS
    motif_directory=os.path.join(output_directory,'HAYSTACK_MOTIFS')
    for sample_name in sample_names:
        specific_regions_filename=os.path.join(output_directory,'HAYSTACK_HOTSPOTS','SPECIFIC_REGIONS','Regions_specific_for_%s*.bed' %sample_name)
        bg_regions_filename=glob.glob(os.path.join(output_directory,'HAYSTACK_HOTSPOTS','SPECIFIC_REGIONS','Background_for_%s*.bed' %sample_name))[0]
        #bg_regions_filename=glob.glob(specific_regions_filename.replace('Regions_specific','Background')[:-11]+'*.bed')[0] #lo zscore e' diverso...
        #print specific_regions_filename,bg_regions_filename
        cmd_to_run='haystack_motifs %s %s --bed_bg_filename %s --output_directory %s --name %s' % (specific_regions_filename,genome_name, bg_regions_filename,motif_directory, sample_name)
        
        if meme_motifs_filename:
             cmd_to_run+=' --meme_motifs_filename %s' % meme_motifs_filename
             
             
        if n_processes:
            cmd_to_run+=' --n_processes %d' % n_processes
            
        if temp_directory:
            cmd_to_run+=' --temp_directory %s' % temp_directory
            
            
        
        print cmd_to_run
        sb.call(cmd_to_run,shell=True,env=system_env)
    
        if USE_GENE_EXPRESSION:
                #CALL HAYSTACK TF ACTIVITY 
                motifs_output_folder=os.path.join(motif_directory,'HAYSTACK_MOTIFS_on_%s' % sample_name) 
                if os.path.exists(motifs_output_folder):
                    cmd_to_run='haystack_tf_activity_plane %s %s %s --output_directory %s'  %(motifs_output_folder,sample_names_tf_activity_filename,sample_name,tf_activity_directory)
                    
                    if motif_mapping_filename:
                        cmd_to_run+=' --motif_mapping_filename %s' %  motif_mapping_filename       
                    
                    if plot_all:
                        cmd_to_run+=' --plot_all'
                        
                    
                    print cmd_to_run
                    sb.call(cmd_to_run,shell=True,env=system_env) 
示例#4
0
def main(input_args=None):
    print('\n[H A Y S T A C K   M O T I F S]')
    print(
        '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n'
    )
    print('Version %s\n' % HAYSTACK_VERSION)

    bootstrap = False
    ngram_correction = 'g'

    parser = get_args_motif()
    args = parser.parse_args(input_args)

    args.n_processes = max(1, args.n_processes - 1)

    args_dict = vars(args)
    for key, value in args_dict.items():
        if key == 'n_target_coordinates':
            n_target_coordinates = value
        else:
            exec('%s=%s' % (key, repr(value)))

    bed_score_column -= 1

    if no_c_g_correction:
        c_g_correction = False
    else:
        c_g_correction = True

    if no_random_sampling_target:
        random_sampling_target = False
    else:
        random_sampling_target = True

    check_file(bed_target_filename)

    if not bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)

    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename = os.path.join(
            determine_path('motif_databases'),
            'JASPAR_CORE_2016_vertebrates.meme')

    annotation_directory = determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error(
                'The mapping to the closest gene requires Java free available from: http://java.com/en/download/'
            )
            use_gene_annotations = False
        else:
            check_file(gene_annotations_filename)
            info('Using %s as gene annotations file' %
                 gene_annotations_filename)
            use_gene_annotations = True
    else:
        gene_annotations_filename = os.path.join(annotation_directory,
                                                 '%s_genes.bed' % genome_name)
        gene_ids_to_names_filename = os.path.join(
            annotation_directory, '%s_genes_id_to_names' % genome_name)

        if os.path.exists(gene_annotations_filename) and os.path.exists(
                gene_ids_to_names_filename):
            use_gene_annotations = True
        else:
            use_gene_annotations = False
            info('No gene annotations file specified')

    genome, _, nucleotide_bg_filename = initialize_genome(genome_name)

    target_name = ntpath.basename(bed_target_filename.replace('.bed', ''))

    bg_name = ntpath.basename(bed_bg_filename.replace('.bed', ''))
    # timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name = 'HAYSTACK_MOTIFS_on_' + name
    else:
        directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    info(
        '###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n' \
        % (bed_target_filename, bed_bg_filename, str(bg_target_ratio), str(c_g_correction), str(mask_repetitive),
           'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates), output_directory))

    N_TARGET = None
    N_BG = None
    COMMAND_USED = ' '.join(sys.argv)

    _n_target_coordinates = n_target_coordinates

    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords = Coordinate.bed_to_coordinates(bed_target_filename,
                                                  cl_score=bed_score_column)

    if len(target_coords) == 0:
        info('No coordinates to analyze in your input file. Exiting.')
        sys.exit(1)

    # calculate automatically the average lenght of the target regions

    if internal_window_length:
        info('Using the user defined internal window length:%d' %
             internal_window_length)
        if internal_window_length % 2:
            internal_window_length += 1

    else:

        internal_window_length = int(np.mean(map(len, target_coords)))
        if internal_window_length % 2:
            internal_window_length += 1
        info(
            'Using the average length of target coordinates as internal window length:%d'
            % internal_window_length)

        if not window_length:
            window_length = internal_window_length * 5

    info('Total window length:%d' % window_length)

    if not smooth_size:
        smooth_size = internal_window_length / 5

    target_coords = Coordinate.coordinates_of_intervals_around_center(
        target_coords, internal_window_length)

    if len(target_coords) > n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            target_coords = random.sample(target_coords, n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            sorted_idxs_by_score = np.argsort([c.score
                                               for c in target_coords])[::-1]
            target_coords = [
                target_coords[idx]
                for idx in sorted_idxs_by_score[:n_target_coordinates]
            ]
    else:

        if random_sampling_target and bootstrap and not np.isinf(
                n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords = sample_wr(target_coords, n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))

    info('Extracting Motifs in target coordinates')
    positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning(
        target_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)
    n_target_coordinates = len(target_coords)  # fix for the bootstrap!

    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords = []
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))

            for _ in range(bg_target_ratio):
                for idx_c, c in enumerate(target_coords):
                    c_bin = np.nonzero(
                        np.histogram(c_g_content_target[idx_c], bins)[0])[0][0]
                    c_random_bin = -1

                    while c_random_bin != c_bin:
                        random_bpstart = np.random.randint(
                            1, genome.chr_len[c.chr_id] - len(c) + 1)
                        c_random = Coordinate(c.chr_id, random_bpstart,
                                              random_bpstart + len(c) - 1)
                        seq = genome.extract_sequence(c_random)
                        c_g_content_c_random = (seq.count('c') +
                                                seq.count('g')) / float(len(c))
                        c_random_bin = np.nonzero(
                            np.histogram(c_g_content_c_random, bins)[0])[0][0]

                    # print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            debug('original: ' +
                  str(np.histogram(c_g_content_target, bins)[0]))
            debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0]))

        else:
            bg_coords = get_random_coordinates(target_coords, genome)

        info('Done!')

    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords = Coordinate.coordinates_of_intervals_around_center(
            bg_coords, internal_window_length)

        if use_entire_bg:
            bg_target_ratio = float(len(bg_coords)) / n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f',
                 bg_target_ratio)

        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))
            target_hist = np.histogram(c_g_content_target, bins)[0]
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            ratios = bg_hist / (target_hist * 1.0)
            debug('original:%s' % target_hist)
            debug('bg:%s' % bg_hist)
            debug('ratios:%s' % ratios)
            K_MATCH = min(
                bg_target_ratio,
                ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) &
                       (target_hist / float(target_hist.sum()) > 0.05)].min())

            debug('K_MATCH:%d' % K_MATCH)

            to_match = np.int32(np.floor(K_MATCH * target_hist))

            debug('to_match:%s' % to_match)

            idxs_corrected_bg = np.array([], dtype=int)

            for idx_bin in range(len(bins) - 1):
                idxs_matching_regions = \
                    np.nonzero((c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0]
                to_take = np.random.permutation(len(idxs_matching_regions))
                to_take = to_take[range(
                    min(len(idxs_matching_regions), to_match[idx_bin]))]
                idxs_corrected_bg = np.hstack(
                    (idxs_corrected_bg, idxs_matching_regions[to_take]))

            debug('original:%s' % target_hist)
            debug('K:%d' % K_MATCH)
            debug('to sample:%s' % to_match)
            debug('obtained:%s' %
                  np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0])
            bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug(np.histogram(c_g_content_bg, bins)[0])
            if np.array_equal(K_MATCH * target_hist,
                              np.histogram(c_g_content_bg, bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s'
                     % (target_hist, np.histogram(c_g_content_bg, bins)[0]))
            else:
                warn(
                    'C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'
                    % (target_hist, np.histogram(c_g_content_bg, bins)[0]))

            debug(target_hist / np.histogram(c_g_content_bg, bins)[0])

    if len(bg_coords) >= bg_target_ratio * n_target_coordinates:
        bg_coords = random.sample(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
    else:
        if bootstrap and len(bg_coords) < (bg_target_ratio *
                                           n_target_coordinates *
                                           0.95):  # allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords = sample_wr(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' %
                  (target_hist, np.histogram(c_g_content_bg, bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning(
        bg_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)[0:3]

    # allocate date for reports
    N_MOTIFS = len(motif_ids)
    rankings = np.zeros(N_MOTIFS, dtype=np.int16)
    motif_ratios = np.zeros(N_MOTIFS)
    support_p = np.zeros(N_MOTIFS)
    support_n = np.zeros(N_MOTIFS)
    fisher_p_values = np.zeros(N_MOTIFS)
    central_enrichment = np.zeros(N_MOTIFS)

    N_seq_p = positive_matrix.shape[0]
    N_seq_n = negative_matrix.shape[0]

    profile_presence_p = (positive_matrix > 0).sum(0)
    profile_presence_n = (negative_matrix > 0).sum(0)

    support_p = profile_presence_p / float(N_seq_p)
    support_n = profile_presence_n / float(N_seq_n)

    internal_bpstart = window_length / 2 - internal_window_length / 2
    internal_bpend = window_length / 2 + internal_window_length / 2

    for idx, motif_id in enumerate(motif_ids):
        fisher_p_values[idx] = stats.fisher_exact(
            [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]],
             [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1]
        central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][
            internal_bpstart:internal_bpend].mean() / np.hstack([
                motifs_profiles_in_sequences[motif_id][:internal_bpstart],
                motifs_profiles_in_sequences[motif_id][internal_bpend:]
            ]).mean()

    motif_ratios = (support_p + 0.01) / (support_n + 0.01)

    # Foundamental!
    if not disable_ratio:
        motif_ratios[support_p < 0.03] = 1

    rankings = stats.rankdata(-motif_ratios)

    # filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep = np.nonzero(motif_ratios > 1)[0]
    else:
        idxs_to_keep = range(len(motif_ratios))

    rankings = rankings[idxs_to_keep]
    motif_ratios = motif_ratios[idxs_to_keep]
    support_p = support_p[idxs_to_keep]
    support_n = support_n[idxs_to_keep]
    fisher_p_values = fisher_p_values[idxs_to_keep]
    central_enrichment = central_enrichment[idxs_to_keep]

    motif_ids = [motif_ids[_] for _ in idxs_to_keep]
    motif_names = [motif_names[_] for _ in idxs_to_keep]
    motif_idxs = [_ for _ in idxs_to_keep]

    try:
        qvalues = estimate_qvalues(fisher_p_values)
        # we test the ones only with ratio >1
    except:
        print fisher_p_values

    # qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################

    # generate reports in html
    info('Generating HTML report...')
    imgs_directory = os.path.join(output_directory, 'images')
    genes_list_directory = os.path.join(output_directory, 'genes_lists')
    motif_regions_directory = os.path.join(output_directory, 'motifs_regions')

    # create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)

    j2_env = Environment(
        loader=FileSystemLoader(determine_path('extra') + '/templates/'),
        trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra') + '/templates/')
    template = j2_env.get_template('report_template.html')

    # copy haystack logo and bg
    shutil.copyfile(
        determine_path('extra') + '/templates/haystack_logo.png',
        os.path.join(imgs_directory, 'haystack_logo.png'))
    shutil.copyfile(
        determine_path('extra') + '/templates/noise.png',
        os.path.join(imgs_directory, 'noise.png'))

    motifs_dump = []
    for i in np.argsort(rankings):
        if (support_p[i] >= 0.03
                or disable_ratio) and fisher_p_values[i] < 0.01 and (
                    motif_ratios[i] > 1 or disable_ratio
                ) and central_enrichment[i] > min_central_enrichment:
            # if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):

            info('Generating logo and profile for:' + motif_ids[i])

            # create motif logo
            img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i],
                             file_format='pdf')
            # fix the weblogo prefix problem
            img_logo_url = os.path.join('images',
                                        'logo_' + motif_ids[i] + '.png')

            # create motif enrichment profile
            img_profile = os.path.join(imgs_directory,
                                       'profile_' + motif_ids[i] + '.png')
            motif_profile_target = motifs_profiles_in_sequences[
                motif_ids[i]] / N_seq_p
            motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n

            # print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,
                                   motif_profile_bg,
                                   motif_ids[i],
                                   img_profile,
                                   smooth_size=smooth_size,
                                   window_size=window_length)
            img_profile_url = os.path.join('images',
                                           'profile_' + motif_ids[i] + '.png')

            # create regions
            info('Extracting regions with:' + motif_ids[i])
            regions = os.path.join(
                motif_regions_directory,
                motif_ids[i] + '_motif_region_in_target.bed')
            with open(regions, 'w+') as outfile:
                outfile.write(
                    'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n'
                )
                for c, locations in motif_coords_in_seqs_with_motif[
                        motif_ids[i]].items():
                    outfile.write('\t'.join([
                        c.chr_id,
                        str(c.bpstart),
                        str(c.bpend), ';'.join([
                            '-'.join(map(str, map(int, l))) for l in locations
                        ]),
                        str(len(locations))
                    ]) + '\n')
            regions_url = os.path.join(
                'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed')

            # map closest downstream genes
            genes_url = None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' %
                     motif_ids[i])

                peak_annotator_path = os.path.join(determine_path('extra/'),
                                                   'PeakAnnotator.jar')

                if gene_ids_to_names_filename:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, gene_ids_to_names_filename, genes_list_directory),
                            shell=True)
                else:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s  -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, genes_list_directory), shell=True)

                genes_url = os.path.join(
                    'genes_lists',
                    motif_ids[i] + '_motif_region_in_target.tss.bed')

            motifs_dump.append({
                'id': motif_ids[i],
                'name': motif_names[i],
                'support_p': support_p[i] * 100,
                'support_n': support_n[i] * 100,
                'ratio': motif_ratios[i],
                'rank': float(rankings[i]),
                'pvalue': fisher_p_values[i],
                'qvalue': qvalues[i],
                'central_enrichment': central_enrichment[i],
                'img_logo': img_logo_url,
                'img_profile': img_profile_url,
                'regions': regions_url,
                'genes': genes_url,
                'idx_motif': motif_idxs[i]
            })

    outfile = codecs.open(
        os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump, bed_target_filename=bed_target_filename,
                                  bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n, \
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,
                                  use_gene_annotations=use_gene_annotations))
    outfile.close()

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory = os.path.join(output_directory, 'dump')

        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)

        np.save(os.path.join(dump_directory, 'matrix_' + target_name),
                positive_matrix)
        np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name),
                negative_matrix)

        cp.dump(
            motifs_dump,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_dumps.pickle'), 'w'))

        # cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        # cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(
            idxs_seqs_with_motif,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_seqs_idxs.pickle'), 'w'))
        cp.dump(
            idxs_seqs_with_motif_bg,
            open(
                os.path.join(dump_directory,
                             bg_name + '_motif_seqs_idxs.pickle'), 'w'))

        cp.dump(
            motif_coords_in_seqs_with_motif,
            open(
                os.path.join(
                    dump_directory,
                    target_name + '_motif_coords_in_seqs_with_motif.pickle'),
                'w'))

        Coordinate.coordinates_to_bed(
            target_coords,
            os.path.join(
                dump_directory,
                'Target_coordinates_selected_on_' + target_name + '.bed'),
            minimal_format=False)
        Coordinate.coordinates_to_bed(
            bg_coords,
            os.path.join(dump_directory,
                         'BG_coordinates_selected_on_' + bg_name + '.bed'),
            minimal_format=True)
    #info('Motif analysis for Sample %s completed' %name)
    info('Motif analysis completed! Ciao!')
def main():

  
    print '\n[H A Y S T A C K   M O T I F S]'
    print('\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n')
    print 'Version %s\n' % HAYSTACK_VERSION
    
    bootstrap=False
    ngram_correction='g'

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument('bed_target_filename', type=str,  help='A bed file containing the target coordinates on the genome of reference')
    parser.add_argument('genome_name', type=str,  help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')

    #optional
    parser.add_argument('--bed_bg_filename', type=str,  help="A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)", default='random_background')
    parser.add_argument('--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2016)')
    parser.add_argument('--nucleotide_bg_filename',type=str, help='Nucleotide probability for the background in MEME format (default precomupted on the Genome)')
    parser.add_argument('--p_value', type=float, help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)',default=1e-4)
    parser.add_argument('--no_c_g_correction',  help='Disable the matching of the C+G density of the background',action='store_true')
    parser.add_argument('--c_g_bins', type=int,help='Number of bins for the C+G density correction (default: 8)',default=8)
    parser.add_argument('--mask_repetitive', help='Mask repetitive sequences',action='store_true')
    parser.add_argument('--n_target_coordinates', type=int, help='Number of target coordinates to use (default: all)',default=np.inf)
    parser.add_argument('--use_entire_bg', help='Use the entire background file (use only when the cg correction is disabled)',action='store_true')   
    parser.add_argument('--bed_score_column', type=int, help='Column in the bedfile that represents the score (default: 5)',default=5)
    parser.add_argument('--bg_target_ratio', type=int, help='Background size/Target size ratio (default: 1.0)',default=2)
    parser.add_argument('--bootstrap',  help='Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)',action='store_true')
    parser.add_argument('--temp_directory',  help='Directory to store temporary files  (default: /tmp)', default='/tmp')
    parser.add_argument('--no_random_sampling_target',  help='Select the best --n_target_coordinates using the score column from the target file instead of randomly select them',action='store_true')
    parser.add_argument('--name',  help='Define a custom output filename for the report', default='')
    parser.add_argument('--internal_window_length', type=int, help='Window length in bp for the enrichment (default: average lenght of the target sequences)')
    parser.add_argument('--window_length', type=int, help='Window length in bp for the profiler (default:internal_window_length*5)')
    parser.add_argument('--min_central_enrichment', type=float, help='Minimum central enrichment to report a motif (default:>1.0)',default=1.0)
    parser.add_argument('--disable_ratio',  help='Disable target/bg ratio filter',action='store_true')
    parser.add_argument('--dump', help='Dump all the intermediate data, choices: True, False (default: False)',action='store_true')
    parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)',default='')
    parser.add_argument('--smooth_size',type=int, help='Size in bp for the smoothing window (default: internal_window_length/4)')
    parser.add_argument('--gene_annotations_filename',type=str, help='Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene')
    parser.add_argument('--gene_ids_to_names_filename',type=str, help='Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)')
    parser.add_argument('--n_processes',type=int, help='Specify the number of processes to use. The default is #cores available.',default=mp.cpu_count())
    parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %s' % HAYSTACK_VERSION)

    args = parser.parse_args()

    args_dict=vars(args)
    for key,value in args_dict.items():
        if key=='n_target_coordinates':
            n_target_coordinates=value
        else:
            exec('%s=%s' %(key,repr(value)))

    
    bed_score_column-=1

    if no_c_g_correction:
        c_g_correction=False
    else:
        c_g_correction=True

    if no_random_sampling_target:
        random_sampling_target=False
    else:
        random_sampling_target=True
        

    check_file(bed_target_filename)

    if not  bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)


    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename=os.path.join(determine_path('motif_databases'),'JASPAR_CORE_2016_vertebrates.meme')
        
    annotation_directory=determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error('The mapping to the closest gene requires Java free available from: http://java.com/en/download/')
            use_gene_annotations=False
        else:
            check_file(gene_annotations_filename) 
            info('Using %s as gene annotations file' % gene_annotations_filename)
            use_gene_annotations=True
    else:
            gene_annotations_filename=os.path.join(annotation_directory,'%s_genes.bed' % genome_name)
            gene_ids_to_names_filename=os.path.join(annotation_directory,'%s_genes_id_to_names' % genome_name)
            
            if os.path.exists(gene_annotations_filename) and os.path.exists(gene_ids_to_names_filename):
                use_gene_annotations=True
            else:
                use_gene_annotations=False
                info('No gene annotations file specified')


    target_name=ntpath.basename(bed_target_filename.replace('.bed',''))
    bg_name=ntpath.basename(bed_bg_filename.replace('.bed',''))
    #timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name='HAYSTACK_MOTIFS_on_'+name
    else:
        directory_name='HAYSTACK_on_'+target_name+'_VS_'+bg_name

    if output_directory:
        output_directory=os.path.join(output_directory, directory_name)
    else:
        output_directory=directory_name


    info('###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n'\
         %(bed_target_filename,bed_bg_filename,str(bg_target_ratio),str(c_g_correction),str(mask_repetitive),'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates),output_directory))

    info('Initializing Genome:%s' %genome_name)

    genome_directory=determine_path('genomes')
    genome_2bit=os.path.join(genome_directory,genome_name+'.2bit')

    if os.path.exists(genome_2bit):
        genome=Genome_2bit(genome_2bit)
    else:
        info("\nIt seems you don't have the required genome file.")
        if query_yes_no('Should I download it for you?'):
            sb.call('haystack_download_genome %s' %genome_name,shell=True,env=system_env)
            if os.path.exists(genome_2bit):
                info('Genome correctly downloaded!')
                genome=Genome_2bit(genome_2bit)
            else:
                error('Sorry I cannot download the required file for you. Check your Internet connection.')
                sys.exit(1)
        else:
            error('Sorry I need the genome file to perform the analysis. Exiting...')
            sys.exit(1)

    if not nucleotide_bg_filename:
        nucleotide_bg_filename=os.path.join(genome_directory,genome_name+'_meme_bg')

    check_file(nucleotide_bg_filename)
        


    N_TARGET=None
    N_BG=None
    COMMAND_USED=' '.join(sys.argv)

    _n_target_coordinates=n_target_coordinates


    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords=Coordinate.bed_to_coordinates(bed_target_filename,cl_score=bed_score_column)

    if len(target_coords) == 0:
    	info('No coordinates to analyze in your input file. Exiting.')
    	sys.exit(1)

    #calculate automatically the average lenght of the target regions


    if internal_window_length:
        info('Using the user defined internal window length:%d' % internal_window_length )
        if internal_window_length % 2:
            internal_window_length+=1
            
    else:
                        
        internal_window_length=int(np.mean(map(len,target_coords)))
        if internal_window_length % 2:
            internal_window_length+=1
        info('Using the average length of target coordinates as internal window length:%d' % internal_window_length )

        if not window_length:
            window_length=internal_window_length*5

    info('Total window length:%d' % window_length ) 

        

    if not smooth_size:
        smooth_size=internal_window_length/5

    target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,internal_window_length)
        

    if len(target_coords)>n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %( n_target_coordinates,len(target_coords)))
            target_coords=random.sample(target_coords,n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %( n_target_coordinates,len(target_coords)))
            sorted_idxs_by_score=np.argsort([c.score for c in target_coords])[::-1]
            target_coords=[target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates]]
    else:
        
        if random_sampling_target and bootstrap and not np.isinf(n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords=sample_wr(target_coords,n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))
            

    info('Extracting Motifs in target coordinates')
    positive_matrix,motifs_profiles_in_sequences, idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,motif_names,motif_ids=parallel_fimo_scanning(target_coords,
                                                                                                                                                  meme_motifs_filename,
                                                                                                                                                  genome,nucleotide_bg_filename,
                                                                                                                                                  temp_directory=temp_directory,
                                                                                                                                                  p_value=p_value,
                                                                                                                                                  mask_repetitive=mask_repetitive,
                                                                                                                                                  window_length=window_length,
                                                                                                                                                  internal_window_length=internal_window_length,
                                                                                                                                                  num_consumers=n_processes)
    n_target_coordinates=len(target_coords) #fix for the bootstrap!




    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords=[]
            c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction)

            info('Extract a Matching C+G Background')
            bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf))

            for _ in range(bg_target_ratio):
                for idx_c,c in enumerate(target_coords):
                    c_bin=np.nonzero(np.histogram(c_g_content_target[idx_c],bins)[0])[0][0]
                    c_random_bin=-1
                    
                    while c_random_bin != c_bin:
                        random_bpstart=np.random.randint(1,genome.chr_len[c.chr_id]-len(c)+1)
                        c_random=Coordinate(c.chr_id,random_bpstart,random_bpstart+len(c)-1)
                        seq=genome.extract_sequence(c_random)
                        c_g_content_c_random=(seq.count('c')+seq.count('g'))/float(len(c))
                        c_random_bin=np.nonzero(np.histogram(c_g_content_c_random,bins)[0])[0][0]

                    #print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            bg_hist=np.histogram(c_g_content_bg,bins)[0]
            debug('original: '+str(np.histogram(c_g_content_target,bins)[0]))
            debug('obtained:'+str(np.histogram(c_g_content_bg,bins)[0]))

        else:
            bg_coords=get_random_coordinates(target_coords,genome)
        
        info('Done!')
       
    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords=Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords=Coordinate.coordinates_of_intervals_around_center(bg_coords,internal_window_length)

        if use_entire_bg:
            bg_target_ratio=float(len(bg_coords))/n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio)

        
        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction)   
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)

            info('Extract a Matching C+G Background')
            bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf))
            target_hist=np.histogram(c_g_content_target,bins)[0]
            bg_hist=np.histogram(c_g_content_bg,bins)[0]
            ratios=bg_hist/(target_hist*1.0);
            debug('original:%s' %target_hist)
            debug('bg:%s' %bg_hist)
            debug('ratios:%s' %ratios)
            K_MATCH=min(bg_target_ratio,ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios>0) &(target_hist/float(target_hist.sum())>0.05)].min())

            debug('K_MATCH:%d' %K_MATCH)

            to_match=np.int32(np.floor(K_MATCH*target_hist))

            debug('to_match:%s' %to_match)
            
            idxs_corrected_bg=np.array([],dtype=int)

            for idx_bin in range(len(bins)-1):
                idxs_matching_regions=np.nonzero((c_g_content_bg>=bins[idx_bin]) & (c_g_content_bg<bins[idx_bin+1]))[0]
                to_take=np.random.permutation(len(idxs_matching_regions))
                to_take=to_take[range(min(len(idxs_matching_regions),to_match[idx_bin]))]
                idxs_corrected_bg= np.hstack((idxs_corrected_bg,idxs_matching_regions[to_take]))  

            debug('original:%s' %target_hist)
            debug('K:%d' %K_MATCH)
            debug('to sample:%s' %to_match) 
            debug( 'obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg],bins)[0] )
            bg_coords=[bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            debug(np.histogram(c_g_content_bg,bins)[0])
            if np.array_equal(K_MATCH*target_hist,np.histogram(c_g_content_bg,bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0]))
            else:
                warn('C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'%(target_hist,np.histogram(c_g_content_bg,bins)[0]))

            debug(target_hist/np.histogram(c_g_content_bg,bins)[0])


    if len(bg_coords)>=bg_target_ratio*n_target_coordinates:
        bg_coords=random.sample(bg_coords,int(bg_target_ratio*n_target_coordinates))
    else:
        if bootstrap and len(bg_coords)<(bg_target_ratio*n_target_coordinates*0.95): #allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords=sample_wr(bg_coords,int(bg_target_ratio*n_target_coordinates))
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix,motifs_profiles_in_bg,idxs_seqs_with_motif_bg=parallel_fimo_scanning(bg_coords,
                                                                                       meme_motifs_filename,
                                                                                       genome,nucleotide_bg_filename,
                                                                                       temp_directory=temp_directory,
                                                                                       p_value=p_value,
                                                                                       mask_repetitive=mask_repetitive,
                                                                                       window_length=window_length,
                                                                                       internal_window_length=internal_window_length,
                                                                                       num_consumers=n_processes)[0:3]

    #allocate date for reports
    N_MOTIFS=len(motif_ids)
    rankings=np.zeros(N_MOTIFS,dtype=np.int16)
    motif_ratios=np.zeros(N_MOTIFS)
    support_p=np.zeros(N_MOTIFS)
    support_n=np.zeros(N_MOTIFS)
    fisher_p_values=np.zeros(N_MOTIFS)
    central_enrichment=np.zeros(N_MOTIFS)

    N_seq_p=positive_matrix.shape[0]
    N_seq_n=negative_matrix.shape[0]

    profile_presence_p=(positive_matrix>0).sum(0)
    profile_presence_n=(negative_matrix>0).sum(0)

    support_p=profile_presence_p/float(N_seq_p)
    support_n=profile_presence_n/float(N_seq_n)

    internal_bpstart=window_length/2-internal_window_length/2
    internal_bpend=window_length/2+internal_window_length/2

    for idx,motif_id in enumerate(motif_ids):
        fisher_p_values[idx]= stats.fisher_exact([[ profile_presence_p[idx], N_seq_p-profile_presence_p[idx]], [ profile_presence_n[idx], N_seq_n-profile_presence_n[idx]]])[1]
        central_enrichment[idx]=motifs_profiles_in_sequences[motif_id][internal_bpstart:internal_bpend].mean()/ np.hstack([motifs_profiles_in_sequences[motif_id][:internal_bpstart],motifs_profiles_in_sequences[motif_id][internal_bpend:]]).mean()
        
    motif_ratios=(support_p+0.01)/(support_n+0.01)

    #Foundamental!
    if not disable_ratio:
        motif_ratios[support_p<0.03]=1
    
    rankings=stats.rankdata(-motif_ratios)


    #filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep=np.nonzero(motif_ratios>1)[0]
    else:
        idxs_to_keep=range(len(motif_ratios))
        

    rankings=rankings[idxs_to_keep]
    motif_ratios=motif_ratios[idxs_to_keep]
    support_p=support_p[idxs_to_keep]
    support_n=support_n[idxs_to_keep]
    fisher_p_values=fisher_p_values[idxs_to_keep]
    central_enrichment=central_enrichment[idxs_to_keep]

    motif_ids=[motif_ids[_] for _ in idxs_to_keep]
    motif_names=[motif_names[_] for _ in idxs_to_keep]
    motif_idxs=[_ for _ in idxs_to_keep]

    try:
        qvalues=estimate_qvalues(fisher_p_values); # we test the ones only with ratio >1
    except:
        print fisher_p_values

    #qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################


    #generate reports in html
    info('Generating HTML report...')
    imgs_directory=os.path.join(output_directory,'images')
    genes_list_directory=os.path.join(output_directory,'genes_lists')
    motif_regions_directory=os.path.join(output_directory,'motifs_regions')

    #create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)


    j2_env = Environment(loader=FileSystemLoader(determine_path('extra')+'/templates/'),trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra')+'/templates/')
    template= j2_env.get_template('report_template.html')

    #copy haystack logo and bg
    shutil.copyfile(determine_path('extra')+'/templates/haystack_logo.png', os.path.join(imgs_directory,'haystack_logo.png'))
    shutil.copyfile(determine_path('extra')+'/templates/noise.png', os.path.join(imgs_directory,'noise.png'))

    motifs_dump=[]
    for i in np.argsort(rankings):
        if (support_p[i]>=0.03 or disable_ratio)  and fisher_p_values[i]<0.01  and  (motif_ratios[i]>1 or disable_ratio) and central_enrichment[i]>min_central_enrichment:
        #if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):
       
            info('Generating logo and profile for:'+motif_ids[i])
            
            #create motif logo
            img_logo=os.path.join(imgs_directory,'logo_'+motif_ids[i])
            generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo')
            generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo',file_format='pdf')
            #fix the weblogo prefix problem
            img_logo_url=os.path.join('images','logo_'+motif_ids[i]+'.png')
            
            #create motif enrichment profile
            img_profile=os.path.join(imgs_directory,'profile_'+motif_ids[i]+'.png')
            motif_profile_target=motifs_profiles_in_sequences[motif_ids[i]]/N_seq_p
            motif_profile_bg=motifs_profiles_in_bg[motif_ids[i]]/N_seq_n

            #print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,motif_profile_bg,motif_ids[i],img_profile,smooth_size=smooth_size,window_size=window_length)
            img_profile_url=os.path.join('images','profile_'+motif_ids[i]+'.png')
            
            #create regions
            info('Extracting regions with:'+motif_ids[i])
            regions=os.path.join(motif_regions_directory,motif_ids[i]+'_motif_region_in_target.bed')
            with open(regions,'w+') as outfile:
                outfile.write('Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n')
                for c,locations in motif_coords_in_seqs_with_motif[motif_ids[i]].items():
                    outfile.write('\t'.join([c.chr_id,str(c.bpstart),str(c.bpend),';'.join(['-'.join(map(str,map(int,l))) for l in locations]),str(len(locations))])+'\n')
            regions_url=os.path.join('motifs_regions',motif_ids[i]+'_motif_region_in_target.bed')
            
            #map closest downstream genes
            genes_url=None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' % motif_ids[i])

                peak_annotator_path=os.path.join(determine_path('extra/'),'PeakAnnotator.jar')
                    
                if gene_ids_to_names_filename:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \
                            %(regions,gene_annotations_filename,gene_ids_to_names_filename,genes_list_directory),  shell=True,env=system_env)
                else:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s  -o %s >/dev/null 2>&1' \
                            %(regions,gene_annotations_filename,genes_list_directory),  shell=True,env=system_env)

                
                genes_url=os.path.join('genes_lists',motif_ids[i]+'_motif_region_in_target.tss.bed')
                                
                
            motifs_dump.append({'id':motif_ids[i],'name':motif_names[i],'support_p':support_p[i]*100,
                                 'support_n':support_n[i]*100, 'ratio':motif_ratios[i],'rank':float(rankings[i]),
                                 'pvalue':fisher_p_values[i],'qvalue':qvalues[i],'central_enrichment':central_enrichment[i],
                                 'img_logo':img_logo_url,'img_profile':img_profile_url,'regions':regions_url,'genes':genes_url,'idx_motif':motif_idxs[i]})


    outfile= codecs.open(os.path.join(output_directory,"Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump,bed_target_filename=bed_target_filename,bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n,\
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,use_gene_annotations=use_gene_annotations))
    outfile.close()    

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory=os.path.join(output_directory,'dump')
        
        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)
            
        np.save(os.path.join(dump_directory,'matrix_'+target_name),positive_matrix)
        np.save(os.path.join(dump_directory,'matrix_BG_'+target_name),negative_matrix)
        
        cp.dump(motifs_dump,open(os.path.join(dump_directory,target_name+'_motif_dumps.pickle'),'w'))

        #cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        #cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(idxs_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_seqs_idxs.pickle'),'w'))
        cp.dump(idxs_seqs_with_motif_bg,open(os.path.join(dump_directory,bg_name+'_motif_seqs_idxs.pickle'),'w'))

        cp.dump(motif_coords_in_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_coords_in_seqs_with_motif.pickle'),'w'))

        Coordinate.coordinates_to_bed(target_coords,os.path.join(dump_directory,'Target_coordinates_selected_on_'+target_name+'.bed'),minimal_format=False)
        Coordinate.coordinates_to_bed(bg_coords,os.path.join(dump_directory,'BG_coordinates_selected_on_'+ bg_name+'.bed'),minimal_format=True)

    info('All done! Ciao!')
    sys.exit(0)
示例#6
0
def main():
    print '\n[H A Y S T A C K   T F  A C T I V I T Y  P L A N E]'
    print(
        '\n-TFs Activity on Gene Expression- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters',
                                     prog='haystack_tf_activity_plane')
    parser.add_argument(
        'haystack_motifs_output_folder',
        type=str,
        help='A path to a folder created by the haystack_motifs utility')
    parser.add_argument(
        'gene_expression_samples_filename',
        type=str,
        help='A file containing the list of sample names and locations')
    parser.add_argument(
        'target_cell_type',
        type=str,
        help='The sample name to use as a target for the analysis')

    #optional
    parser.add_argument(
        '--motif_mapping_filename',
        type=str,
        help=
        'Custom motif to gene mapping file (the default is for JASPAR CORE 2016 database)'
    )
    parser.add_argument('--output_directory',
                        type=str,
                        help='Output directory (default: current directory)')
    parser.add_argument('--name',
                        help='Define a custom output filename for the report')
    parser.add_argument(
        '--plot_all',
        help=
        'Disable the filter on the TF activity and correlation (default z-score TF>0 and rho>0.3)',
        action='store_true')
    parser.add_argument('--version',
                        help='Print version and exit.',
                        action='version',
                        version='Version %s' % HAYSTACK_VERSION)

    args = parser.parse_args()

    args_dict = vars(args)
    for key, value in args_dict.items():
        exec('%s=%s' % (key, repr(value)))

    if not os.path.exists(haystack_motifs_output_folder):
        error("The haystack_motifs_output_folder specified: %s doesn't exist!")
        sys.exit(1)

    check_file(gene_expression_samples_filename)

    if motif_mapping_filename:
        check_file(motif_mapping_filename)
    else:
        motif_mapping_filename = os.path.join(
            determine_path('motif_databases'),
            'JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt')

    if name:
        directory_name = 'HAYSTACK_TFs_ACTIVITY_PLANES_on_' + name
    else:
        directory_name = 'HAYSTACK_TFs_ACTIVITY_PLANES_on_' + target_cell_type

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    motif_mapping = pd.read_table(motif_mapping_filename,
                                  header=None,
                                  names=['MOTIF_ID', 'MOTIF_NAME', 'GENES'],
                                  index_col=0)
    motif_mapping = motif_mapping.reset_index().groupby('MOTIF_ID').apply(
        group_motif_mapping)
    motif_mapping = motif_mapping.set_index('MOTIF_ID')

    #load mapping filename
    df_gene_mapping = pd.read_table(FileWrapper(
        "#", gene_expression_samples_filename, "r"),
                                    header=None,
                                    index_col=0,
                                    names=['Sample_name', 'Sample_file'])

    if target_cell_type not in df_gene_mapping.index:
        error(
            '\nThe target_cell_type must be among these sample names:\n\n%s' %
            '\t'.join(df_gene_mapping.index.values))
        sys.exit(1)

    N_SAMPLES = df_gene_mapping.shape[0]

    if N_SAMPLES == 1:
        error(
            '\nYou need at least gene expression for two cell-types. Exiting...'
        )
        sys.exit(1)
    elif N_SAMPLES == 2:
        USE_ZSCORE = False
        bg_target_cell_type = list(
            set(df_gene_mapping.index) - {target_cell_type})[0]
        info(
            'Only 2 samples provided, use expression ratio plane instead of z-score. Target:%s, Bg: %s'
            % (target_cell_type, bg_target_cell_type))
    else:
        USE_ZSCORE = True

    #load gene expression and calculate ranking
    gene_values = []
    for sample_name in df_gene_mapping.index:
        info('Load gene expression file for :%s' % sample_name)
        check_file(df_gene_mapping.ix[sample_name, 'Sample_file'])
        gene_values.append(
            pd.read_table(df_gene_mapping.ix[sample_name, 'Sample_file'],
                          index_col=0,
                          names=['Gene_Symbol', sample_name]))

    gene_values = pd.concat(gene_values, axis=1)
    #make names to uppercase! TODO
    gene_ranking = gene_values.rank(ascending=True)

    #create output folder
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    #For each motif make the plots
    for motif_gene_filename in glob.glob(
            os.path.join(haystack_motifs_output_folder, 'genes_lists') +
            '/*.bed'):

        current_motif_id = os.path.basename(motif_gene_filename).split('_')[0]
        info('Analyzing %s from:%s' % (current_motif_id, motif_gene_filename))

        #genes closeby the motif sites
        mapped_genes = map(
            str.upper,
            list(pd.read_table(motif_gene_filename)['Symbol'].values))

        #target genes average activity
        if USE_ZSCORE:
            ds_values = zscore_series(gene_ranking.ix[mapped_genes, :].mean())
        else:
            ds_values = (
                gene_ranking.ix[mapped_genes, target_cell_type] /
                gene_ranking.ix[mapped_genes, bg_target_cell_type]).mean()

        if current_motif_id in motif_mapping.index:
            current_motif_name = motif_mapping.ix[current_motif_id].MOTIF_NAME

            for gene_name in set(
                    map(str.upper,
                        motif_mapping.ix[current_motif_id].GENES.split(','))):

                #specificity of the TF
                try:
                    if USE_ZSCORE:
                        tf_values = zscore_series(
                            gene_ranking.ix[gene_name.upper()])
                    else:
                        tf_values = (gene_ranking.ix[gene_name.upper(),
                                                     target_cell_type] /
                                     gene_ranking.ix[gene_name.upper(),
                                                     bg_target_cell_type])

                except:
                    warn(
                        'The expression values of the gene %s are not present. Skipping it.'
                        % gene_name.upper())
                    continue

                if USE_ZSCORE:
                    #correlation
                    ro = np.corrcoef(tf_values, ds_values)[0, 1]

                    tf_value = tf_values[target_cell_type]
                    ds_value = ds_values[target_cell_type]

                    info(
                        'Gene:%s TF z-score:%.2f Targets z-score:%.2f  Correlation:%.2f'
                        % (gene_name, tf_value, ds_value, ro))

                    #make plots
                    if (tf_value > 0 and np.abs(ro) > 0.3) or plot_all:

                        x_min = min(-4, tf_values.min() * 1.1)
                        x_max = max(4, tf_values.max() * 1.1)
                        y_min = min(-4, ds_values.min() * 1.1)
                        y_max = max(4, ds_values.max() * 1.1)

                        fig = plt.figure(figsize=(10, 10),
                                         dpi=80,
                                         facecolor='w',
                                         edgecolor='w')
                        ax = fig.add_subplot(111)
                        plt.grid()
                        plt.plot([x_min, x_max], [0, 0], 'k')
                        plt.plot([0, 0], [y_min, y_max], 'k')
                        ax.scatter(tf_values,
                                   ds_values,
                                   s=100,
                                   facecolors='none',
                                   edgecolors='k',
                                   label='rest of cell-types')
                        ax.hold(True)
                        ax.plot(tf_values[target_cell_type],
                                ds_values[target_cell_type],
                                '*r',
                                markersize=30,
                                linestyle='None',
                                label=target_cell_type)
                        ax.legend(loc='center',
                                  bbox_to_anchor=(0.5, -0.1),
                                  ncol=3,
                                  fancybox=True,
                                  shadow=True,
                                  numpoints=1)

                        ax.set_aspect('equal')
                        plt.text(x_min * 0.98,
                                 y_max * 0.85,
                                 r'$\rho$=%.2f' % ro,
                                 fontsize=14)
                        plt.xlim(x_min, x_max)
                        plt.ylim(y_min, y_max)

                        plt.xlabel('TF z-score', fontsize=16)
                        plt.ylabel('Targets z-score', fontsize=16)
                        plt.title(
                            'Motif: %s (%s) Gene: %s' %
                            (current_motif_name, current_motif_id, gene_name),
                            fontsize=17)
                        plt.savefig(
                            os.path.join(
                                output_directory,
                                '%s_motif_%s(%s)_gene_%s.pdf' %
                                (target_cell_type,
                                 current_motif_name.replace(
                                     '::', '_'), current_motif_id, gene_name)))
                        plt.close()

                else:
                    info(
                        'Gene:%s TF expression ratio:%.2f Targets expression ratio:%.2f'
                        % (
                            gene_name,
                            tf_values,
                            ds_values,
                        ))
                    x_min = min(0, tf_values * 1.1)
                    x_max = max(2, tf_values * 1.1)
                    y_min = min(0, ds_values * 1.1)
                    y_max = max(2, ds_values * 1.1)

                    if (tf_values > 1.2) & ((ds_values > 1.2) |
                                            (ds_values < 0.8)) or plot_all:
                        fig = plt.figure(figsize=(10, 10),
                                         dpi=80,
                                         facecolor='w',
                                         edgecolor='w')
                        ax = fig.add_subplot(111)
                        plt.grid()
                        plt.plot([x_min, x_max], [1, 1], 'k')
                        plt.plot([1, 1], [y_min, y_max], 'k')
                        ax.plot(tf_values,
                                ds_values,
                                '*r',
                                markersize=30,
                                linestyle='None',
                                label=target_cell_type)

                        ax.set_aspect('equal')
                        plt.xlim(x_min, x_max)
                        plt.ylim(y_min, y_max)

                        plt.xlabel('TF expression ratio (%s/%s)' %
                                   (target_cell_type, bg_target_cell_type),
                                   fontsize=16)
                        plt.ylabel('Average Targets Expression Ratio (%s/%s)' %
                                   (target_cell_type, bg_target_cell_type),
                                   fontsize=16)
                        plt.title(
                            'Motif: %s (%s) Gene: %s' %
                            (current_motif_name, current_motif_id, gene_name),
                            fontsize=17)
                        plt.savefig(
                            os.path.join(
                                output_directory,
                                '%s_motif_%s(%s)_gene_%s.pdf' %
                                (target_cell_type,
                                 current_motif_name.replace(
                                     '::', '_'), current_motif_id, gene_name)))
                        plt.close()

        else:
            warn('Sorry the motif %s is not mappable to gene' %
                 current_motif_id)
    info('All done! Ciao!')
    sys.exit(0)
def main():
    print '\n[H A Y S T A C K   T F  A C T I V I T Y  P L A N E]'
    print('\n-TFs Activity on Gene Expression- [Luca Pinello - [email protected]]\n')
    print 'Version %s\n' % HAYSTACK_VERSION
    
    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters',prog='haystack_tf_activity_plane')
    parser.add_argument('haystack_motifs_output_folder', type=str,  help='A path to a folder created by the haystack_motifs utility')
    parser.add_argument('gene_expression_samples_filename', type=str,  help='A file containing the list of sample names and locations')
    parser.add_argument('target_cell_type', type=str,  help='The sample name to use as a target for the analysis')
    
    #optional
    parser.add_argument('--motif_mapping_filename', type=str, help='Custom motif to gene mapping file (the default is for JASPAR CORE 2016 database)')
    parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)')
    parser.add_argument('--name',  help='Define a custom output filename for the report')
    parser.add_argument('--plot_all',  help='Disable the filter on the TF activity and correlation (default z-score TF>0 and rho>0.3)',action='store_true')
    parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %s' % HAYSTACK_VERSION)
    
    args = parser.parse_args()
    
    args_dict=vars(args)
    for key,value in args_dict.items():
        exec('%s=%s' %(key,repr(value)))
    
    if not os.path.exists(haystack_motifs_output_folder):
        error("The haystack_motifs_output_folder specified: %s doesn't exist!")
        sys.exit(1)
    
    check_file(gene_expression_samples_filename)
    
    
    if motif_mapping_filename:
         check_file(motif_mapping_filename)
    else:
        motif_mapping_filename=os.path.join(determine_path('motif_databases'),'JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt')
        
    if name:
        directory_name='HAYSTACK_TFs_ACTIVITY_PLANES_on_'+name
    else:
        directory_name='HAYSTACK_TFs_ACTIVITY_PLANES_on_'+target_cell_type
    
    if output_directory:
        output_directory=os.path.join(output_directory, directory_name)
    else:
        output_directory=directory_name    
    
    motif_mapping=pd.read_table(motif_mapping_filename,header=None,names=['MOTIF_ID','MOTIF_NAME','GENES'],index_col=0)
    motif_mapping=motif_mapping.reset_index().groupby('MOTIF_ID').apply(group_motif_mapping)
    motif_mapping=motif_mapping.set_index('MOTIF_ID')
    
    
    
    #load mapping filename
    df_gene_mapping=pd.read_table(FileWrapper("#",gene_expression_samples_filename, "r"),header=None,index_col=0,names=['Sample_name','Sample_file'])
    
    if target_cell_type not in df_gene_mapping.index:
        error('\nThe target_cell_type must be among these sample names:\n\n%s' %'\t'.join(df_gene_mapping.index.values))
        sys.exit(1)
    
    N_SAMPLES=df_gene_mapping.shape[0]
        
    if N_SAMPLES==1:
        error('\nYou need at least gene expression for two cell-types. Exiting...')
        sys.exit(1)      
    elif N_SAMPLES==2:
        USE_ZSCORE=False
        bg_target_cell_type=list(set(df_gene_mapping.index)-{target_cell_type})[0]
        info('Only 2 samples provided, use expression ratio plane instead of z-score. Target:%s, Bg: %s' %(target_cell_type,bg_target_cell_type))
    else:
        USE_ZSCORE=True
      
    #load gene expression and calculate ranking
    gene_values=[]
    for sample_name in df_gene_mapping.index:
        info('Load gene expression file for :%s' % sample_name)
        check_file(df_gene_mapping.ix[sample_name,'Sample_file'])
        gene_values.append(pd.read_table(df_gene_mapping.ix[sample_name,'Sample_file'],index_col=0,names=['Gene_Symbol',sample_name]))
        
    gene_values=pd.concat(gene_values,axis=1)
    #make names to uppercase! TODO
    gene_ranking=gene_values.rank(ascending=True)    
    
    #create output folder
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    
    #For each motif make the plots
    for motif_gene_filename in glob.glob(os.path.join(haystack_motifs_output_folder,'genes_lists')+'/*.bed'):
    
        current_motif_id=os.path.basename(motif_gene_filename).split('_')[0]
        info('Analyzing %s from:%s' %(current_motif_id, motif_gene_filename))
        
        #genes closeby the motif sites
        mapped_genes=map(str.upper,list(pd.read_table(motif_gene_filename)['Symbol'].values))
        
        #target genes average activity
        if USE_ZSCORE:
            ds_values=zscore_series(gene_ranking.ix[mapped_genes,:].mean() )
        else:
            ds_values=(gene_ranking.ix[mapped_genes,target_cell_type]/gene_ranking.ix[mapped_genes,bg_target_cell_type]).mean()
        
        if current_motif_id in motif_mapping.index:
            current_motif_name=motif_mapping.ix[current_motif_id].MOTIF_NAME
    
            for gene_name in set(map(str.upper,motif_mapping.ix[current_motif_id].GENES.split(','))):
            
                #specificity of the TF
                try:
                    if USE_ZSCORE:
                        tf_values=zscore_series(gene_ranking.ix[gene_name.upper()])
                    else:
                        tf_values=(gene_ranking.ix[gene_name.upper(),target_cell_type]/gene_ranking.ix[gene_name.upper(),bg_target_cell_type])
                        
                except:
                    warn('The expression values of the gene %s are not present. Skipping it.' % gene_name.upper())
                    continue
                
                if USE_ZSCORE:
                    #correlation 
                    ro=np.corrcoef(tf_values,ds_values)[0,1]
    
                
                    tf_value=tf_values[target_cell_type]
                    ds_value=ds_values[target_cell_type]
    
                    info('Gene:%s TF z-score:%.2f Targets z-score:%.2f  Correlation:%.2f' %(gene_name,tf_value,ds_value,ro))
                
                    #make plots
                    if (tf_value>0 and np.abs(ro)>0.3) or plot_all:
    
                        x_min=min(-4,tf_values.min()*1.1)
                        x_max=max(4,tf_values.max()*1.1)
                        y_min=min(-4,ds_values.min()*1.1)
                        y_max=max(4,ds_values.max()*1.1)
    
                        fig = plt.figure( figsize=(10, 10), dpi=80, facecolor='w', edgecolor='w')
                        ax = fig.add_subplot(111)
                        plt.grid()
                        plt.plot([x_min,x_max],[0,0],'k')
                        plt.plot([0,0],[y_min,y_max],'k')
                        ax.scatter(tf_values,ds_values, s=100, facecolors='none', edgecolors='k',label='rest of cell-types')
                        ax.hold(True)
                        ax.plot(tf_values[target_cell_type],ds_values[target_cell_type],'*r',markersize=30,linestyle='None',label=target_cell_type)
                        ax.legend(loc='center', bbox_to_anchor=(0.5, -0.1),ncol=3, fancybox=True, shadow=True,numpoints=1)
    
                        ax.set_aspect('equal')
                        plt.text(x_min*0.98,y_max*0.85,r'$\rho$=%.2f' % ro,fontsize=14)
                        plt.xlim(x_min,x_max)
                        plt.ylim(y_min,y_max)
    
                        plt.xlabel('TF z-score',fontsize=16)
                        plt.ylabel('Targets z-score',fontsize=16)
                        plt.title('Motif: %s (%s) Gene: %s' % (current_motif_name ,current_motif_id ,gene_name),fontsize=17)
                        plt.savefig(os.path.join(output_directory,'%s_motif_%s(%s)_gene_%s.pdf' % (target_cell_type,current_motif_name.replace('::','_') ,current_motif_id ,gene_name)))
                        plt.close()
     
                else:  
                    info('Gene:%s TF expression ratio:%.2f Targets expression ratio:%.2f' %(gene_name,tf_values,ds_values,))
                    x_min=min(0,tf_values*1.1)
                    x_max=max(2,tf_values*1.1)
                    y_min=min(0,ds_values*1.1)
                    y_max=max(2,ds_values*1.1)
                    
                    if (tf_values>1.2) & ((ds_values>1.2)|(ds_values<0.8)) or plot_all:
                        fig = plt.figure( figsize=(10, 10), dpi=80, facecolor='w', edgecolor='w')
                        ax = fig.add_subplot(111)
                        plt.grid()
                        plt.plot([x_min,x_max],[1,1],'k')
                        plt.plot([1,1],[y_min,y_max],'k')
                        ax.plot(tf_values,ds_values,'*r',markersize=30,linestyle='None',label=target_cell_type)
    
                        ax.set_aspect('equal')
                        plt.xlim(x_min,x_max)
                        plt.ylim(y_min,y_max)
    
                        plt.xlabel('TF expression ratio (%s/%s)' % (target_cell_type,bg_target_cell_type),fontsize=16)
                        plt.ylabel('Average Targets Expression Ratio (%s/%s)' % (target_cell_type,bg_target_cell_type),fontsize=16)
                        plt.title('Motif: %s (%s) Gene: %s' % (current_motif_name ,current_motif_id ,gene_name),fontsize=17)
                        plt.savefig(os.path.join(output_directory,'%s_motif_%s(%s)_gene_%s.pdf' % (target_cell_type,current_motif_name.replace('::','_') ,current_motif_id ,gene_name)))
                        plt.close()
                
        else:
            warn('Sorry the motif %s is not mappable to gene' % current_motif_id)
    info('All done! Ciao!')
    sys.exit(0)
示例#8
0
def create_tiled_genome(genome_name, output_directory, chr_len_filename,
                        bin_size, chrom_exclude, blacklist):

    from re import search
    genome_directory = determine_path('genomes')
    annotations_directory = determine_path('gene_annotations')

    genome_sorted_bins_file = os.path.join(
        output_directory,
        '%s.%dbp.bins.sorted.bed' % (os.path.basename(genome_name), bin_size))

    chr_len_sorted_filtered_filename = os.path.join(
        output_directory, "%s_chr_lengths_sorted_filtered.txt" % genome_name)

    if not (os.path.exists(genome_sorted_bins_file) and do_not_recompute):

        info('Sorting chromosome lengths file once again to double check....')

        cmd = ' sort -k1,1 -k2,2n  "%s" -o  "%s" ' % (chr_len_filename,
                                                      chr_len_filename)

        sb.call(cmd, shell=True)

        info('Creating bins of %dbp in %s' %
             (bin_size, genome_sorted_bins_file))

        if chrom_exclude:

            with open(chr_len_sorted_filtered_filename, 'wb') as f:
                f.writelines(line for line in open(chr_len_filename)
                             if not search(chrom_exclude,
                                           line.split()[0]))
        else:
            chr_len_sorted_filtered_filename = chr_len_filename

        cmd = 'bedtools makewindows -g "%s" -w %s   > "%s" ' % (
            chr_len_sorted_filtered_filename, bin_size,
            genome_sorted_bins_file)

        sb.call(cmd, shell=True)

        if blacklist == 'none':
            info('Tiled genome file created will not be blacklist filtered')

        else:
            info('Tiled genome file created will be blacklist filtered')

            if blacklist == 'hg19':
                info('Using hg19 blacklist file %s to filter out the regions' %
                     blacklist)
                blacklist_filepath = os.path.join(
                    annotations_directory, 'hg19_blacklisted_regions.bed')
                check_file(blacklist_filepath)
            elif os.path.isfile(blacklist):

                info('Using blacklist file %s to filter out the regions' %
                     blacklist)
                blacklist_filepath = blacklist
                check_file(blacklist_filepath)

            else:
                error('Incorrect blacklist option provided. '
                      'It is neither a file nor a genome')
                sys.exit(1)

            info('Sort blacklist file')

            cmd = ' sort -k1,1 -k2,2n  "%s" -o  "%s" ' % (blacklist_filepath,
                                                          blacklist_filepath)

            sb.call(cmd, shell=True)

            genome_sorted_bins_filtered_file = genome_sorted_bins_file.replace(
                '.bed', '.filtered.bed')

            info(' filter out blacklist regions')

            cmd = 'bedtools intersect -sorted -a "%s" -b "%s"  -v  > %s ' % (
                genome_sorted_bins_file, blacklist_filepath,
                genome_sorted_bins_filtered_file)

            sb.call(cmd, shell=True)

            if not keep_intermediate_files:
                info('Deleting %s' % genome_sorted_bins_file)
                try:
                    os.remove(genome_sorted_bins_file)
                except:
                    pass

            genome_sorted_bins_file = genome_sorted_bins_filtered_file

    return genome_sorted_bins_file
示例#9
0
def main(input_args=None):

    print '\n[H A Y S T A C K   P I P E L I N E]'
    print('\n-SELECTION OF HOTSPOTS OF VARIABILITY AND ENRICHED MOTIFS-\n')
    print 'Version %s\n' % HAYSTACK_VERSION
    parser = get_args_pipeline()
    args = parser.parse_args(input_args)

    args_dict = vars(args)
    for key, value in args_dict.items():
        exec('%s=%s' % (key, repr(value)))

    if meme_motifs_filename:
        check_file(meme_motifs_filename)

    if motif_mapping_filename:
        check_file(motif_mapping_filename)

    if not os.path.exists(temp_directory):
        error(
            'The folder specified with --temp_directory: %s does not exist!' %
            temp_directory)
        sys.exit(1)

    if input_is_bigwig:
        extension_to_check = '.bw'
        info('Input is set BigWig (.bw)')
    else:
        extension_to_check = '.bam'
        info('Input is set compressed SAM (.bam)')

    if name:
        directory_name = 'HAYSTACK_PIPELINE_RESULTS_on_%s' % name

    else:
        directory_name = 'HAYSTACK_PIPELINE_RESULTS'

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    # check folder or sample filename

    USE_GENE_EXPRESSION = True
    if not os.path.exists(samples_filename_or_bam_folder):
        error("The file or folder %s doesn't exist. Exiting." %
              samples_filename_or_bam_folder)
        sys.exit(1)

    if os.path.isfile(samples_filename_or_bam_folder):
        BAM_FOLDER = False
        data_filenames = []
        gene_expression_filenames = []
        sample_names = []

        with open(samples_filename_or_bam_folder) as infile:
            for line in infile:

                if not line.strip():
                    continue

                if line.startswith(
                        '#'):  # skip optional header line or empty lines
                    info('Skipping header/comment line:%s' % line)
                    continue

                fields = line.strip().split()
                n_fields = len(fields)

                if n_fields == 2:

                    USE_GENE_EXPRESSION = False

                    sample_names.append(fields[0])
                    data_filenames.append(fields[1])

                elif n_fields == 3:

                    USE_GENE_EXPRESSION = USE_GENE_EXPRESSION and True
                    sample_names.append(fields[0])
                    data_filenames.append(fields[1])
                    gene_expression_filenames.append(fields[2])
                else:
                    error('The samples file format is wrong!')
                    sys.exit(1)
    else:
        if os.path.exists(samples_filename_or_bam_folder):
            BAM_FOLDER = True
            USE_GENE_EXPRESSION = False
            data_filenames = glob.glob(
                os.path.join(samples_filename_or_bam_folder,
                             '*' + extension_to_check))

            if not data_filenames:
                error('No bam/bigwig  files to analyze in %s. Exiting.' %
                      samples_filename_or_bam_folder)
                sys.exit(1)

            sample_names = [
                os.path.basename(data_filename).replace(
                    extension_to_check, '') for data_filename in data_filenames
            ]
        else:
            error("The file or folder %s doesn't exist. Exiting." %
                  samples_filename_or_bam_folder)
            sys.exit(1)

    # check all the files before starting
    info('Checking samples files location...')
    for data_filename in data_filenames:
        check_file(data_filename)

    if USE_GENE_EXPRESSION:
        for gene_expression_filename in gene_expression_filenames:
            check_file(gene_expression_filename)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # copy back the file used
    if not BAM_FOLDER:
        shutil.copy2(samples_filename_or_bam_folder, output_directory)

    # write hotspots conf files
    sample_names_hotspots_filename = os.path.join(output_directory,
                                                  'sample_names_hotspots.txt')

    with open(sample_names_hotspots_filename, 'w+') as outfile:
        for sample_name, data_filename in zip(sample_names, data_filenames):
            outfile.write('%s\t%s\n' % (sample_name, data_filename))

    #CALL HAYSTACK HOTSPOTS
    cmd_to_run='haystack_hotspots %s %s --output_directory %s --bin_size %d %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % \
                (sample_names_hotspots_filename, genome_name,output_directory,bin_size,
                 ('--do_not_filter_bams' if do_not_filter_bams else ''),
                 ('--depleted' if depleted else ''),
                 ('--do_not_recompute' if do_not_recompute else ''),
                 ('--keep_intermediate_files' if keep_intermediate_files else ''),
                 ('--input_is_bigwig' if input_is_bigwig else ''),
                 ('--disable_quantile_normalization' if disable_quantile_normalization else ''),
                 '--transformation %s' % transformation,
                 '--chrom_exclude "%s"' % chrom_exclude,
                 '--z_score_high %f' % z_score_high,
                 '--z_score_low %f' % z_score_low,
                 '--th_rpm %f' % th_rpm,
                 '--blacklist %s' % blacklist,
                 '--read_ext %d' % read_ext,
                 '--n_processes %d' % n_processes)
    print(cmd_to_run)
    sb.call(cmd_to_run, shell=True)

    # CALL HAYSTACK MOTIFS
    motif_directory = os.path.join(output_directory, 'HAYSTACK_MOTIFS')

    for sample_name in sample_names:
        specific_regions_filename = os.path.join(
            output_directory, 'HAYSTACK_HOTSPOTS', 'SPECIFIC_REGIONS',
            'Regions_specific_for_%s*.bed' % sample_name)
        bg_regions_filename = glob.glob(
            os.path.join(output_directory, 'HAYSTACK_HOTSPOTS',
                         'SPECIFIC_REGIONS',
                         'Background_for_%s*.bed' % sample_name))[0]
        cmd_to_run = 'haystack_motifs %s %s --bed_bg_filename %s --output_directory %s --name %s' % (
            specific_regions_filename, genome_name, bg_regions_filename,
            motif_directory, sample_name)

        if meme_motifs_filename:
            cmd_to_run += ' --meme_motifs_filename %s' % meme_motifs_filename

        if n_processes:
            cmd_to_run += ' --n_processes %d' % n_processes

        if temp_directory:
            cmd_to_run += ' --temp_directory %s' % temp_directory

        print(cmd_to_run)
        sb.call(cmd_to_run, shell=True)

    if USE_GENE_EXPRESSION:

        sample_names_tf_activity_filename = os.path.join(
            output_directory, 'sample_names_tf_activity.txt')

        with open(sample_names_tf_activity_filename, 'w+') as outfile:
            for sample_name, gene_expression_filename in zip(
                    sample_names, gene_expression_filenames):
                outfile.write('%s\t%s\n' %
                              (sample_name, gene_expression_filename))

        tf_activity_directory = os.path.join(output_directory,
                                             'HAYSTACK_TFs_ACTIVITY_PLANES')

        for sample_name in sample_names:

            # write tf activity  conf files

            # CALL HAYSTACK TF ACTIVITY
            motifs_output_folder = os.path.join(
                motif_directory, 'HAYSTACK_MOTIFS_on_%s' % sample_name)

            if os.path.exists(motifs_output_folder):
                cmd_to_run = 'haystack_tf_activity_plane %s %s %s --output_directory %s' % (
                    motifs_output_folder, sample_names_tf_activity_filename,
                    sample_name, tf_activity_directory)

            if motif_mapping_filename:
                cmd_to_run += ' --motif_mapping_filename %s' % motif_mapping_filename

            if plot_all:
                cmd_to_run += ' --plot_all'

            if rho_cutoff:
                cmd_to_run += ' --rho_cutoff %f' % rho_cutoff

            if tf_value_cuttoff:
                cmd_to_run += ' --tf_value_cuttoff %f' % tf_value_cuttoff

            print(cmd_to_run)
            sb.call(cmd_to_run, shell=True)
示例#10
0
def main():

    print '\n[H A Y S T A C K   H O T S P O T]'
    print(
        '\n-SELECTION OF VARIABLE REGIONS- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    if which('samtools') is None:
        error(
            'Haystack requires samtools free available at: http://sourceforge.net/projects/samtools/files/samtools/0.1.19/'
        )
        sys.exit(1)

    if which('bedtools') is None:
        error(
            'Haystack requires bedtools free available at: https://github.com/arq5x/bedtools2/releases/tag/v2.20.1'
        )
        sys.exit(1)

    if which('bedGraphToBigWig') is None:
        info(
            'To generate the bigwig files Haystack requires bedGraphToBigWig please download from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH'
        )

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument(
        'samples_filename_or_bam_folder',
        type=str,
        help=
        'A tab delimeted file with in each row (1) a sample name, (2) the path to the corresponding bam filename. Alternatively it is possible to specify a folder containing some .bam files to analyze.'
    )
    parser.add_argument(
        'genome_name',
        type=str,
        help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')

    #optional
    parser.add_argument('--bin_size',
                        type=int,
                        help='bin size to use(default: 500bp)',
                        default=500)
    parser.add_argument('--disable_quantile_normalization',
                        help='Disable quantile normalization (default: False)',
                        action='store_true')
    parser.add_argument(
        '--th_rpm',
        type=float,
        help=
        'Percentile on the signal intensity to consider for the hotspots (default: 99)',
        default=99)
    parser.add_argument(
        '--transformation',
        type=str,
        help=
        'Variance stabilizing transformation among: none, log2, angle (default: angle)',
        default='angle',
        choices=['angle', 'log2', 'none'])
    parser.add_argument('--recompute_all',
                        help='Ignore any file previously precalculated',
                        action='store_true')
    parser.add_argument(
        '--z_score_high',
        type=float,
        help='z-score value to select the specific regions(default: 1.5)',
        default=1.5)
    parser.add_argument(
        '--z_score_low',
        type=float,
        help='z-score value to select the not specific regions(default: 0.25)',
        default=0.25)
    parser.add_argument('--name',
                        help='Define a custom output filename for the report',
                        default='')
    parser.add_argument('--output_directory',
                        type=str,
                        help='Output directory (default: current directory)',
                        default='')
    parser.add_argument(
        '--use_X_Y',
        help=
        'Force to process the X and Y chromosomes (default: not processed)',
        action='store_true')
    parser.add_argument(
        '--max_regions_percentage',
        type=float,
        help=
        'Upper bound on the %% of the regions selected  (deafult: 0.1, 0.0=0%% 1.0=100%%)',
        default=0.1)
    parser.add_argument(
        '--depleted',
        help=
        'Look for cell type specific regions with depletion of signal instead of enrichment',
        action='store_true')
    parser.add_argument(
        '--input_is_bigwig',
        help=
        'Use the bigwig format instead of the bam format for the input. Note: The files must have extension .bw',
        action='store_true')
    parser.add_argument('--version',
                        help='Print version and exit.',
                        action='version',
                        version='Version %s' % HAYSTACK_VERSION)
    args = parser.parse_args()

    args_dict = vars(args)
    for key, value in args_dict.items():
        exec('%s=%s' % (key, repr(value)))

    if input_is_bigwig:
        extension_to_check = '.bw'
        info('Input is set BigWig (.bw)')
    else:
        extension_to_check = '.bam'
        info('Input is set compressed SAM (.bam)')

    #check folder or sample filename
    if os.path.isfile(samples_filename_or_bam_folder):
        BAM_FOLDER = False
        bam_filenames = []
        sample_names = []
        with open(samples_filename_or_bam_folder) as infile:
            for line in infile:

                if not line.strip():
                    continue

                if line.startswith(
                        '#'):  #skip optional header line or empty lines
                    info('Skipping header/comment line:%s' % line)
                    continue

                fields = line.strip().split()
                n_fields = len(fields)

                if n_fields == 2:
                    sample_names.append(fields[0])
                    bam_filenames.append(fields[1])
                else:
                    error('The samples file format is wrong!')
                    sys.exit(1)

    else:
        if os.path.exists(samples_filename_or_bam_folder):
            BAM_FOLDER = True
            bam_filenames = glob.glob(
                os.path.join(samples_filename_or_bam_folder,
                             '*' + extension_to_check))

            if not bam_filenames:
                error('No bam/bigwig  files to analyze in %s. Exiting.' %
                      samples_filename_or_bam_folder)
                sys.exit(1)

            sample_names = [
                os.path.basename(bam_filename).replace(extension_to_check, '')
                for bam_filename in bam_filenames
            ]
        else:
            error("The file or folder %s doesn't exist. Exiting." %
                  samples_filename_or_bam_folder)
            sys.exit(1)

    #check all the files before starting
    info('Checking samples files location...')
    for bam_filename in bam_filenames:
        check_file(bam_filename)

    info('Initializing Genome:%s' % genome_name)

    genome_directory = determine_path('genomes')
    genome_2bit = os.path.join(genome_directory, genome_name + '.2bit')

    if os.path.exists(genome_2bit):
        genome = Genome_2bit(genome_2bit)
    else:
        info("\nIt seems you don't have the required genome file.")
        if query_yes_no('Should I download it for you?'):
            sb.call('haystack_download_genome %s' % genome_name,
                    shell=True,
                    env=system_env)
            if os.path.exists(genome_2bit):
                info('Genome correctly downloaded!')
                genome = Genome_2bit(genome_2bit)
            else:
                error(
                    'Sorry I cannot download the required file for you. Check your Internet connection.'
                )
                sys.exit(1)
        else:
            error(
                'Sorry I need the genome file to perform the analysis. Exiting...'
            )
            sys.exit(1)

    chr_len_filename = os.path.join(genome_directory,
                                    "%s_chr_lengths.txt" % genome_name)
    check_file(chr_len_filename)

    if name:
        directory_name = 'HAYSTACK_HOTSPOTS_on_%s' % name

    else:
        directory_name = 'HAYSTACK_HOTSPOTS'

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    genome_sorted_bins_file = os.path.join(
        output_directory,
        '%s.%dbp.bins.sorted.bed' % (os.path.basename(genome_name), bin_size))

    tracks_directory = os.path.join(output_directory, 'TRACKS')
    if not os.path.exists(tracks_directory):
        os.makedirs(tracks_directory)

    intermediate_directory = os.path.join(output_directory, 'INTERMEDIATE')
    if not os.path.exists(intermediate_directory):
        os.makedirs(intermediate_directory)

    if not os.path.exists(genome_sorted_bins_file) or recompute_all:
        info('Creating bins of %dbp for %s in %s' %
             (bin_size, chr_len_filename, genome_sorted_bins_file))
        sb.call(
            'bedtools makewindows -g %s -w %s |  bedtools sort -i stdin |' %
            (chr_len_filename, bin_size) + "perl -nle 'print " + '"$_\t$.";' +
            "' /dev/stdin> %s" % genome_sorted_bins_file,
            shell=True,
            env=system_env)

    #convert bam files to genome-wide rpm tracks
    for base_name, bam_filename in zip(sample_names, bam_filenames):

        info('Processing:%s' % bam_filename)

        rpm_filename = os.path.join(tracks_directory,
                                    '%s.bedgraph' % base_name)
        sorted_rpm_filename = os.path.join(tracks_directory,
                                           '%s_sorted.bedgraph' % base_name)
        mapped_sorted_rpm_filename = os.path.join(
            tracks_directory, '%s_mapped_sorted.bedgraph' % base_name)
        binned_rpm_filename = os.path.join(
            intermediate_directory, '%s.%dbp.rpm' % (base_name, bin_size))
        bigwig_filename = os.path.join(tracks_directory, '%s.bw' % base_name)

        if input_is_bigwig and which('bigWigAverageOverBed'):
            if not os.path.exists(binned_rpm_filename) or recompute_all:
                cmd = 'bigWigAverageOverBed %s %s  /dev/stdout | sort -s -n -k 1,1 | cut -f5 > %s' % (
                    bam_filename, genome_sorted_bins_file, binned_rpm_filename)
                sb.call(cmd, shell=True, env=system_env)
                shutil.copy2(bam_filename, bigwig_filename)

        else:
            if not os.path.exists(binned_rpm_filename) or recompute_all:
                info('Computing Scaling Factor...')
                cmd = 'samtools view -c -F 512 %s' % bam_filename
                #print cmd
                proc = sb.Popen(cmd,
                                stdout=sb.PIPE,
                                shell=True,
                                env=system_env)
                (stdout, stderr) = proc.communicate()
                #print stdout,stderr
                scaling_factor = (1.0 / float(stdout.strip())) * 1000000

                info('Scaling Factor: %e' % scaling_factor)

                info('Building BedGraph RPM track...')
                cmd = 'samtools view -b -F 512 %s | bamToBed | slopBed  -r %s -l 0 -s -i stdin -g %s | genomeCoverageBed -g  %s -i stdin -bg -scale %.32f > %s' % (
                    bam_filename, bin_size, chr_len_filename, chr_len_filename,
                    scaling_factor, rpm_filename)
                #print cmd

                proc = sb.call(cmd, shell=True, env=system_env)

            if which('bedGraphToBigWig'):
                if not os.path.exists(bigwig_filename) or recompute_all:
                    info('Converting BedGraph to BigWig')
                    cmd = 'bedGraphToBigWig %s %s %s' % (
                        rpm_filename, chr_len_filename, bigwig_filename)
                    proc = sb.call(cmd, shell=True, env=system_env)

            else:
                info(
                    'Sorry I cannot create the bigwig file.\nPlease download and install bedGraphToBigWig from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH'
                )

            if not os.path.exists(binned_rpm_filename) or recompute_all:
                info('Make constant binned (%dbp) rpm values file' % bin_size)
                #cmd='bedtools sort -i %s |  bedtools map -a %s -b stdin -c 4 -o mean -null 0.0 | cut -f5 > %s'   %(rpm_filename,genome_sorted_bins_file,binned_rpm_filename)
                #proc=sb.call(cmd,shell=True,env=system_env)

                cmd = 'sort -k1,1 -k2,2n  %s  > %s' % (rpm_filename,
                                                       sorted_rpm_filename)
                proc = sb.call(cmd, shell=True, env=system_env)

                cmd = 'bedtools map -a %s -b %s -c 4 -o mean -null 0.0  > %s' % (
                    genome_sorted_bins_file, sorted_rpm_filename,
                    mapped_sorted_rpm_filename)
                proc = sb.call(cmd, shell=True, env=system_env)

                cmd = 'cut -f5 %s  > %s' % (mapped_sorted_rpm_filename,
                                            binned_rpm_filename)
                proc = sb.call(cmd, shell=True, env=system_env)

            try:
                os.remove(rpm_filename)
                os.remove(sorted_rpm_filename)
                os.remove(mapped_sorted_rpm_filename)
            except:
                pass

    #load coordinates of bins
    coordinates_bin = pd.read_csv(genome_sorted_bins_file,
                                  names=['chr_id', 'bpstart', 'bpend'],
                                  sep='\t',
                                  header=None,
                                  usecols=[0, 1, 2])
    N_BINS = coordinates_bin.shape[0]
    if not use_X_Y:
        coordinates_bin = coordinates_bin.ix[
            (coordinates_bin['chr_id'] != 'chrX')
            & (coordinates_bin['chr_id'] != 'chrY')]

    #load all the tracks
    info('Loading the processed tracks')
    df_chip = {}
    for state_file in glob.glob(os.path.join(intermediate_directory, '*.rpm')):
        col_name = os.path.basename(state_file).replace('.rpm', '')
        df_chip[col_name] = pd.read_csv(state_file, squeeze=True, header=None)
        info('Loading:%s' % col_name)

    df_chip = pd.DataFrame(df_chip)

    if disable_quantile_normalization:
        info('Skipping quantile normalization...')
    else:
        info('Normalizing the data...')
        df_chip = pd.DataFrame(quantile_normalization(df_chip.values),
                               columns=df_chip.columns,
                               index=df_chip.index)

    if which('bedGraphToBigWig'):
        #write quantile normalized tracks
        coord_quantile = coordinates_bin.copy()
        for col in df_chip:

            if disable_quantile_normalization:
                normalized_output_filename = os.path.join(
                    tracks_directory, '%s.bedgraph' % os.path.basename(col))
            else:
                normalized_output_filename = os.path.join(
                    tracks_directory,
                    '%s_quantile_normalized.bedgraph' % os.path.basename(col))

            normalized_output_filename_bigwig = normalized_output_filename.replace(
                '.bedgraph', '.bw')

            if not os.path.exists(
                    normalized_output_filename_bigwig) or recompute_all:
                info('Writing binned track: %s' %
                     normalized_output_filename_bigwig)
                coord_quantile['rpm_normalized'] = df_chip.ix[:, col]
                coord_quantile.dropna().to_csv(normalized_output_filename,
                                               sep='\t',
                                               header=False,
                                               index=False)

                cmd = 'bedGraphToBigWig %s %s %s' % (
                    normalized_output_filename, chr_len_filename,
                    normalized_output_filename_bigwig)
                proc = sb.call(cmd, shell=True, env=system_env)
                try:
                    os.remove(normalized_output_filename)
                except:
                    pass
    else:
        info(
            'Sorry I cannot creat the bigwig file.\nPlease download and install bedGraphToBigWig from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH'
        )

    #th_rpm=np.min(df_chip.apply(lambda x: np.percentile(x,th_rpm)))
    th_rpm = find_th_rpm(df_chip, th_rpm)
    info('Estimated th_rpm:%s' % th_rpm)

    df_chip_not_empty = df_chip.ix[(df_chip > th_rpm).any(1), :]

    if transformation == 'log2':
        df_chip_not_empty = df_chip_not_empty.applymap(log2_transform)
        info('Using log2 transformation')

    elif transformation == 'angle':
        df_chip_not_empty = df_chip_not_empty.applymap(angle_transform)
        info('Using angle transformation')

    else:
        info('Using no transformation')

    iod_values = df_chip_not_empty.var(1) / df_chip_not_empty.mean(1)

    ####calculate the inflation point a la superenhancers
    scores = iod_values
    min_s = np.min(scores)
    max_s = np.max(scores)

    N_POINTS = len(scores)
    x = np.linspace(0, 1, N_POINTS)
    y = sorted((scores - min_s) / (max_s - min_s))
    m = smooth((np.diff(y) / np.diff(x)), 50)
    m = m - 1
    m[m <= 0] = np.inf
    m[:int(len(m) * (1 - max_regions_percentage))] = np.inf
    idx_th = np.argmin(m) + 1

    #print idx_th,
    th_iod = sorted(iod_values)[idx_th]
    #print th_iod

    hpr_idxs = iod_values > th_iod
    #print len(iod_values),len(hpr_idxs),sum(hpr_idxs), sum(hpr_idxs)/float(len(hpr_idxs)),

    info('Selected %f%% regions (%d)' %
         (sum(hpr_idxs) / float(len(hpr_idxs)) * 100, sum(hpr_idxs)))
    coordinates_bin['iod'] = iod_values

    #we remove the regions "without" signal in any of the cell types
    coordinates_bin.dropna(inplace=True)

    #create a track for IGV
    bedgraph_iod_track_filename = os.path.join(tracks_directory,
                                               'VARIABILITY.bedgraph')
    bw_iod_track_filename = os.path.join(tracks_directory, 'VARIABILITY.bw')

    if not os.path.exists(bw_iod_track_filename) or recompute_all:

        info('Generating variability track in bigwig format in:%s' %
             bw_iod_track_filename)

        coordinates_bin.to_csv(bedgraph_iod_track_filename,
                               sep='\t',
                               header=False,
                               index=False)
        sb.call('bedGraphToBigWig %s %s %s' %
                (bedgraph_iod_track_filename, chr_len_filename,
                 bw_iod_track_filename),
                shell=True,
                env=system_env)
        try:
            os.remove(bedgraph_iod_track_filename)
        except:
            pass

    #Write the HPRs
    bedgraph_hpr_filename = os.path.join(
        tracks_directory, 'SELECTED_VARIABILITY_HOTSPOT.bedgraph')

    to_write = coordinates_bin.ix[hpr_idxs[hpr_idxs].index]
    to_write.dropna(inplace=True)
    to_write['bpstart'] = to_write['bpstart'].astype(int)
    to_write['bpend'] = to_write['bpend'].astype(int)

    to_write.to_csv(bedgraph_hpr_filename, sep='\t', header=False, index=False)

    bed_hpr_fileaname = os.path.join(output_directory,
                                     'SELECTED_VARIABILITY_HOTSPOT.bed')

    if not os.path.exists(bed_hpr_fileaname) or recompute_all:
        info('Writing the HPRs in: %s' % bed_hpr_fileaname)
        sb.call('sort -k1,1 -k2,2n %s | bedtools merge -i stdin >  %s' %
                (bedgraph_hpr_filename, bed_hpr_fileaname),
                shell=True,
                env=system_env)

    #os.remove(bedgraph_hpr_filename)

    df_chip_hpr = df_chip_not_empty.ix[hpr_idxs, :]
    df_chip_hpr_zscore = df_chip_hpr.apply(zscore, axis=1)

    specific_regions_directory = os.path.join(output_directory,
                                              'SPECIFIC_REGIONS')
    if not os.path.exists(specific_regions_directory):
        os.makedirs(specific_regions_directory)

    if depleted:
        z_score_high = -z_score_high
        z_score_low = -z_score_low

    #write target
    info('Writing Specific Regions for each cell line...')
    coord_zscore = coordinates_bin.copy()
    for col in df_chip_hpr_zscore:

        regions_specific_filename = 'Regions_specific_for_%s_z_%.2f.bedgraph' % (
            os.path.basename(col).replace('.rpm', ''), z_score_high)
        specific_output_filename = os.path.join(specific_regions_directory,
                                                regions_specific_filename)
        specific_output_bed_filename = specific_output_filename.replace(
            '.bedgraph', '.bed')

        if not os.path.exists(specific_output_bed_filename) or recompute_all:
            if depleted:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] < z_score_high, col]
            else:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] > z_score_high, col]
            coord_zscore.dropna().to_csv(specific_output_filename,
                                         sep='\t',
                                         header=False,
                                         index=False)

            info('Writing:%s' % specific_output_bed_filename)
            sb.call('sort -k1,1 -k2,2n %s | bedtools merge -i stdin >  %s' %
                    (specific_output_filename, specific_output_bed_filename),
                    shell=True,
                    env=system_env)

    #write background
    info('Writing Background Regions for each cell line...')
    coord_zscore = coordinates_bin.copy()
    for col in df_chip_hpr_zscore:

        regions_bg_filename = 'Background_for_%s_z_%.2f.bedgraph' % (
            os.path.basename(col).replace('.rpm', ''), z_score_low)
        bg_output_filename = os.path.join(
            specific_regions_directory, 'Background_for_%s_z_%.2f.bedgraph' %
            (os.path.basename(col).replace('.rpm', ''), z_score_low))
        bg_output_bed_filename = bg_output_filename.replace(
            '.bedgraph', '.bed')

        if not os.path.exists(bg_output_bed_filename) or recompute_all:

            if depleted:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] > z_score_low, col]
            else:
                coord_zscore['z-score'] = df_chip_hpr_zscore.ix[
                    df_chip_hpr_zscore.ix[:, col] < z_score_low, col]
            coord_zscore.dropna().to_csv(bg_output_filename,
                                         sep='\t',
                                         header=False,
                                         index=False)

            info('Writing:%s' % bg_output_bed_filename)
            sb.call('sort -k1,1 -k2,2n -i %s | bedtools merge -i stdin >  %s' %
                    (bg_output_filename, bg_output_bed_filename),
                    shell=True,
                    env=system_env)

    ###plot selection
    pl.figure()
    pl.title('Selection of the HPRs')
    pl.plot(x, y, 'r', lw=3)
    pl.plot(x[idx_th], y[idx_th], '*', markersize=20)
    pl.hold(True)
    x_ext = np.linspace(-0.1, 1.2, N_POINTS)
    y_line = (m[idx_th] + 1.0) * (x_ext - x[idx_th]) + y[idx_th]
    pl.plot(x_ext, y_line, '--k', lw=3)
    pl.xlim(0, 1.1)
    pl.ylim(0, 1)
    pl.xlabel('Fraction of bins')
    pl.ylabel('Score normalized')
    pl.savefig(
        os.path.join(output_directory, 'SELECTION_OF_VARIABILITY_HOTSPOT.pdf'))
    pl.close()

    igv_session_filename = os.path.join(output_directory,
                                        'OPEN_ME_WITH_IGV.xml')
    info('Creating an IGV session file (.xml) in: %s' % igv_session_filename)

    session = ET.Element("Session")
    session.set("genome", genome_name)
    session.set("hasGeneTrack", "true")
    session.set("version", "7")
    resources = ET.SubElement(session, "Resources")
    panel = ET.SubElement(session, "Panel")

    resource_items = []
    track_items = []

    hpr_iod_scores = scores[scores > th_iod]
    min_h = np.mean(hpr_iod_scores) - 2 * np.std(hpr_iod_scores)
    max_h = np.mean(hpr_iod_scores) + 2 * np.std(hpr_iod_scores)
    mid_h = np.mean(hpr_iod_scores)
    #write the tracks
    for sample_name in sample_names:
        if disable_quantile_normalization:
            track_full_path = os.path.join(
                output_directory, 'TRACKS',
                '%s.%dbp.bw' % (sample_name, bin_size))
        else:
            track_full_path = os.path.join(
                output_directory, 'TRACKS',
                '%s.%dbp_quantile_normalized.bw' % (sample_name, bin_size))

        track_filename = rem_base_path(track_full_path, output_directory)

        if os.path.exists(track_full_path):
            resource_items.append(ET.SubElement(resources, "Resource"))
            resource_items[-1].set("path", track_filename)
            track_items.append(ET.SubElement(panel, "Track"))
            track_items[-1].set('color', "0,0,178")
            track_items[-1].set('id', track_filename)
            track_items[-1].set("name", sample_name)

    resource_items.append(ET.SubElement(resources, "Resource"))
    resource_items[-1].set(
        "path", rem_base_path(bw_iod_track_filename, output_directory))

    track_items.append(ET.SubElement(panel, "Track"))
    track_items[-1].set('color', "178,0,0")
    track_items[-1].set('id',
                        rem_base_path(bw_iod_track_filename, output_directory))
    track_items[-1].set('renderer', "HEATMAP")
    track_items[-1].set(
        "colorScale",
        "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" %
        (mid_h, min_h, mid_h, max_h))
    track_items[-1].set("name", 'VARIABILITY')

    resource_items.append(ET.SubElement(resources, "Resource"))
    resource_items[-1].set("path",
                           rem_base_path(bed_hpr_fileaname, output_directory))
    track_items.append(ET.SubElement(panel, "Track"))
    track_items[-1].set('color', "178,0,0")
    track_items[-1].set('id', rem_base_path(bed_hpr_fileaname,
                                            output_directory))
    track_items[-1].set('renderer', "HEATMAP")
    track_items[-1].set(
        "colorScale",
        "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" %
        (mid_h, min_h, mid_h, max_h))
    track_items[-1].set("name", 'HOTSPOTS')

    for sample_name in sample_names:
        track_full_path = glob.glob(
            os.path.join(output_directory, 'SPECIFIC_REGIONS',
                         'Regions_specific_for_%s*.bedgraph' % sample_name))[0]
        specific_track_filename = rem_base_path(track_full_path,
                                                output_directory)
        if os.path.exists(track_full_path):
            resource_items.append(ET.SubElement(resources, "Resource"))
            resource_items[-1].set("path", specific_track_filename)

            track_items.append(ET.SubElement(panel, "Track"))
            track_items[-1].set('color', "178,0,0")
            track_items[-1].set('id', specific_track_filename)
            track_items[-1].set('renderer', "HEATMAP")
            track_items[-1].set(
                "colorScale",
                "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0"
                % (mid_h, min_h, mid_h, max_h))
            track_items[-1].set("name", 'REGION SPECIFIC FOR %s' % sample_name)

    tree = ET.ElementTree(session)
    tree.write(igv_session_filename, xml_declaration=True)

    info('All done! Ciao!')
    sys.exit(0)