def get_data_filepaths(samples_filename_or_bam_folder, input_is_bigwig): # check folder or sample filename if not os.path.exists(samples_filename_or_bam_folder): error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder) sys.exit(1) if os.path.isfile(samples_filename_or_bam_folder): data_filenames = [] sample_names = [] with open(samples_filename_or_bam_folder) as infile: for line in infile: if not line.strip(): continue if line.startswith('#'): # skip optional header line info('Skipping header/comment line:%s' % line) continue fields = line.strip().split() n_fields = len(fields) if n_fields == 2: sample_names.append(fields[0]) data_filenames.append(fields[1]) else: error('The samples file format is wrong!') sys.exit(1) # dir_path = os.path.dirname(os.path.realpath(samples_filename_or_bam_folder)) # data_filenames = [os.path.join(dir_path, filename) # for filename in data_filenames] else: if input_is_bigwig: extension_to_check = '.bw' info('Input is set BigWig (.bw)') else: extension_to_check = '.bam' info('Input is set compressed SAM (.bam)') data_filenames = glob.glob( os.path.join(samples_filename_or_bam_folder, '*' + extension_to_check)) if not data_filenames: error('No bam/bigwig files to analyze in %s. Exiting.' % samples_filename_or_bam_folder_or_bam_folder) sys.exit(1) sample_names = [ os.path.basename(data_filename).replace(extension_to_check, '') for data_filename in data_filenames ] # check all the files before starting info('Checking samples files location...') for data_filename in data_filenames: check_file(data_filename) return sample_names, data_filenames
def get_target_motifs_filepaths(target_motifs_filepaths_file): # check folder or sample filename if not os.path.exists(target_motifs_filepaths_file): error("The file or folder %s doesn't exist. Exiting." % target_motifs_filepaths_file) sys.exit(1) if os.path.isfile(target_motifs_filepaths_file): specific_regions_filenames = [] bg_regions_filenames = [] sample_names = [] with open(target_motifs_filepaths_file) as infile: for line in infile: if not line.strip(): continue if line.startswith( '#'): # skip optional header line or empty lines info('Skipping header/comment line:%s' % line) continue fields = line.strip().split() n_fields = len(fields) if n_fields == 2: sample_names.append(fields[0]) specific_regions_filenames.append(fields[1]) bg_regions_filenames.append("random_background") elif n_fields == 3: sample_names.append(fields[0]) specific_regions_filenames.append(fields[1]) bg_regions_filenames.append(fields[2]) else: error('The samples file format is wrong!') sys.exit(1) # check all the files before starting info('Checking files location...') for specific_regions_filename in specific_regions_filenames: check_file(specific_regions_filename) if n_fields == 3: # check all the files before starting info('Checking files location...') for bg_regions_filename in bg_regions_filenames: check_file(bg_regions_filename) return sample_names, specific_regions_filenames, bg_regions_filenames
def main(): print '\n[H A Y S T A C K P I P E L I N E]' print('\n-SELECTION OF HOTSPOTS OF VARIABILITY AND ENRICHED MOTIFS- [Luca Pinello - [email protected]]\n') print 'Version %s\n' % HAYSTACK_VERSION #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters') parser.add_argument('samples_filename_or_bam_folder', type=str, help='A tab delimeted file with in each row (1) a sample name, (2) the path to the corresponding bam filename, (3 optional) the path to the corresponding gene expression filaneme. Alternatively it is possible to specify a folder containing some .bam files to analyze.') parser.add_argument('genome_name', type=str, help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)') #optional parser.add_argument('--name', help='Define a custom output filename for the report', default='') parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)',default='') parser.add_argument('--bin_size', type=int,help='bin size to use(default: 500bp)',default=500) parser.add_argument('--recompute_all',help='Ignore any file previously precalculated fot the command haystack_hotstpot',action='store_true') parser.add_argument('--depleted', help='Look for cell type specific regions with depletion of signal instead of enrichment',action='store_true') parser.add_argument('--input_is_bigwig', help='Use the bigwig format instead of the bam format for the input. Note: The files must have extension .bw',action='store_true') parser.add_argument('--disable_quantile_normalization',help='Disable quantile normalization (default: False)',action='store_true') parser.add_argument('--transformation',type=str,help='Variance stabilizing transformation among: none, log2, angle (default: angle)',default='angle',choices=['angle', 'log2', 'none']) parser.add_argument('--z_score_high', type=float,help='z-score value to select the specific regions(default: 1.5)',default=1.5) parser.add_argument('--z_score_low', type=float,help='z-score value to select the not specific regions(default: 0.25)',default=0.25) parser.add_argument('--th_rpm',type=float,help='Percentile on the signal intensity to consider for the hotspots (default: 99)', default=99) parser.add_argument('--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2016)') parser.add_argument('--motif_mapping_filename', type=str, help='Custom motif to gene mapping file (the default is for JASPAR CORE 2016 database)') parser.add_argument('--plot_all', help='Disable the filter on the TF activity and correlation (default z-score TF>0 and rho>0.3)',action='store_true') parser.add_argument('--n_processes',type=int, help='Specify the number of processes to use. The default is #cores available.',default=multiprocessing.cpu_count()) parser.add_argument('--temp_directory', help='Directory to store temporary files (default: /tmp)', default='/tmp') parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict=vars(args) for key,value in args_dict.items(): exec('%s=%s' %(key,repr(value))) if meme_motifs_filename: check_file(meme_motifs_filename) if motif_mapping_filename: check_file(motif_mapping_filename) if not os.path.exists(temp_directory): error('The folder specified with --temp_directory: %s does not exist!' % temp_directory) sys.exit(1) if input_is_bigwig: extension_to_check='.bw' info('Input is set BigWig (.bw)') else: extension_to_check='.bam' info('Input is set compressed SAM (.bam)') if name: directory_name='HAYSTACK_PIPELINE_RESULTS_on_%s' % name else: directory_name='HAYSTACK_PIPELINE_RESULTS' if output_directory: output_directory=os.path.join(output_directory,directory_name) else: output_directory=directory_name #check folder or sample filename USE_GENE_EXPRESSION=True if os.path.isfile(samples_filename_or_bam_folder): BAM_FOLDER=False bam_filenames=[] gene_expression_filenames=[] sample_names=[] with open(samples_filename_or_bam_folder) as infile: for line in infile: if not line.strip(): continue if line.startswith('#'): #skip optional header line or empty lines info('Skipping header/comment line:%s' % line) continue fields=line.strip().split() n_fields=len(fields) if n_fields==2: USE_GENE_EXPRESSION=False sample_names.append(fields[0]) bam_filenames.append(fields[1]) elif n_fields==3: USE_GENE_EXPRESSION=USE_GENE_EXPRESSION and True sample_names.append(fields[0]) bam_filenames.append(fields[1]) gene_expression_filenames.append(fields[2]) else: error('The samples file format is wrong!') sys.exit(1) else: if os.path.exists(samples_filename_or_bam_folder): BAM_FOLDER=True USE_GENE_EXPRESSION=False bam_filenames=glob.glob(os.path.join(samples_filename_or_bam_folder,'*'+extension_to_check)) if not bam_filenames: error('No bam/bigwig files to analyze in %s. Exiting.' % samples_filename_or_bam_folder) sys.exit(1) sample_names=[os.path.basename(bam_filename).replace(extension_to_check,'') for bam_filename in bam_filenames] else: error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder) sys.exit(1) #check all the files before starting info('Checking samples files location...') for bam_filename in bam_filenames: check_file(bam_filename) if USE_GENE_EXPRESSION: for gene_expression_filename in gene_expression_filenames: check_file(gene_expression_filename) if not os.path.exists(output_directory): os.makedirs(output_directory) #copy back the file used if not BAM_FOLDER: shutil.copy2(samples_filename_or_bam_folder,output_directory) #write hotspots conf files sample_names_hotspots_filename=os.path.join(output_directory,'sample_names_hotspots.txt') with open(sample_names_hotspots_filename,'w+') as outfile: for sample_name,bam_filename in zip(sample_names,bam_filenames): outfile.write('%s\t%s\n' % (sample_name, bam_filename)) #write tf activity conf files if USE_GENE_EXPRESSION: sample_names_tf_activity_filename=os.path.join(output_directory,'sample_names_tf_activity.txt') with open(sample_names_tf_activity_filename,'w+') as outfile: for sample_name,gene_expression_filename in zip(sample_names,gene_expression_filenames): outfile.write('%s\t%s\n' % (sample_name, gene_expression_filename)) tf_activity_directory=os.path.join(output_directory,'HAYSTACK_TFs_ACTIVITY_PLANES') #CALL HAYSTACK HOTSPOTS cmd_to_run='haystack_hotspots %s %s --output_directory %s --bin_size %d %s %s %s %s %s %s %s %s' % \ (sample_names_hotspots_filename, genome_name,output_directory,bin_size, ('--recompute_all' if recompute_all else ''), ('--depleted' if depleted else ''), ('--input_is_bigwig' if input_is_bigwig else ''), ('--disable_quantile_normalization' if disable_quantile_normalization else ''), '--transformation %s' % transformation, '--z_score_high %f' % z_score_high, '--z_score_low %f' % z_score_low, '--th_rpm %f' % th_rpm) print cmd_to_run sb.call(cmd_to_run ,shell=True,env=system_env) #CALL HAYSTACK MOTIFS motif_directory=os.path.join(output_directory,'HAYSTACK_MOTIFS') for sample_name in sample_names: specific_regions_filename=os.path.join(output_directory,'HAYSTACK_HOTSPOTS','SPECIFIC_REGIONS','Regions_specific_for_%s*.bed' %sample_name) bg_regions_filename=glob.glob(os.path.join(output_directory,'HAYSTACK_HOTSPOTS','SPECIFIC_REGIONS','Background_for_%s*.bed' %sample_name))[0] #bg_regions_filename=glob.glob(specific_regions_filename.replace('Regions_specific','Background')[:-11]+'*.bed')[0] #lo zscore e' diverso... #print specific_regions_filename,bg_regions_filename cmd_to_run='haystack_motifs %s %s --bed_bg_filename %s --output_directory %s --name %s' % (specific_regions_filename,genome_name, bg_regions_filename,motif_directory, sample_name) if meme_motifs_filename: cmd_to_run+=' --meme_motifs_filename %s' % meme_motifs_filename if n_processes: cmd_to_run+=' --n_processes %d' % n_processes if temp_directory: cmd_to_run+=' --temp_directory %s' % temp_directory print cmd_to_run sb.call(cmd_to_run,shell=True,env=system_env) if USE_GENE_EXPRESSION: #CALL HAYSTACK TF ACTIVITY motifs_output_folder=os.path.join(motif_directory,'HAYSTACK_MOTIFS_on_%s' % sample_name) if os.path.exists(motifs_output_folder): cmd_to_run='haystack_tf_activity_plane %s %s %s --output_directory %s' %(motifs_output_folder,sample_names_tf_activity_filename,sample_name,tf_activity_directory) if motif_mapping_filename: cmd_to_run+=' --motif_mapping_filename %s' % motif_mapping_filename if plot_all: cmd_to_run+=' --plot_all' print cmd_to_run sb.call(cmd_to_run,shell=True,env=system_env)
def main(input_args=None): print('\n[H A Y S T A C K M O T I F S]') print( '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n' ) print('Version %s\n' % HAYSTACK_VERSION) bootstrap = False ngram_correction = 'g' parser = get_args_motif() args = parser.parse_args(input_args) args.n_processes = max(1, args.n_processes - 1) args_dict = vars(args) for key, value in args_dict.items(): if key == 'n_target_coordinates': n_target_coordinates = value else: exec('%s=%s' % (key, repr(value))) bed_score_column -= 1 if no_c_g_correction: c_g_correction = False else: c_g_correction = True if no_random_sampling_target: random_sampling_target = False else: random_sampling_target = True check_file(bed_target_filename) if not bed_bg_filename == 'random_background': check_file(bed_bg_filename) if meme_motifs_filename: check_file(meme_motifs_filename) else: meme_motifs_filename = os.path.join( determine_path('motif_databases'), 'JASPAR_CORE_2016_vertebrates.meme') annotation_directory = determine_path('gene_annotations') if gene_annotations_filename: if which('java') is None: error( 'The mapping to the closest gene requires Java free available from: http://java.com/en/download/' ) use_gene_annotations = False else: check_file(gene_annotations_filename) info('Using %s as gene annotations file' % gene_annotations_filename) use_gene_annotations = True else: gene_annotations_filename = os.path.join(annotation_directory, '%s_genes.bed' % genome_name) gene_ids_to_names_filename = os.path.join( annotation_directory, '%s_genes_id_to_names' % genome_name) if os.path.exists(gene_annotations_filename) and os.path.exists( gene_ids_to_names_filename): use_gene_annotations = True else: use_gene_annotations = False info('No gene annotations file specified') genome, _, nucleotide_bg_filename = initialize_genome(genome_name) target_name = ntpath.basename(bed_target_filename.replace('.bed', '')) bg_name = ntpath.basename(bed_bg_filename.replace('.bed', '')) # timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.') if name: directory_name = 'HAYSTACK_MOTIFS_on_' + name else: directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name if output_directory: output_directory = os.path.join(output_directory, directory_name) else: output_directory = directory_name info( '###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n' \ % (bed_target_filename, bed_bg_filename, str(bg_target_ratio), str(c_g_correction), str(mask_repetitive), 'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates), output_directory)) N_TARGET = None N_BG = None COMMAND_USED = ' '.join(sys.argv) _n_target_coordinates = n_target_coordinates info('Loading Target coordinates from bed:%s' % bed_target_filename) target_coords = Coordinate.bed_to_coordinates(bed_target_filename, cl_score=bed_score_column) if len(target_coords) == 0: info('No coordinates to analyze in your input file. Exiting.') sys.exit(1) # calculate automatically the average lenght of the target regions if internal_window_length: info('Using the user defined internal window length:%d' % internal_window_length) if internal_window_length % 2: internal_window_length += 1 else: internal_window_length = int(np.mean(map(len, target_coords))) if internal_window_length % 2: internal_window_length += 1 info( 'Using the average length of target coordinates as internal window length:%d' % internal_window_length) if not window_length: window_length = internal_window_length * 5 info('Total window length:%d' % window_length) if not smooth_size: smooth_size = internal_window_length / 5 target_coords = Coordinate.coordinates_of_intervals_around_center( target_coords, internal_window_length) if len(target_coords) > n_target_coordinates: if random_sampling_target: info('Sampling %d coordinates among the %d total' % (n_target_coordinates, len(target_coords))) target_coords = random.sample(target_coords, n_target_coordinates) else: info('Selecting the best %d coordinates among the %d total' % (n_target_coordinates, len(target_coords))) sorted_idxs_by_score = np.argsort([c.score for c in target_coords])[::-1] target_coords = [ target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates] ] else: if random_sampling_target and bootstrap and not np.isinf( n_target_coordinates): warn('Number of target regions < %d' % n_target_coordinates) info('bootstrapping to obtain enough target regions') target_coords = sample_wr(target_coords, n_target_coordinates) else: info('Using all the %d target coordinates' % len(target_coords)) info('Extracting Motifs in target coordinates') positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning( target_coords, meme_motifs_filename, genome, nucleotide_bg_filename, temp_directory=temp_directory, p_value=p_value, mask_repetitive=mask_repetitive, window_length=window_length, internal_window_length=internal_window_length, num_consumers=n_processes) n_target_coordinates = len(target_coords) # fix for the bootstrap! if bed_bg_filename == 'random_background': info('Extracting Random Coordinates from the genome...') if c_g_correction: info('Calculating the C+G content of the target coordinates') bg_coords = [] c_g_content_target = calculate_average_ngram_presence( target_coords, genome, ngram_correction) info('Extract a Matching C+G Background') bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf)) for _ in range(bg_target_ratio): for idx_c, c in enumerate(target_coords): c_bin = np.nonzero( np.histogram(c_g_content_target[idx_c], bins)[0])[0][0] c_random_bin = -1 while c_random_bin != c_bin: random_bpstart = np.random.randint( 1, genome.chr_len[c.chr_id] - len(c) + 1) c_random = Coordinate(c.chr_id, random_bpstart, random_bpstart + len(c) - 1) seq = genome.extract_sequence(c_random) c_g_content_c_random = (seq.count('c') + seq.count('g')) / float(len(c)) c_random_bin = np.nonzero( np.histogram(c_g_content_c_random, bins)[0])[0][0] # print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c bg_coords.append(c_random) c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) bg_hist = np.histogram(c_g_content_bg, bins)[0] debug('original: ' + str(np.histogram(c_g_content_target, bins)[0])) debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0])) else: bg_coords = get_random_coordinates(target_coords, genome) info('Done!') else: info('Loading Background Coordinates from:%s' % bed_bg_filename) bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename) bg_coords = Coordinate.coordinates_of_intervals_around_center( bg_coords, internal_window_length) if use_entire_bg: bg_target_ratio = float(len(bg_coords)) / n_target_coordinates info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio) if c_g_correction: info('Calculating the C+G content') c_g_content_target = calculate_average_ngram_presence( target_coords, genome, ngram_correction) c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) info('Extract a Matching C+G Background') bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf)) target_hist = np.histogram(c_g_content_target, bins)[0] bg_hist = np.histogram(c_g_content_bg, bins)[0] ratios = bg_hist / (target_hist * 1.0) debug('original:%s' % target_hist) debug('bg:%s' % bg_hist) debug('ratios:%s' % ratios) K_MATCH = min( bg_target_ratio, ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) & (target_hist / float(target_hist.sum()) > 0.05)].min()) debug('K_MATCH:%d' % K_MATCH) to_match = np.int32(np.floor(K_MATCH * target_hist)) debug('to_match:%s' % to_match) idxs_corrected_bg = np.array([], dtype=int) for idx_bin in range(len(bins) - 1): idxs_matching_regions = \ np.nonzero((c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0] to_take = np.random.permutation(len(idxs_matching_regions)) to_take = to_take[range( min(len(idxs_matching_regions), to_match[idx_bin]))] idxs_corrected_bg = np.hstack( (idxs_corrected_bg, idxs_matching_regions[to_take])) debug('original:%s' % target_hist) debug('K:%d' % K_MATCH) debug('to sample:%s' % to_match) debug('obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0]) bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg] c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) debug(np.histogram(c_g_content_bg, bins)[0]) if np.array_equal(K_MATCH * target_hist, np.histogram(c_g_content_bg, bins)[0]): info('C+G content perfectly matched!\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) else: warn( 'C+G content not perfectly matched\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) debug(target_hist / np.histogram(c_g_content_bg, bins)[0]) if len(bg_coords) >= bg_target_ratio * n_target_coordinates: bg_coords = random.sample(bg_coords, int(bg_target_ratio * n_target_coordinates)) else: if bootstrap and len(bg_coords) < (bg_target_ratio * n_target_coordinates * 0.95): # allow a small tollerance! info('bootstrapping to obtain enough background regions') bg_coords = sample_wr(bg_coords, int(bg_target_ratio * n_target_coordinates)) c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) debug('After bootstrap:\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) info('Extracting Motifs in background coordinates') negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning( bg_coords, meme_motifs_filename, genome, nucleotide_bg_filename, temp_directory=temp_directory, p_value=p_value, mask_repetitive=mask_repetitive, window_length=window_length, internal_window_length=internal_window_length, num_consumers=n_processes)[0:3] # allocate date for reports N_MOTIFS = len(motif_ids) rankings = np.zeros(N_MOTIFS, dtype=np.int16) motif_ratios = np.zeros(N_MOTIFS) support_p = np.zeros(N_MOTIFS) support_n = np.zeros(N_MOTIFS) fisher_p_values = np.zeros(N_MOTIFS) central_enrichment = np.zeros(N_MOTIFS) N_seq_p = positive_matrix.shape[0] N_seq_n = negative_matrix.shape[0] profile_presence_p = (positive_matrix > 0).sum(0) profile_presence_n = (negative_matrix > 0).sum(0) support_p = profile_presence_p / float(N_seq_p) support_n = profile_presence_n / float(N_seq_n) internal_bpstart = window_length / 2 - internal_window_length / 2 internal_bpend = window_length / 2 + internal_window_length / 2 for idx, motif_id in enumerate(motif_ids): fisher_p_values[idx] = stats.fisher_exact( [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]], [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1] central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][ internal_bpstart:internal_bpend].mean() / np.hstack([ motifs_profiles_in_sequences[motif_id][:internal_bpstart], motifs_profiles_in_sequences[motif_id][internal_bpend:] ]).mean() motif_ratios = (support_p + 0.01) / (support_n + 0.01) # Foundamental! if not disable_ratio: motif_ratios[support_p < 0.03] = 1 rankings = stats.rankdata(-motif_ratios) # filter here positive or positive and negative################################# if not disable_ratio: idxs_to_keep = np.nonzero(motif_ratios > 1)[0] else: idxs_to_keep = range(len(motif_ratios)) rankings = rankings[idxs_to_keep] motif_ratios = motif_ratios[idxs_to_keep] support_p = support_p[idxs_to_keep] support_n = support_n[idxs_to_keep] fisher_p_values = fisher_p_values[idxs_to_keep] central_enrichment = central_enrichment[idxs_to_keep] motif_ids = [motif_ids[_] for _ in idxs_to_keep] motif_names = [motif_names[_] for _ in idxs_to_keep] motif_idxs = [_ for _ in idxs_to_keep] try: qvalues = estimate_qvalues(fisher_p_values) # we test the ones only with ratio >1 except: print fisher_p_values # qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids)) ################################################################################ # generate reports in html info('Generating HTML report...') imgs_directory = os.path.join(output_directory, 'images') genes_list_directory = os.path.join(output_directory, 'genes_lists') motif_regions_directory = os.path.join(output_directory, 'motifs_regions') # create folders if not os.path.exists(imgs_directory): os.makedirs(imgs_directory) if use_gene_annotations and not os.path.exists(genes_list_directory): os.makedirs(genes_list_directory) if not os.path.exists(motif_regions_directory): os.makedirs(motif_regions_directory) j2_env = Environment( loader=FileSystemLoader(determine_path('extra') + '/templates/'), trim_blocks=True) info('DIRECTORY:%s' % determine_path('extra') + '/templates/') template = j2_env.get_template('report_template.html') # copy haystack logo and bg shutil.copyfile( determine_path('extra') + '/templates/haystack_logo.png', os.path.join(imgs_directory, 'haystack_logo.png')) shutil.copyfile( determine_path('extra') + '/templates/noise.png', os.path.join(imgs_directory, 'noise.png')) motifs_dump = [] for i in np.argsort(rankings): if (support_p[i] >= 0.03 or disable_ratio) and fisher_p_values[i] < 0.01 and ( motif_ratios[i] > 1 or disable_ratio ) and central_enrichment[i] > min_central_enrichment: # if (support_p[i]>=0.01 or support_n[i]>=0.01) and fisher_p_values[i]<0.1 and (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9): info('Generating logo and profile for:' + motif_ids[i]) # create motif logo img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i]) generate_weblogo(motif_ids[i], meme_motifs_filename, img_logo, title=motif_ids[i]) generate_weblogo(motif_ids[i], meme_motifs_filename, img_logo, title=motif_ids[i], file_format='pdf') # fix the weblogo prefix problem img_logo_url = os.path.join('images', 'logo_' + motif_ids[i] + '.png') # create motif enrichment profile img_profile = os.path.join(imgs_directory, 'profile_' + motif_ids[i] + '.png') motif_profile_target = motifs_profiles_in_sequences[ motif_ids[i]] / N_seq_p motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n # print motif_profile_target.shape, motif_profile_bg.shape generate_motif_profile(motif_profile_target, motif_profile_bg, motif_ids[i], img_profile, smooth_size=smooth_size, window_size=window_length) img_profile_url = os.path.join('images', 'profile_' + motif_ids[i] + '.png') # create regions info('Extracting regions with:' + motif_ids[i]) regions = os.path.join( motif_regions_directory, motif_ids[i] + '_motif_region_in_target.bed') with open(regions, 'w+') as outfile: outfile.write( 'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n' ) for c, locations in motif_coords_in_seqs_with_motif[ motif_ids[i]].items(): outfile.write('\t'.join([ c.chr_id, str(c.bpstart), str(c.bpend), ';'.join([ '-'.join(map(str, map(int, l))) for l in locations ]), str(len(locations)) ]) + '\n') regions_url = os.path.join( 'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed') # map closest downstream genes genes_url = None if use_gene_annotations: info('Mapping regions with:%s to the clostest genes' % motif_ids[i]) peak_annotator_path = os.path.join(determine_path('extra/'), 'PeakAnnotator.jar') if gene_ids_to_names_filename: sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \ % (regions, gene_annotations_filename, gene_ids_to_names_filename, genes_list_directory), shell=True) else: sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -o %s >/dev/null 2>&1' \ % (regions, gene_annotations_filename, genes_list_directory), shell=True) genes_url = os.path.join( 'genes_lists', motif_ids[i] + '_motif_region_in_target.tss.bed') motifs_dump.append({ 'id': motif_ids[i], 'name': motif_names[i], 'support_p': support_p[i] * 100, 'support_n': support_n[i] * 100, 'ratio': motif_ratios[i], 'rank': float(rankings[i]), 'pvalue': fisher_p_values[i], 'qvalue': qvalues[i], 'central_enrichment': central_enrichment[i], 'img_logo': img_logo_url, 'img_profile': img_profile_url, 'regions': regions_url, 'genes': genes_url, 'idx_motif': motif_idxs[i] }) outfile = codecs.open( os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8") outfile.write(template.render(motifs_dump=motifs_dump, bed_target_filename=bed_target_filename, bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n, \ meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED, use_gene_annotations=use_gene_annotations)) outfile.close() if dump: info('Saving all the intermediate data on: %s ...' % output_directory) dump_directory = os.path.join(output_directory, 'dump') if not os.path.exists(dump_directory): os.makedirs(dump_directory) np.save(os.path.join(dump_directory, 'matrix_' + target_name), positive_matrix) np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name), negative_matrix) cp.dump( motifs_dump, open( os.path.join(dump_directory, target_name + '_motif_dumps.pickle'), 'w')) # cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w')) # cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w')) cp.dump( idxs_seqs_with_motif, open( os.path.join(dump_directory, target_name + '_motif_seqs_idxs.pickle'), 'w')) cp.dump( idxs_seqs_with_motif_bg, open( os.path.join(dump_directory, bg_name + '_motif_seqs_idxs.pickle'), 'w')) cp.dump( motif_coords_in_seqs_with_motif, open( os.path.join( dump_directory, target_name + '_motif_coords_in_seqs_with_motif.pickle'), 'w')) Coordinate.coordinates_to_bed( target_coords, os.path.join( dump_directory, 'Target_coordinates_selected_on_' + target_name + '.bed'), minimal_format=False) Coordinate.coordinates_to_bed( bg_coords, os.path.join(dump_directory, 'BG_coordinates_selected_on_' + bg_name + '.bed'), minimal_format=True) #info('Motif analysis for Sample %s completed' %name) info('Motif analysis completed! Ciao!')
def main(): print '\n[H A Y S T A C K M O T I F S]' print('\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n') print 'Version %s\n' % HAYSTACK_VERSION bootstrap=False ngram_correction='g' #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters') parser.add_argument('bed_target_filename', type=str, help='A bed file containing the target coordinates on the genome of reference') parser.add_argument('genome_name', type=str, help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)') #optional parser.add_argument('--bed_bg_filename', type=str, help="A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)", default='random_background') parser.add_argument('--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2016)') parser.add_argument('--nucleotide_bg_filename',type=str, help='Nucleotide probability for the background in MEME format (default precomupted on the Genome)') parser.add_argument('--p_value', type=float, help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)',default=1e-4) parser.add_argument('--no_c_g_correction', help='Disable the matching of the C+G density of the background',action='store_true') parser.add_argument('--c_g_bins', type=int,help='Number of bins for the C+G density correction (default: 8)',default=8) parser.add_argument('--mask_repetitive', help='Mask repetitive sequences',action='store_true') parser.add_argument('--n_target_coordinates', type=int, help='Number of target coordinates to use (default: all)',default=np.inf) parser.add_argument('--use_entire_bg', help='Use the entire background file (use only when the cg correction is disabled)',action='store_true') parser.add_argument('--bed_score_column', type=int, help='Column in the bedfile that represents the score (default: 5)',default=5) parser.add_argument('--bg_target_ratio', type=int, help='Background size/Target size ratio (default: 1.0)',default=2) parser.add_argument('--bootstrap', help='Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)',action='store_true') parser.add_argument('--temp_directory', help='Directory to store temporary files (default: /tmp)', default='/tmp') parser.add_argument('--no_random_sampling_target', help='Select the best --n_target_coordinates using the score column from the target file instead of randomly select them',action='store_true') parser.add_argument('--name', help='Define a custom output filename for the report', default='') parser.add_argument('--internal_window_length', type=int, help='Window length in bp for the enrichment (default: average lenght of the target sequences)') parser.add_argument('--window_length', type=int, help='Window length in bp for the profiler (default:internal_window_length*5)') parser.add_argument('--min_central_enrichment', type=float, help='Minimum central enrichment to report a motif (default:>1.0)',default=1.0) parser.add_argument('--disable_ratio', help='Disable target/bg ratio filter',action='store_true') parser.add_argument('--dump', help='Dump all the intermediate data, choices: True, False (default: False)',action='store_true') parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)',default='') parser.add_argument('--smooth_size',type=int, help='Size in bp for the smoothing window (default: internal_window_length/4)') parser.add_argument('--gene_annotations_filename',type=str, help='Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene') parser.add_argument('--gene_ids_to_names_filename',type=str, help='Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)') parser.add_argument('--n_processes',type=int, help='Specify the number of processes to use. The default is #cores available.',default=mp.cpu_count()) parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict=vars(args) for key,value in args_dict.items(): if key=='n_target_coordinates': n_target_coordinates=value else: exec('%s=%s' %(key,repr(value))) bed_score_column-=1 if no_c_g_correction: c_g_correction=False else: c_g_correction=True if no_random_sampling_target: random_sampling_target=False else: random_sampling_target=True check_file(bed_target_filename) if not bed_bg_filename == 'random_background': check_file(bed_bg_filename) if meme_motifs_filename: check_file(meme_motifs_filename) else: meme_motifs_filename=os.path.join(determine_path('motif_databases'),'JASPAR_CORE_2016_vertebrates.meme') annotation_directory=determine_path('gene_annotations') if gene_annotations_filename: if which('java') is None: error('The mapping to the closest gene requires Java free available from: http://java.com/en/download/') use_gene_annotations=False else: check_file(gene_annotations_filename) info('Using %s as gene annotations file' % gene_annotations_filename) use_gene_annotations=True else: gene_annotations_filename=os.path.join(annotation_directory,'%s_genes.bed' % genome_name) gene_ids_to_names_filename=os.path.join(annotation_directory,'%s_genes_id_to_names' % genome_name) if os.path.exists(gene_annotations_filename) and os.path.exists(gene_ids_to_names_filename): use_gene_annotations=True else: use_gene_annotations=False info('No gene annotations file specified') target_name=ntpath.basename(bed_target_filename.replace('.bed','')) bg_name=ntpath.basename(bed_bg_filename.replace('.bed','')) #timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.') if name: directory_name='HAYSTACK_MOTIFS_on_'+name else: directory_name='HAYSTACK_on_'+target_name+'_VS_'+bg_name if output_directory: output_directory=os.path.join(output_directory, directory_name) else: output_directory=directory_name info('###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n'\ %(bed_target_filename,bed_bg_filename,str(bg_target_ratio),str(c_g_correction),str(mask_repetitive),'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates),output_directory)) info('Initializing Genome:%s' %genome_name) genome_directory=determine_path('genomes') genome_2bit=os.path.join(genome_directory,genome_name+'.2bit') if os.path.exists(genome_2bit): genome=Genome_2bit(genome_2bit) else: info("\nIt seems you don't have the required genome file.") if query_yes_no('Should I download it for you?'): sb.call('haystack_download_genome %s' %genome_name,shell=True,env=system_env) if os.path.exists(genome_2bit): info('Genome correctly downloaded!') genome=Genome_2bit(genome_2bit) else: error('Sorry I cannot download the required file for you. Check your Internet connection.') sys.exit(1) else: error('Sorry I need the genome file to perform the analysis. Exiting...') sys.exit(1) if not nucleotide_bg_filename: nucleotide_bg_filename=os.path.join(genome_directory,genome_name+'_meme_bg') check_file(nucleotide_bg_filename) N_TARGET=None N_BG=None COMMAND_USED=' '.join(sys.argv) _n_target_coordinates=n_target_coordinates info('Loading Target coordinates from bed:%s' % bed_target_filename) target_coords=Coordinate.bed_to_coordinates(bed_target_filename,cl_score=bed_score_column) if len(target_coords) == 0: info('No coordinates to analyze in your input file. Exiting.') sys.exit(1) #calculate automatically the average lenght of the target regions if internal_window_length: info('Using the user defined internal window length:%d' % internal_window_length ) if internal_window_length % 2: internal_window_length+=1 else: internal_window_length=int(np.mean(map(len,target_coords))) if internal_window_length % 2: internal_window_length+=1 info('Using the average length of target coordinates as internal window length:%d' % internal_window_length ) if not window_length: window_length=internal_window_length*5 info('Total window length:%d' % window_length ) if not smooth_size: smooth_size=internal_window_length/5 target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,internal_window_length) if len(target_coords)>n_target_coordinates: if random_sampling_target: info('Sampling %d coordinates among the %d total' %( n_target_coordinates,len(target_coords))) target_coords=random.sample(target_coords,n_target_coordinates) else: info('Selecting the best %d coordinates among the %d total' %( n_target_coordinates,len(target_coords))) sorted_idxs_by_score=np.argsort([c.score for c in target_coords])[::-1] target_coords=[target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates]] else: if random_sampling_target and bootstrap and not np.isinf(n_target_coordinates): warn('Number of target regions < %d' % n_target_coordinates) info('bootstrapping to obtain enough target regions') target_coords=sample_wr(target_coords,n_target_coordinates) else: info('Using all the %d target coordinates' % len(target_coords)) info('Extracting Motifs in target coordinates') positive_matrix,motifs_profiles_in_sequences, idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,motif_names,motif_ids=parallel_fimo_scanning(target_coords, meme_motifs_filename, genome,nucleotide_bg_filename, temp_directory=temp_directory, p_value=p_value, mask_repetitive=mask_repetitive, window_length=window_length, internal_window_length=internal_window_length, num_consumers=n_processes) n_target_coordinates=len(target_coords) #fix for the bootstrap! if bed_bg_filename == 'random_background': info('Extracting Random Coordinates from the genome...') if c_g_correction: info('Calculating the C+G content of the target coordinates') bg_coords=[] c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction) info('Extract a Matching C+G Background') bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf)) for _ in range(bg_target_ratio): for idx_c,c in enumerate(target_coords): c_bin=np.nonzero(np.histogram(c_g_content_target[idx_c],bins)[0])[0][0] c_random_bin=-1 while c_random_bin != c_bin: random_bpstart=np.random.randint(1,genome.chr_len[c.chr_id]-len(c)+1) c_random=Coordinate(c.chr_id,random_bpstart,random_bpstart+len(c)-1) seq=genome.extract_sequence(c_random) c_g_content_c_random=(seq.count('c')+seq.count('g'))/float(len(c)) c_random_bin=np.nonzero(np.histogram(c_g_content_c_random,bins)[0])[0][0] #print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c bg_coords.append(c_random) c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction) bg_hist=np.histogram(c_g_content_bg,bins)[0] debug('original: '+str(np.histogram(c_g_content_target,bins)[0])) debug('obtained:'+str(np.histogram(c_g_content_bg,bins)[0])) else: bg_coords=get_random_coordinates(target_coords,genome) info('Done!') else: info('Loading Background Coordinates from:%s' % bed_bg_filename) bg_coords=Coordinate.bed_to_coordinates(bed_bg_filename) bg_coords=Coordinate.coordinates_of_intervals_around_center(bg_coords,internal_window_length) if use_entire_bg: bg_target_ratio=float(len(bg_coords))/n_target_coordinates info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio) if c_g_correction: info('Calculating the C+G content') c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction) c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction) info('Extract a Matching C+G Background') bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf)) target_hist=np.histogram(c_g_content_target,bins)[0] bg_hist=np.histogram(c_g_content_bg,bins)[0] ratios=bg_hist/(target_hist*1.0); debug('original:%s' %target_hist) debug('bg:%s' %bg_hist) debug('ratios:%s' %ratios) K_MATCH=min(bg_target_ratio,ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios>0) &(target_hist/float(target_hist.sum())>0.05)].min()) debug('K_MATCH:%d' %K_MATCH) to_match=np.int32(np.floor(K_MATCH*target_hist)) debug('to_match:%s' %to_match) idxs_corrected_bg=np.array([],dtype=int) for idx_bin in range(len(bins)-1): idxs_matching_regions=np.nonzero((c_g_content_bg>=bins[idx_bin]) & (c_g_content_bg<bins[idx_bin+1]))[0] to_take=np.random.permutation(len(idxs_matching_regions)) to_take=to_take[range(min(len(idxs_matching_regions),to_match[idx_bin]))] idxs_corrected_bg= np.hstack((idxs_corrected_bg,idxs_matching_regions[to_take])) debug('original:%s' %target_hist) debug('K:%d' %K_MATCH) debug('to sample:%s' %to_match) debug( 'obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg],bins)[0] ) bg_coords=[bg_coords[idx] for idx in idxs_corrected_bg] c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction) debug(np.histogram(c_g_content_bg,bins)[0]) if np.array_equal(K_MATCH*target_hist,np.histogram(c_g_content_bg,bins)[0]): info('C+G content perfectly matched!\n\ttarget:%s\n\tbg :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0])) else: warn('C+G content not perfectly matched\n\ttarget:%s\n\tbg :%s'%(target_hist,np.histogram(c_g_content_bg,bins)[0])) debug(target_hist/np.histogram(c_g_content_bg,bins)[0]) if len(bg_coords)>=bg_target_ratio*n_target_coordinates: bg_coords=random.sample(bg_coords,int(bg_target_ratio*n_target_coordinates)) else: if bootstrap and len(bg_coords)<(bg_target_ratio*n_target_coordinates*0.95): #allow a small tollerance! info('bootstrapping to obtain enough background regions') bg_coords=sample_wr(bg_coords,int(bg_target_ratio*n_target_coordinates)) c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction) debug('After bootstrap:\n\ttarget:%s\n\tbg :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0])) info('Extracting Motifs in background coordinates') negative_matrix,motifs_profiles_in_bg,idxs_seqs_with_motif_bg=parallel_fimo_scanning(bg_coords, meme_motifs_filename, genome,nucleotide_bg_filename, temp_directory=temp_directory, p_value=p_value, mask_repetitive=mask_repetitive, window_length=window_length, internal_window_length=internal_window_length, num_consumers=n_processes)[0:3] #allocate date for reports N_MOTIFS=len(motif_ids) rankings=np.zeros(N_MOTIFS,dtype=np.int16) motif_ratios=np.zeros(N_MOTIFS) support_p=np.zeros(N_MOTIFS) support_n=np.zeros(N_MOTIFS) fisher_p_values=np.zeros(N_MOTIFS) central_enrichment=np.zeros(N_MOTIFS) N_seq_p=positive_matrix.shape[0] N_seq_n=negative_matrix.shape[0] profile_presence_p=(positive_matrix>0).sum(0) profile_presence_n=(negative_matrix>0).sum(0) support_p=profile_presence_p/float(N_seq_p) support_n=profile_presence_n/float(N_seq_n) internal_bpstart=window_length/2-internal_window_length/2 internal_bpend=window_length/2+internal_window_length/2 for idx,motif_id in enumerate(motif_ids): fisher_p_values[idx]= stats.fisher_exact([[ profile_presence_p[idx], N_seq_p-profile_presence_p[idx]], [ profile_presence_n[idx], N_seq_n-profile_presence_n[idx]]])[1] central_enrichment[idx]=motifs_profiles_in_sequences[motif_id][internal_bpstart:internal_bpend].mean()/ np.hstack([motifs_profiles_in_sequences[motif_id][:internal_bpstart],motifs_profiles_in_sequences[motif_id][internal_bpend:]]).mean() motif_ratios=(support_p+0.01)/(support_n+0.01) #Foundamental! if not disable_ratio: motif_ratios[support_p<0.03]=1 rankings=stats.rankdata(-motif_ratios) #filter here positive or positive and negative################################# if not disable_ratio: idxs_to_keep=np.nonzero(motif_ratios>1)[0] else: idxs_to_keep=range(len(motif_ratios)) rankings=rankings[idxs_to_keep] motif_ratios=motif_ratios[idxs_to_keep] support_p=support_p[idxs_to_keep] support_n=support_n[idxs_to_keep] fisher_p_values=fisher_p_values[idxs_to_keep] central_enrichment=central_enrichment[idxs_to_keep] motif_ids=[motif_ids[_] for _ in idxs_to_keep] motif_names=[motif_names[_] for _ in idxs_to_keep] motif_idxs=[_ for _ in idxs_to_keep] try: qvalues=estimate_qvalues(fisher_p_values); # we test the ones only with ratio >1 except: print fisher_p_values #qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids)) ################################################################################ #generate reports in html info('Generating HTML report...') imgs_directory=os.path.join(output_directory,'images') genes_list_directory=os.path.join(output_directory,'genes_lists') motif_regions_directory=os.path.join(output_directory,'motifs_regions') #create folders if not os.path.exists(imgs_directory): os.makedirs(imgs_directory) if use_gene_annotations and not os.path.exists(genes_list_directory): os.makedirs(genes_list_directory) if not os.path.exists(motif_regions_directory): os.makedirs(motif_regions_directory) j2_env = Environment(loader=FileSystemLoader(determine_path('extra')+'/templates/'),trim_blocks=True) info('DIRECTORY:%s' % determine_path('extra')+'/templates/') template= j2_env.get_template('report_template.html') #copy haystack logo and bg shutil.copyfile(determine_path('extra')+'/templates/haystack_logo.png', os.path.join(imgs_directory,'haystack_logo.png')) shutil.copyfile(determine_path('extra')+'/templates/noise.png', os.path.join(imgs_directory,'noise.png')) motifs_dump=[] for i in np.argsort(rankings): if (support_p[i]>=0.03 or disable_ratio) and fisher_p_values[i]<0.01 and (motif_ratios[i]>1 or disable_ratio) and central_enrichment[i]>min_central_enrichment: #if (support_p[i]>=0.01 or support_n[i]>=0.01) and fisher_p_values[i]<0.1 and (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9): info('Generating logo and profile for:'+motif_ids[i]) #create motif logo img_logo=os.path.join(imgs_directory,'logo_'+motif_ids[i]) generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo') generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo',file_format='pdf') #fix the weblogo prefix problem img_logo_url=os.path.join('images','logo_'+motif_ids[i]+'.png') #create motif enrichment profile img_profile=os.path.join(imgs_directory,'profile_'+motif_ids[i]+'.png') motif_profile_target=motifs_profiles_in_sequences[motif_ids[i]]/N_seq_p motif_profile_bg=motifs_profiles_in_bg[motif_ids[i]]/N_seq_n #print motif_profile_target.shape, motif_profile_bg.shape generate_motif_profile(motif_profile_target,motif_profile_bg,motif_ids[i],img_profile,smooth_size=smooth_size,window_size=window_length) img_profile_url=os.path.join('images','profile_'+motif_ids[i]+'.png') #create regions info('Extracting regions with:'+motif_ids[i]) regions=os.path.join(motif_regions_directory,motif_ids[i]+'_motif_region_in_target.bed') with open(regions,'w+') as outfile: outfile.write('Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n') for c,locations in motif_coords_in_seqs_with_motif[motif_ids[i]].items(): outfile.write('\t'.join([c.chr_id,str(c.bpstart),str(c.bpend),';'.join(['-'.join(map(str,map(int,l))) for l in locations]),str(len(locations))])+'\n') regions_url=os.path.join('motifs_regions',motif_ids[i]+'_motif_region_in_target.bed') #map closest downstream genes genes_url=None if use_gene_annotations: info('Mapping regions with:%s to the clostest genes' % motif_ids[i]) peak_annotator_path=os.path.join(determine_path('extra/'),'PeakAnnotator.jar') if gene_ids_to_names_filename: sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \ %(regions,gene_annotations_filename,gene_ids_to_names_filename,genes_list_directory), shell=True,env=system_env) else: sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -o %s >/dev/null 2>&1' \ %(regions,gene_annotations_filename,genes_list_directory), shell=True,env=system_env) genes_url=os.path.join('genes_lists',motif_ids[i]+'_motif_region_in_target.tss.bed') motifs_dump.append({'id':motif_ids[i],'name':motif_names[i],'support_p':support_p[i]*100, 'support_n':support_n[i]*100, 'ratio':motif_ratios[i],'rank':float(rankings[i]), 'pvalue':fisher_p_values[i],'qvalue':qvalues[i],'central_enrichment':central_enrichment[i], 'img_logo':img_logo_url,'img_profile':img_profile_url,'regions':regions_url,'genes':genes_url,'idx_motif':motif_idxs[i]}) outfile= codecs.open(os.path.join(output_directory,"Haystack_report.html"), "w", "utf-8") outfile.write(template.render(motifs_dump=motifs_dump,bed_target_filename=bed_target_filename,bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n,\ meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,use_gene_annotations=use_gene_annotations)) outfile.close() if dump: info('Saving all the intermediate data on: %s ...' % output_directory) dump_directory=os.path.join(output_directory,'dump') if not os.path.exists(dump_directory): os.makedirs(dump_directory) np.save(os.path.join(dump_directory,'matrix_'+target_name),positive_matrix) np.save(os.path.join(dump_directory,'matrix_BG_'+target_name),negative_matrix) cp.dump(motifs_dump,open(os.path.join(dump_directory,target_name+'_motif_dumps.pickle'),'w')) #cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w')) #cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w')) cp.dump(idxs_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_seqs_idxs.pickle'),'w')) cp.dump(idxs_seqs_with_motif_bg,open(os.path.join(dump_directory,bg_name+'_motif_seqs_idxs.pickle'),'w')) cp.dump(motif_coords_in_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_coords_in_seqs_with_motif.pickle'),'w')) Coordinate.coordinates_to_bed(target_coords,os.path.join(dump_directory,'Target_coordinates_selected_on_'+target_name+'.bed'),minimal_format=False) Coordinate.coordinates_to_bed(bg_coords,os.path.join(dump_directory,'BG_coordinates_selected_on_'+ bg_name+'.bed'),minimal_format=True) info('All done! Ciao!') sys.exit(0)
def main(): print '\n[H A Y S T A C K T F A C T I V I T Y P L A N E]' print( '\n-TFs Activity on Gene Expression- [Luca Pinello - [email protected]]\n' ) print 'Version %s\n' % HAYSTACK_VERSION #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters', prog='haystack_tf_activity_plane') parser.add_argument( 'haystack_motifs_output_folder', type=str, help='A path to a folder created by the haystack_motifs utility') parser.add_argument( 'gene_expression_samples_filename', type=str, help='A file containing the list of sample names and locations') parser.add_argument( 'target_cell_type', type=str, help='The sample name to use as a target for the analysis') #optional parser.add_argument( '--motif_mapping_filename', type=str, help= 'Custom motif to gene mapping file (the default is for JASPAR CORE 2016 database)' ) parser.add_argument('--output_directory', type=str, help='Output directory (default: current directory)') parser.add_argument('--name', help='Define a custom output filename for the report') parser.add_argument( '--plot_all', help= 'Disable the filter on the TF activity and correlation (default z-score TF>0 and rho>0.3)', action='store_true') parser.add_argument('--version', help='Print version and exit.', action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict = vars(args) for key, value in args_dict.items(): exec('%s=%s' % (key, repr(value))) if not os.path.exists(haystack_motifs_output_folder): error("The haystack_motifs_output_folder specified: %s doesn't exist!") sys.exit(1) check_file(gene_expression_samples_filename) if motif_mapping_filename: check_file(motif_mapping_filename) else: motif_mapping_filename = os.path.join( determine_path('motif_databases'), 'JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt') if name: directory_name = 'HAYSTACK_TFs_ACTIVITY_PLANES_on_' + name else: directory_name = 'HAYSTACK_TFs_ACTIVITY_PLANES_on_' + target_cell_type if output_directory: output_directory = os.path.join(output_directory, directory_name) else: output_directory = directory_name motif_mapping = pd.read_table(motif_mapping_filename, header=None, names=['MOTIF_ID', 'MOTIF_NAME', 'GENES'], index_col=0) motif_mapping = motif_mapping.reset_index().groupby('MOTIF_ID').apply( group_motif_mapping) motif_mapping = motif_mapping.set_index('MOTIF_ID') #load mapping filename df_gene_mapping = pd.read_table(FileWrapper( "#", gene_expression_samples_filename, "r"), header=None, index_col=0, names=['Sample_name', 'Sample_file']) if target_cell_type not in df_gene_mapping.index: error( '\nThe target_cell_type must be among these sample names:\n\n%s' % '\t'.join(df_gene_mapping.index.values)) sys.exit(1) N_SAMPLES = df_gene_mapping.shape[0] if N_SAMPLES == 1: error( '\nYou need at least gene expression for two cell-types. Exiting...' ) sys.exit(1) elif N_SAMPLES == 2: USE_ZSCORE = False bg_target_cell_type = list( set(df_gene_mapping.index) - {target_cell_type})[0] info( 'Only 2 samples provided, use expression ratio plane instead of z-score. Target:%s, Bg: %s' % (target_cell_type, bg_target_cell_type)) else: USE_ZSCORE = True #load gene expression and calculate ranking gene_values = [] for sample_name in df_gene_mapping.index: info('Load gene expression file for :%s' % sample_name) check_file(df_gene_mapping.ix[sample_name, 'Sample_file']) gene_values.append( pd.read_table(df_gene_mapping.ix[sample_name, 'Sample_file'], index_col=0, names=['Gene_Symbol', sample_name])) gene_values = pd.concat(gene_values, axis=1) #make names to uppercase! TODO gene_ranking = gene_values.rank(ascending=True) #create output folder if not os.path.exists(output_directory): os.makedirs(output_directory) #For each motif make the plots for motif_gene_filename in glob.glob( os.path.join(haystack_motifs_output_folder, 'genes_lists') + '/*.bed'): current_motif_id = os.path.basename(motif_gene_filename).split('_')[0] info('Analyzing %s from:%s' % (current_motif_id, motif_gene_filename)) #genes closeby the motif sites mapped_genes = map( str.upper, list(pd.read_table(motif_gene_filename)['Symbol'].values)) #target genes average activity if USE_ZSCORE: ds_values = zscore_series(gene_ranking.ix[mapped_genes, :].mean()) else: ds_values = ( gene_ranking.ix[mapped_genes, target_cell_type] / gene_ranking.ix[mapped_genes, bg_target_cell_type]).mean() if current_motif_id in motif_mapping.index: current_motif_name = motif_mapping.ix[current_motif_id].MOTIF_NAME for gene_name in set( map(str.upper, motif_mapping.ix[current_motif_id].GENES.split(','))): #specificity of the TF try: if USE_ZSCORE: tf_values = zscore_series( gene_ranking.ix[gene_name.upper()]) else: tf_values = (gene_ranking.ix[gene_name.upper(), target_cell_type] / gene_ranking.ix[gene_name.upper(), bg_target_cell_type]) except: warn( 'The expression values of the gene %s are not present. Skipping it.' % gene_name.upper()) continue if USE_ZSCORE: #correlation ro = np.corrcoef(tf_values, ds_values)[0, 1] tf_value = tf_values[target_cell_type] ds_value = ds_values[target_cell_type] info( 'Gene:%s TF z-score:%.2f Targets z-score:%.2f Correlation:%.2f' % (gene_name, tf_value, ds_value, ro)) #make plots if (tf_value > 0 and np.abs(ro) > 0.3) or plot_all: x_min = min(-4, tf_values.min() * 1.1) x_max = max(4, tf_values.max() * 1.1) y_min = min(-4, ds_values.min() * 1.1) y_max = max(4, ds_values.max() * 1.1) fig = plt.figure(figsize=(10, 10), dpi=80, facecolor='w', edgecolor='w') ax = fig.add_subplot(111) plt.grid() plt.plot([x_min, x_max], [0, 0], 'k') plt.plot([0, 0], [y_min, y_max], 'k') ax.scatter(tf_values, ds_values, s=100, facecolors='none', edgecolors='k', label='rest of cell-types') ax.hold(True) ax.plot(tf_values[target_cell_type], ds_values[target_cell_type], '*r', markersize=30, linestyle='None', label=target_cell_type) ax.legend(loc='center', bbox_to_anchor=(0.5, -0.1), ncol=3, fancybox=True, shadow=True, numpoints=1) ax.set_aspect('equal') plt.text(x_min * 0.98, y_max * 0.85, r'$\rho$=%.2f' % ro, fontsize=14) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xlabel('TF z-score', fontsize=16) plt.ylabel('Targets z-score', fontsize=16) plt.title( 'Motif: %s (%s) Gene: %s' % (current_motif_name, current_motif_id, gene_name), fontsize=17) plt.savefig( os.path.join( output_directory, '%s_motif_%s(%s)_gene_%s.pdf' % (target_cell_type, current_motif_name.replace( '::', '_'), current_motif_id, gene_name))) plt.close() else: info( 'Gene:%s TF expression ratio:%.2f Targets expression ratio:%.2f' % ( gene_name, tf_values, ds_values, )) x_min = min(0, tf_values * 1.1) x_max = max(2, tf_values * 1.1) y_min = min(0, ds_values * 1.1) y_max = max(2, ds_values * 1.1) if (tf_values > 1.2) & ((ds_values > 1.2) | (ds_values < 0.8)) or plot_all: fig = plt.figure(figsize=(10, 10), dpi=80, facecolor='w', edgecolor='w') ax = fig.add_subplot(111) plt.grid() plt.plot([x_min, x_max], [1, 1], 'k') plt.plot([1, 1], [y_min, y_max], 'k') ax.plot(tf_values, ds_values, '*r', markersize=30, linestyle='None', label=target_cell_type) ax.set_aspect('equal') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xlabel('TF expression ratio (%s/%s)' % (target_cell_type, bg_target_cell_type), fontsize=16) plt.ylabel('Average Targets Expression Ratio (%s/%s)' % (target_cell_type, bg_target_cell_type), fontsize=16) plt.title( 'Motif: %s (%s) Gene: %s' % (current_motif_name, current_motif_id, gene_name), fontsize=17) plt.savefig( os.path.join( output_directory, '%s_motif_%s(%s)_gene_%s.pdf' % (target_cell_type, current_motif_name.replace( '::', '_'), current_motif_id, gene_name))) plt.close() else: warn('Sorry the motif %s is not mappable to gene' % current_motif_id) info('All done! Ciao!') sys.exit(0)
def main(): print '\n[H A Y S T A C K T F A C T I V I T Y P L A N E]' print('\n-TFs Activity on Gene Expression- [Luca Pinello - [email protected]]\n') print 'Version %s\n' % HAYSTACK_VERSION #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters',prog='haystack_tf_activity_plane') parser.add_argument('haystack_motifs_output_folder', type=str, help='A path to a folder created by the haystack_motifs utility') parser.add_argument('gene_expression_samples_filename', type=str, help='A file containing the list of sample names and locations') parser.add_argument('target_cell_type', type=str, help='The sample name to use as a target for the analysis') #optional parser.add_argument('--motif_mapping_filename', type=str, help='Custom motif to gene mapping file (the default is for JASPAR CORE 2016 database)') parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)') parser.add_argument('--name', help='Define a custom output filename for the report') parser.add_argument('--plot_all', help='Disable the filter on the TF activity and correlation (default z-score TF>0 and rho>0.3)',action='store_true') parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict=vars(args) for key,value in args_dict.items(): exec('%s=%s' %(key,repr(value))) if not os.path.exists(haystack_motifs_output_folder): error("The haystack_motifs_output_folder specified: %s doesn't exist!") sys.exit(1) check_file(gene_expression_samples_filename) if motif_mapping_filename: check_file(motif_mapping_filename) else: motif_mapping_filename=os.path.join(determine_path('motif_databases'),'JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt') if name: directory_name='HAYSTACK_TFs_ACTIVITY_PLANES_on_'+name else: directory_name='HAYSTACK_TFs_ACTIVITY_PLANES_on_'+target_cell_type if output_directory: output_directory=os.path.join(output_directory, directory_name) else: output_directory=directory_name motif_mapping=pd.read_table(motif_mapping_filename,header=None,names=['MOTIF_ID','MOTIF_NAME','GENES'],index_col=0) motif_mapping=motif_mapping.reset_index().groupby('MOTIF_ID').apply(group_motif_mapping) motif_mapping=motif_mapping.set_index('MOTIF_ID') #load mapping filename df_gene_mapping=pd.read_table(FileWrapper("#",gene_expression_samples_filename, "r"),header=None,index_col=0,names=['Sample_name','Sample_file']) if target_cell_type not in df_gene_mapping.index: error('\nThe target_cell_type must be among these sample names:\n\n%s' %'\t'.join(df_gene_mapping.index.values)) sys.exit(1) N_SAMPLES=df_gene_mapping.shape[0] if N_SAMPLES==1: error('\nYou need at least gene expression for two cell-types. Exiting...') sys.exit(1) elif N_SAMPLES==2: USE_ZSCORE=False bg_target_cell_type=list(set(df_gene_mapping.index)-{target_cell_type})[0] info('Only 2 samples provided, use expression ratio plane instead of z-score. Target:%s, Bg: %s' %(target_cell_type,bg_target_cell_type)) else: USE_ZSCORE=True #load gene expression and calculate ranking gene_values=[] for sample_name in df_gene_mapping.index: info('Load gene expression file for :%s' % sample_name) check_file(df_gene_mapping.ix[sample_name,'Sample_file']) gene_values.append(pd.read_table(df_gene_mapping.ix[sample_name,'Sample_file'],index_col=0,names=['Gene_Symbol',sample_name])) gene_values=pd.concat(gene_values,axis=1) #make names to uppercase! TODO gene_ranking=gene_values.rank(ascending=True) #create output folder if not os.path.exists(output_directory): os.makedirs(output_directory) #For each motif make the plots for motif_gene_filename in glob.glob(os.path.join(haystack_motifs_output_folder,'genes_lists')+'/*.bed'): current_motif_id=os.path.basename(motif_gene_filename).split('_')[0] info('Analyzing %s from:%s' %(current_motif_id, motif_gene_filename)) #genes closeby the motif sites mapped_genes=map(str.upper,list(pd.read_table(motif_gene_filename)['Symbol'].values)) #target genes average activity if USE_ZSCORE: ds_values=zscore_series(gene_ranking.ix[mapped_genes,:].mean() ) else: ds_values=(gene_ranking.ix[mapped_genes,target_cell_type]/gene_ranking.ix[mapped_genes,bg_target_cell_type]).mean() if current_motif_id in motif_mapping.index: current_motif_name=motif_mapping.ix[current_motif_id].MOTIF_NAME for gene_name in set(map(str.upper,motif_mapping.ix[current_motif_id].GENES.split(','))): #specificity of the TF try: if USE_ZSCORE: tf_values=zscore_series(gene_ranking.ix[gene_name.upper()]) else: tf_values=(gene_ranking.ix[gene_name.upper(),target_cell_type]/gene_ranking.ix[gene_name.upper(),bg_target_cell_type]) except: warn('The expression values of the gene %s are not present. Skipping it.' % gene_name.upper()) continue if USE_ZSCORE: #correlation ro=np.corrcoef(tf_values,ds_values)[0,1] tf_value=tf_values[target_cell_type] ds_value=ds_values[target_cell_type] info('Gene:%s TF z-score:%.2f Targets z-score:%.2f Correlation:%.2f' %(gene_name,tf_value,ds_value,ro)) #make plots if (tf_value>0 and np.abs(ro)>0.3) or plot_all: x_min=min(-4,tf_values.min()*1.1) x_max=max(4,tf_values.max()*1.1) y_min=min(-4,ds_values.min()*1.1) y_max=max(4,ds_values.max()*1.1) fig = plt.figure( figsize=(10, 10), dpi=80, facecolor='w', edgecolor='w') ax = fig.add_subplot(111) plt.grid() plt.plot([x_min,x_max],[0,0],'k') plt.plot([0,0],[y_min,y_max],'k') ax.scatter(tf_values,ds_values, s=100, facecolors='none', edgecolors='k',label='rest of cell-types') ax.hold(True) ax.plot(tf_values[target_cell_type],ds_values[target_cell_type],'*r',markersize=30,linestyle='None',label=target_cell_type) ax.legend(loc='center', bbox_to_anchor=(0.5, -0.1),ncol=3, fancybox=True, shadow=True,numpoints=1) ax.set_aspect('equal') plt.text(x_min*0.98,y_max*0.85,r'$\rho$=%.2f' % ro,fontsize=14) plt.xlim(x_min,x_max) plt.ylim(y_min,y_max) plt.xlabel('TF z-score',fontsize=16) plt.ylabel('Targets z-score',fontsize=16) plt.title('Motif: %s (%s) Gene: %s' % (current_motif_name ,current_motif_id ,gene_name),fontsize=17) plt.savefig(os.path.join(output_directory,'%s_motif_%s(%s)_gene_%s.pdf' % (target_cell_type,current_motif_name.replace('::','_') ,current_motif_id ,gene_name))) plt.close() else: info('Gene:%s TF expression ratio:%.2f Targets expression ratio:%.2f' %(gene_name,tf_values,ds_values,)) x_min=min(0,tf_values*1.1) x_max=max(2,tf_values*1.1) y_min=min(0,ds_values*1.1) y_max=max(2,ds_values*1.1) if (tf_values>1.2) & ((ds_values>1.2)|(ds_values<0.8)) or plot_all: fig = plt.figure( figsize=(10, 10), dpi=80, facecolor='w', edgecolor='w') ax = fig.add_subplot(111) plt.grid() plt.plot([x_min,x_max],[1,1],'k') plt.plot([1,1],[y_min,y_max],'k') ax.plot(tf_values,ds_values,'*r',markersize=30,linestyle='None',label=target_cell_type) ax.set_aspect('equal') plt.xlim(x_min,x_max) plt.ylim(y_min,y_max) plt.xlabel('TF expression ratio (%s/%s)' % (target_cell_type,bg_target_cell_type),fontsize=16) plt.ylabel('Average Targets Expression Ratio (%s/%s)' % (target_cell_type,bg_target_cell_type),fontsize=16) plt.title('Motif: %s (%s) Gene: %s' % (current_motif_name ,current_motif_id ,gene_name),fontsize=17) plt.savefig(os.path.join(output_directory,'%s_motif_%s(%s)_gene_%s.pdf' % (target_cell_type,current_motif_name.replace('::','_') ,current_motif_id ,gene_name))) plt.close() else: warn('Sorry the motif %s is not mappable to gene' % current_motif_id) info('All done! Ciao!') sys.exit(0)
def create_tiled_genome(genome_name, output_directory, chr_len_filename, bin_size, chrom_exclude, blacklist): from re import search genome_directory = determine_path('genomes') annotations_directory = determine_path('gene_annotations') genome_sorted_bins_file = os.path.join( output_directory, '%s.%dbp.bins.sorted.bed' % (os.path.basename(genome_name), bin_size)) chr_len_sorted_filtered_filename = os.path.join( output_directory, "%s_chr_lengths_sorted_filtered.txt" % genome_name) if not (os.path.exists(genome_sorted_bins_file) and do_not_recompute): info('Sorting chromosome lengths file once again to double check....') cmd = ' sort -k1,1 -k2,2n "%s" -o "%s" ' % (chr_len_filename, chr_len_filename) sb.call(cmd, shell=True) info('Creating bins of %dbp in %s' % (bin_size, genome_sorted_bins_file)) if chrom_exclude: with open(chr_len_sorted_filtered_filename, 'wb') as f: f.writelines(line for line in open(chr_len_filename) if not search(chrom_exclude, line.split()[0])) else: chr_len_sorted_filtered_filename = chr_len_filename cmd = 'bedtools makewindows -g "%s" -w %s > "%s" ' % ( chr_len_sorted_filtered_filename, bin_size, genome_sorted_bins_file) sb.call(cmd, shell=True) if blacklist == 'none': info('Tiled genome file created will not be blacklist filtered') else: info('Tiled genome file created will be blacklist filtered') if blacklist == 'hg19': info('Using hg19 blacklist file %s to filter out the regions' % blacklist) blacklist_filepath = os.path.join( annotations_directory, 'hg19_blacklisted_regions.bed') check_file(blacklist_filepath) elif os.path.isfile(blacklist): info('Using blacklist file %s to filter out the regions' % blacklist) blacklist_filepath = blacklist check_file(blacklist_filepath) else: error('Incorrect blacklist option provided. ' 'It is neither a file nor a genome') sys.exit(1) info('Sort blacklist file') cmd = ' sort -k1,1 -k2,2n "%s" -o "%s" ' % (blacklist_filepath, blacklist_filepath) sb.call(cmd, shell=True) genome_sorted_bins_filtered_file = genome_sorted_bins_file.replace( '.bed', '.filtered.bed') info(' filter out blacklist regions') cmd = 'bedtools intersect -sorted -a "%s" -b "%s" -v > %s ' % ( genome_sorted_bins_file, blacklist_filepath, genome_sorted_bins_filtered_file) sb.call(cmd, shell=True) if not keep_intermediate_files: info('Deleting %s' % genome_sorted_bins_file) try: os.remove(genome_sorted_bins_file) except: pass genome_sorted_bins_file = genome_sorted_bins_filtered_file return genome_sorted_bins_file
def main(input_args=None): print '\n[H A Y S T A C K P I P E L I N E]' print('\n-SELECTION OF HOTSPOTS OF VARIABILITY AND ENRICHED MOTIFS-\n') print 'Version %s\n' % HAYSTACK_VERSION parser = get_args_pipeline() args = parser.parse_args(input_args) args_dict = vars(args) for key, value in args_dict.items(): exec('%s=%s' % (key, repr(value))) if meme_motifs_filename: check_file(meme_motifs_filename) if motif_mapping_filename: check_file(motif_mapping_filename) if not os.path.exists(temp_directory): error( 'The folder specified with --temp_directory: %s does not exist!' % temp_directory) sys.exit(1) if input_is_bigwig: extension_to_check = '.bw' info('Input is set BigWig (.bw)') else: extension_to_check = '.bam' info('Input is set compressed SAM (.bam)') if name: directory_name = 'HAYSTACK_PIPELINE_RESULTS_on_%s' % name else: directory_name = 'HAYSTACK_PIPELINE_RESULTS' if output_directory: output_directory = os.path.join(output_directory, directory_name) else: output_directory = directory_name # check folder or sample filename USE_GENE_EXPRESSION = True if not os.path.exists(samples_filename_or_bam_folder): error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder) sys.exit(1) if os.path.isfile(samples_filename_or_bam_folder): BAM_FOLDER = False data_filenames = [] gene_expression_filenames = [] sample_names = [] with open(samples_filename_or_bam_folder) as infile: for line in infile: if not line.strip(): continue if line.startswith( '#'): # skip optional header line or empty lines info('Skipping header/comment line:%s' % line) continue fields = line.strip().split() n_fields = len(fields) if n_fields == 2: USE_GENE_EXPRESSION = False sample_names.append(fields[0]) data_filenames.append(fields[1]) elif n_fields == 3: USE_GENE_EXPRESSION = USE_GENE_EXPRESSION and True sample_names.append(fields[0]) data_filenames.append(fields[1]) gene_expression_filenames.append(fields[2]) else: error('The samples file format is wrong!') sys.exit(1) else: if os.path.exists(samples_filename_or_bam_folder): BAM_FOLDER = True USE_GENE_EXPRESSION = False data_filenames = glob.glob( os.path.join(samples_filename_or_bam_folder, '*' + extension_to_check)) if not data_filenames: error('No bam/bigwig files to analyze in %s. Exiting.' % samples_filename_or_bam_folder) sys.exit(1) sample_names = [ os.path.basename(data_filename).replace( extension_to_check, '') for data_filename in data_filenames ] else: error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder) sys.exit(1) # check all the files before starting info('Checking samples files location...') for data_filename in data_filenames: check_file(data_filename) if USE_GENE_EXPRESSION: for gene_expression_filename in gene_expression_filenames: check_file(gene_expression_filename) if not os.path.exists(output_directory): os.makedirs(output_directory) # copy back the file used if not BAM_FOLDER: shutil.copy2(samples_filename_or_bam_folder, output_directory) # write hotspots conf files sample_names_hotspots_filename = os.path.join(output_directory, 'sample_names_hotspots.txt') with open(sample_names_hotspots_filename, 'w+') as outfile: for sample_name, data_filename in zip(sample_names, data_filenames): outfile.write('%s\t%s\n' % (sample_name, data_filename)) #CALL HAYSTACK HOTSPOTS cmd_to_run='haystack_hotspots %s %s --output_directory %s --bin_size %d %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % \ (sample_names_hotspots_filename, genome_name,output_directory,bin_size, ('--do_not_filter_bams' if do_not_filter_bams else ''), ('--depleted' if depleted else ''), ('--do_not_recompute' if do_not_recompute else ''), ('--keep_intermediate_files' if keep_intermediate_files else ''), ('--input_is_bigwig' if input_is_bigwig else ''), ('--disable_quantile_normalization' if disable_quantile_normalization else ''), '--transformation %s' % transformation, '--chrom_exclude "%s"' % chrom_exclude, '--z_score_high %f' % z_score_high, '--z_score_low %f' % z_score_low, '--th_rpm %f' % th_rpm, '--blacklist %s' % blacklist, '--read_ext %d' % read_ext, '--n_processes %d' % n_processes) print(cmd_to_run) sb.call(cmd_to_run, shell=True) # CALL HAYSTACK MOTIFS motif_directory = os.path.join(output_directory, 'HAYSTACK_MOTIFS') for sample_name in sample_names: specific_regions_filename = os.path.join( output_directory, 'HAYSTACK_HOTSPOTS', 'SPECIFIC_REGIONS', 'Regions_specific_for_%s*.bed' % sample_name) bg_regions_filename = glob.glob( os.path.join(output_directory, 'HAYSTACK_HOTSPOTS', 'SPECIFIC_REGIONS', 'Background_for_%s*.bed' % sample_name))[0] cmd_to_run = 'haystack_motifs %s %s --bed_bg_filename %s --output_directory %s --name %s' % ( specific_regions_filename, genome_name, bg_regions_filename, motif_directory, sample_name) if meme_motifs_filename: cmd_to_run += ' --meme_motifs_filename %s' % meme_motifs_filename if n_processes: cmd_to_run += ' --n_processes %d' % n_processes if temp_directory: cmd_to_run += ' --temp_directory %s' % temp_directory print(cmd_to_run) sb.call(cmd_to_run, shell=True) if USE_GENE_EXPRESSION: sample_names_tf_activity_filename = os.path.join( output_directory, 'sample_names_tf_activity.txt') with open(sample_names_tf_activity_filename, 'w+') as outfile: for sample_name, gene_expression_filename in zip( sample_names, gene_expression_filenames): outfile.write('%s\t%s\n' % (sample_name, gene_expression_filename)) tf_activity_directory = os.path.join(output_directory, 'HAYSTACK_TFs_ACTIVITY_PLANES') for sample_name in sample_names: # write tf activity conf files # CALL HAYSTACK TF ACTIVITY motifs_output_folder = os.path.join( motif_directory, 'HAYSTACK_MOTIFS_on_%s' % sample_name) if os.path.exists(motifs_output_folder): cmd_to_run = 'haystack_tf_activity_plane %s %s %s --output_directory %s' % ( motifs_output_folder, sample_names_tf_activity_filename, sample_name, tf_activity_directory) if motif_mapping_filename: cmd_to_run += ' --motif_mapping_filename %s' % motif_mapping_filename if plot_all: cmd_to_run += ' --plot_all' if rho_cutoff: cmd_to_run += ' --rho_cutoff %f' % rho_cutoff if tf_value_cuttoff: cmd_to_run += ' --tf_value_cuttoff %f' % tf_value_cuttoff print(cmd_to_run) sb.call(cmd_to_run, shell=True)
def main(): print '\n[H A Y S T A C K H O T S P O T]' print( '\n-SELECTION OF VARIABLE REGIONS- [Luca Pinello - [email protected]]\n' ) print 'Version %s\n' % HAYSTACK_VERSION if which('samtools') is None: error( 'Haystack requires samtools free available at: http://sourceforge.net/projects/samtools/files/samtools/0.1.19/' ) sys.exit(1) if which('bedtools') is None: error( 'Haystack requires bedtools free available at: https://github.com/arq5x/bedtools2/releases/tag/v2.20.1' ) sys.exit(1) if which('bedGraphToBigWig') is None: info( 'To generate the bigwig files Haystack requires bedGraphToBigWig please download from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH' ) #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters') parser.add_argument( 'samples_filename_or_bam_folder', type=str, help= 'A tab delimeted file with in each row (1) a sample name, (2) the path to the corresponding bam filename. Alternatively it is possible to specify a folder containing some .bam files to analyze.' ) parser.add_argument( 'genome_name', type=str, help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)') #optional parser.add_argument('--bin_size', type=int, help='bin size to use(default: 500bp)', default=500) parser.add_argument('--disable_quantile_normalization', help='Disable quantile normalization (default: False)', action='store_true') parser.add_argument( '--th_rpm', type=float, help= 'Percentile on the signal intensity to consider for the hotspots (default: 99)', default=99) parser.add_argument( '--transformation', type=str, help= 'Variance stabilizing transformation among: none, log2, angle (default: angle)', default='angle', choices=['angle', 'log2', 'none']) parser.add_argument('--recompute_all', help='Ignore any file previously precalculated', action='store_true') parser.add_argument( '--z_score_high', type=float, help='z-score value to select the specific regions(default: 1.5)', default=1.5) parser.add_argument( '--z_score_low', type=float, help='z-score value to select the not specific regions(default: 0.25)', default=0.25) parser.add_argument('--name', help='Define a custom output filename for the report', default='') parser.add_argument('--output_directory', type=str, help='Output directory (default: current directory)', default='') parser.add_argument( '--use_X_Y', help= 'Force to process the X and Y chromosomes (default: not processed)', action='store_true') parser.add_argument( '--max_regions_percentage', type=float, help= 'Upper bound on the %% of the regions selected (deafult: 0.1, 0.0=0%% 1.0=100%%)', default=0.1) parser.add_argument( '--depleted', help= 'Look for cell type specific regions with depletion of signal instead of enrichment', action='store_true') parser.add_argument( '--input_is_bigwig', help= 'Use the bigwig format instead of the bam format for the input. Note: The files must have extension .bw', action='store_true') parser.add_argument('--version', help='Print version and exit.', action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict = vars(args) for key, value in args_dict.items(): exec('%s=%s' % (key, repr(value))) if input_is_bigwig: extension_to_check = '.bw' info('Input is set BigWig (.bw)') else: extension_to_check = '.bam' info('Input is set compressed SAM (.bam)') #check folder or sample filename if os.path.isfile(samples_filename_or_bam_folder): BAM_FOLDER = False bam_filenames = [] sample_names = [] with open(samples_filename_or_bam_folder) as infile: for line in infile: if not line.strip(): continue if line.startswith( '#'): #skip optional header line or empty lines info('Skipping header/comment line:%s' % line) continue fields = line.strip().split() n_fields = len(fields) if n_fields == 2: sample_names.append(fields[0]) bam_filenames.append(fields[1]) else: error('The samples file format is wrong!') sys.exit(1) else: if os.path.exists(samples_filename_or_bam_folder): BAM_FOLDER = True bam_filenames = glob.glob( os.path.join(samples_filename_or_bam_folder, '*' + extension_to_check)) if not bam_filenames: error('No bam/bigwig files to analyze in %s. Exiting.' % samples_filename_or_bam_folder) sys.exit(1) sample_names = [ os.path.basename(bam_filename).replace(extension_to_check, '') for bam_filename in bam_filenames ] else: error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder) sys.exit(1) #check all the files before starting info('Checking samples files location...') for bam_filename in bam_filenames: check_file(bam_filename) info('Initializing Genome:%s' % genome_name) genome_directory = determine_path('genomes') genome_2bit = os.path.join(genome_directory, genome_name + '.2bit') if os.path.exists(genome_2bit): genome = Genome_2bit(genome_2bit) else: info("\nIt seems you don't have the required genome file.") if query_yes_no('Should I download it for you?'): sb.call('haystack_download_genome %s' % genome_name, shell=True, env=system_env) if os.path.exists(genome_2bit): info('Genome correctly downloaded!') genome = Genome_2bit(genome_2bit) else: error( 'Sorry I cannot download the required file for you. Check your Internet connection.' ) sys.exit(1) else: error( 'Sorry I need the genome file to perform the analysis. Exiting...' ) sys.exit(1) chr_len_filename = os.path.join(genome_directory, "%s_chr_lengths.txt" % genome_name) check_file(chr_len_filename) if name: directory_name = 'HAYSTACK_HOTSPOTS_on_%s' % name else: directory_name = 'HAYSTACK_HOTSPOTS' if output_directory: output_directory = os.path.join(output_directory, directory_name) else: output_directory = directory_name if not os.path.exists(output_directory): os.makedirs(output_directory) genome_sorted_bins_file = os.path.join( output_directory, '%s.%dbp.bins.sorted.bed' % (os.path.basename(genome_name), bin_size)) tracks_directory = os.path.join(output_directory, 'TRACKS') if not os.path.exists(tracks_directory): os.makedirs(tracks_directory) intermediate_directory = os.path.join(output_directory, 'INTERMEDIATE') if not os.path.exists(intermediate_directory): os.makedirs(intermediate_directory) if not os.path.exists(genome_sorted_bins_file) or recompute_all: info('Creating bins of %dbp for %s in %s' % (bin_size, chr_len_filename, genome_sorted_bins_file)) sb.call( 'bedtools makewindows -g %s -w %s | bedtools sort -i stdin |' % (chr_len_filename, bin_size) + "perl -nle 'print " + '"$_\t$.";' + "' /dev/stdin> %s" % genome_sorted_bins_file, shell=True, env=system_env) #convert bam files to genome-wide rpm tracks for base_name, bam_filename in zip(sample_names, bam_filenames): info('Processing:%s' % bam_filename) rpm_filename = os.path.join(tracks_directory, '%s.bedgraph' % base_name) sorted_rpm_filename = os.path.join(tracks_directory, '%s_sorted.bedgraph' % base_name) mapped_sorted_rpm_filename = os.path.join( tracks_directory, '%s_mapped_sorted.bedgraph' % base_name) binned_rpm_filename = os.path.join( intermediate_directory, '%s.%dbp.rpm' % (base_name, bin_size)) bigwig_filename = os.path.join(tracks_directory, '%s.bw' % base_name) if input_is_bigwig and which('bigWigAverageOverBed'): if not os.path.exists(binned_rpm_filename) or recompute_all: cmd = 'bigWigAverageOverBed %s %s /dev/stdout | sort -s -n -k 1,1 | cut -f5 > %s' % ( bam_filename, genome_sorted_bins_file, binned_rpm_filename) sb.call(cmd, shell=True, env=system_env) shutil.copy2(bam_filename, bigwig_filename) else: if not os.path.exists(binned_rpm_filename) or recompute_all: info('Computing Scaling Factor...') cmd = 'samtools view -c -F 512 %s' % bam_filename #print cmd proc = sb.Popen(cmd, stdout=sb.PIPE, shell=True, env=system_env) (stdout, stderr) = proc.communicate() #print stdout,stderr scaling_factor = (1.0 / float(stdout.strip())) * 1000000 info('Scaling Factor: %e' % scaling_factor) info('Building BedGraph RPM track...') cmd = 'samtools view -b -F 512 %s | bamToBed | slopBed -r %s -l 0 -s -i stdin -g %s | genomeCoverageBed -g %s -i stdin -bg -scale %.32f > %s' % ( bam_filename, bin_size, chr_len_filename, chr_len_filename, scaling_factor, rpm_filename) #print cmd proc = sb.call(cmd, shell=True, env=system_env) if which('bedGraphToBigWig'): if not os.path.exists(bigwig_filename) or recompute_all: info('Converting BedGraph to BigWig') cmd = 'bedGraphToBigWig %s %s %s' % ( rpm_filename, chr_len_filename, bigwig_filename) proc = sb.call(cmd, shell=True, env=system_env) else: info( 'Sorry I cannot create the bigwig file.\nPlease download and install bedGraphToBigWig from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH' ) if not os.path.exists(binned_rpm_filename) or recompute_all: info('Make constant binned (%dbp) rpm values file' % bin_size) #cmd='bedtools sort -i %s | bedtools map -a %s -b stdin -c 4 -o mean -null 0.0 | cut -f5 > %s' %(rpm_filename,genome_sorted_bins_file,binned_rpm_filename) #proc=sb.call(cmd,shell=True,env=system_env) cmd = 'sort -k1,1 -k2,2n %s > %s' % (rpm_filename, sorted_rpm_filename) proc = sb.call(cmd, shell=True, env=system_env) cmd = 'bedtools map -a %s -b %s -c 4 -o mean -null 0.0 > %s' % ( genome_sorted_bins_file, sorted_rpm_filename, mapped_sorted_rpm_filename) proc = sb.call(cmd, shell=True, env=system_env) cmd = 'cut -f5 %s > %s' % (mapped_sorted_rpm_filename, binned_rpm_filename) proc = sb.call(cmd, shell=True, env=system_env) try: os.remove(rpm_filename) os.remove(sorted_rpm_filename) os.remove(mapped_sorted_rpm_filename) except: pass #load coordinates of bins coordinates_bin = pd.read_csv(genome_sorted_bins_file, names=['chr_id', 'bpstart', 'bpend'], sep='\t', header=None, usecols=[0, 1, 2]) N_BINS = coordinates_bin.shape[0] if not use_X_Y: coordinates_bin = coordinates_bin.ix[ (coordinates_bin['chr_id'] != 'chrX') & (coordinates_bin['chr_id'] != 'chrY')] #load all the tracks info('Loading the processed tracks') df_chip = {} for state_file in glob.glob(os.path.join(intermediate_directory, '*.rpm')): col_name = os.path.basename(state_file).replace('.rpm', '') df_chip[col_name] = pd.read_csv(state_file, squeeze=True, header=None) info('Loading:%s' % col_name) df_chip = pd.DataFrame(df_chip) if disable_quantile_normalization: info('Skipping quantile normalization...') else: info('Normalizing the data...') df_chip = pd.DataFrame(quantile_normalization(df_chip.values), columns=df_chip.columns, index=df_chip.index) if which('bedGraphToBigWig'): #write quantile normalized tracks coord_quantile = coordinates_bin.copy() for col in df_chip: if disable_quantile_normalization: normalized_output_filename = os.path.join( tracks_directory, '%s.bedgraph' % os.path.basename(col)) else: normalized_output_filename = os.path.join( tracks_directory, '%s_quantile_normalized.bedgraph' % os.path.basename(col)) normalized_output_filename_bigwig = normalized_output_filename.replace( '.bedgraph', '.bw') if not os.path.exists( normalized_output_filename_bigwig) or recompute_all: info('Writing binned track: %s' % normalized_output_filename_bigwig) coord_quantile['rpm_normalized'] = df_chip.ix[:, col] coord_quantile.dropna().to_csv(normalized_output_filename, sep='\t', header=False, index=False) cmd = 'bedGraphToBigWig %s %s %s' % ( normalized_output_filename, chr_len_filename, normalized_output_filename_bigwig) proc = sb.call(cmd, shell=True, env=system_env) try: os.remove(normalized_output_filename) except: pass else: info( 'Sorry I cannot creat the bigwig file.\nPlease download and install bedGraphToBigWig from here: http://hgdownload.cse.ucsc.edu/admin/exe/ and add to your PATH' ) #th_rpm=np.min(df_chip.apply(lambda x: np.percentile(x,th_rpm))) th_rpm = find_th_rpm(df_chip, th_rpm) info('Estimated th_rpm:%s' % th_rpm) df_chip_not_empty = df_chip.ix[(df_chip > th_rpm).any(1), :] if transformation == 'log2': df_chip_not_empty = df_chip_not_empty.applymap(log2_transform) info('Using log2 transformation') elif transformation == 'angle': df_chip_not_empty = df_chip_not_empty.applymap(angle_transform) info('Using angle transformation') else: info('Using no transformation') iod_values = df_chip_not_empty.var(1) / df_chip_not_empty.mean(1) ####calculate the inflation point a la superenhancers scores = iod_values min_s = np.min(scores) max_s = np.max(scores) N_POINTS = len(scores) x = np.linspace(0, 1, N_POINTS) y = sorted((scores - min_s) / (max_s - min_s)) m = smooth((np.diff(y) / np.diff(x)), 50) m = m - 1 m[m <= 0] = np.inf m[:int(len(m) * (1 - max_regions_percentage))] = np.inf idx_th = np.argmin(m) + 1 #print idx_th, th_iod = sorted(iod_values)[idx_th] #print th_iod hpr_idxs = iod_values > th_iod #print len(iod_values),len(hpr_idxs),sum(hpr_idxs), sum(hpr_idxs)/float(len(hpr_idxs)), info('Selected %f%% regions (%d)' % (sum(hpr_idxs) / float(len(hpr_idxs)) * 100, sum(hpr_idxs))) coordinates_bin['iod'] = iod_values #we remove the regions "without" signal in any of the cell types coordinates_bin.dropna(inplace=True) #create a track for IGV bedgraph_iod_track_filename = os.path.join(tracks_directory, 'VARIABILITY.bedgraph') bw_iod_track_filename = os.path.join(tracks_directory, 'VARIABILITY.bw') if not os.path.exists(bw_iod_track_filename) or recompute_all: info('Generating variability track in bigwig format in:%s' % bw_iod_track_filename) coordinates_bin.to_csv(bedgraph_iod_track_filename, sep='\t', header=False, index=False) sb.call('bedGraphToBigWig %s %s %s' % (bedgraph_iod_track_filename, chr_len_filename, bw_iod_track_filename), shell=True, env=system_env) try: os.remove(bedgraph_iod_track_filename) except: pass #Write the HPRs bedgraph_hpr_filename = os.path.join( tracks_directory, 'SELECTED_VARIABILITY_HOTSPOT.bedgraph') to_write = coordinates_bin.ix[hpr_idxs[hpr_idxs].index] to_write.dropna(inplace=True) to_write['bpstart'] = to_write['bpstart'].astype(int) to_write['bpend'] = to_write['bpend'].astype(int) to_write.to_csv(bedgraph_hpr_filename, sep='\t', header=False, index=False) bed_hpr_fileaname = os.path.join(output_directory, 'SELECTED_VARIABILITY_HOTSPOT.bed') if not os.path.exists(bed_hpr_fileaname) or recompute_all: info('Writing the HPRs in: %s' % bed_hpr_fileaname) sb.call('sort -k1,1 -k2,2n %s | bedtools merge -i stdin > %s' % (bedgraph_hpr_filename, bed_hpr_fileaname), shell=True, env=system_env) #os.remove(bedgraph_hpr_filename) df_chip_hpr = df_chip_not_empty.ix[hpr_idxs, :] df_chip_hpr_zscore = df_chip_hpr.apply(zscore, axis=1) specific_regions_directory = os.path.join(output_directory, 'SPECIFIC_REGIONS') if not os.path.exists(specific_regions_directory): os.makedirs(specific_regions_directory) if depleted: z_score_high = -z_score_high z_score_low = -z_score_low #write target info('Writing Specific Regions for each cell line...') coord_zscore = coordinates_bin.copy() for col in df_chip_hpr_zscore: regions_specific_filename = 'Regions_specific_for_%s_z_%.2f.bedgraph' % ( os.path.basename(col).replace('.rpm', ''), z_score_high) specific_output_filename = os.path.join(specific_regions_directory, regions_specific_filename) specific_output_bed_filename = specific_output_filename.replace( '.bedgraph', '.bed') if not os.path.exists(specific_output_bed_filename) or recompute_all: if depleted: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] < z_score_high, col] else: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] > z_score_high, col] coord_zscore.dropna().to_csv(specific_output_filename, sep='\t', header=False, index=False) info('Writing:%s' % specific_output_bed_filename) sb.call('sort -k1,1 -k2,2n %s | bedtools merge -i stdin > %s' % (specific_output_filename, specific_output_bed_filename), shell=True, env=system_env) #write background info('Writing Background Regions for each cell line...') coord_zscore = coordinates_bin.copy() for col in df_chip_hpr_zscore: regions_bg_filename = 'Background_for_%s_z_%.2f.bedgraph' % ( os.path.basename(col).replace('.rpm', ''), z_score_low) bg_output_filename = os.path.join( specific_regions_directory, 'Background_for_%s_z_%.2f.bedgraph' % (os.path.basename(col).replace('.rpm', ''), z_score_low)) bg_output_bed_filename = bg_output_filename.replace( '.bedgraph', '.bed') if not os.path.exists(bg_output_bed_filename) or recompute_all: if depleted: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] > z_score_low, col] else: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] < z_score_low, col] coord_zscore.dropna().to_csv(bg_output_filename, sep='\t', header=False, index=False) info('Writing:%s' % bg_output_bed_filename) sb.call('sort -k1,1 -k2,2n -i %s | bedtools merge -i stdin > %s' % (bg_output_filename, bg_output_bed_filename), shell=True, env=system_env) ###plot selection pl.figure() pl.title('Selection of the HPRs') pl.plot(x, y, 'r', lw=3) pl.plot(x[idx_th], y[idx_th], '*', markersize=20) pl.hold(True) x_ext = np.linspace(-0.1, 1.2, N_POINTS) y_line = (m[idx_th] + 1.0) * (x_ext - x[idx_th]) + y[idx_th] pl.plot(x_ext, y_line, '--k', lw=3) pl.xlim(0, 1.1) pl.ylim(0, 1) pl.xlabel('Fraction of bins') pl.ylabel('Score normalized') pl.savefig( os.path.join(output_directory, 'SELECTION_OF_VARIABILITY_HOTSPOT.pdf')) pl.close() igv_session_filename = os.path.join(output_directory, 'OPEN_ME_WITH_IGV.xml') info('Creating an IGV session file (.xml) in: %s' % igv_session_filename) session = ET.Element("Session") session.set("genome", genome_name) session.set("hasGeneTrack", "true") session.set("version", "7") resources = ET.SubElement(session, "Resources") panel = ET.SubElement(session, "Panel") resource_items = [] track_items = [] hpr_iod_scores = scores[scores > th_iod] min_h = np.mean(hpr_iod_scores) - 2 * np.std(hpr_iod_scores) max_h = np.mean(hpr_iod_scores) + 2 * np.std(hpr_iod_scores) mid_h = np.mean(hpr_iod_scores) #write the tracks for sample_name in sample_names: if disable_quantile_normalization: track_full_path = os.path.join( output_directory, 'TRACKS', '%s.%dbp.bw' % (sample_name, bin_size)) else: track_full_path = os.path.join( output_directory, 'TRACKS', '%s.%dbp_quantile_normalized.bw' % (sample_name, bin_size)) track_filename = rem_base_path(track_full_path, output_directory) if os.path.exists(track_full_path): resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set("path", track_filename) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "0,0,178") track_items[-1].set('id', track_filename) track_items[-1].set("name", sample_name) resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set( "path", rem_base_path(bw_iod_track_filename, output_directory)) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "178,0,0") track_items[-1].set('id', rem_base_path(bw_iod_track_filename, output_directory)) track_items[-1].set('renderer', "HEATMAP") track_items[-1].set( "colorScale", "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" % (mid_h, min_h, mid_h, max_h)) track_items[-1].set("name", 'VARIABILITY') resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set("path", rem_base_path(bed_hpr_fileaname, output_directory)) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "178,0,0") track_items[-1].set('id', rem_base_path(bed_hpr_fileaname, output_directory)) track_items[-1].set('renderer', "HEATMAP") track_items[-1].set( "colorScale", "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" % (mid_h, min_h, mid_h, max_h)) track_items[-1].set("name", 'HOTSPOTS') for sample_name in sample_names: track_full_path = glob.glob( os.path.join(output_directory, 'SPECIFIC_REGIONS', 'Regions_specific_for_%s*.bedgraph' % sample_name))[0] specific_track_filename = rem_base_path(track_full_path, output_directory) if os.path.exists(track_full_path): resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set("path", specific_track_filename) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "178,0,0") track_items[-1].set('id', specific_track_filename) track_items[-1].set('renderer', "HEATMAP") track_items[-1].set( "colorScale", "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" % (mid_h, min_h, mid_h, max_h)) track_items[-1].set("name", 'REGION SPECIFIC FOR %s' % sample_name) tree = ET.ElementTree(session) tree.write(igv_session_filename, xml_declaration=True) info('All done! Ciao!') sys.exit(0)