Пример #1
0
def get_random_coordinates(coords, genome):
    random_coords = []
    for c in coords:
        random_bpstart = np.random.randint(
            1, genome.chr_len[c.chr_id] - len(c) + 1)
        random_coords.append(
            Coordinate(c.chr_id, random_bpstart, random_bpstart + len(c) - 1))
    return random_coords
def intersections_lengths(n_input_coordinates,interval_tree,coord_to_row_index,target_coordinates):
    inters_lenghts=np.zeros(n_input_coordinates)
    for cl_index,c in enumerate(target_coordinates):

        if interval_tree.has_key(c.chr_id):
            coords_hits=interval_tree[c.chr_id].find(c.bpstart, c.bpend)

            for coord_hit in coords_hits:
                c_to_add=Coordinate.coordinates_from_interval(c.chr_id, coord_hit)
                row_index=coord_to_row_index[c_to_add]
                inters_lenghts[row_index]+=len(c &  c_to_add)
                inters_lenghts[row_index]=min(inters_lenghts[row_index],len(c_to_add)) #shared regions across genes..
                #print 'target:'+str(c),'hit:'+str(c_to_add),len(c),len(c_to_add),len(c &  c_to_add)

    return inters_lenghts
Пример #3
0
print 'Please send any bugs to: [email protected]'
print '----------------------------------------------\n\n'
ng=Ngram(alphabet_size=4,ngram_length=4)    

print '>Loading genome from:',genome_directory

if memory_mapped_genome:
    g=Genome_mm(genome_directory)
else:
    g=Genome(genome_directory)

print 'Genome Loaded.'
    
mim_values=dict()
print '\n>Loading coordinates from:',bed_file
coordinates=Coordinate.bed_to_coordinates(bed_file)
print '%d coordinates loaded' % len(coordinates)

S=[]
R=[]

print '\n>Extracting sequences:'
pb = ProgressBar(widgets=['Sequences processed: ', Percentage()], maxval=len(coordinates)).start()
for idx,c in enumerate(coordinates):
    seq=g.extract_sequence(c)
    
    if 'n' not in seq:
        S.append(seq)
        
        if shuffle:
            seq_random=''.join( [S[-1][i] for i in permutation(len(S[-1] ))]   )
Пример #4
0
for chrom in bed_dict:
	for site in bed_dict[chrom]:

		counter += 1

		if counter%1000 == 0:
			logger.info('Completed PAM search for %s out of %s sites ...' % (counter, total_regions))

		pams_found_list = []

		start, stop, sample = site
		total_sgRNAs_dict[sample] = []

		pams_found[sample] = [0]*len(pam_list)

		genomic_region = genome.extract_sequence(Coordinate(chrom, start, stop)).upper()

		for i in range(len(pam_list)):

			for pam_entry in pam_list[i]:

				find_guides_top = [[m.start()] for m in re.finditer('(?=%s)' % pam_entry[0], genomic_region, re.IGNORECASE)]
				find_guides_bottom = [[m.start()] for m in re.finditer('(?=%s)' % pam_entry[1], genomic_region, re.IGNORECASE)]

				if find_guides_top:

					for match in find_guides_top:

						start_g = match[0]
						stop_g = start_g + pam_entry[2]
Пример #5
0
def run_haystack():

    print '\n[H A Y S T A C K   M O T I F S]'
    print('\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n')

    if which('fimo') is None:
        error('Haystack requires Fimo from the MEME suite free available at: http://meme.nbcr.net/meme/')
        sys.exit(1)

    bootstrap=False
    ngram_correction='g'

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument('bed_target_filename', type=str,  help='A bed file containing the target coordinates on the genome of reference')
    parser.add_argument('genome_name', type=str,  help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')

    #optional
    parser.add_argument('--bed_bg_filename', type=str,  help="A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)", default='random_background')
    parser.add_argument('--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2014)')
    parser.add_argument('--nucleotide_bg_filename',type=str, help='Nucleotide probability for the background in MEME format (default precomupted on the Genome)')
    parser.add_argument('--p_value', type=float, help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)',default=1e-4)
    parser.add_argument('--no_c_g_correction',  help='Disable the matching of the C+G density of the background',action='store_true')
    parser.add_argument('--c_g_bins', type=int,help='Number of bins for the C+G density correction (default: 8)',default=8)
    parser.add_argument('--mask_repetitive', help='Mask repetitive sequences',action='store_true')
    parser.add_argument('--n_target_coordinates', type=int, help='Number of target coordinates to use (default: all)',default=np.inf)
    parser.add_argument('--use_entire_bg', help='Use the entire background file (use only when the cg correction is disabled)',action='store_true')   
    parser.add_argument('--bed_score_column', type=int, help='Column in the bedfile that represents the score (default: 5)',default=5)
    parser.add_argument('--bg_target_ratio', type=int, help='Background size/Target size ratio (default: 1.0)',default=2)
    parser.add_argument('--bootstrap',  help='Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)',action='store_true')
    parser.add_argument('--temp_directory',  help='Directory to store temporary files  (default: /tmp)', default='/tmp')
    parser.add_argument('--no_random_sampling_target',  help='Select the best --n_target_coordinates using the score column from the target file instead of randomly select them',action='store_true')
    parser.add_argument('--name',  help='Define a custom output filename for the report', default='')
    parser.add_argument('--internal_window_length', type=int, help='Window length in bp for the enrichment (default: average lenght of the target sequences)')
    parser.add_argument('--window_length', type=int, help='Window length in bp for the profiler (default:internal_window_length*5)')
    parser.add_argument('--min_central_enrichment', type=float, help='Minimum central enrichment to report a motif (default:>1.0)',default=1.0)
    parser.add_argument('--disable_ratio',  help='Disable target/bg ratio filter',action='store_true')
    parser.add_argument('--dump', help='Dump all the intermediate data, choices: True, False (default: False)',action='store_true')
    parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)',default='')
    parser.add_argument('--smooth_size',type=int, help='Size in bp for the smoothing window (default: internal_window_length/4)')
    parser.add_argument('--gene_annotations_filename',type=str, help='Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene')
    parser.add_argument('--gene_ids_to_names_filename',type=str, help='Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)')
    parser.add_argument('--n_processes',type=int, help='Specify the number of processes to use. The default is #cores available.',default=multiprocessing.cpu_count())
    parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %.1f' % HAYSTACK_VERSION)

    args = parser.parse_args()

    args_dict=vars(args)
    for key,value in args_dict.items():
        if key=='n_target_coordinates':
            n_target_coordinates=value
        else:
            exec('%s=%s' %(key,repr(value)))

    
    bed_score_column-=1

    if no_c_g_correction:
        c_g_correction=False
    else:
        c_g_correction=True

    if no_random_sampling_target:
        random_sampling_target=False
    else:
        random_sampling_target=True
        

    check_file(bed_target_filename)

    if not  bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)


    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename=os.path.join(determine_path('motif_databases'),'JASPAR_CORE_2014_vertebrates.meme')
        
    annotation_directory=determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error('The mapping to the closest gene requires Java free available from: http://java.com/en/download/')
            use_gene_annotations=False
        else:
            check_file(gene_annotations_filename) 
            info('Using %s as gene annotations file' % gene_annotations_filename)
            use_gene_annotations=True
    else:
            gene_annotations_filename=os.path.join(annotation_directory,'%s_genes.bed' % genome_name)
            gene_ids_to_names_filename=os.path.join(annotation_directory,'%s_genes_id_to_names' % genome_name)
            
            if os.path.exists(gene_annotations_filename) and os.path.exists(gene_ids_to_names_filename):
                use_gene_annotations=True
            else:
                use_gene_annotations=False
                info('No gene annotations file specified')


    target_name=ntpath.basename(bed_target_filename.replace('.bed',''))
    bg_name=ntpath.basename(bed_bg_filename.replace('.bed',''))
    #timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name='HAYSTACK_MOTIFS_on_'+name
    else:
        directory_name='HAYSTACK_on_'+target_name+'_VS_'+bg_name

    if output_directory:
        output_directory=os.path.join(output_directory, directory_name)
    else:
        output_directory=directory_name


    info('###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n'\
         %(bed_target_filename,bed_bg_filename,str(bg_target_ratio),str(c_g_correction),str(mask_repetitive),'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates),output_directory))

    info('Initializing Genome:%s' %genome_name)

    genome_directory=determine_path('genomes')
    genome_2bit=os.path.join(genome_directory,genome_name+'.2bit')

    if os.path.exists(genome_2bit):
        genome=Genome_2bit(genome_2bit)
    else:
        info("\nIt seems you don't have the required genome file.")
        if query_yes_no('Should I download it for you?'):
            sb.call('download_genome %s' %genome_name,shell=True,env=system_env)
            if os.path.exists(genome_2bit):
                info('Genome correctly downloaded!')
                genome=Genome_2bit(genome_2bit)
            else:
                error('Sorry I cannot download the required file for you. Check your Internet connection.')
                sys.exit(1)
        else:
            error('Sorry I need the genome file to perform the analysis. Exiting...')
            sys.exit(1)

    if not nucleotide_bg_filename:
        nucleotide_bg_filename=os.path.join(genome_directory,genome_name+'_meme_bg')

    check_file(nucleotide_bg_filename)
        


    N_TARGET=None
    N_BG=None
    COMMAND_USED=' '.join(sys.argv)

    _n_target_coordinates=n_target_coordinates


    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords=Coordinate.bed_to_coordinates(bed_target_filename,cl_score=bed_score_column)

    if len(target_coords) == 0:
    	info('No coordinates to analyze in your input file. Exiting.')
    	sys.exit(1)

    #calculate automatically the average lenght of the target regions


    if internal_window_length:
        info('Using the user defined internal window length:%d' % internal_window_length )
        if internal_window_length % 2:
            internal_window_length+=1
            
    else:
                        
        internal_window_length=int(np.mean(map(len,target_coords)))
        if internal_window_length % 2:
            internal_window_length+=1
        info('Using the average length of target coordinates as internal window length:%d' % internal_window_length )

        if not window_length:
            window_length=internal_window_length*5

    info('Total window length:%d' % window_length ) 

        

    if not smooth_size:
        smooth_size=internal_window_length/5

    target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,internal_window_length)
        

    if len(target_coords)>n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %( n_target_coordinates,len(target_coords)))
            target_coords=random.sample(target_coords,n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %( n_target_coordinates,len(target_coords)))
            sorted_idxs_by_score=np.argsort([c.score for c in target_coords])[::-1]
            target_coords=[target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates]]
    else:
        
        if random_sampling_target and bootstrap and not np.isinf(n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords=sample_wr(target_coords,n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))
            

    info('Extracting Motifs in target coordinates')
    positive_matrix,motifs_profiles_in_sequences, idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,motif_names,motif_ids=ParallelFimoScanning(target_coords,
                                                                                                                                                  meme_motifs_filename,
                                                                                                                                                  genome,nucleotide_bg_filename,
                                                                                                                                                  temp_directory=temp_directory,
                                                                                                                                                  p_value=p_value,
                                                                                                                                                  mask_repetitive=mask_repetitive,
                                                                                                                                                  window_length=window_length,
                                                                                                                                                  internal_window_length=internal_window_length,
                                                                                                                                                  num_consumers=n_processes)
    n_target_coordinates=len(target_coords) #fix for the bootstrap!




    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords=[]
            c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction)

            info('Extract a Matching C+G Background')
            bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf))

            for _ in range(bg_target_ratio):
                for idx_c,c in enumerate(target_coords):
                    c_bin=np.nonzero(np.histogram(c_g_content_target[idx_c],bins)[0])[0][0]
                    c_random_bin=-1
                    
                    while c_random_bin != c_bin:
                        random_bpstart=np.random.randint(1,genome.chr_len[c.chr_id]-len(c)+1)
                        c_random=Coordinate(c.chr_id,random_bpstart,random_bpstart+len(c)-1)
                        seq=genome.extract_sequence(c_random)
                        c_g_content_c_random=(seq.count('c')+seq.count('g'))/float(len(c))
                        c_random_bin=np.nonzero(np.histogram(c_g_content_c_random,bins)[0])[0][0]

                    #print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            bg_hist=np.histogram(c_g_content_bg,bins)[0]
            debug('original: '+str(np.histogram(c_g_content_target,bins)[0]))
            debug('obtained:'+str(np.histogram(c_g_content_bg,bins)[0]))

        else:
            bg_coords=get_random_coordinates(target_coords,genome)
        
        info('Done!')
       
    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords=Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords=Coordinate.coordinates_of_intervals_around_center(bg_coords,internal_window_length)

        if use_entire_bg:
            bg_target_ratio=float(len(bg_coords))/n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio)

        
        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction)   
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)

            info('Extract a Matching C+G Background')
            bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf))
            target_hist=np.histogram(c_g_content_target,bins)[0]
            bg_hist=np.histogram(c_g_content_bg,bins)[0]
            ratios=bg_hist/(target_hist*1.0);
            debug('original:%s' %target_hist)
            debug('bg:%s' %bg_hist)
            debug('ratios:%s' %ratios)
            K_MATCH=min(bg_target_ratio,ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios>0) &(target_hist/float(target_hist.sum())>0.05)].min())

            debug('K_MATCH:%d' %K_MATCH)

            to_match=np.int32(np.floor(K_MATCH*target_hist))

            debug('to_match:%s' %to_match)
            
            idxs_corrected_bg=np.array([],dtype=int)

            for idx_bin in range(len(bins)-1):
                idxs_matching_regions=np.nonzero((c_g_content_bg>=bins[idx_bin]) & (c_g_content_bg<bins[idx_bin+1]))[0]
                to_take=np.random.permutation(len(idxs_matching_regions))
                to_take=to_take[range(min(len(idxs_matching_regions),to_match[idx_bin]))]
                idxs_corrected_bg= np.hstack((idxs_corrected_bg,idxs_matching_regions[to_take]))  

            debug('original:%s' %target_hist)
            debug('K:%d' %K_MATCH)
            debug('to sample:%s' %to_match) 
            debug( 'obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg],bins)[0] )
            bg_coords=[bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            debug(np.histogram(c_g_content_bg,bins)[0])
            if np.array_equal(K_MATCH*target_hist,np.histogram(c_g_content_bg,bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0]))
            else:
                warn('C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'%(target_hist,np.histogram(c_g_content_bg,bins)[0]))

            debug(target_hist/np.histogram(c_g_content_bg,bins)[0])


    if len(bg_coords)>=bg_target_ratio*n_target_coordinates:
        bg_coords=random.sample(bg_coords,int(bg_target_ratio*n_target_coordinates))
    else:
        if bootstrap and len(bg_coords)<(bg_target_ratio*n_target_coordinates*0.95): #allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords=sample_wr(bg_coords,int(bg_target_ratio*n_target_coordinates))
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix,motifs_profiles_in_bg,idxs_seqs_with_motif_bg=ParallelFimoScanning(bg_coords,
                                                                                       meme_motifs_filename,
                                                                                       genome,nucleotide_bg_filename,
                                                                                       temp_directory=temp_directory,
                                                                                       p_value=p_value,
                                                                                       mask_repetitive=mask_repetitive,
                                                                                       window_length=window_length,
                                                                                       internal_window_length=internal_window_length,
                                                                                       num_consumers=n_processes)[0:3]

    #allocate date for reports
    N_MOTIFS=len(motif_ids)
    rankings=np.zeros(N_MOTIFS,dtype=np.int16)
    motif_ratios=np.zeros(N_MOTIFS)
    support_p=np.zeros(N_MOTIFS)
    support_n=np.zeros(N_MOTIFS)
    fisher_p_values=np.zeros(N_MOTIFS)
    central_enrichment=np.zeros(N_MOTIFS)

    N_seq_p=positive_matrix.shape[0]
    N_seq_n=negative_matrix.shape[0]

    profile_presence_p=(positive_matrix>0).sum(0)
    profile_presence_n=(negative_matrix>0).sum(0)

    support_p=profile_presence_p/float(N_seq_p)
    support_n=profile_presence_n/float(N_seq_n)

    internal_bpstart=window_length/2-internal_window_length/2
    internal_bpend=window_length/2+internal_window_length/2

    for idx,motif_id in enumerate(motif_ids):
        fisher_p_values[idx]= stats.fisher_exact([[ profile_presence_p[idx], N_seq_p-profile_presence_p[idx]], [ profile_presence_n[idx], N_seq_n-profile_presence_n[idx]]])[1]
        central_enrichment[idx]=motifs_profiles_in_sequences[motif_id][internal_bpstart:internal_bpend].mean()/ np.hstack([motifs_profiles_in_sequences[motif_id][:internal_bpstart],motifs_profiles_in_sequences[motif_id][internal_bpend:]]).mean()
        
    motif_ratios=(support_p+0.01)/(support_n+0.01)

    #Foundamental!
    if not disable_ratio:
        motif_ratios[support_p<0.03]=1
    
    rankings=stats.rankdata(-motif_ratios)


    #filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep=np.nonzero(motif_ratios>1)[0]
    else:
        idxs_to_keep=range(len(motif_ratios))
        

    rankings=rankings[idxs_to_keep]
    motif_ratios=motif_ratios[idxs_to_keep]
    support_p=support_p[idxs_to_keep]
    support_n=support_n[idxs_to_keep]
    fisher_p_values=fisher_p_values[idxs_to_keep]
    central_enrichment=central_enrichment[idxs_to_keep]

    motif_ids=[motif_ids[_] for _ in idxs_to_keep]
    motif_names=[motif_names[_] for _ in idxs_to_keep]
    motif_idxs=[_ for _ in idxs_to_keep]

    try:
        qvalues=estimate_qvalues(fisher_p_values); # we test the ones only with ratio >1
    except:
        print fisher_p_values

    #qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################


    #generate reports in html
    info('Generating HTML report...')
    imgs_directory=os.path.join(output_directory,'images')
    genes_list_directory=os.path.join(output_directory,'genes_lists')
    motif_regions_directory=os.path.join(output_directory,'motifs_regions')

    #create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)


    j2_env = Environment(loader=FileSystemLoader(determine_path('extra')+'/templates/'),trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra')+'/templates/')
    template= j2_env.get_template('report_template.html')

    #copy haystack logo and bg
    shutil.copyfile(determine_path('extra')+'/templates/haystack_logo.png', os.path.join(imgs_directory,'haystack_logo.png'))
    shutil.copyfile(determine_path('extra')+'/templates/noise.png', os.path.join(imgs_directory,'noise.png'))

    motifs_dump=[]
    for i in np.argsort(rankings):
        if (support_p[i]>=0.03 or disable_ratio)  and fisher_p_values[i]<0.01  and  (motif_ratios[i]>1 or disable_ratio) and central_enrichment[i]>min_central_enrichment:
        #if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):
       
            info('Generating logo and profile for:'+motif_ids[i])
            
            #create motif logo
            img_logo=os.path.join(imgs_directory,'logo_'+motif_ids[i])
            generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo')
            generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo',file_format='pdf')
            #fix the weblogo prefix problem
            img_logo_url=os.path.join('images','logo_'+motif_ids[i]+'.png')
            
            #create motif enrichment profile
            img_profile=os.path.join(imgs_directory,'profile_'+motif_ids[i]+'.png')
            motif_profile_target=motifs_profiles_in_sequences[motif_ids[i]]/N_seq_p
            motif_profile_bg=motifs_profiles_in_bg[motif_ids[i]]/N_seq_n

            #print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,motif_profile_bg,motif_ids[i],img_profile,smooth_size=smooth_size,window_size=window_length)
            img_profile_url=os.path.join('images','profile_'+motif_ids[i]+'.png')
            
            #create regions
            info('Extracting regions with:'+motif_ids[i])
            regions=os.path.join(motif_regions_directory,motif_ids[i]+'_motif_region_in_target.bed')
            with open(regions,'w+') as outfile:
                outfile.write('Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n')
                for c,locations in motif_coords_in_seqs_with_motif[motif_ids[i]].items():
                    outfile.write('\t'.join([c.chr_id,str(c.bpstart),str(c.bpend),';'.join(['-'.join(map(str,map(int,l))) for l in locations]),str(len(locations))])+'\n')
            regions_url=os.path.join('motifs_regions',motif_ids[i]+'_motif_region_in_target.bed')
            
            #map closest downstream genes
            genes_url=None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' % motif_ids[i])

                peak_annotator_path=os.path.join(determine_path('extra/'),'PeakAnnotator.jar')

                if CURRENT_PLATFORM=='CYGWIN':
                    peak_annotator_path=cygwin_path(peak_annotator_path)
                    gene_ids_to_names_filename=cygwin_path(gene_ids_to_names_filename)
                    regions=cygwin_path(regions)
                    gene_annotations_filename=cygwin_path(gene_annotations_filename)
                    
                if gene_ids_to_names_filename:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -s %s -o %s &> %s' \
                            %(regions,gene_annotations_filename,gene_ids_to_names_filename,genes_list_directory,os.path.join(genes_list_directory,'log_peakannotator.txt')),  shell=True,env=system_env)
                else:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s  -o %s &> %s' \
                            %(regions,gene_annotations_filename,genes_list_directory,os.path.join(genes_list_directory,'log_peakannotator.txt')),  shell=True,env=system_env)

                
                genes_url=os.path.join('genes_lists',motif_ids[i]+'_motif_region_in_target.tss.bed')
                                
                
            motifs_dump.append({'id':motif_ids[i],'name':motif_names[i],'support_p':support_p[i]*100,
                                 'support_n':support_n[i]*100, 'ratio':motif_ratios[i],'rank':float(rankings[i]),
                                 'pvalue':fisher_p_values[i],'qvalue':qvalues[i],'central_enrichment':central_enrichment[i],
                                 'img_logo':img_logo_url,'img_profile':img_profile_url,'regions':regions_url,'genes':genes_url,'idx_motif':motif_idxs[i]})


    outfile= codecs.open(os.path.join(output_directory,"Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump,bed_target_filename=bed_target_filename,bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n,\
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,use_gene_annotations=use_gene_annotations))
    outfile.close()    

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory=os.path.join(output_directory,'dump')
        
        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)
            
        np.save(os.path.join(dump_directory,'matrix_'+target_name),positive_matrix)
        np.save(os.path.join(dump_directory,'matrix_BG_'+target_name),negative_matrix)
        
        cp.dump(motifs_dump,open(os.path.join(dump_directory,target_name+'_motif_dumps.pickle'),'w'))

        #cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        #cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(idxs_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_seqs_idxs.pickle'),'w'))
        cp.dump(idxs_seqs_with_motif_bg,open(os.path.join(dump_directory,bg_name+'_motif_seqs_idxs.pickle'),'w'))

        cp.dump(motif_coords_in_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_coords_in_seqs_with_motif.pickle'),'w'))

        Coordinate.coordinates_to_bed(target_coords,os.path.join(dump_directory,'Target_coordinates_selected_on_'+target_name+'.bed'),minimal_format=False)
        Coordinate.coordinates_to_bed(bg_coords,os.path.join(dump_directory,'BG_coordinates_selected_on_'+ bg_name+'.bed'),minimal_format=True)

    info('All done! Ciao!')
    sys.exit(0)
Пример #6
0
def ParallelFimoScanning(target_coords,meme_motifs_filename,genome,nucleotide_bg_filename,temp_directory='/tmp',p_value=1e-4,num_consumers=multiprocessing.cpu_count(),mask_repetitive=False,window_length=None,internal_window_length=None):
    # Establish communication queues
    tasks = multiprocessing.Queue()
    results = multiprocessing.Queue()
    
    # Start consumers
    debug('Creating %d fimo consumers' % num_consumers)
    consumers = [ FimoSequencesConsumer(tasks, results)
                  for i in xrange(num_consumers) ]
    for w in consumers:
        w.start()


    #Initialize Fimo
    info('Initiliaze Fimo and load motifs')
    fimo=Fimo(meme_motifs_filename,nucleotide_bg_filename,temp_directory=temp_directory,p_value=p_value)

    #print 'DEBUG:',target_coords[0],len(target_coords[0])
    original_target_coords=target_coords
    
    if window_length:
        internal_bpstart=window_length/2-internal_window_length/2
        internal_bpend=window_length/2+internal_window_length/2
        #print 'DEBUG:',target_coords[0],window_length,internal_window_length,internal_bpstart,internal_bpend
        original_target_coords=target_coords
        target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,window_length)
        #print 'DEBUG:',target_coords[0],len(target_coords[0])
        
    
    # Enqueue jobs
    num_jobs = len(target_coords)
    for idx,c in enumerate(target_coords):
        seq=genome.extract_sequence(c,mask_repetitive=mask_repetitive)
        tasks.put(FimoOnSingleSequence(seq, fimo,idx))
    
    # Add a poison pill for each consumer
    for i in xrange(num_consumers):
        tasks.put(None)

    motifs_profiles_in_sequences=dict()
    idxs_seqs_with_motif=dict()
    motif_coords_in_seqs_with_motif=dict()
    
    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id]=np.zeros(len(c))
        idxs_seqs_with_motif[motif_id]=set()
        motif_coords_in_seqs_with_motif[motif_id]=pickable_defaultdict()

    motifs_in_sequences_matrix=np.zeros((len(target_coords),len(fimo.motif_ids)))

    # Build the final matrix
    for idx in xrange(len(target_coords)):
        idx_seq,row= results.get()
        motif_in_center=set()
        for motif in row:
            motifs_profiles_in_sequences[motif['id']][motif['start']:motif['end']]+=1.0
            
            if motif['start']>=internal_bpstart and motif['end']<=internal_bpend: #keep track only if is in the internal window!
                idxs_seqs_with_motif[motif['id']].add(idx_seq)
                motifs_in_sequences_matrix[idx_seq,fimo.motif_id_to_index[motif['id']]]=+1
                motif_in_center.add(motif['id'])

                motif_coords_in_seqs_with_motif[motif['id']][original_target_coords[idx_seq]].append((int(motif['start']+target_coords[idx_seq].bpstart-1),int(motif['end']+target_coords[idx_seq].bpstart-1) ))
                
            

    return motifs_in_sequences_matrix,motifs_profiles_in_sequences,idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,fimo.motif_names, fimo.motif_ids
'''
To obtain the intergenic regions use hg19.genome (chr lengths) and then with bedtools
subtractBed -a hg19.bed -b coordinates_gene_hg19.bed > intergenic_regions_hg19.bed


'''

from bioutilities import Coordinate, Gene
gene_annotation_file='RefSeqhg19.txt'


gl=Gene.load_from_annotation(gene_annotation_file,load_exons_introns_info=True,header_lines=1)

exons=Gene.exons_from_annotations(gene_annotation_file)
Coordinate.coordinates_to_bed(exons,'hg_19_exons.bed',minimal_format=True)
del(exons)

introns=Gene.introns_from_annotations(gene_annotation_file)
Coordinate.coordinates_to_bed(introns,'hg_19_introns.bed',minimal_format=True)
del(introns)

Coordinate.coordinates_to_bed(Gene.genes_coordinates_from_annotations(gene_annotation_file),'genes_coordinates_hg_19.bed',minimal_format=True)
Пример #8
0
def main(input_args=None):
    print '\n[H A Y S T A C K   M O T I F S]'
    print(
        '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    bootstrap = False
    ngram_correction = 'g'

    parser = get_args_motif()
    args = parser.parse_args(input_args)

    args.n_processes = max(1, args.n_processes - 1)

    args_dict = vars(args)
    for key, value in args_dict.items():
        if key == 'n_target_coordinates':
            n_target_coordinates = value
        else:
            exec('%s=%s' % (key, repr(value)))

    bed_score_column -= 1

    if no_c_g_correction:
        c_g_correction = False
    else:
        c_g_correction = True

    if no_random_sampling_target:
        random_sampling_target = False
    else:
        random_sampling_target = True

    check_file(bed_target_filename)

    if not bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)

    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename = os.path.join(
            determine_path('motif_databases'),
            'JASPAR_CORE_2016_vertebrates.meme')

    annotation_directory = determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error(
                'The mapping to the closest gene requires Java free available from: http://java.com/en/download/'
            )
            use_gene_annotations = False
        else:
            check_file(gene_annotations_filename)
            info('Using %s as gene annotations file' %
                 gene_annotations_filename)
            use_gene_annotations = True
    else:
        gene_annotations_filename = os.path.join(annotation_directory,
                                                 '%s_genes.bed' % genome_name)
        gene_ids_to_names_filename = os.path.join(
            annotation_directory, '%s_genes_id_to_names' % genome_name)

        if os.path.exists(gene_annotations_filename) and os.path.exists(
                gene_ids_to_names_filename):
            use_gene_annotations = True
        else:
            use_gene_annotations = False
            info('No gene annotations file specified')

    genome, _, nucleotide_bg_filename = initialize_genome(genome_name)

    target_name = ntpath.basename(bed_target_filename.replace('.bed', ''))

    bg_name = ntpath.basename(bed_bg_filename.replace('.bed', ''))
    # timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name = 'HAYSTACK_MOTIFS_on_' + name
    else:
        directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    info(
        '###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n' \
        % (bed_target_filename, bed_bg_filename, str(bg_target_ratio), str(c_g_correction), str(mask_repetitive),
           'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates), output_directory))

    N_TARGET = None
    N_BG = None
    COMMAND_USED = ' '.join(sys.argv)

    _n_target_coordinates = n_target_coordinates

    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords = Coordinate.bed_to_coordinates(bed_target_filename,
                                                  cl_score=bed_score_column)

    if len(target_coords) == 0:
        info('No coordinates to analyze in your input file. Exiting.')
        sys.exit(1)

    # calculate automatically the average lenght of the target regions

    if internal_window_length:
        info('Using the user defined internal window length:%d' %
             internal_window_length)
        if internal_window_length % 2:
            internal_window_length += 1

    else:

        internal_window_length = int(np.mean(map(len, target_coords)))
        if internal_window_length % 2:
            internal_window_length += 1
        info(
            'Using the average length of target coordinates as internal window length:%d'
            % internal_window_length)

        if not window_length:
            window_length = internal_window_length * 5

    info('Total window length:%d' % window_length)

    if not smooth_size:
        smooth_size = internal_window_length / 5

    target_coords = Coordinate.coordinates_of_intervals_around_center(
        target_coords, internal_window_length)

    if len(target_coords) > n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            target_coords = random.sample(target_coords, n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            sorted_idxs_by_score = np.argsort([c.score
                                               for c in target_coords])[::-1]
            target_coords = [
                target_coords[idx]
                for idx in sorted_idxs_by_score[:n_target_coordinates]
            ]
    else:

        if random_sampling_target and bootstrap and not np.isinf(
                n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords = sample_wr(target_coords, n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))

    info('Extracting Motifs in target coordinates')
    positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning(
        target_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)
    n_target_coordinates = len(target_coords)  # fix for the bootstrap!

    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords = []
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))

            for _ in range(bg_target_ratio):
                for idx_c, c in enumerate(target_coords):
                    c_bin = np.nonzero(
                        np.histogram(c_g_content_target[idx_c], bins)[0])[0][0]
                    c_random_bin = -1

                    while c_random_bin != c_bin:
                        random_bpstart = np.random.randint(
                            1, genome.chr_len[c.chr_id] - len(c) + 1)
                        c_random = Coordinate(c.chr_id, random_bpstart,
                                              random_bpstart + len(c) - 1)
                        seq = genome.extract_sequence(c_random)
                        c_g_content_c_random = (seq.count('c') +
                                                seq.count('g')) / float(len(c))
                        c_random_bin = np.nonzero(
                            np.histogram(c_g_content_c_random, bins)[0])[0][0]

                    # print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            debug('original: ' +
                  str(np.histogram(c_g_content_target, bins)[0]))
            debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0]))

        else:
            bg_coords = get_random_coordinates(target_coords, genome)

        info('Done!')

    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords = Coordinate.coordinates_of_intervals_around_center(
            bg_coords, internal_window_length)

        if use_entire_bg:
            bg_target_ratio = float(len(bg_coords)) / n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f',
                 bg_target_ratio)

        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))
            target_hist = np.histogram(c_g_content_target, bins)[0]
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            ratios = bg_hist / (target_hist * 1.0)
            debug('original:%s' % target_hist)
            debug('bg:%s' % bg_hist)
            debug('ratios:%s' % ratios)
            K_MATCH = min(
                bg_target_ratio,
                ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) &
                       (target_hist / float(target_hist.sum()) > 0.05)].min())

            debug('K_MATCH:%d' % K_MATCH)

            to_match = np.int32(np.floor(K_MATCH * target_hist))

            debug('to_match:%s' % to_match)

            idxs_corrected_bg = np.array([], dtype=int)

            for idx_bin in range(len(bins) - 1):
                idxs_matching_regions = \
                    np.nonzero((c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0]
                to_take = np.random.permutation(len(idxs_matching_regions))
                to_take = to_take[range(
                    min(len(idxs_matching_regions), to_match[idx_bin]))]
                idxs_corrected_bg = np.hstack(
                    (idxs_corrected_bg, idxs_matching_regions[to_take]))

            debug('original:%s' % target_hist)
            debug('K:%d' % K_MATCH)
            debug('to sample:%s' % to_match)
            debug('obtained:%s' %
                  np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0])
            bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug(np.histogram(c_g_content_bg, bins)[0])
            if np.array_equal(K_MATCH * target_hist,
                              np.histogram(c_g_content_bg, bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s'
                     % (target_hist, np.histogram(c_g_content_bg, bins)[0]))
            else:
                warn(
                    'C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'
                    % (target_hist, np.histogram(c_g_content_bg, bins)[0]))

            debug(target_hist / np.histogram(c_g_content_bg, bins)[0])

    if len(bg_coords) >= bg_target_ratio * n_target_coordinates:
        bg_coords = random.sample(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
    else:
        if bootstrap and len(bg_coords) < (bg_target_ratio *
                                           n_target_coordinates *
                                           0.95):  # allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords = sample_wr(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' %
                  (target_hist, np.histogram(c_g_content_bg, bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning(
        bg_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)[0:3]

    # allocate date for reports
    N_MOTIFS = len(motif_ids)
    rankings = np.zeros(N_MOTIFS, dtype=np.int16)
    motif_ratios = np.zeros(N_MOTIFS)
    support_p = np.zeros(N_MOTIFS)
    support_n = np.zeros(N_MOTIFS)
    fisher_p_values = np.zeros(N_MOTIFS)
    central_enrichment = np.zeros(N_MOTIFS)

    N_seq_p = positive_matrix.shape[0]
    N_seq_n = negative_matrix.shape[0]

    profile_presence_p = (positive_matrix > 0).sum(0)
    profile_presence_n = (negative_matrix > 0).sum(0)

    support_p = profile_presence_p / float(N_seq_p)
    support_n = profile_presence_n / float(N_seq_n)

    internal_bpstart = window_length / 2 - internal_window_length / 2
    internal_bpend = window_length / 2 + internal_window_length / 2

    for idx, motif_id in enumerate(motif_ids):
        fisher_p_values[idx] = stats.fisher_exact(
            [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]],
             [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1]
        central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][
            internal_bpstart:internal_bpend].mean() / np.hstack([
                motifs_profiles_in_sequences[motif_id][:internal_bpstart],
                motifs_profiles_in_sequences[motif_id][internal_bpend:]
            ]).mean()

    motif_ratios = (support_p + 0.01) / (support_n + 0.01)

    # Foundamental!
    if not disable_ratio:
        motif_ratios[support_p < 0.03] = 1

    rankings = stats.rankdata(-motif_ratios)

    # filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep = np.nonzero(motif_ratios > 1)[0]
    else:
        idxs_to_keep = range(len(motif_ratios))

    rankings = rankings[idxs_to_keep]
    motif_ratios = motif_ratios[idxs_to_keep]
    support_p = support_p[idxs_to_keep]
    support_n = support_n[idxs_to_keep]
    fisher_p_values = fisher_p_values[idxs_to_keep]
    central_enrichment = central_enrichment[idxs_to_keep]

    motif_ids = [motif_ids[_] for _ in idxs_to_keep]
    motif_names = [motif_names[_] for _ in idxs_to_keep]
    motif_idxs = [_ for _ in idxs_to_keep]

    try:
        qvalues = estimate_qvalues(fisher_p_values)
        # we test the ones only with ratio >1
    except:
        print fisher_p_values

    # qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################

    # generate reports in html
    info('Generating HTML report...')
    imgs_directory = os.path.join(output_directory, 'images')
    genes_list_directory = os.path.join(output_directory, 'genes_lists')
    motif_regions_directory = os.path.join(output_directory, 'motifs_regions')

    # create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)

    j2_env = Environment(
        loader=FileSystemLoader(determine_path('extra') + '/templates/'),
        trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra') + '/templates/')
    template = j2_env.get_template('report_template.html')

    # copy haystack logo and bg
    shutil.copyfile(
        determine_path('extra') + '/templates/haystack_logo.png',
        os.path.join(imgs_directory, 'haystack_logo.png'))
    shutil.copyfile(
        determine_path('extra') + '/templates/noise.png',
        os.path.join(imgs_directory, 'noise.png'))

    motifs_dump = []
    for i in np.argsort(rankings):
        if (support_p[i] >= 0.03
                or disable_ratio) and fisher_p_values[i] < 0.01 and (
                    motif_ratios[i] > 1 or disable_ratio
                ) and central_enrichment[i] > min_central_enrichment:
            # if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):

            info('Generating logo and profile for:' + motif_ids[i])

            # create motif logo
            img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i],
                             file_format='pdf')
            # fix the weblogo prefix problem
            img_logo_url = os.path.join('images',
                                        'logo_' + motif_ids[i] + '.png')

            # create motif enrichment profile
            img_profile = os.path.join(imgs_directory,
                                       'profile_' + motif_ids[i] + '.png')
            motif_profile_target = motifs_profiles_in_sequences[
                motif_ids[i]] / N_seq_p
            motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n

            # print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,
                                   motif_profile_bg,
                                   motif_ids[i],
                                   img_profile,
                                   smooth_size=smooth_size,
                                   window_size=window_length)
            img_profile_url = os.path.join('images',
                                           'profile_' + motif_ids[i] + '.png')

            # create regions
            info('Extracting regions with:' + motif_ids[i])
            regions = os.path.join(
                motif_regions_directory,
                motif_ids[i] + '_motif_region_in_target.bed')
            with open(regions, 'w+') as outfile:
                outfile.write(
                    'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n'
                )
                for c, locations in motif_coords_in_seqs_with_motif[
                        motif_ids[i]].items():
                    outfile.write('\t'.join([
                        c.chr_id,
                        str(c.bpstart),
                        str(c.bpend), ';'.join([
                            '-'.join(map(str, map(int, l))) for l in locations
                        ]),
                        str(len(locations))
                    ]) + '\n')
            regions_url = os.path.join(
                'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed')

            # map closest downstream genes
            genes_url = None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' %
                     motif_ids[i])

                peak_annotator_path = os.path.join(determine_path('extra/'),
                                                   'PeakAnnotator.jar')

                if gene_ids_to_names_filename:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, gene_ids_to_names_filename, genes_list_directory),
                            shell=True)
                else:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s  -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, genes_list_directory), shell=True)

                genes_url = os.path.join(
                    'genes_lists',
                    motif_ids[i] + '_motif_region_in_target.tss.bed')

            motifs_dump.append({
                'id': motif_ids[i],
                'name': motif_names[i],
                'support_p': support_p[i] * 100,
                'support_n': support_n[i] * 100,
                'ratio': motif_ratios[i],
                'rank': float(rankings[i]),
                'pvalue': fisher_p_values[i],
                'qvalue': qvalues[i],
                'central_enrichment': central_enrichment[i],
                'img_logo': img_logo_url,
                'img_profile': img_profile_url,
                'regions': regions_url,
                'genes': genes_url,
                'idx_motif': motif_idxs[i]
            })

    outfile = codecs.open(
        os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump, bed_target_filename=bed_target_filename,
                                  bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n, \
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,
                                  use_gene_annotations=use_gene_annotations))
    outfile.close()

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory = os.path.join(output_directory, 'dump')

        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)

        np.save(os.path.join(dump_directory, 'matrix_' + target_name),
                positive_matrix)
        np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name),
                negative_matrix)

        cp.dump(
            motifs_dump,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_dumps.pickle'), 'w'))

        # cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        # cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(
            idxs_seqs_with_motif,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_seqs_idxs.pickle'), 'w'))
        cp.dump(
            idxs_seqs_with_motif_bg,
            open(
                os.path.join(dump_directory,
                             bg_name + '_motif_seqs_idxs.pickle'), 'w'))

        cp.dump(
            motif_coords_in_seqs_with_motif,
            open(
                os.path.join(
                    dump_directory,
                    target_name + '_motif_coords_in_seqs_with_motif.pickle'),
                'w'))

        Coordinate.coordinates_to_bed(
            target_coords,
            os.path.join(
                dump_directory,
                'Target_coordinates_selected_on_' + target_name + '.bed'),
            minimal_format=False)
        Coordinate.coordinates_to_bed(
            bg_coords,
            os.path.join(dump_directory,
                         'BG_coordinates_selected_on_' + bg_name + '.bed'),
            minimal_format=True)
    #info('Motif analysis for Sample %s completed' %name)
    info('Motif analysis completed! Ciao!')
Пример #9
0
def parallel_fimo_scanning(target_coords, meme_motifs_filename, genome,
                           nucleotide_bg_filename, temp_directory, p_value,
                           mask_repetitive, window_length,
                           internal_window_length, num_consumers):
    fimo = Fimo(meme_motifs_filename,
                nucleotide_bg_filename,
                temp_directory=temp_directory,
                p_value=p_value)

    # init variables
    prefix = 'haystack_motifs_' + str(uuid.uuid4())

    motifs_profiles_in_sequences = dict()
    idxs_seqs_with_motif = dict()
    motif_coords_in_seqs_with_motif = dict()

    # extend with flanking
    original_target_coords = target_coords

    if window_length:
        internal_bpstart = window_length / 2 - internal_window_length / 2
        internal_bpend = window_length / 2 + internal_window_length / 2
        target_coords = Coordinate.coordinates_of_intervals_around_center(
            target_coords, window_length)

    # write fasta
    target_coords_fasta_filename = os.path.join(temp_directory, prefix + '.fa')
    Coordinate.coordinates_to_fasta(target_coords,
                                    target_coords_fasta_filename, genome)

    # mapping
    coord_to_idx = dict()
    for idx, c in enumerate(target_coords):
        coord_to_idx[str(c).split()[0]] = idx

    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id] = np.zeros(len(c))
        idxs_seqs_with_motif[motif_id] = set()
        motif_coords_in_seqs_with_motif[motif_id] = pickable_defaultdict()

    motifs_in_sequences_matrix = np.zeros(
        (len(target_coords), len(fimo.motif_ids)))

    # num_consumers= num_consumers -2

    # compute motifs with fimo
    if num_consumers > 1:

        # partial function for multiprocessing
        compute_single_motif = partial(call_fimo, target_coords_fasta_filename,
                                       prefix, meme_motifs_filename,
                                       nucleotide_bg_filename, temp_directory,
                                       p_value)

        pool = mp.Pool(processes=num_consumers)
        pool.map(compute_single_motif, fimo.motif_ids)
        pool.close()
        pool.join()
        fimo_output_filename = os.path.join(temp_directory,
                                            prefix + '_fimo_output.motifs')
        sb.call('cat %s*.motifs > "%s"' %
                (os.path.join(temp_directory, prefix), fimo_output_filename),
                shell=True)
    else:
        call_fimo(target_coords_fasta_filename, prefix, meme_motifs_filename,
                  nucleotide_bg_filename, temp_directory, p_value,
                  'ALL_MOTIFS')
        fimo_output_filename = os.path.join(
            temp_directory, '%s_%s.motifs' % (prefix, 'ALL_MOTIFS'))

    with open(fimo_output_filename) as infile:

        for line in infile:
            try:

                motif_id, motif_coord, motif_start, motif_end = line.split()
                motif_start = int(motif_start)
                motif_end = int(motif_end)
                idx_seq = coord_to_idx[motif_coord]

                motifs_profiles_in_sequences[motif_id][
                    motif_start:motif_end] += 1.0

                if motif_start >= internal_bpstart and motif_end <= internal_bpend:  # keep track only if is in the internal window!
                    idxs_seqs_with_motif[motif_id].add(idx_seq)
                    motifs_in_sequences_matrix[
                        idx_seq, fimo.motif_id_to_index[motif_id]] = +1
                    motif_coords_in_seqs_with_motif[motif_id][
                        original_target_coords[idx_seq]].append(
                            (motif_start + target_coords[idx_seq].bpstart - 1,
                             motif_end + target_coords[idx_seq].bpstart - 1))
            except:
                print line

    sb.call('rm %s* ' % os.path.join(temp_directory, prefix), shell=True)

    return motifs_in_sequences_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, fimo.motif_names, fimo.motif_ids
Пример #10
0
def parallel_fimo_scanning(target_coords,
                              meme_motifs_filename,
                              genome,nucleotide_bg_filename,
                              temp_directory,
                              p_value,
                              mask_repetitive,
                              window_length,
                              internal_window_length,
                              num_consumers):
    
    fimo=Fimo(meme_motifs_filename,nucleotide_bg_filename,temp_directory=temp_directory,p_value=p_value)
    
    #init variables
    prefix='haystack_motifs_'+str(uuid.uuid4())
        


    motifs_profiles_in_sequences=dict()
    idxs_seqs_with_motif=dict()
    motif_coords_in_seqs_with_motif=dict()
    
    
        
    #extend with flanking
    original_target_coords=target_coords

    if window_length:
        internal_bpstart=window_length/2-internal_window_length/2
        internal_bpend=window_length/2+internal_window_length/2
        target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,window_length)
    
    #write fasta
    target_coords_fasta_filename=os.path.join(temp_directory,prefix+'.fa')  
    Coordinate.coordinates_to_fasta(target_coords,target_coords_fasta_filename,genome)
    
    #mapping
    coord_to_idx=dict()
    for idx,c in enumerate(target_coords):
        coord_to_idx[str(c).split()[0]]=idx
        
        
        
    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id]=np.zeros(len(c))
        idxs_seqs_with_motif[motif_id]=set()
        motif_coords_in_seqs_with_motif[motif_id]=pickable_defaultdict()

    motifs_in_sequences_matrix=np.zeros((len(target_coords),len(fimo.motif_ids)))

    #compute motifs with fimo
    if num_consumers>1:
        
        #partial function for multiprocessing
        compute_single_motif=partial(call_fimo,target_coords_fasta_filename,prefix,meme_motifs_filename,nucleotide_bg_filename,temp_directory,p_value)
        
        pool = mp.Pool(processes=num_consumers)
        results=pool.map(compute_single_motif,fimo.motif_ids)
        pool.close()
        pool.join()
        fimo_output_filename=os.path.join(temp_directory,prefix+'_fimo_output.motifs')
        sb.call('cat %s*.motifs > %s' % (os.path.join(temp_directory,prefix),fimo_output_filename ),shell=True)
    else:
        call_fimo(target_coords_fasta_filename,prefix,meme_motifs_filename,nucleotide_bg_filename,temp_directory,p_value,'ALL_MOTIFS')
        fimo_output_filename=os.path.join(temp_directory,'%s_%s.motifs' % (prefix,'ALL_MOTIFS'))
    

    with open(fimo_output_filename) as infile:
        
        for line in infile:
            try:
            
                motif_id,motif_coord,motif_start,motif_end=line.split()
                motif_start=int(motif_start)
                motif_end=int(motif_end)
                idx_seq=coord_to_idx[motif_coord]
    
                motifs_profiles_in_sequences[motif_id][motif_start:motif_end]+=1.0
    
                if motif_start>=internal_bpstart and motif_end<=internal_bpend: #keep track only if is in the internal window!
                    idxs_seqs_with_motif[motif_id].add(idx_seq)
                    motifs_in_sequences_matrix[idx_seq,fimo.motif_id_to_index[motif_id]]=+1
                    motif_coords_in_seqs_with_motif[motif_id][original_target_coords[idx_seq]].append((motif_start+target_coords[idx_seq].bpstart-1,motif_end+target_coords[idx_seq].bpstart-1 ))  
            except:
                print line
      
    sb.call('rm %s* ' % os.path.join(temp_directory,prefix),shell=True)

    return motifs_in_sequences_matrix,motifs_profiles_in_sequences,idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,fimo.motif_names, fimo.motif_ids
                #print 'problema coordinate random'
                region_nok=True

                not_founded=True
                while not_founded:
                    c_random=target_coordinates[np.random.randint(len(target_coordinates))]
                    not_founded= len(c_random) < len(c)
                

    #print random_sampling_score
    return random_sampling_score


print 'carica i dati'

input_coordinates=Coordinate.bed_to_coordinates(input_bed_file)
exons_coordinates=Coordinate.bed_to_coordinates(exon_bed_file)
introns_coordinates=Coordinate.bed_to_coordinates(intron_bed_file)
intergenic_coordinates=Coordinate.bed_to_coordinates(intergenic_bed_file)

print 'alloca memoria per lunghezza intersezioni'
inters_length_exon_intron_intergenic=np.zeros((len(input_coordinates),3))

print 'costruisci interval tree degli enanchers'
interval_tree=dict()
coord_to_row_index=dict()
row_index=0
for c in input_coordinates:
    if c.chr_id not in interval_tree:
        interval_tree[c.chr_id]=Intersecter()