Python Coordinate примеры использования

Язык программирования: Python

Пространство имен/Пакет: bioutilities

Класс/Тип: Coordinate

Примеров на hotexamples.com: 11

Python Coordinate - 11 примеров найдено. Это лучшие примеры Python кода для bioutilities.Coordinate, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Coordinate(3)

coordinates_of_intervals_around_center(2)

bed_to_coordinates(1)

coordinates_from_interval(1)

coordinates_to_bed(1)

coordinates_to_fasta(1)

Пример #1

Показать файл

def get_random_coordinates(coords, genome):
    random_coords = []
    for c in coords:
        random_bpstart = np.random.randint(
            1, genome.chr_len[c.chr_id] - len(c) + 1)
        random_coords.append(
            Coordinate(c.chr_id, random_bpstart, random_bpstart + len(c) - 1))
    return random_coords

Пример #2

Показать файл

Файл: calculate_conservation.py Проект: lucapinello/bioutilities

def intersections_lengths(n_input_coordinates,interval_tree,coord_to_row_index,target_coordinates):
    inters_lenghts=np.zeros(n_input_coordinates)
    for cl_index,c in enumerate(target_coordinates):

        if interval_tree.has_key(c.chr_id):
            coords_hits=interval_tree[c.chr_id].find(c.bpstart, c.bpend)

            for coord_hit in coords_hits:
                c_to_add=Coordinate.coordinates_from_interval(c.chr_id, coord_hit)
                row_index=coord_to_row_index[c_to_add]
                inters_lenghts[row_index]+=len(c &  c_to_add)
                inters_lenghts[row_index]=min(inters_lenghts[row_index],len(c_to_add)) #shared regions across genes..
                #print 'target:'+str(c),'hit:'+str(c_to_add),len(c),len(c_to_add),len(c &  c_to_add)

    return inters_lenghts

Пример #3

Показать файл

Файл: mim.py Проект: BioinformaticsArchive/mim

print 'Please send any bugs to: [email protected]'
print '----------------------------------------------\n\n'
ng=Ngram(alphabet_size=4,ngram_length=4)    

print '>Loading genome from:',genome_directory

if memory_mapped_genome:
    g=Genome_mm(genome_directory)
else:
    g=Genome(genome_directory)

print 'Genome Loaded.'
    
mim_values=dict()
print '\n>Loading coordinates from:',bed_file
coordinates=Coordinate.bed_to_coordinates(bed_file)
print '%d coordinates loaded' % len(coordinates)

S=[]
R=[]

print '\n>Extracting sequences:'
pb = ProgressBar(widgets=['Sequences processed: ', Percentage()], maxval=len(coordinates)).start()
for idx,c in enumerate(coordinates):
    seq=g.extract_sequence(c)
    
    if 'n' not in seq:
        S.append(seq)
        
        if shuffle:
            seq_random=''.join( [S[-1][i] for i in permutation(len(S[-1] ))]   )

Пример #4

Показать файл

for chrom in bed_dict:
	for site in bed_dict[chrom]:

		counter += 1

		if counter%1000 == 0:
			logger.info('Completed PAM search for %s out of %s sites ...' % (counter, total_regions))

		pams_found_list = []

		start, stop, sample = site
		total_sgRNAs_dict[sample] = []

		pams_found[sample] = [0]*len(pam_list)

		genomic_region = genome.extract_sequence(Coordinate(chrom, start, stop)).upper()

		for i in range(len(pam_list)):

			for pam_entry in pam_list[i]:

				find_guides_top = [[m.start()] for m in re.finditer('(?=%s)' % pam_entry[0], genomic_region, re.IGNORECASE)]
				find_guides_bottom = [[m.start()] for m in re.finditer('(?=%s)' % pam_entry[1], genomic_region, re.IGNORECASE)]

				if find_guides_top:

					for match in find_guides_top:

						start_g = match[0]
						stop_g = start_g + pam_entry[2]

Пример #5

Показать файл

Файл: haystack_motifs.py Проект: a1aks/Haystack

def run_haystack():

    print '\n[H A Y S T A C K   M O T I F S]'
    print('\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n')

    if which('fimo') is None:
        error('Haystack requires Fimo from the MEME suite free available at: http://meme.nbcr.net/meme/')
        sys.exit(1)

    bootstrap=False
    ngram_correction='g'

    #mandatory
    parser = argparse.ArgumentParser(description='HAYSTACK Parameters')
    parser.add_argument('bed_target_filename', type=str,  help='A bed file containing the target coordinates on the genome of reference')
    parser.add_argument('genome_name', type=str,  help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')

    #optional
    parser.add_argument('--bed_bg_filename', type=str,  help="A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)", default='random_background')
    parser.add_argument('--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2014)')
    parser.add_argument('--nucleotide_bg_filename',type=str, help='Nucleotide probability for the background in MEME format (default precomupted on the Genome)')
    parser.add_argument('--p_value', type=float, help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)',default=1e-4)
    parser.add_argument('--no_c_g_correction',  help='Disable the matching of the C+G density of the background',action='store_true')
    parser.add_argument('--c_g_bins', type=int,help='Number of bins for the C+G density correction (default: 8)',default=8)
    parser.add_argument('--mask_repetitive', help='Mask repetitive sequences',action='store_true')
    parser.add_argument('--n_target_coordinates', type=int, help='Number of target coordinates to use (default: all)',default=np.inf)
    parser.add_argument('--use_entire_bg', help='Use the entire background file (use only when the cg correction is disabled)',action='store_true')   
    parser.add_argument('--bed_score_column', type=int, help='Column in the bedfile that represents the score (default: 5)',default=5)
    parser.add_argument('--bg_target_ratio', type=int, help='Background size/Target size ratio (default: 1.0)',default=2)
    parser.add_argument('--bootstrap',  help='Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)',action='store_true')
    parser.add_argument('--temp_directory',  help='Directory to store temporary files  (default: /tmp)', default='/tmp')
    parser.add_argument('--no_random_sampling_target',  help='Select the best --n_target_coordinates using the score column from the target file instead of randomly select them',action='store_true')
    parser.add_argument('--name',  help='Define a custom output filename for the report', default='')
    parser.add_argument('--internal_window_length', type=int, help='Window length in bp for the enrichment (default: average lenght of the target sequences)')
    parser.add_argument('--window_length', type=int, help='Window length in bp for the profiler (default:internal_window_length*5)')
    parser.add_argument('--min_central_enrichment', type=float, help='Minimum central enrichment to report a motif (default:>1.0)',default=1.0)
    parser.add_argument('--disable_ratio',  help='Disable target/bg ratio filter',action='store_true')
    parser.add_argument('--dump', help='Dump all the intermediate data, choices: True, False (default: False)',action='store_true')
    parser.add_argument('--output_directory',type=str, help='Output directory (default: current directory)',default='')
    parser.add_argument('--smooth_size',type=int, help='Size in bp for the smoothing window (default: internal_window_length/4)')
    parser.add_argument('--gene_annotations_filename',type=str, help='Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene')
    parser.add_argument('--gene_ids_to_names_filename',type=str, help='Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)')
    parser.add_argument('--n_processes',type=int, help='Specify the number of processes to use. The default is #cores available.',default=multiprocessing.cpu_count())
    parser.add_argument('--version',help='Print version and exit.',action='version', version='Version %.1f' % HAYSTACK_VERSION)

    args = parser.parse_args()

    args_dict=vars(args)
    for key,value in args_dict.items():
        if key=='n_target_coordinates':
            n_target_coordinates=value
        else:
            exec('%s=%s' %(key,repr(value)))

    
    bed_score_column-=1

    if no_c_g_correction:
        c_g_correction=False
    else:
        c_g_correction=True

    if no_random_sampling_target:
        random_sampling_target=False
    else:
        random_sampling_target=True
        

    check_file(bed_target_filename)

    if not  bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)


    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename=os.path.join(determine_path('motif_databases'),'JASPAR_CORE_2014_vertebrates.meme')
        
    annotation_directory=determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error('The mapping to the closest gene requires Java free available from: http://java.com/en/download/')
            use_gene_annotations=False
        else:
            check_file(gene_annotations_filename) 
            info('Using %s as gene annotations file' % gene_annotations_filename)
            use_gene_annotations=True
    else:
            gene_annotations_filename=os.path.join(annotation_directory,'%s_genes.bed' % genome_name)
            gene_ids_to_names_filename=os.path.join(annotation_directory,'%s_genes_id_to_names' % genome_name)
            
            if os.path.exists(gene_annotations_filename) and os.path.exists(gene_ids_to_names_filename):
                use_gene_annotations=True
            else:
                use_gene_annotations=False
                info('No gene annotations file specified')


    target_name=ntpath.basename(bed_target_filename.replace('.bed',''))
    bg_name=ntpath.basename(bed_bg_filename.replace('.bed',''))
    #timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name='HAYSTACK_MOTIFS_on_'+name
    else:
        directory_name='HAYSTACK_on_'+target_name+'_VS_'+bg_name

    if output_directory:
        output_directory=os.path.join(output_directory, directory_name)
    else:
        output_directory=directory_name


    info('###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n'\
         %(bed_target_filename,bed_bg_filename,str(bg_target_ratio),str(c_g_correction),str(mask_repetitive),'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates),output_directory))

    info('Initializing Genome:%s' %genome_name)

    genome_directory=determine_path('genomes')
    genome_2bit=os.path.join(genome_directory,genome_name+'.2bit')

    if os.path.exists(genome_2bit):
        genome=Genome_2bit(genome_2bit)
    else:
        info("\nIt seems you don't have the required genome file.")
        if query_yes_no('Should I download it for you?'):
            sb.call('download_genome %s' %genome_name,shell=True,env=system_env)
            if os.path.exists(genome_2bit):
                info('Genome correctly downloaded!')
                genome=Genome_2bit(genome_2bit)
            else:
                error('Sorry I cannot download the required file for you. Check your Internet connection.')
                sys.exit(1)
        else:
            error('Sorry I need the genome file to perform the analysis. Exiting...')
            sys.exit(1)

    if not nucleotide_bg_filename:
        nucleotide_bg_filename=os.path.join(genome_directory,genome_name+'_meme_bg')

    check_file(nucleotide_bg_filename)
        


    N_TARGET=None
    N_BG=None
    COMMAND_USED=' '.join(sys.argv)

    _n_target_coordinates=n_target_coordinates


    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords=Coordinate.bed_to_coordinates(bed_target_filename,cl_score=bed_score_column)

    if len(target_coords) == 0:
    	info('No coordinates to analyze in your input file. Exiting.')
    	sys.exit(1)

    #calculate automatically the average lenght of the target regions


    if internal_window_length:
        info('Using the user defined internal window length:%d' % internal_window_length )
        if internal_window_length % 2:
            internal_window_length+=1
            
    else:
                        
        internal_window_length=int(np.mean(map(len,target_coords)))
        if internal_window_length % 2:
            internal_window_length+=1
        info('Using the average length of target coordinates as internal window length:%d' % internal_window_length )

        if not window_length:
            window_length=internal_window_length*5

    info('Total window length:%d' % window_length ) 

        

    if not smooth_size:
        smooth_size=internal_window_length/5

    target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,internal_window_length)
        

    if len(target_coords)>n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %( n_target_coordinates,len(target_coords)))
            target_coords=random.sample(target_coords,n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %( n_target_coordinates,len(target_coords)))
            sorted_idxs_by_score=np.argsort([c.score for c in target_coords])[::-1]
            target_coords=[target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates]]
    else:
        
        if random_sampling_target and bootstrap and not np.isinf(n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords=sample_wr(target_coords,n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))
            

    info('Extracting Motifs in target coordinates')
    positive_matrix,motifs_profiles_in_sequences, idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,motif_names,motif_ids=ParallelFimoScanning(target_coords,
                                                                                                                                                  meme_motifs_filename,
                                                                                                                                                  genome,nucleotide_bg_filename,
                                                                                                                                                  temp_directory=temp_directory,
                                                                                                                                                  p_value=p_value,
                                                                                                                                                  mask_repetitive=mask_repetitive,
                                                                                                                                                  window_length=window_length,
                                                                                                                                                  internal_window_length=internal_window_length,
                                                                                                                                                  num_consumers=n_processes)
    n_target_coordinates=len(target_coords) #fix for the bootstrap!




    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords=[]
            c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction)

            info('Extract a Matching C+G Background')
            bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf))

            for _ in range(bg_target_ratio):
                for idx_c,c in enumerate(target_coords):
                    c_bin=np.nonzero(np.histogram(c_g_content_target[idx_c],bins)[0])[0][0]
                    c_random_bin=-1
                    
                    while c_random_bin != c_bin:
                        random_bpstart=np.random.randint(1,genome.chr_len[c.chr_id]-len(c)+1)
                        c_random=Coordinate(c.chr_id,random_bpstart,random_bpstart+len(c)-1)
                        seq=genome.extract_sequence(c_random)
                        c_g_content_c_random=(seq.count('c')+seq.count('g'))/float(len(c))
                        c_random_bin=np.nonzero(np.histogram(c_g_content_c_random,bins)[0])[0][0]

                    #print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            bg_hist=np.histogram(c_g_content_bg,bins)[0]
            debug('original: '+str(np.histogram(c_g_content_target,bins)[0]))
            debug('obtained:'+str(np.histogram(c_g_content_bg,bins)[0]))

        else:
            bg_coords=get_random_coordinates(target_coords,genome)
        
        info('Done!')
       
    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords=Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords=Coordinate.coordinates_of_intervals_around_center(bg_coords,internal_window_length)

        if use_entire_bg:
            bg_target_ratio=float(len(bg_coords))/n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio)

        
        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target=calculate_average_ngram_presence(target_coords,genome,ngram_correction)   
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)

            info('Extract a Matching C+G Background')
            bins=np.hstack((np.linspace(0,1,c_g_bins),np.inf))
            target_hist=np.histogram(c_g_content_target,bins)[0]
            bg_hist=np.histogram(c_g_content_bg,bins)[0]
            ratios=bg_hist/(target_hist*1.0);
            debug('original:%s' %target_hist)
            debug('bg:%s' %bg_hist)
            debug('ratios:%s' %ratios)
            K_MATCH=min(bg_target_ratio,ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios>0) &(target_hist/float(target_hist.sum())>0.05)].min())

            debug('K_MATCH:%d' %K_MATCH)

            to_match=np.int32(np.floor(K_MATCH*target_hist))

            debug('to_match:%s' %to_match)
            
            idxs_corrected_bg=np.array([],dtype=int)

            for idx_bin in range(len(bins)-1):
                idxs_matching_regions=np.nonzero((c_g_content_bg>=bins[idx_bin]) & (c_g_content_bg<bins[idx_bin+1]))[0]
                to_take=np.random.permutation(len(idxs_matching_regions))
                to_take=to_take[range(min(len(idxs_matching_regions),to_match[idx_bin]))]
                idxs_corrected_bg= np.hstack((idxs_corrected_bg,idxs_matching_regions[to_take]))  

            debug('original:%s' %target_hist)
            debug('K:%d' %K_MATCH)
            debug('to sample:%s' %to_match) 
            debug( 'obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg],bins)[0] )
            bg_coords=[bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            debug(np.histogram(c_g_content_bg,bins)[0])
            if np.array_equal(K_MATCH*target_hist,np.histogram(c_g_content_bg,bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0]))
            else:
                warn('C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'%(target_hist,np.histogram(c_g_content_bg,bins)[0]))

            debug(target_hist/np.histogram(c_g_content_bg,bins)[0])


    if len(bg_coords)>=bg_target_ratio*n_target_coordinates:
        bg_coords=random.sample(bg_coords,int(bg_target_ratio*n_target_coordinates))
    else:
        if bootstrap and len(bg_coords)<(bg_target_ratio*n_target_coordinates*0.95): #allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords=sample_wr(bg_coords,int(bg_target_ratio*n_target_coordinates))
            c_g_content_bg=calculate_average_ngram_presence(bg_coords,genome,ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' % (target_hist,np.histogram(c_g_content_bg,bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix,motifs_profiles_in_bg,idxs_seqs_with_motif_bg=ParallelFimoScanning(bg_coords,
                                                                                       meme_motifs_filename,
                                                                                       genome,nucleotide_bg_filename,
                                                                                       temp_directory=temp_directory,
                                                                                       p_value=p_value,
                                                                                       mask_repetitive=mask_repetitive,
                                                                                       window_length=window_length,
                                                                                       internal_window_length=internal_window_length,
                                                                                       num_consumers=n_processes)[0:3]

    #allocate date for reports
    N_MOTIFS=len(motif_ids)
    rankings=np.zeros(N_MOTIFS,dtype=np.int16)
    motif_ratios=np.zeros(N_MOTIFS)
    support_p=np.zeros(N_MOTIFS)
    support_n=np.zeros(N_MOTIFS)
    fisher_p_values=np.zeros(N_MOTIFS)
    central_enrichment=np.zeros(N_MOTIFS)

    N_seq_p=positive_matrix.shape[0]
    N_seq_n=negative_matrix.shape[0]

    profile_presence_p=(positive_matrix>0).sum(0)
    profile_presence_n=(negative_matrix>0).sum(0)

    support_p=profile_presence_p/float(N_seq_p)
    support_n=profile_presence_n/float(N_seq_n)

    internal_bpstart=window_length/2-internal_window_length/2
    internal_bpend=window_length/2+internal_window_length/2

    for idx,motif_id in enumerate(motif_ids):
        fisher_p_values[idx]= stats.fisher_exact([[ profile_presence_p[idx], N_seq_p-profile_presence_p[idx]], [ profile_presence_n[idx], N_seq_n-profile_presence_n[idx]]])[1]
        central_enrichment[idx]=motifs_profiles_in_sequences[motif_id][internal_bpstart:internal_bpend].mean()/ np.hstack([motifs_profiles_in_sequences[motif_id][:internal_bpstart],motifs_profiles_in_sequences[motif_id][internal_bpend:]]).mean()
        
    motif_ratios=(support_p+0.01)/(support_n+0.01)

    #Foundamental!
    if not disable_ratio:
        motif_ratios[support_p<0.03]=1
    
    rankings=stats.rankdata(-motif_ratios)


    #filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep=np.nonzero(motif_ratios>1)[0]
    else:
        idxs_to_keep=range(len(motif_ratios))
        

    rankings=rankings[idxs_to_keep]
    motif_ratios=motif_ratios[idxs_to_keep]
    support_p=support_p[idxs_to_keep]
    support_n=support_n[idxs_to_keep]
    fisher_p_values=fisher_p_values[idxs_to_keep]
    central_enrichment=central_enrichment[idxs_to_keep]

    motif_ids=[motif_ids[_] for _ in idxs_to_keep]
    motif_names=[motif_names[_] for _ in idxs_to_keep]
    motif_idxs=[_ for _ in idxs_to_keep]

    try:
        qvalues=estimate_qvalues(fisher_p_values); # we test the ones only with ratio >1
    except:
        print fisher_p_values

    #qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################


    #generate reports in html
    info('Generating HTML report...')
    imgs_directory=os.path.join(output_directory,'images')
    genes_list_directory=os.path.join(output_directory,'genes_lists')
    motif_regions_directory=os.path.join(output_directory,'motifs_regions')

    #create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)


    j2_env = Environment(loader=FileSystemLoader(determine_path('extra')+'/templates/'),trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra')+'/templates/')
    template= j2_env.get_template('report_template.html')

    #copy haystack logo and bg
    shutil.copyfile(determine_path('extra')+'/templates/haystack_logo.png', os.path.join(imgs_directory,'haystack_logo.png'))
    shutil.copyfile(determine_path('extra')+'/templates/noise.png', os.path.join(imgs_directory,'noise.png'))

    motifs_dump=[]
    for i in np.argsort(rankings):
        if (support_p[i]>=0.03 or disable_ratio)  and fisher_p_values[i]<0.01  and  (motif_ratios[i]>1 or disable_ratio) and central_enrichment[i]>min_central_enrichment:
        #if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):
       
            info('Generating logo and profile for:'+motif_ids[i])
            
            #create motif logo
            img_logo=os.path.join(imgs_directory,'logo_'+motif_ids[i])
            generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo')
            generate_weblogo(motif_ids[i],meme_motifs_filename,img_logo,title=motif_ids[i],SEQLOGO=determine_path('extra')+'/seqlogo',file_format='pdf')
            #fix the weblogo prefix problem
            img_logo_url=os.path.join('images','logo_'+motif_ids[i]+'.png')
            
            #create motif enrichment profile
            img_profile=os.path.join(imgs_directory,'profile_'+motif_ids[i]+'.png')
            motif_profile_target=motifs_profiles_in_sequences[motif_ids[i]]/N_seq_p
            motif_profile_bg=motifs_profiles_in_bg[motif_ids[i]]/N_seq_n

            #print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,motif_profile_bg,motif_ids[i],img_profile,smooth_size=smooth_size,window_size=window_length)
            img_profile_url=os.path.join('images','profile_'+motif_ids[i]+'.png')
            
            #create regions
            info('Extracting regions with:'+motif_ids[i])
            regions=os.path.join(motif_regions_directory,motif_ids[i]+'_motif_region_in_target.bed')
            with open(regions,'w+') as outfile:
                outfile.write('Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n')
                for c,locations in motif_coords_in_seqs_with_motif[motif_ids[i]].items():
                    outfile.write('\t'.join([c.chr_id,str(c.bpstart),str(c.bpend),';'.join(['-'.join(map(str,map(int,l))) for l in locations]),str(len(locations))])+'\n')
            regions_url=os.path.join('motifs_regions',motif_ids[i]+'_motif_region_in_target.bed')
            
            #map closest downstream genes
            genes_url=None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' % motif_ids[i])

                peak_annotator_path=os.path.join(determine_path('extra/'),'PeakAnnotator.jar')

                if CURRENT_PLATFORM=='CYGWIN':
                    peak_annotator_path=cygwin_path(peak_annotator_path)
                    gene_ids_to_names_filename=cygwin_path(gene_ids_to_names_filename)
                    regions=cygwin_path(regions)
                    gene_annotations_filename=cygwin_path(gene_annotations_filename)
                    
                if gene_ids_to_names_filename:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -s %s -o %s &> %s' \
                            %(regions,gene_annotations_filename,gene_ids_to_names_filename,genes_list_directory,os.path.join(genes_list_directory,'log_peakannotator.txt')),  shell=True,env=system_env)
                else:
                    sb.call('java -jar '+peak_annotator_path+' -u TSS -p %s -a %s  -o %s &> %s' \
                            %(regions,gene_annotations_filename,genes_list_directory,os.path.join(genes_list_directory,'log_peakannotator.txt')),  shell=True,env=system_env)

                
                genes_url=os.path.join('genes_lists',motif_ids[i]+'_motif_region_in_target.tss.bed')
                                
                
            motifs_dump.append({'id':motif_ids[i],'name':motif_names[i],'support_p':support_p[i]*100,
                                 'support_n':support_n[i]*100, 'ratio':motif_ratios[i],'rank':float(rankings[i]),
                                 'pvalue':fisher_p_values[i],'qvalue':qvalues[i],'central_enrichment':central_enrichment[i],
                                 'img_logo':img_logo_url,'img_profile':img_profile_url,'regions':regions_url,'genes':genes_url,'idx_motif':motif_idxs[i]})


    outfile= codecs.open(os.path.join(output_directory,"Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump,bed_target_filename=bed_target_filename,bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n,\
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,use_gene_annotations=use_gene_annotations))
    outfile.close()    

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory=os.path.join(output_directory,'dump')
        
        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)
            
        np.save(os.path.join(dump_directory,'matrix_'+target_name),positive_matrix)
        np.save(os.path.join(dump_directory,'matrix_BG_'+target_name),negative_matrix)
        
        cp.dump(motifs_dump,open(os.path.join(dump_directory,target_name+'_motif_dumps.pickle'),'w'))

        #cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        #cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(idxs_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_seqs_idxs.pickle'),'w'))
        cp.dump(idxs_seqs_with_motif_bg,open(os.path.join(dump_directory,bg_name+'_motif_seqs_idxs.pickle'),'w'))

        cp.dump(motif_coords_in_seqs_with_motif,open(os.path.join(dump_directory,target_name+'_motif_coords_in_seqs_with_motif.pickle'),'w'))

        Coordinate.coordinates_to_bed(target_coords,os.path.join(dump_directory,'Target_coordinates_selected_on_'+target_name+'.bed'),minimal_format=False)
        Coordinate.coordinates_to_bed(bg_coords,os.path.join(dump_directory,'BG_coordinates_selected_on_'+ bg_name+'.bed'),minimal_format=True)

    info('All done! Ciao!')
    sys.exit(0)

Пример #6

Показать файл

Файл: haystack_motifs.py Проект: a1aks/Haystack

def ParallelFimoScanning(target_coords,meme_motifs_filename,genome,nucleotide_bg_filename,temp_directory='/tmp',p_value=1e-4,num_consumers=multiprocessing.cpu_count(),mask_repetitive=False,window_length=None,internal_window_length=None):
    # Establish communication queues
    tasks = multiprocessing.Queue()
    results = multiprocessing.Queue()
    
    # Start consumers
    debug('Creating %d fimo consumers' % num_consumers)
    consumers = [ FimoSequencesConsumer(tasks, results)
                  for i in xrange(num_consumers) ]
    for w in consumers:
        w.start()


    #Initialize Fimo
    info('Initiliaze Fimo and load motifs')
    fimo=Fimo(meme_motifs_filename,nucleotide_bg_filename,temp_directory=temp_directory,p_value=p_value)

    #print 'DEBUG:',target_coords[0],len(target_coords[0])
    original_target_coords=target_coords
    
    if window_length:
        internal_bpstart=window_length/2-internal_window_length/2
        internal_bpend=window_length/2+internal_window_length/2
        #print 'DEBUG:',target_coords[0],window_length,internal_window_length,internal_bpstart,internal_bpend
        original_target_coords=target_coords
        target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,window_length)
        #print 'DEBUG:',target_coords[0],len(target_coords[0])
        
    
    # Enqueue jobs
    num_jobs = len(target_coords)
    for idx,c in enumerate(target_coords):
        seq=genome.extract_sequence(c,mask_repetitive=mask_repetitive)
        tasks.put(FimoOnSingleSequence(seq, fimo,idx))
    
    # Add a poison pill for each consumer
    for i in xrange(num_consumers):
        tasks.put(None)

    motifs_profiles_in_sequences=dict()
    idxs_seqs_with_motif=dict()
    motif_coords_in_seqs_with_motif=dict()
    
    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id]=np.zeros(len(c))
        idxs_seqs_with_motif[motif_id]=set()
        motif_coords_in_seqs_with_motif[motif_id]=pickable_defaultdict()

    motifs_in_sequences_matrix=np.zeros((len(target_coords),len(fimo.motif_ids)))

    # Build the final matrix
    for idx in xrange(len(target_coords)):
        idx_seq,row= results.get()
        motif_in_center=set()
        for motif in row:
            motifs_profiles_in_sequences[motif['id']][motif['start']:motif['end']]+=1.0
            
            if motif['start']>=internal_bpstart and motif['end']<=internal_bpend: #keep track only if is in the internal window!
                idxs_seqs_with_motif[motif['id']].add(idx_seq)
                motifs_in_sequences_matrix[idx_seq,fimo.motif_id_to_index[motif['id']]]=+1
                motif_in_center.add(motif['id'])

                motif_coords_in_seqs_with_motif[motif['id']][original_target_coords[idx_seq]].append((int(motif['start']+target_coords[idx_seq].bpstart-1),int(motif['end']+target_coords[idx_seq].bpstart-1) ))
                
            

    return motifs_in_sequences_matrix,motifs_profiles_in_sequences,idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,fimo.motif_names, fimo.motif_ids

Пример #7

Показать файл

Файл: annotation_to_genomic_regions.py Проект: lucapinello/bioutilities

'''
To obtain the intergenic regions use hg19.genome (chr lengths) and then with bedtools
subtractBed -a hg19.bed -b coordinates_gene_hg19.bed > intergenic_regions_hg19.bed


'''

from bioutilities import Coordinate, Gene
gene_annotation_file='RefSeqhg19.txt'


gl=Gene.load_from_annotation(gene_annotation_file,load_exons_introns_info=True,header_lines=1)

exons=Gene.exons_from_annotations(gene_annotation_file)
Coordinate.coordinates_to_bed(exons,'hg_19_exons.bed',minimal_format=True)
del(exons)

introns=Gene.introns_from_annotations(gene_annotation_file)
Coordinate.coordinates_to_bed(introns,'hg_19_introns.bed',minimal_format=True)
del(introns)

Coordinate.coordinates_to_bed(Gene.genes_coordinates_from_annotations(gene_annotation_file),'genes_coordinates_hg_19.bed',minimal_format=True)

Пример #8

Показать файл

def main(input_args=None):
    print '\n[H A Y S T A C K   M O T I F S]'
    print(
        '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    bootstrap = False
    ngram_correction = 'g'

    parser = get_args_motif()
    args = parser.parse_args(input_args)

    args.n_processes = max(1, args.n_processes - 1)

    args_dict = vars(args)
    for key, value in args_dict.items():
        if key == 'n_target_coordinates':
            n_target_coordinates = value
        else:
            exec('%s=%s' % (key, repr(value)))

    bed_score_column -= 1

    if no_c_g_correction:
        c_g_correction = False
    else:
        c_g_correction = True

    if no_random_sampling_target:
        random_sampling_target = False
    else:
        random_sampling_target = True

    check_file(bed_target_filename)

    if not bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)

    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename = os.path.join(
            determine_path('motif_databases'),
            'JASPAR_CORE_2016_vertebrates.meme')

    annotation_directory = determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error(
                'The mapping to the closest gene requires Java free available from: http://java.com/en/download/'
            )
            use_gene_annotations = False
        else:
            check_file(gene_annotations_filename)
            info('Using %s as gene annotations file' %
                 gene_annotations_filename)
            use_gene_annotations = True
    else:
        gene_annotations_filename = os.path.join(annotation_directory,
                                                 '%s_genes.bed' % genome_name)
        gene_ids_to_names_filename = os.path.join(
            annotation_directory, '%s_genes_id_to_names' % genome_name)

        if os.path.exists(gene_annotations_filename) and os.path.exists(
                gene_ids_to_names_filename):
            use_gene_annotations = True
        else:
            use_gene_annotations = False
            info('No gene annotations file specified')

    genome, _, nucleotide_bg_filename = initialize_genome(genome_name)

    target_name = ntpath.basename(bed_target_filename.replace('.bed', ''))

    bg_name = ntpath.basename(bed_bg_filename.replace('.bed', ''))
    # timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name = 'HAYSTACK_MOTIFS_on_' + name
    else:
        directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    info(
        '###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n' \
        % (bed_target_filename, bed_bg_filename, str(bg_target_ratio), str(c_g_correction), str(mask_repetitive),
           'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates), output_directory))

    N_TARGET = None
    N_BG = None
    COMMAND_USED = ' '.join(sys.argv)

    _n_target_coordinates = n_target_coordinates

    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords = Coordinate.bed_to_coordinates(bed_target_filename,
                                                  cl_score=bed_score_column)

    if len(target_coords) == 0:
        info('No coordinates to analyze in your input file. Exiting.')
        sys.exit(1)

    # calculate automatically the average lenght of the target regions

    if internal_window_length:
        info('Using the user defined internal window length:%d' %
             internal_window_length)
        if internal_window_length % 2:
            internal_window_length += 1

    else:

        internal_window_length = int(np.mean(map(len, target_coords)))
        if internal_window_length % 2:
            internal_window_length += 1
        info(
            'Using the average length of target coordinates as internal window length:%d'
            % internal_window_length)

        if not window_length:
            window_length = internal_window_length * 5

    info('Total window length:%d' % window_length)

    if not smooth_size:
        smooth_size = internal_window_length / 5

    target_coords = Coordinate.coordinates_of_intervals_around_center(
        target_coords, internal_window_length)

    if len(target_coords) > n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            target_coords = random.sample(target_coords, n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            sorted_idxs_by_score = np.argsort([c.score
                                               for c in target_coords])[::-1]
            target_coords = [
                target_coords[idx]
                for idx in sorted_idxs_by_score[:n_target_coordinates]
            ]
    else:

        if random_sampling_target and bootstrap and not np.isinf(
                n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords = sample_wr(target_coords, n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))

    info('Extracting Motifs in target coordinates')
    positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning(
        target_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)
    n_target_coordinates = len(target_coords)  # fix for the bootstrap!

    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords = []
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))

            for _ in range(bg_target_ratio):
                for idx_c, c in enumerate(target_coords):
                    c_bin = np.nonzero(
                        np.histogram(c_g_content_target[idx_c], bins)[0])[0][0]
                    c_random_bin = -1

                    while c_random_bin != c_bin:
                        random_bpstart = np.random.randint(
                            1, genome.chr_len[c.chr_id] - len(c) + 1)
                        c_random = Coordinate(c.chr_id, random_bpstart,
                                              random_bpstart + len(c) - 1)
                        seq = genome.extract_sequence(c_random)
                        c_g_content_c_random = (seq.count('c') +
                                                seq.count('g')) / float(len(c))
                        c_random_bin = np.nonzero(
                            np.histogram(c_g_content_c_random, bins)[0])[0][0]

                    # print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            debug('original: ' +
                  str(np.histogram(c_g_content_target, bins)[0]))
            debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0]))

        else:
            bg_coords = get_random_coordinates(target_coords, genome)

        info('Done!')

    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords = Coordinate.coordinates_of_intervals_around_center(
            bg_coords, internal_window_length)

        if use_entire_bg:
            bg_target_ratio = float(len(bg_coords)) / n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f',
                 bg_target_ratio)

        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))
            target_hist = np.histogram(c_g_content_target, bins)[0]
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            ratios = bg_hist / (target_hist * 1.0)
            debug('original:%s' % target_hist)
            debug('bg:%s' % bg_hist)
            debug('ratios:%s' % ratios)
            K_MATCH = min(
                bg_target_ratio,
                ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) &
                       (target_hist / float(target_hist.sum()) > 0.05)].min())

            debug('K_MATCH:%d' % K_MATCH)

            to_match = np.int32(np.floor(K_MATCH * target_hist))

            debug('to_match:%s' % to_match)

            idxs_corrected_bg = np.array([], dtype=int)

            for idx_bin in range(len(bins) - 1):
                idxs_matching_regions = \
                    np.nonzero((c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0]
                to_take = np.random.permutation(len(idxs_matching_regions))
                to_take = to_take[range(
                    min(len(idxs_matching_regions), to_match[idx_bin]))]
                idxs_corrected_bg = np.hstack(
                    (idxs_corrected_bg, idxs_matching_regions[to_take]))

            debug('original:%s' % target_hist)
            debug('K:%d' % K_MATCH)
            debug('to sample:%s' % to_match)
            debug('obtained:%s' %
                  np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0])
            bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug(np.histogram(c_g_content_bg, bins)[0])
            if np.array_equal(K_MATCH * target_hist,
                              np.histogram(c_g_content_bg, bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s'
                     % (target_hist, np.histogram(c_g_content_bg, bins)[0]))
            else:
                warn(
                    'C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'
                    % (target_hist, np.histogram(c_g_content_bg, bins)[0]))

            debug(target_hist / np.histogram(c_g_content_bg, bins)[0])

    if len(bg_coords) >= bg_target_ratio * n_target_coordinates:
        bg_coords = random.sample(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
    else:
        if bootstrap and len(bg_coords) < (bg_target_ratio *
                                           n_target_coordinates *
                                           0.95):  # allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords = sample_wr(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' %
                  (target_hist, np.histogram(c_g_content_bg, bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning(
        bg_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)[0:3]

    # allocate date for reports
    N_MOTIFS = len(motif_ids)
    rankings = np.zeros(N_MOTIFS, dtype=np.int16)
    motif_ratios = np.zeros(N_MOTIFS)
    support_p = np.zeros(N_MOTIFS)
    support_n = np.zeros(N_MOTIFS)
    fisher_p_values = np.zeros(N_MOTIFS)
    central_enrichment = np.zeros(N_MOTIFS)

    N_seq_p = positive_matrix.shape[0]
    N_seq_n = negative_matrix.shape[0]

    profile_presence_p = (positive_matrix > 0).sum(0)
    profile_presence_n = (negative_matrix > 0).sum(0)

    support_p = profile_presence_p / float(N_seq_p)
    support_n = profile_presence_n / float(N_seq_n)

    internal_bpstart = window_length / 2 - internal_window_length / 2
    internal_bpend = window_length / 2 + internal_window_length / 2

    for idx, motif_id in enumerate(motif_ids):
        fisher_p_values[idx] = stats.fisher_exact(
            [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]],
             [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1]
        central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][
            internal_bpstart:internal_bpend].mean() / np.hstack([
                motifs_profiles_in_sequences[motif_id][:internal_bpstart],
                motifs_profiles_in_sequences[motif_id][internal_bpend:]
            ]).mean()

    motif_ratios = (support_p + 0.01) / (support_n + 0.01)

    # Foundamental!
    if not disable_ratio:
        motif_ratios[support_p < 0.03] = 1

    rankings = stats.rankdata(-motif_ratios)

    # filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep = np.nonzero(motif_ratios > 1)[0]
    else:
        idxs_to_keep = range(len(motif_ratios))

    rankings = rankings[idxs_to_keep]
    motif_ratios = motif_ratios[idxs_to_keep]
    support_p = support_p[idxs_to_keep]
    support_n = support_n[idxs_to_keep]
    fisher_p_values = fisher_p_values[idxs_to_keep]
    central_enrichment = central_enrichment[idxs_to_keep]

    motif_ids = [motif_ids[_] for _ in idxs_to_keep]
    motif_names = [motif_names[_] for _ in idxs_to_keep]
    motif_idxs = [_ for _ in idxs_to_keep]

    try:
        qvalues = estimate_qvalues(fisher_p_values)
        # we test the ones only with ratio >1
    except:
        print fisher_p_values

    # qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################

    # generate reports in html
    info('Generating HTML report...')
    imgs_directory = os.path.join(output_directory, 'images')
    genes_list_directory = os.path.join(output_directory, 'genes_lists')
    motif_regions_directory = os.path.join(output_directory, 'motifs_regions')

    # create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)

    j2_env = Environment(
        loader=FileSystemLoader(determine_path('extra') + '/templates/'),
        trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra') + '/templates/')
    template = j2_env.get_template('report_template.html')

    # copy haystack logo and bg
    shutil.copyfile(
        determine_path('extra') + '/templates/haystack_logo.png',
        os.path.join(imgs_directory, 'haystack_logo.png'))
    shutil.copyfile(
        determine_path('extra') + '/templates/noise.png',
        os.path.join(imgs_directory, 'noise.png'))

    motifs_dump = []
    for i in np.argsort(rankings):
        if (support_p[i] >= 0.03
                or disable_ratio) and fisher_p_values[i] < 0.01 and (
                    motif_ratios[i] > 1 or disable_ratio
                ) and central_enrichment[i] > min_central_enrichment:
            # if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):

            info('Generating logo and profile for:' + motif_ids[i])

            # create motif logo
            img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i],
                             file_format='pdf')
            # fix the weblogo prefix problem
            img_logo_url = os.path.join('images',
                                        'logo_' + motif_ids[i] + '.png')

            # create motif enrichment profile
            img_profile = os.path.join(imgs_directory,
                                       'profile_' + motif_ids[i] + '.png')
            motif_profile_target = motifs_profiles_in_sequences[
                motif_ids[i]] / N_seq_p
            motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n

            # print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,
                                   motif_profile_bg,
                                   motif_ids[i],
                                   img_profile,
                                   smooth_size=smooth_size,
                                   window_size=window_length)
            img_profile_url = os.path.join('images',
                                           'profile_' + motif_ids[i] + '.png')

            # create regions
            info('Extracting regions with:' + motif_ids[i])
            regions = os.path.join(
                motif_regions_directory,
                motif_ids[i] + '_motif_region_in_target.bed')
            with open(regions, 'w+') as outfile:
                outfile.write(
                    'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n'
                )
                for c, locations in motif_coords_in_seqs_with_motif[
                        motif_ids[i]].items():
                    outfile.write('\t'.join([
                        c.chr_id,
                        str(c.bpstart),
                        str(c.bpend), ';'.join([
                            '-'.join(map(str, map(int, l))) for l in locations
                        ]),
                        str(len(locations))
                    ]) + '\n')
            regions_url = os.path.join(
                'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed')

            # map closest downstream genes
            genes_url = None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' %
                     motif_ids[i])

                peak_annotator_path = os.path.join(determine_path('extra/'),
                                                   'PeakAnnotator.jar')

                if gene_ids_to_names_filename:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, gene_ids_to_names_filename, genes_list_directory),
                            shell=True)
                else:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s  -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, genes_list_directory), shell=True)

                genes_url = os.path.join(
                    'genes_lists',
                    motif_ids[i] + '_motif_region_in_target.tss.bed')

            motifs_dump.append({
                'id': motif_ids[i],
                'name': motif_names[i],
                'support_p': support_p[i] * 100,
                'support_n': support_n[i] * 100,
                'ratio': motif_ratios[i],
                'rank': float(rankings[i]),
                'pvalue': fisher_p_values[i],
                'qvalue': qvalues[i],
                'central_enrichment': central_enrichment[i],
                'img_logo': img_logo_url,
                'img_profile': img_profile_url,
                'regions': regions_url,
                'genes': genes_url,
                'idx_motif': motif_idxs[i]
            })

    outfile = codecs.open(
        os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump, bed_target_filename=bed_target_filename,
                                  bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n, \
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,
                                  use_gene_annotations=use_gene_annotations))
    outfile.close()

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory = os.path.join(output_directory, 'dump')

        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)

        np.save(os.path.join(dump_directory, 'matrix_' + target_name),
                positive_matrix)
        np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name),
                negative_matrix)

        cp.dump(
            motifs_dump,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_dumps.pickle'), 'w'))

        # cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        # cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(
            idxs_seqs_with_motif,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_seqs_idxs.pickle'), 'w'))
        cp.dump(
            idxs_seqs_with_motif_bg,
            open(
                os.path.join(dump_directory,
                             bg_name + '_motif_seqs_idxs.pickle'), 'w'))

        cp.dump(
            motif_coords_in_seqs_with_motif,
            open(
                os.path.join(
                    dump_directory,
                    target_name + '_motif_coords_in_seqs_with_motif.pickle'),
                'w'))

        Coordinate.coordinates_to_bed(
            target_coords,
            os.path.join(
                dump_directory,
                'Target_coordinates_selected_on_' + target_name + '.bed'),
            minimal_format=False)
        Coordinate.coordinates_to_bed(
            bg_coords,
            os.path.join(dump_directory,
                         'BG_coordinates_selected_on_' + bg_name + '.bed'),
            minimal_format=True)
    #info('Motif analysis for Sample %s completed' %name)
    info('Motif analysis completed! Ciao!')

Пример #9

Показать файл

def parallel_fimo_scanning(target_coords, meme_motifs_filename, genome,
                           nucleotide_bg_filename, temp_directory, p_value,
                           mask_repetitive, window_length,
                           internal_window_length, num_consumers):
    fimo = Fimo(meme_motifs_filename,
                nucleotide_bg_filename,
                temp_directory=temp_directory,
                p_value=p_value)

    # init variables
    prefix = 'haystack_motifs_' + str(uuid.uuid4())

    motifs_profiles_in_sequences = dict()
    idxs_seqs_with_motif = dict()
    motif_coords_in_seqs_with_motif = dict()

    # extend with flanking
    original_target_coords = target_coords

    if window_length:
        internal_bpstart = window_length / 2 - internal_window_length / 2
        internal_bpend = window_length / 2 + internal_window_length / 2
        target_coords = Coordinate.coordinates_of_intervals_around_center(
            target_coords, window_length)

    # write fasta
    target_coords_fasta_filename = os.path.join(temp_directory, prefix + '.fa')
    Coordinate.coordinates_to_fasta(target_coords,
                                    target_coords_fasta_filename, genome)

    # mapping
    coord_to_idx = dict()
    for idx, c in enumerate(target_coords):
        coord_to_idx[str(c).split()[0]] = idx

    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id] = np.zeros(len(c))
        idxs_seqs_with_motif[motif_id] = set()
        motif_coords_in_seqs_with_motif[motif_id] = pickable_defaultdict()

    motifs_in_sequences_matrix = np.zeros(
        (len(target_coords), len(fimo.motif_ids)))

    # num_consumers= num_consumers -2

    # compute motifs with fimo
    if num_consumers > 1:

        # partial function for multiprocessing
        compute_single_motif = partial(call_fimo, target_coords_fasta_filename,
                                       prefix, meme_motifs_filename,
                                       nucleotide_bg_filename, temp_directory,
                                       p_value)

        pool = mp.Pool(processes=num_consumers)
        pool.map(compute_single_motif, fimo.motif_ids)
        pool.close()
        pool.join()
        fimo_output_filename = os.path.join(temp_directory,
                                            prefix + '_fimo_output.motifs')
        sb.call('cat %s*.motifs > "%s"' %
                (os.path.join(temp_directory, prefix), fimo_output_filename),
                shell=True)
    else:
        call_fimo(target_coords_fasta_filename, prefix, meme_motifs_filename,
                  nucleotide_bg_filename, temp_directory, p_value,
                  'ALL_MOTIFS')
        fimo_output_filename = os.path.join(
            temp_directory, '%s_%s.motifs' % (prefix, 'ALL_MOTIFS'))

    with open(fimo_output_filename) as infile:

        for line in infile:
            try:

                motif_id, motif_coord, motif_start, motif_end = line.split()
                motif_start = int(motif_start)
                motif_end = int(motif_end)
                idx_seq = coord_to_idx[motif_coord]

                motifs_profiles_in_sequences[motif_id][
                    motif_start:motif_end] += 1.0

                if motif_start >= internal_bpstart and motif_end <= internal_bpend:  # keep track only if is in the internal window!
                    idxs_seqs_with_motif[motif_id].add(idx_seq)
                    motifs_in_sequences_matrix[
                        idx_seq, fimo.motif_id_to_index[motif_id]] = +1
                    motif_coords_in_seqs_with_motif[motif_id][
                        original_target_coords[idx_seq]].append(
                            (motif_start + target_coords[idx_seq].bpstart - 1,
                             motif_end + target_coords[idx_seq].bpstart - 1))
            except:
                print line

    sb.call('rm %s* ' % os.path.join(temp_directory, prefix), shell=True)

    return motifs_in_sequences_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, fimo.motif_names, fimo.motif_ids

Пример #10

Показать файл

Файл: haystack_motifs_CORE.py Проект: lucapinello/Haystack

def parallel_fimo_scanning(target_coords,
                              meme_motifs_filename,
                              genome,nucleotide_bg_filename,
                              temp_directory,
                              p_value,
                              mask_repetitive,
                              window_length,
                              internal_window_length,
                              num_consumers):
    
    fimo=Fimo(meme_motifs_filename,nucleotide_bg_filename,temp_directory=temp_directory,p_value=p_value)
    
    #init variables
    prefix='haystack_motifs_'+str(uuid.uuid4())
        


    motifs_profiles_in_sequences=dict()
    idxs_seqs_with_motif=dict()
    motif_coords_in_seqs_with_motif=dict()
    
    
        
    #extend with flanking
    original_target_coords=target_coords

    if window_length:
        internal_bpstart=window_length/2-internal_window_length/2
        internal_bpend=window_length/2+internal_window_length/2
        target_coords=Coordinate.coordinates_of_intervals_around_center(target_coords,window_length)
    
    #write fasta
    target_coords_fasta_filename=os.path.join(temp_directory,prefix+'.fa')  
    Coordinate.coordinates_to_fasta(target_coords,target_coords_fasta_filename,genome)
    
    #mapping
    coord_to_idx=dict()
    for idx,c in enumerate(target_coords):
        coord_to_idx[str(c).split()[0]]=idx
        
        
        
    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id]=np.zeros(len(c))
        idxs_seqs_with_motif[motif_id]=set()
        motif_coords_in_seqs_with_motif[motif_id]=pickable_defaultdict()

    motifs_in_sequences_matrix=np.zeros((len(target_coords),len(fimo.motif_ids)))

    #compute motifs with fimo
    if num_consumers>1:
        
        #partial function for multiprocessing
        compute_single_motif=partial(call_fimo,target_coords_fasta_filename,prefix,meme_motifs_filename,nucleotide_bg_filename,temp_directory,p_value)
        
        pool = mp.Pool(processes=num_consumers)
        results=pool.map(compute_single_motif,fimo.motif_ids)
        pool.close()
        pool.join()
        fimo_output_filename=os.path.join(temp_directory,prefix+'_fimo_output.motifs')
        sb.call('cat %s*.motifs > %s' % (os.path.join(temp_directory,prefix),fimo_output_filename ),shell=True)
    else:
        call_fimo(target_coords_fasta_filename,prefix,meme_motifs_filename,nucleotide_bg_filename,temp_directory,p_value,'ALL_MOTIFS')
        fimo_output_filename=os.path.join(temp_directory,'%s_%s.motifs' % (prefix,'ALL_MOTIFS'))
    

    with open(fimo_output_filename) as infile:
        
        for line in infile:
            try:
            
                motif_id,motif_coord,motif_start,motif_end=line.split()
                motif_start=int(motif_start)
                motif_end=int(motif_end)
                idx_seq=coord_to_idx[motif_coord]
    
                motifs_profiles_in_sequences[motif_id][motif_start:motif_end]+=1.0
    
                if motif_start>=internal_bpstart and motif_end<=internal_bpend: #keep track only if is in the internal window!
                    idxs_seqs_with_motif[motif_id].add(idx_seq)
                    motifs_in_sequences_matrix[idx_seq,fimo.motif_id_to_index[motif_id]]=+1
                    motif_coords_in_seqs_with_motif[motif_id][original_target_coords[idx_seq]].append((motif_start+target_coords[idx_seq].bpstart-1,motif_end+target_coords[idx_seq].bpstart-1 ))  
            except:
                print line
      
    sb.call('rm %s* ' % os.path.join(temp_directory,prefix),shell=True)

    return motifs_in_sequences_matrix,motifs_profiles_in_sequences,idxs_seqs_with_motif,motif_coords_in_seqs_with_motif,fimo.motif_names, fimo.motif_ids

Пример #11

Показать файл

Файл: calculate_conservation.py Проект: lucapinello/bioutilities

                #print 'problema coordinate random'
                region_nok=True

                not_founded=True
                while not_founded:
                    c_random=target_coordinates[np.random.randint(len(target_coordinates))]
                    not_founded= len(c_random) < len(c)
                

    #print random_sampling_score
    return random_sampling_score


print 'carica i dati'

input_coordinates=Coordinate.bed_to_coordinates(input_bed_file)
exons_coordinates=Coordinate.bed_to_coordinates(exon_bed_file)
introns_coordinates=Coordinate.bed_to_coordinates(intron_bed_file)
intergenic_coordinates=Coordinate.bed_to_coordinates(intergenic_bed_file)

print 'alloca memoria per lunghezza intersezioni'
inters_length_exon_intron_intergenic=np.zeros((len(input_coordinates),3))

print 'costruisci interval tree degli enanchers'
interval_tree=dict()
coord_to_row_index=dict()
row_index=0
for c in input_coordinates:
    if c.chr_id not in interval_tree:
        interval_tree[c.chr_id]=Intersecter()