Пример #1
0
def main(input_args=None):
    print '\n[H A Y S T A C K   M O T I F S]'
    print(
        '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n'
    )
    print 'Version %s\n' % HAYSTACK_VERSION

    bootstrap = False
    ngram_correction = 'g'

    parser = get_args_motif()
    args = parser.parse_args(input_args)

    args.n_processes = max(1, args.n_processes - 1)

    args_dict = vars(args)
    for key, value in args_dict.items():
        if key == 'n_target_coordinates':
            n_target_coordinates = value
        else:
            exec('%s=%s' % (key, repr(value)))

    bed_score_column -= 1

    if no_c_g_correction:
        c_g_correction = False
    else:
        c_g_correction = True

    if no_random_sampling_target:
        random_sampling_target = False
    else:
        random_sampling_target = True

    check_file(bed_target_filename)

    if not bed_bg_filename == 'random_background':
        check_file(bed_bg_filename)

    if meme_motifs_filename:
        check_file(meme_motifs_filename)
    else:
        meme_motifs_filename = os.path.join(
            determine_path('motif_databases'),
            'JASPAR_CORE_2016_vertebrates.meme')

    annotation_directory = determine_path('gene_annotations')
    if gene_annotations_filename:

        if which('java') is None:
            error(
                'The mapping to the closest gene requires Java free available from: http://java.com/en/download/'
            )
            use_gene_annotations = False
        else:
            check_file(gene_annotations_filename)
            info('Using %s as gene annotations file' %
                 gene_annotations_filename)
            use_gene_annotations = True
    else:
        gene_annotations_filename = os.path.join(annotation_directory,
                                                 '%s_genes.bed' % genome_name)
        gene_ids_to_names_filename = os.path.join(
            annotation_directory, '%s_genes_id_to_names' % genome_name)

        if os.path.exists(gene_annotations_filename) and os.path.exists(
                gene_ids_to_names_filename):
            use_gene_annotations = True
        else:
            use_gene_annotations = False
            info('No gene annotations file specified')

    genome, _, nucleotide_bg_filename = initialize_genome(genome_name)

    target_name = ntpath.basename(bed_target_filename.replace('.bed', ''))

    bg_name = ntpath.basename(bed_bg_filename.replace('.bed', ''))
    # timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.')

    if name:
        directory_name = 'HAYSTACK_MOTIFS_on_' + name
    else:
        directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name

    if output_directory:
        output_directory = os.path.join(output_directory, directory_name)
    else:
        output_directory = directory_name

    info(
        '###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n' \
        % (bed_target_filename, bed_bg_filename, str(bg_target_ratio), str(c_g_correction), str(mask_repetitive),
           'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates), output_directory))

    N_TARGET = None
    N_BG = None
    COMMAND_USED = ' '.join(sys.argv)

    _n_target_coordinates = n_target_coordinates

    info('Loading Target coordinates from bed:%s' % bed_target_filename)
    target_coords = Coordinate.bed_to_coordinates(bed_target_filename,
                                                  cl_score=bed_score_column)

    if len(target_coords) == 0:
        info('No coordinates to analyze in your input file. Exiting.')
        sys.exit(1)

    # calculate automatically the average lenght of the target regions

    if internal_window_length:
        info('Using the user defined internal window length:%d' %
             internal_window_length)
        if internal_window_length % 2:
            internal_window_length += 1

    else:

        internal_window_length = int(np.mean(map(len, target_coords)))
        if internal_window_length % 2:
            internal_window_length += 1
        info(
            'Using the average length of target coordinates as internal window length:%d'
            % internal_window_length)

        if not window_length:
            window_length = internal_window_length * 5

    info('Total window length:%d' % window_length)

    if not smooth_size:
        smooth_size = internal_window_length / 5

    target_coords = Coordinate.coordinates_of_intervals_around_center(
        target_coords, internal_window_length)

    if len(target_coords) > n_target_coordinates:
        if random_sampling_target:
            info('Sampling %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            target_coords = random.sample(target_coords, n_target_coordinates)
        else:
            info('Selecting the best %d coordinates among the %d total' %
                 (n_target_coordinates, len(target_coords)))
            sorted_idxs_by_score = np.argsort([c.score
                                               for c in target_coords])[::-1]
            target_coords = [
                target_coords[idx]
                for idx in sorted_idxs_by_score[:n_target_coordinates]
            ]
    else:

        if random_sampling_target and bootstrap and not np.isinf(
                n_target_coordinates):
            warn('Number of target regions < %d' % n_target_coordinates)
            info('bootstrapping to obtain enough target regions')
            target_coords = sample_wr(target_coords, n_target_coordinates)
        else:
            info('Using all the %d target coordinates' % len(target_coords))

    info('Extracting Motifs in target coordinates')
    positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning(
        target_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)
    n_target_coordinates = len(target_coords)  # fix for the bootstrap!

    if bed_bg_filename == 'random_background':
        info('Extracting Random Coordinates from the genome...')

        if c_g_correction:
            info('Calculating the C+G content of the target coordinates')
            bg_coords = []
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))

            for _ in range(bg_target_ratio):
                for idx_c, c in enumerate(target_coords):
                    c_bin = np.nonzero(
                        np.histogram(c_g_content_target[idx_c], bins)[0])[0][0]
                    c_random_bin = -1

                    while c_random_bin != c_bin:
                        random_bpstart = np.random.randint(
                            1, genome.chr_len[c.chr_id] - len(c) + 1)
                        c_random = Coordinate(c.chr_id, random_bpstart,
                                              random_bpstart + len(c) - 1)
                        seq = genome.extract_sequence(c_random)
                        c_g_content_c_random = (seq.count('c') +
                                                seq.count('g')) / float(len(c))
                        c_random_bin = np.nonzero(
                            np.histogram(c_g_content_c_random, bins)[0])[0][0]

                    # print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c
                    bg_coords.append(c_random)

            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            debug('original: ' +
                  str(np.histogram(c_g_content_target, bins)[0]))
            debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0]))

        else:
            bg_coords = get_random_coordinates(target_coords, genome)

        info('Done!')

    else:
        info('Loading Background Coordinates from:%s' % bed_bg_filename)
        bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename)
        bg_coords = Coordinate.coordinates_of_intervals_around_center(
            bg_coords, internal_window_length)

        if use_entire_bg:
            bg_target_ratio = float(len(bg_coords)) / n_target_coordinates
            info('Using all the coordinates in the BG, BG/TG:%f',
                 bg_target_ratio)

        if c_g_correction:
            info('Calculating the C+G content')
            c_g_content_target = calculate_average_ngram_presence(
                target_coords, genome, ngram_correction)
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)

            info('Extract a Matching C+G Background')
            bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf))
            target_hist = np.histogram(c_g_content_target, bins)[0]
            bg_hist = np.histogram(c_g_content_bg, bins)[0]
            ratios = bg_hist / (target_hist * 1.0)
            debug('original:%s' % target_hist)
            debug('bg:%s' % bg_hist)
            debug('ratios:%s' % ratios)
            K_MATCH = min(
                bg_target_ratio,
                ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) &
                       (target_hist / float(target_hist.sum()) > 0.05)].min())

            debug('K_MATCH:%d' % K_MATCH)

            to_match = np.int32(np.floor(K_MATCH * target_hist))

            debug('to_match:%s' % to_match)

            idxs_corrected_bg = np.array([], dtype=int)

            for idx_bin in range(len(bins) - 1):
                idxs_matching_regions = \
                    np.nonzero((c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0]
                to_take = np.random.permutation(len(idxs_matching_regions))
                to_take = to_take[range(
                    min(len(idxs_matching_regions), to_match[idx_bin]))]
                idxs_corrected_bg = np.hstack(
                    (idxs_corrected_bg, idxs_matching_regions[to_take]))

            debug('original:%s' % target_hist)
            debug('K:%d' % K_MATCH)
            debug('to sample:%s' % to_match)
            debug('obtained:%s' %
                  np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0])
            bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg]
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug(np.histogram(c_g_content_bg, bins)[0])
            if np.array_equal(K_MATCH * target_hist,
                              np.histogram(c_g_content_bg, bins)[0]):
                info('C+G content perfectly matched!\n\ttarget:%s\n\tbg    :%s'
                     % (target_hist, np.histogram(c_g_content_bg, bins)[0]))
            else:
                warn(
                    'C+G content not perfectly matched\n\ttarget:%s\n\tbg    :%s'
                    % (target_hist, np.histogram(c_g_content_bg, bins)[0]))

            debug(target_hist / np.histogram(c_g_content_bg, bins)[0])

    if len(bg_coords) >= bg_target_ratio * n_target_coordinates:
        bg_coords = random.sample(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
    else:
        if bootstrap and len(bg_coords) < (bg_target_ratio *
                                           n_target_coordinates *
                                           0.95):  # allow a small tollerance!
            info('bootstrapping to obtain enough background regions')
            bg_coords = sample_wr(bg_coords,
                                  int(bg_target_ratio * n_target_coordinates))
            c_g_content_bg = calculate_average_ngram_presence(
                bg_coords, genome, ngram_correction)
            debug('After bootstrap:\n\ttarget:%s\n\tbg    :%s' %
                  (target_hist, np.histogram(c_g_content_bg, bins)[0]))

    info('Extracting Motifs in background coordinates')
    negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning(
        bg_coords,
        meme_motifs_filename,
        genome,
        nucleotide_bg_filename,
        temp_directory=temp_directory,
        p_value=p_value,
        mask_repetitive=mask_repetitive,
        window_length=window_length,
        internal_window_length=internal_window_length,
        num_consumers=n_processes)[0:3]

    # allocate date for reports
    N_MOTIFS = len(motif_ids)
    rankings = np.zeros(N_MOTIFS, dtype=np.int16)
    motif_ratios = np.zeros(N_MOTIFS)
    support_p = np.zeros(N_MOTIFS)
    support_n = np.zeros(N_MOTIFS)
    fisher_p_values = np.zeros(N_MOTIFS)
    central_enrichment = np.zeros(N_MOTIFS)

    N_seq_p = positive_matrix.shape[0]
    N_seq_n = negative_matrix.shape[0]

    profile_presence_p = (positive_matrix > 0).sum(0)
    profile_presence_n = (negative_matrix > 0).sum(0)

    support_p = profile_presence_p / float(N_seq_p)
    support_n = profile_presence_n / float(N_seq_n)

    internal_bpstart = window_length / 2 - internal_window_length / 2
    internal_bpend = window_length / 2 + internal_window_length / 2

    for idx, motif_id in enumerate(motif_ids):
        fisher_p_values[idx] = stats.fisher_exact(
            [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]],
             [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1]
        central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][
            internal_bpstart:internal_bpend].mean() / np.hstack([
                motifs_profiles_in_sequences[motif_id][:internal_bpstart],
                motifs_profiles_in_sequences[motif_id][internal_bpend:]
            ]).mean()

    motif_ratios = (support_p + 0.01) / (support_n + 0.01)

    # Foundamental!
    if not disable_ratio:
        motif_ratios[support_p < 0.03] = 1

    rankings = stats.rankdata(-motif_ratios)

    # filter here positive or positive and negative#################################
    if not disable_ratio:
        idxs_to_keep = np.nonzero(motif_ratios > 1)[0]
    else:
        idxs_to_keep = range(len(motif_ratios))

    rankings = rankings[idxs_to_keep]
    motif_ratios = motif_ratios[idxs_to_keep]
    support_p = support_p[idxs_to_keep]
    support_n = support_n[idxs_to_keep]
    fisher_p_values = fisher_p_values[idxs_to_keep]
    central_enrichment = central_enrichment[idxs_to_keep]

    motif_ids = [motif_ids[_] for _ in idxs_to_keep]
    motif_names = [motif_names[_] for _ in idxs_to_keep]
    motif_idxs = [_ for _ in idxs_to_keep]

    try:
        qvalues = estimate_qvalues(fisher_p_values)
        # we test the ones only with ratio >1
    except:
        print fisher_p_values

    # qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids))
    ################################################################################

    # generate reports in html
    info('Generating HTML report...')
    imgs_directory = os.path.join(output_directory, 'images')
    genes_list_directory = os.path.join(output_directory, 'genes_lists')
    motif_regions_directory = os.path.join(output_directory, 'motifs_regions')

    # create folders
    if not os.path.exists(imgs_directory):
        os.makedirs(imgs_directory)
    if use_gene_annotations and not os.path.exists(genes_list_directory):
        os.makedirs(genes_list_directory)
    if not os.path.exists(motif_regions_directory):
        os.makedirs(motif_regions_directory)

    j2_env = Environment(
        loader=FileSystemLoader(determine_path('extra') + '/templates/'),
        trim_blocks=True)

    info('DIRECTORY:%s' % determine_path('extra') + '/templates/')
    template = j2_env.get_template('report_template.html')

    # copy haystack logo and bg
    shutil.copyfile(
        determine_path('extra') + '/templates/haystack_logo.png',
        os.path.join(imgs_directory, 'haystack_logo.png'))
    shutil.copyfile(
        determine_path('extra') + '/templates/noise.png',
        os.path.join(imgs_directory, 'noise.png'))

    motifs_dump = []
    for i in np.argsort(rankings):
        if (support_p[i] >= 0.03
                or disable_ratio) and fisher_p_values[i] < 0.01 and (
                    motif_ratios[i] > 1 or disable_ratio
                ) and central_enrichment[i] > min_central_enrichment:
            # if (support_p[i]>=0.01 or  support_n[i]>=0.01) and fisher_p_values[i]<0.1 and  (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and  ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9):

            info('Generating logo and profile for:' + motif_ids[i])

            # create motif logo
            img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i])
            generate_weblogo(motif_ids[i],
                             meme_motifs_filename,
                             img_logo,
                             title=motif_ids[i],
                             file_format='pdf')
            # fix the weblogo prefix problem
            img_logo_url = os.path.join('images',
                                        'logo_' + motif_ids[i] + '.png')

            # create motif enrichment profile
            img_profile = os.path.join(imgs_directory,
                                       'profile_' + motif_ids[i] + '.png')
            motif_profile_target = motifs_profiles_in_sequences[
                motif_ids[i]] / N_seq_p
            motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n

            # print motif_profile_target.shape, motif_profile_bg.shape
            generate_motif_profile(motif_profile_target,
                                   motif_profile_bg,
                                   motif_ids[i],
                                   img_profile,
                                   smooth_size=smooth_size,
                                   window_size=window_length)
            img_profile_url = os.path.join('images',
                                           'profile_' + motif_ids[i] + '.png')

            # create regions
            info('Extracting regions with:' + motif_ids[i])
            regions = os.path.join(
                motif_regions_directory,
                motif_ids[i] + '_motif_region_in_target.bed')
            with open(regions, 'w+') as outfile:
                outfile.write(
                    'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n'
                )
                for c, locations in motif_coords_in_seqs_with_motif[
                        motif_ids[i]].items():
                    outfile.write('\t'.join([
                        c.chr_id,
                        str(c.bpstart),
                        str(c.bpend), ';'.join([
                            '-'.join(map(str, map(int, l))) for l in locations
                        ]),
                        str(len(locations))
                    ]) + '\n')
            regions_url = os.path.join(
                'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed')

            # map closest downstream genes
            genes_url = None
            if use_gene_annotations:
                info('Mapping regions with:%s to the clostest genes' %
                     motif_ids[i])

                peak_annotator_path = os.path.join(determine_path('extra/'),
                                                   'PeakAnnotator.jar')

                if gene_ids_to_names_filename:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, gene_ids_to_names_filename, genes_list_directory),
                            shell=True)
                else:
                    sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s  -o %s >/dev/null 2>&1' \
                            % (regions, gene_annotations_filename, genes_list_directory), shell=True)

                genes_url = os.path.join(
                    'genes_lists',
                    motif_ids[i] + '_motif_region_in_target.tss.bed')

            motifs_dump.append({
                'id': motif_ids[i],
                'name': motif_names[i],
                'support_p': support_p[i] * 100,
                'support_n': support_n[i] * 100,
                'ratio': motif_ratios[i],
                'rank': float(rankings[i]),
                'pvalue': fisher_p_values[i],
                'qvalue': qvalues[i],
                'central_enrichment': central_enrichment[i],
                'img_logo': img_logo_url,
                'img_profile': img_profile_url,
                'regions': regions_url,
                'genes': genes_url,
                'idx_motif': motif_idxs[i]
            })

    outfile = codecs.open(
        os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8")
    outfile.write(template.render(motifs_dump=motifs_dump, bed_target_filename=bed_target_filename,
                                  bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n, \
                                  meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,
                                  use_gene_annotations=use_gene_annotations))
    outfile.close()

    if dump:
        info('Saving all the intermediate data on: %s ...' % output_directory)
        dump_directory = os.path.join(output_directory, 'dump')

        if not os.path.exists(dump_directory):
            os.makedirs(dump_directory)

        np.save(os.path.join(dump_directory, 'matrix_' + target_name),
                positive_matrix)
        np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name),
                negative_matrix)

        cp.dump(
            motifs_dump,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_dumps.pickle'), 'w'))

        # cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w'))
        # cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w'))

        cp.dump(
            idxs_seqs_with_motif,
            open(
                os.path.join(dump_directory,
                             target_name + '_motif_seqs_idxs.pickle'), 'w'))
        cp.dump(
            idxs_seqs_with_motif_bg,
            open(
                os.path.join(dump_directory,
                             bg_name + '_motif_seqs_idxs.pickle'), 'w'))

        cp.dump(
            motif_coords_in_seqs_with_motif,
            open(
                os.path.join(
                    dump_directory,
                    target_name + '_motif_coords_in_seqs_with_motif.pickle'),
                'w'))

        Coordinate.coordinates_to_bed(
            target_coords,
            os.path.join(
                dump_directory,
                'Target_coordinates_selected_on_' + target_name + '.bed'),
            minimal_format=False)
        Coordinate.coordinates_to_bed(
            bg_coords,
            os.path.join(dump_directory,
                         'BG_coordinates_selected_on_' + bg_name + '.bed'),
            minimal_format=True)
    #info('Motif analysis for Sample %s completed' %name)
    info('Motif analysis completed! Ciao!')
Пример #2
0
def parallel_fimo_scanning(target_coords, meme_motifs_filename, genome,
                           nucleotide_bg_filename, temp_directory, p_value,
                           mask_repetitive, window_length,
                           internal_window_length, num_consumers):
    fimo = Fimo(meme_motifs_filename,
                nucleotide_bg_filename,
                temp_directory=temp_directory,
                p_value=p_value)

    # init variables
    prefix = 'haystack_motifs_' + str(uuid.uuid4())

    motifs_profiles_in_sequences = dict()
    idxs_seqs_with_motif = dict()
    motif_coords_in_seqs_with_motif = dict()

    # extend with flanking
    original_target_coords = target_coords

    if window_length:
        internal_bpstart = window_length / 2 - internal_window_length / 2
        internal_bpend = window_length / 2 + internal_window_length / 2
        target_coords = Coordinate.coordinates_of_intervals_around_center(
            target_coords, window_length)

    # write fasta
    target_coords_fasta_filename = os.path.join(temp_directory, prefix + '.fa')
    Coordinate.coordinates_to_fasta(target_coords,
                                    target_coords_fasta_filename, genome)

    # mapping
    coord_to_idx = dict()
    for idx, c in enumerate(target_coords):
        coord_to_idx[str(c).split()[0]] = idx

    for motif_id in fimo.motif_ids:
        motifs_profiles_in_sequences[motif_id] = np.zeros(len(c))
        idxs_seqs_with_motif[motif_id] = set()
        motif_coords_in_seqs_with_motif[motif_id] = pickable_defaultdict()

    motifs_in_sequences_matrix = np.zeros(
        (len(target_coords), len(fimo.motif_ids)))

    # num_consumers= num_consumers -2

    # compute motifs with fimo
    if num_consumers > 1:

        # partial function for multiprocessing
        compute_single_motif = partial(call_fimo, target_coords_fasta_filename,
                                       prefix, meme_motifs_filename,
                                       nucleotide_bg_filename, temp_directory,
                                       p_value)

        pool = mp.Pool(processes=num_consumers)
        pool.map(compute_single_motif, fimo.motif_ids)
        pool.close()
        pool.join()
        fimo_output_filename = os.path.join(temp_directory,
                                            prefix + '_fimo_output.motifs')
        sb.call('cat %s*.motifs > "%s"' %
                (os.path.join(temp_directory, prefix), fimo_output_filename),
                shell=True)
    else:
        call_fimo(target_coords_fasta_filename, prefix, meme_motifs_filename,
                  nucleotide_bg_filename, temp_directory, p_value,
                  'ALL_MOTIFS')
        fimo_output_filename = os.path.join(
            temp_directory, '%s_%s.motifs' % (prefix, 'ALL_MOTIFS'))

    with open(fimo_output_filename) as infile:

        for line in infile:
            try:

                motif_id, motif_coord, motif_start, motif_end = line.split()
                motif_start = int(motif_start)
                motif_end = int(motif_end)
                idx_seq = coord_to_idx[motif_coord]

                motifs_profiles_in_sequences[motif_id][
                    motif_start:motif_end] += 1.0

                if motif_start >= internal_bpstart and motif_end <= internal_bpend:  # keep track only if is in the internal window!
                    idxs_seqs_with_motif[motif_id].add(idx_seq)
                    motifs_in_sequences_matrix[
                        idx_seq, fimo.motif_id_to_index[motif_id]] = +1
                    motif_coords_in_seqs_with_motif[motif_id][
                        original_target_coords[idx_seq]].append(
                            (motif_start + target_coords[idx_seq].bpstart - 1,
                             motif_end + target_coords[idx_seq].bpstart - 1))
            except:
                print line

    sb.call('rm %s* ' % os.path.join(temp_directory, prefix), shell=True)

    return motifs_in_sequences_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, fimo.motif_names, fimo.motif_ids