예제 #1
0
def annotate_peaks(peaks, ref_path):
    """
    peak to gene annotation strategy:
        1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak
        2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak
        3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak
            This step is optional
        4. call it an intergenic peak
    """

    ref_mgr = ReferenceManager(ref_path)
    tss = BedTool(ref_mgr.tss_track)

    # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites
    if tss.field_count() == 7:
        tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
    else:
        df_tss = tss.to_dataframe()
        df_tss['gene_type'] = '.'
        tss_filtered = BedTool.from_dataframe(df_tss).saveas()

    # including transcripts.bed is optional
    if ref_mgr.transcripts_track is None:
        transcripts_filtered = BedTool([])
    else:
        transcripts = BedTool(ref_mgr.transcripts_track)
        if transcripts.field_count() == 7:
            transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
        else:
            df_tx = transcripts.to_dataframe()
            df_tx['gene_type'] = '.'
            transcripts_filtered = BedTool.from_dataframe(df_tx).saveas()

    # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns
    peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas()

    results = []
    peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas()

    # avoid error when no peaks overlap with any transcipts
    if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss):
        peaks_nearby_tss_and_tx = peaks_nearby_tss \
            .intersect(transcripts_filtered, wa=True, wb=True) \
            .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct'])

        for peak in peaks_nearby_tss_and_tx:
            results.append(get_peak_nearby_genes(peak))

    for peak in peaks_nearby_tss_butno_tx:
        results.append(get_peak_nearby_genes(peak))

    return results
예제 #2
0
def xstream(a, b, distance, updown, out):
    """
    find all things in b that are within
    distance of a in the given direction
    (up or down-stream)
    """
    direction = dict(u="l", d="r")[updown[0]]
    kwargs = {'sw':True, direction: distance}

    if "l" in kwargs: kwargs["r"] = 0
    else: kwargs["l"] = 0
    a = BedTool(a).saveas()

    kwargs['stream'] = True
    c = a.window(b, **kwargs)
    afields = a.field_count()

    seen = collections.defaultdict(set)
    for feat in c:
        key = "\t".join(feat[:afields])
        # keep track of all the feature names that overlap this one
        seen[key].update((feat[afields + 3],))

    # the entries that did appear in the window
    for row in seen:
        out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen: continue
        out.write(str(row) + "\t.\n")
    out.flush()
    assert len(BedTool(out.name)) == len(a)
예제 #3
0
def _iter_pairwise_connections(
    clusterable_bedtool: pybedtools.BedTool,
    min_reciprocal_overlap: float,
    min_sample_overlap: float = 0,
    is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({})
) -> Iterator[Tuple[Text, Text]]:
    """
    Iterate over pairs of variant intervals that meet minimum requirement for reciprocal overlap. Exclude self-overlaps.
    Optionally impose requirement of minimum Jaccard index for carrier samples.
    Parameters
    ----------
    clusterable_bedtool: BedTool
        bed object with intervals that may overlap each other
    min_reciprocal_overlap: float
        minimum reciprocal overlap for two intervals to be connected
    min_sample_overlap: float (default=0)
        minimum Jaccard index of carrier samples for two intervals to be connected
    is_carrier: Mapping[Text, numpy.ndarray]
        map from variant ID to carrier status (array boolean True/False for each sample)
    Yields
    -------
    variant_id_1, variant_id_2: Tuple[Text, Text]
        successive pairs of variant IDs that meet the overlap requiremnts
    """
    # Cluster intervals based on reciprocal overlap
    if len(clusterable_bedtool) == 0:
        return
    overlap_bedtool = clusterable_bedtool.intersect(clusterable_bedtool,
                                                    f=min_reciprocal_overlap,
                                                    r=True,
                                                    wa=True,
                                                    wb=True,
                                                    sorted=True,
                                                    nonamecheck=True)
    num_1_fields = clusterable_bedtool.field_count()
    name_1_field = name_field
    sv_type_1_field = sv_type_field
    name_2_field = num_1_fields + name_field
    sv_type_2_field = num_1_fields + sv_type_field

    if min_sample_overlap > 0:
        for overlap in overlap_bedtool:
            fields = overlap.fields
            if fields[sv_type_1_field] != fields[sv_type_2_field]:
                continue  # only cluster same sv_type
            name_1 = fields[name_1_field]
            name_2 = fields[name_2_field]
            if name_1 != name_2 and jaccard_index(
                    is_carrier[name_1],
                    is_carrier[name_2]) >= min_sample_overlap:
                yield name_1, name_2
    else:
        for overlap in overlap_bedtool:
            fields = overlap.fields
            if fields[sv_type_1_field] != fields[sv_type_2_field]:
                continue  # only cluster same sv_type
            name_1 = fields[name_1_field]
            name_2 = fields[name_2_field]
            if name_1 != name_2:
                yield name_1, name_2
예제 #4
0
def xstream(a, b, distance, updown, out):
    """
    find all things in b that are within
    distance of a in the given direction
    (up or down-stream)
    """
    direction = dict(u="l", d="r")[updown[0]]
    kwargs = {'sw': True, direction: distance}

    if "l" in kwargs: kwargs["r"] = 0
    else: kwargs["l"] = 0
    a = BedTool(a).saveas()

    kwargs['stream'] = True
    c = a.window(b, **kwargs)
    afields = a.field_count()

    seen = collections.defaultdict(set)
    for feat in c:
        key = "\t".join(feat[:afields])
        # keep track of all the feature names that overlap this one
        seen[key].update((feat[afields + 3], ))

    # the entries that did appear in the window
    for row in seen:
        out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen: continue
        out.write(str(row) + "\t.\n")
    out.flush()
    assert len(BedTool(out.name)) == len(a)
예제 #5
0
def get_annotation_gene_types(args):
    """
    Return the gene types to use to filter genes/transcript
    annotations by.
    """
    ref_mgr = ReferenceManager(args.reference_path)
    tss = BedTool(ref_mgr.tss_track)
    if tss.field_count() == 7:
        return TRANSCRIPT_ANNOTATION_GENE_TYPES
    else:
        return None
예제 #6
0
def extract_ge_folchange_per_peak(peaks, tables, closestMapping, features,
                                  IdColumn, hm):
    """
    Updates the values on the input matrix by appending the requested values
    from the given tables when closest genes have been found.
    """
    ## keyMap_closest: peak_key:gene_id
    ## peak_keys: cols 1-7 from bed format (1-based index)
    Peaks = BedTool(peaks)
    Peaks=Peaks.sort()
    field_count = Peaks.field_count()
    keyMap_closest = keymap_from_closest_genes(closestMapping, peaks, field_count)
    __update_matrix_values(peaks, keyMap_closest, tables,features,IdColumn,hm)
예제 #7
0
def add_closest(aname, bname):
    a, b = BedTool(aname), BedTool(bname)

    afields = a.field_count()
    c = a.closest(b, d=True)
    get_name = gen_get_name(b, afields)

    dbed = open(BedTool._tmp(), "w")
    # keep the name and distance
    seen_by_line = collections.defaultdict(list)
    for feat in c:
        key = "\t".join(feat[:afields])
        seen_by_line[key].append([feat[-1], get_name(feat)])

    for key, dist_names in seen_by_line.iteritems():
        if len(dist_names) > 0:
            assert len(set([d[0] for d in dist_names])) == 1
        names = ",".join(sorted(set(d[1] for d in dist_names)))
        new_line = "\t".join([key] + [names] + [dist_names[0][0]])
        dbed.write(new_line + "\n")
    dbed.close()
    d = BedTool(dbed.name)
    assert len(d) == len(a)
    return d
예제 #8
0
def add_closest(aname, bname):
    a, b = BedTool(aname), BedTool(bname)

    afields = a.field_count()
    c = a.closest(b, d=True)
    get_name = gen_get_name(b, afields)

    dbed = open(BedTool._tmp(), "w")
    # keep the name and distance
    seen_by_line = collections.defaultdict(list)
    for feat in c:
        key = "\t".join(feat[:afields])
        seen_by_line[key].append([feat[-1], get_name(feat)])

    for key, dist_names in seen_by_line.items():
        if len(dist_names) > 0:
            assert len(set([d[0] for d in dist_names])) == 1
        names = ",".join(sorted(set(d[1] for d in dist_names)))
        new_line = "\t".join([key] + [names] + [dist_names[0][0]])
        dbed.write(new_line + "\n")
    dbed.close()
    d = BedTool(dbed.name)
    assert len(d) == len(a)
    return d
예제 #9
0
    def _make_target_bed(self,
                         bed_fpath,
                         work_dir,
                         output_dir,
                         is_debug,
                         padding=None,
                         fai_fpath=None,
                         genome=None,
                         reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath,
                                                    'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir,
                                                   clean_target_bed_fpath,
                                                   'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(
                clean_target_bed_fpath,
                output_bed_fpath=sort_target_bed_fpath,
                fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir,
                                                      sort_target_bed_fpath,
                                                      'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count(
                ) == 3 or reannotate:
                    debug(
                        'Annotating target BED file and collecting overlapping genome features'
                    )
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          reannotate=reannotate,
                                          only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(
            work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir,
                                  final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)

        self.capture_bed_fpath = add_suffix(
            join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
예제 #10
0
def mk_matrix(inputfile=None,
              outputfile=None,
              bigwiglist=None,
              ft_type=None,
              pseudo_count=0,
              upstream=1000,
              downstream=1000,
              bin_around_frac=0.1,
              chrom_info=None,
              bin_nb=100,
              nb_proc=None,
              labels=None,
              no_stranded=False,
              zero_to_na=False):
    """
 Description: Create a matrix to be used by 'profile' and 'heatmap' commands.
    """

    # -------------------------------------------------------------------------
    # Check argument consistency
    #
    # -------------------------------------------------------------------------

    if ft_type in ['single_nuc', 'promoter', 'tts']:
        region_size = upstream + downstream + 1
        if region_size < bin_nb:
            message(
                "The region (-u/-d) needs to be extended given the number "
                "of bins (--bin-nb)",
                type="ERROR")

    # -------------------------------------------------------------------------
    # Check output file name does not ends with .zip
    #
    # -------------------------------------------------------------------------

    if outputfile.name.endswith(".zip"):
        outfn = outputfile.name.replace(".zip", "")
        outputfile = open(outfn, "w")

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
        if ft_type == 'user_regions':
            message(
                "--ft-type can not be set to user_regions"
                " when a gtf is provided.",
                type="ERROR")
    else:
        try:

            region_bo = BedTool(inputfile.name)
            len(region_bo)
        except IndexError:
            message("Unable to read the input file. Check format",
                    type="ERROR")
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            message('Loading the GTF file.')
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

            if ft_type != 'user_regions' and ft_type != 'single_nuc':
                message(
                    "Set --ft-type to 'user_regions' or 'single_nuc'"
                    " when using input bed file.",
                    type="ERROR")
            # Check that the strand is provided and
            # check it is located in the right column
            # (not checked by BedTool...).
            if region_bo.field_count() < 6:
                if not no_stranded:
                    message("Strand is undefined. Use -nst.", type="ERROR")
            else:
                region_name = dict()
                for i in region_bo:
                    if region_name.get(i.name, None) is None:
                        region_name[i.name] = 1
                    else:
                        message(
                            "Regions in bed file should have "
                            "unique identifier (col 4).",
                            type="ERROR")
                    if i.strand[0] not in ['.', '+', '-']:
                        message("Strand should be one of '+','-' or '.'.",
                                type="ERROR")
                    if ft_type == 'single_nuc':
                        if i.end - i.start != 1:
                            message(
                                "Region length should be 1 nucleotide "
                                "long when 'single_nuc' is set. Use 'user_regions'.",
                                type="ERROR")
                    elif ft_type == 'user_regions':
                        if i.end - i.start == 1:
                            message(
                                "Region length should not be 1 nucleotide "
                                "long when 'user_regions' is set. Use 'single_nuc'.",
                                type="ERROR")

    # -------------------------------------------------------------------------
    # Create a list of labels for the diagrams.
    # Take user input in account
    # -------------------------------------------------------------------------
    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bigwiglist):
            message(
                "The number of labels should be the same as the number of"
                " bigwig files.",
                type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bigwiglist)):
            labels += [
                os.path.splitext(os.path.basename(bigwiglist[i].name))[0]
            ]

    # -------------------------------------------------------------------------
    #
    # Get the requested transcrit lines in bed format
    # Tx are restricted to those found on chromosome
    # declared in the bigwig file.
    # -------------------------------------------------------------------------
    message('Getting the list of chromosomes declared in bigwig files.')
    bw_chrom = list()
    for i in bigwiglist:
        bw_chrom += list(pyBigWig.open(i.name).chroms().keys())

    bed_col = [0, 1, 2, 3, 4, 5]

    if is_gtf:

        message('Selecting chromosomes declared in bigwig from gtf.')
        tmp = gtf.select_by_key("feature", "transcript").select_by_key(
            "seqid", ",".join(bw_chrom))

        tmp = gtf.select_by_key("feature", "transcript")
        tmp_tx_name = tmp.extract_data("transcript_id", as_list=True)

        # If several trancript records are associated to
        # the same transcript_id, raise an error.
        if len(tmp_tx_name) > len(set(tmp_tx_name)):
            message('Transcripts should have a unique identifier.',
                    type="ERROR")

        message('Selecting requested regions.')

        # ----------------------------------------------------------------------
        #
        # Slop tss and promoters.
        # No need if transcript was requested (it will be flanked by upstream
        # and doswnstream regions later on).
        # ----------------------------------------------------------------------

        if ft_type == 'transcript':
            message("Getting transcript boundaries (input gtf).")

            main_region_bo = tmp.to_bed(name=["transcript_id"])

        elif ft_type == 'promoter':

            message("Getting promoter regions [-%d,+%d]." %
                    (upstream, downstream))

            main_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

        elif ft_type == 'tts':

            main_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

    else:
        message("Loading regions")

        if ft_type == 'user_regions':
            main_region_bo = BedTool(inputfile.name).cut(bed_col)
        elif ft_type == 'single_nuc':
            main_region_bo = BedTool(inputfile.name).cut(bed_col).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)
        else:
            message("Unknown method.")

    # Save for tracability
    main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed")
    main_region_bo.saveas(main_region_bed.name)

    # -------------------------------------------------------------------------
    #
    # Print a header in the output file
    #
    # -------------------------------------------------------------------------
    message("Preparing comments")

    comments = "#"
    comments += "ft_type:" + ft_type + ";"
    comments += "from:" + str(upstream) + ";"
    comments += "to:" + str(downstream) + ";"
    comments += "labels:" + ",".join(labels) + ";"

    # -------------------------------------------------------------------------
    # Compute coverage of requested region
    # Each worker will send a file
    # -------------------------------------------------------------------------

    outputfile_list = {}
    message("Using %d bins for main region." % bin_nb)

    tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name,
                             nb_proc=nb_proc,
                             big_wig=[x.name for x in bigwiglist],
                             bin_nb=bin_nb,
                             pseudo_count=pseudo_count,
                             stranded=not no_stranded,
                             type="main",
                             labels=labels,
                             outputfile=outputfile.name,
                             zero_to_na=zero_to_na,
                             verbose=pygtftk.utils.VERBOSITY)

    outputfile_list["main"] = tmp_file

    # -------------------------------------------------------------------------
    # If transcript was requested
    # we must process flanking regions
    # We need to retrieve coverage of promoter [-upstream, 0]
    # as transcript coverage window size will depend on transcript length.
    # For promoter the length of windows will be fixed.
    # -------------------------------------------------------------------------

    if ft_type in ['transcript', 'user_regions']:

        # Number of bins for TTS and TSS
        around_bin_nb = int(round(bin_nb * bin_around_frac))
        if around_bin_nb < 1:
            around_bin_nb = 1

        if upstream > 0:

            if ft_type == 'transcript':
                message("Getting promoter (using %d bins)." % around_bin_nb)
                ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                    s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col)

            else:
                message("Getting upstream regions (%d bins)." % around_bin_nb)
                ups_region_bo = main_region_bo.flank(s=True,
                                                     l=upstream,
                                                     r=0,
                                                     g=chrom_info.name)

            upstream_bed_file = make_tmp_file(prefix="upstream_region" +
                                              ft_type,
                                              suffix=".bed")

            ups_region_bo.saveas(upstream_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="upstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["upstream"] = tmp_file

        if downstream > 0:

            if ft_type == 'transcript':
                message("Getting TTS (using %d bins)." % around_bin_nb)
                dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                    s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col)
            else:
                message("Getting downstream regions (%d bins)." %
                        around_bin_nb)

                dws_region_bo = main_region_bo.flank(s=True,
                                                     l=0,
                                                     r=downstream,
                                                     g=chrom_info.name)
            dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type,
                                         suffix=".bed")

            dws_region_bo.saveas(dws_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="downstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["downstream"] = tmp_file

    # -------------------------------------------------------------------------
    #
    # Merge file using pandas
    #
    # -------------------------------------------------------------------------

    message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG")
    df_main = pd.read_csv(outputfile_list["main"].name, sep="\t")
    # save strand and end
    # They will re-joined added later
    df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']]

    df_start = df_main.pop('start')
    df_end = df_main.pop('end')

    if "upstream" in outputfile_list:
        message("Merging upstream file")
        message("Reading (pandas): " + outputfile_list["upstream"].name,
                type="DEBUG")
        df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t")
        df_up = df_up.drop(['start', 'end'], 1)
        df_main = df_up.merge(df_main.loc[:, df_main.columns],
                              on=['bwig', 'chrom', 'gene', 'strand'])

    if "downstream" in outputfile_list:
        message("Merging downstream file")
        message("Reading (pandas): " + outputfile_list["downstream"].name,
                type="DEBUG")
        df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t")
        df_dws = df_dws.drop(['start', 'end'], 1)
        df_main = df_main.merge(df_dws.loc[:, df_dws.columns],
                                on=['bwig', 'chrom', 'gene', 'strand'])

    # join start and end.
    df_main = df_main.merge(df_copy.loc[:, df_copy.columns],
                            on=['bwig', 'chrom', 'gene', 'strand'])
    df_start = df_main.pop('start')
    df_end = df_main.pop('end')
    df_main.insert(2, 'start', df_start)
    df_main.insert(3, 'end', df_end)

    message("Writing to file")
    outputfile.close()

    with open(outputfile.name, 'a') as f:
        f.write(comments + "\n")
        df_main.to_csv(f,
                       sep="\t",
                       index=False,
                       mode='a',
                       columns=df_main.columns,
                       na_rep='NA')

    # -------------------------------------------------------------------------
    #
    # Compress
    #
    # -------------------------------------------------------------------------

    message("Compressing")
    path = os.path.abspath(outputfile.name)
    filename = os.path.basename(path)
    message("filename: " + filename, type="DEBUG")
    zip_filename = filename + '.zip'
    message("zip_filename: " + zip_filename, type="DEBUG")
    zip_path = os.path.join(os.path.dirname(path), zip_filename)
    message("zip_path: " + zip_path, type="DEBUG")

    with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf:
        zf.write(filename=path, arcname=filename)

    for i in outputfile_list:
        message("deleting " + outputfile_list[i].name)
        os.remove(outputfile_list[i].name)
    os.remove(outputfile.name)

    gc.disable()
    close_properly(inputfile, outputfile)
예제 #11
0
    def __call__(self, string):

        # ---------------------------------------------------------------
        # Check file extension
        # ---------------------------------------------------------------

        fasta_format_1 = '(\.[Ff][Aa][Ss][Tt][Aa]$)|(\.[Ff][Nn][Aa]$)'
        fasta_format_2 = '|(\.[Ff][Aa]$)|(\.[Ff][Aa][Ss]$)|(\.[Ff][Ff][Nn]$)|(\.[Ff][Rr][Nn]$)'
        fasta_regexp = fasta_format_1 + fasta_format_2
        fasta_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", fasta_regexp)
        bed_regexp = '\.[Bb][Ee][Dd][3456]{0,1}$'
        bed_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", bed_regexp)
        gtf_regexp = '\.[Gg][Tt][Ff]$'
        gtf_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", gtf_regexp)
        txt_regexp = '(\.[Tt][Xx][Tt]$)|(\.[Cc][Ss][Vv]$)|(\.[Dd][Ss][Vv]$)|(\.[Tt][Aa][Bb]$)|(\.[Tt][Ss][Vv]$)'
        txt_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", txt_regexp)
        bigwig_regexp = '(\.[Bb][Ww]$)|(\.[Bb][Ii][Gg][Ww][Ii][Gg]$)'
        zip_regexp = '\.[Zz][Ii][Pp]$'
        pdf_regexp = '\.[Pp][Dd][Ff]$'

        ext2regexp = {'bed': bed_regexp,
                      'bed.gz': bed_regexp_gz,
                      'gtf': gtf_regexp,
                      'gtf.gz': gtf_regexp_gz,
                      'fasta': fasta_regexp,
                      'fasta.gz': fasta_regexp_gz,
                      'txt': txt_regexp,
                      'txt.gz': txt_regexp_gz,
                      'bigwig': bigwig_regexp,
                      'zip': zip_regexp,
                      'pdf': pdf_regexp}

        # Set verbosity system wide as depending on
        # command line argument order, VERBOSITY (-V) can
        # be evaluated later...
        if '-V' in sys.argv:
            sys_args = ' '.join(sys.argv)
            verbosity_val = re.search('-V ?([01234])?', sys_args)
            if verbosity_val:
                pygtftk.utils.VERBOSITY = int(verbosity_val.group(1))
            else:
                pygtftk.utils.VERBOSITY = 0

        match = False

        if isinstance(self.file_ext, str):
            extension_list = [self.file_ext]
        else:
            extension_list = list(self.file_ext)

        for this_ext in extension_list:
            if re.search(ext2regexp[this_ext], string):
                match = True
                break

        if not match:
            message('Not a valid filename extension :' + string, type="WARNING")
            message('Extension expected: ' + ext2regexp[this_ext], type="ERROR")
            sys.exit()

        # ---------------------------------------------------------------
        # Check directory
        # ---------------------------------------------------------------

        outputdir = os.path.dirname(os.path.abspath(string))

        if not os.path.exists(outputdir):
            if 'w' in self._mode:
                message("Directory not found. Creating.", type="WARNING")
                os.makedirs(outputdir)

        # ---------------------------------------------------------------
        # Check format
        # ---------------------------------------------------------------

        # if bed3, bed4, bad5 convert to bed6

        if self._mode == 'r':
            if self.file_ext == 'bed':

                message("Checking BED file format (" + string + ").",
                        type="INFO")

                try:
                    file_bo = BedTool(string)
                    nb_line = len(file_bo)
                except:
                    msg = "Unable to load file: " + string + "."
                    message(msg, type="ERROR")
                    sys.exit()

                if nb_line == 0:
                    msg = "It seems that file " + string + " is empty."
                    message(msg, type="ERROR")
                    sys.exit()

                if file_bo.file_type != 'bed':
                    msg = "File {f} is not a valid bed file."
                    msg = msg.format(f=string)
                    message(msg, type="ERROR")
                    sys.exit()

                region_nb = 0
                field_count = file_bo.field_count()

                if field_count != 6:
                    message("Converting to bed6 format (" + string + ").", type="WARNING")
                    tmp_file = make_tmp_file(prefix="bed6_",
                                             suffix=".bed")
                    for record in file_bo:
                        region_nb += 1

                        if field_count < 4:
                            name = 'region_' + str(region_nb)
                        else:
                            name = record.name

                        fields = record.fields[0:3]
                        fields += [name, '0', '.']

                        tmp_file.write("\t".join(fields) + "\n")

                    close_properly(tmp_file)
                    string = tmp_file.name

        # we will work with string
        if 'w' in self._mode:
            self._mode = 'w'

        return super(FormattedFile, self).__call__(string)
예제 #12
0
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
                                       x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
예제 #13
0
			for k2, i in d.items():
				l.append("%s:\t%s:\t%s" % (k1, k2, str(i).strip()));
			
		return "\n".join(l)	





chimeras = BedTool(args.path)
if(len(chimeras) == 0):
	sys.stderr.write("input file \'%s\' is empty\n" % args.path);
	sys.exit();

exons = BedTool(args.exons)
offset  = chimeras.field_count();
chimeras_vs_exons = chimeras.intersect(exons, s=args.stranded, wao=True)


first = chimeras_vs_exons[0]

curname, cur_pair_number = first.name.split("|");
intersections = defaultdict(dict);
intervals = OrderedDict();
intervals[int(cur_pair_number)] = first;
intersections[first[offset+3]][int(cur_pair_number)] = list2interval(first[offset:])

for i in chimeras_vs_exons[1:]:
	name, pair_number = i.name.split("|");
	if(name == curname):
		if(pair_number == cur_pair_number):
예제 #14
0
    def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug,
                         padding=None, fai_fpath=None, genome=None, reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate:
                    debug('Annotating target BED file and collecting overlapping genome features')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, reannotate=reannotate, only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir, final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)
        
        self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
예제 #15
0
def bw_coverage(inputfile=None,
                out_file=None,
                bw_list=None,
                pseudo_count=1,
                score=None,
                bin_nb=1,
                n_highest=None,
                nb_proc=1,
                verbose=True):
    """
    Compute transcript coverage with one or several bigWig.
    -------------------------------------------------------
    Uses bx-python as interface to kent utilities.
    """

    # Check if the score is well written

    if not re.search(r"^[b\d\/\*\+\-\(\)\.]+$", score):
        sys.stderr.write("Score should contain the following characters: "
                         "b0, b1 (...) and operators +, ., -, *, /, **, (, ).")
        sys.exit(0)

    # Check if the score to compute fits with
    # The number of input bigWigs
    bw_list = bw_list.split(",")
    bwig_in_score = re.finditer(r"b\d+", score)
    bwig_expected_in_score = ["b" + str(x) for x in range(len(bw_list))]

    for i in bwig_in_score:

        if i.group(0) not in bwig_expected_in_score:
            sys.stderr.write("The indicated column (" + i.group(0) +
                             ") was not found.")
            sys.exit(0)

    # Check the number of windows
    if n_highest is None:
        n_highest = bin_nb

    if verbose:
        sys.stderr.write("Number of bins: " + str(bin_nb) + "\n")
        sys.stderr.write("N highest values: " + str(n_highest) + "\n")

    if n_highest > bin_nb:
        sys.stderr.write("The number of window used for computing the score"
                         " (-n) can not be greater than the number of"
                         " windows (-w)")
        sys.exit()

    # Check input file is in bed6 format

    region_bo = BedTool(inputfile.name)

    if region_bo.field_count() != 6:
        sys.stderr.write(
            "Bed file should should be in Bed6 format. Use '.' if strand is undefined.\n"
        )
        sys.exit()

    tokens = intervals(range(len(BedTool(inputfile.name))), nb_proc)

    pool = multiprocessing.Pool(nb_proc)
    coverage_list = pool.map_async(
        big_wig_summary_worker,
        zip(tokens, repeat(bw_list), repeat(inputfile.name), repeat(bin_nb),
            repeat(pseudo_count), repeat(n_highest),
            repeat(verbose))).get(9999999)

    if False in coverage_list:
        sys.stderr.write("Aborting...")
        sys.exit()

    # Unlist the list of list
    coverage_list = [item for sublist in coverage_list for item in sublist]

    # Prepare a data.frame to collect the results
    dataframe = pd.DataFrame(columns=None)

    if verbose:
        sys.stderr.write("Retrieving results.\n")

    for i in coverage_list:
        dataframe.ix[i[0], i[1] + str(i[2])] = float(i[3])

    if verbose:
        sys.stderr.write("Computing score.\n")

    dataframe = dataframe.eval(score)

    dataframe.to_csv(out_file, sep="\t", header=False)

    close_properly(inputfile, out_file)
        i2 = Interval(interval.chrom, start, stop, interval.name,
                      interval.score, interval.strand)

    if (interval.strand == '-'):
        return i2, i1
    else:
        return i1, i2


coverage = coverage2dict(args.coverage)
genome = SeqIO.to_dict(SeqIO.parse(args.genome, "fasta"))
transcripts = BedTool(args.transcripts)
phages = BedTool(args.phages)
regions = BedTool(args.path)
regions = BedTool([x for x in regions if float(x.score) > args.zscore])
OFFSET = regions.field_count()
up_downs = [
    get_upstream_downstream(x, genome, args.length) for x in transcripts
]
upstreams = BedTool([x[0] for x in up_downs if x[0]])
downstreams = BedTool([x[1] for x in up_downs if x[1]])
phaged_regions = [
    regions.intersect(b=phages, u=True, f=0.5),
    regions.intersect(b=phages, v=True, f=0.5)
]

print([len(x) for x in phaged_regions])

already_discovered = set()
transcriptome_region_dict = defaultdict(list)
ph_names = ['phage', 'non-phage']
예제 #17
0
exons = [];
for interval in BedTool(args.gff3):
	if('ID' in interval.attrs and interval.attrs['ID'].split(':')[0] == 'gene'):
		curname = interval.attrs['gene_id']
		enames = set()
	if(interval[2] == 'exon'):
		if(interval.name not in enames):
			enames.add(interval.name)
			interval.name = curname
			exons.append(gff2bed(interval))
		

	
#Get an intersection between circles and expms
bed = BedTool(args.path);
offset  = bed.field_count();
intersection = bed.intersect(b=exons, s=True, wao=True);

curname = ''
cexons = []
for interval in intersection:
	if(curname == interval.name):
		cexons.append(tuple(interval[offset:offset+6]))
	else:
		if(curname):
			get_exons(cinterval, cexons)
		cinterval = interval
		cexons = [tuple(interval[offset:offset+6])]
		curname = interval.name
else:
	get_exons(cinterval, cexons)