예제 #1
0
파일: methods.py 프로젝트: xie186/pyranges
def get_ftp_file(ftp, binary_loc, path=None, test=False):

    url = "http://" + ftp + "/" + binary_loc
    if test:
        # if test only download small part
        from io import BytesIO
        import gzip
        headers = {"Range": "bytes=0-1000000"}
        c = requests.get(url, headers=headers).content
        s = gzip.GzipFile(fileobj=BytesIO(c)).read(1000000)
        f = s.decode().rsplit("\n", 1)[0]
        with tempfile.NamedTemporaryFile(suffix=".gtf", mode="w+") as t:
            t.write(f)
            gr = pr.read_gtf(t.name)
    else:
        if path and os.path.dirname(path):
            os.makedirs(os.path.dirname(path), exist_ok=True)

        c = requests.get(url).content

        if not path:
            with tempfile.NamedTemporaryFile(suffix=".gtf.gz",
                                             mode="wb+") as t:
                t.write(c)
                gr = pr.read_gtf(t.name)
        else:
            if os.path.dirname(path):
                os.makedirs(os.path.dirname(path), exist_ok=True)
            fh = open(path, "wb+")
            fh.write(c)
            fh.close()

            gr = pr.read_gtf(path)

    return gr
예제 #2
0
def process_polya_bed(polyA_bed_path=None, atlas_version=2, outfile=None):
    '''

    '''
    if atlas_version == 1:
        # No need to add 'chr' prefix
        polya_bed = pyr.readers.read_bed(f=polya_bed_path)

        # read in GTF with multi-overlap transcripts
        tr_gtf = pyr.read_gtf(f=tr_gtf_path)
        tr_gtf = get_last_exons(ranges_obj=tr_gtf)

        polya_bed = join_by_intersect(pyranges1=polya_bed, pyranges2=tr_gtf)

        # short name column already in correct format - skip to adding long name
        polya_bed = add_paqr_long_name(pyranges=polya_bed)

        # add columns with order along exon & number of exons on transcript
        polya_bed = add_n_along_exon(pyranges=polya_bed)
        polya_bed = get_total_n_on_exon(pyranges=polya_bed)

        # first 6 already provided in format of version 1 BED file
        column_order = [
            'Chromosome', 'Start', 'End', 'Name', 'Score', 'Strand',
            'n_along_exon', 'total_n_on_exon', 'paqr_long_name', 'gene_id'
        ]

        write_to_paqr_bed(pyranges=polya_bed,
                          outfile=outfile,
                          col_order=column_order)

    elif atlas_version == 2:
        # need to add chr prefix before joining
        polya_bed = tidy_chromosome_column(polya_bed_path=polya_bed_path)

        # read in GTF with multi-overlap transcripts
        tr_gtf = pyr.read_gtf(f=tr_gtf_path)
        tr_gtf = get_last_exons(ranges_obj=tr_gtf)

        polya_bed = join_by_intersect(pyranges1=polya_bed, pyranges2=tr_gtf)

        # version 2 doesn't have name column in formatting required for PAQR - need to add both short & long name
        polya_bed = add_paqr_name(pyranges=polya_bed)
        polya_bed = add_paqr_long_name(pyranges=polya_bed)

        # add columns with order along exon & number of exons on transcript
        polya_bed = add_n_along_exon(pyranges=polya_bed)
        polya_bed = get_total_n_on_exon(pyranges=polya_bed)

        # v2.0 has custom format
        column_order = [
            'Chromosome', 'Start', 'End', 'paqr_name', 'ThickEnd', 'Strand',
            'n_along_exon', 'total_n_on_exon', 'paqr_long_name', 'gene_id'
        ]

        write_to_paqr_bed(pyranges=polya_bed,
                          outfile=outfile,
                          col_order=column_order)
예제 #3
0
파일: GtfToBed.py 프로젝트: baigal628/CHIPS
def gtfToBed(gtf, output):
    gr = pr.read_gtf(gtf)
    df = gr.df
    geneAndTrans = df[(df["Feature"] == "gene") |
                      (df["Feature"] == "transcript")]
    AnnoBed = geneAndTrans.loc[:, [
        "Chromosome", "Start", "End", "gene_name", "Strand", "gene_id"
    ]]
    AnnoBed = AnnoBed.drop_duplicates()
    AnnoBed = AnnoBed.rename(
        columns={
            'Chromosome': 'chromosome',
            'Start': 'start',
            'End': 'end',
            'gene_name': 'symbol',
            'Strand': 'strand',
            'gene_id': 'product_accession'
        })
    AnnoBed.loc[:, "start"] = AnnoBed.loc[:, "start"] - 1
    AnnoBed.loc[:, "end"] = AnnoBed.loc[:, "end"] - 1
    AnnoBed.loc[AnnoBed.strand == '+', 'TSS'] = AnnoBed.start
    AnnoBed.loc[AnnoBed.strand == '-', 'TSS'] = AnnoBed.end
    AnnoBed.TSS = AnnoBed.TSS.astype(int)
    AnnoBed["coordinate"] = [
        x[0] + ':' + str(x[1]) + '-' + str(x[2])
        for x in AnnoBed.values.tolist()
    ]
    AnnoBed = AnnoBed[[
        'chromosome', 'start', 'end', 'coordinate', 'product_accession',
        'strand', 'symbol', 'TSS'
    ]]
    AnnoBed.to_csv(output, index=None, sep='\t')
예제 #4
0
    def _read_utr(
        gtf_file,
        feature_type="5UTR",
        infer_from_cds=False,
        on_error_warn=True,
    ) -> pd.DataFrame:
        """
        Read, extract and filter valid UTRs from the given gtf_file
        :param gtf_file: path to the GTF file
        :param feature_type: type of the feature that will be filtered for. In general '5UTR' or '3UTR'.
        :param infer_from_cds: Substract the CDS from the exon regions to infer the UTR regions.
            Will use 'feature_type' to decide whether '5UTR' or '3UTR' should be returned.
        :param on_error_warn: Do not break on error; instead throw warning.
        """
        import pyranges

        df = pyranges.read_gtf(gtf_file, as_df=True)

        utr_df = UTRFetcher.get_utr_from_gtf(df,
                                             feature_type=feature_type,
                                             infer_from_cds=infer_from_cds,
                                             on_error_warn=on_error_warn)

        utr_df = utr_df.set_index("transcript_id")
        return utr_df
예제 #5
0
    def _read_cds(
        gtf_file,
        filter_valid_transcripts=False,
        filter_biotype=False,
        filter_tag=False,
        duplicate_attr=None,
        on_error_warn=True,
    ):
        """
        Read, extract and filter valid cds from the given gtf_file
        :param gtf_file: path to the GTF file
        """
        import pyranges

        if duplicate_attr == None:
            # One row in the GTF file can have multiple tags;
            # therefore, to filter them we have to allow duplicate attrs.
            duplicate_attr = filter_tag

        df = pyranges.read_gtf(gtf_file,
                               as_df=True,
                               duplicate_attr=duplicate_attr)

        cds = CDSFetcher.get_cds_from_gtf(
            df,
            filter_valid_transcripts=filter_valid_transcripts,
            filter_biotype=filter_biotype,
            filter_tag=filter_tag,
            on_error_warn=on_error_warn)

        cds = cds.set_index("transcript_id")
        return cds
예제 #6
0
    def _read_intervals(gtf_path=None,
                        bed_path=None,
                        pranges=None,
                        intervals=None,
                        interval_attrs=None,
                        duplicate_attr=False):
        alternatives = [bed_path, pranges, intervals, gtf_path]
        if sum(i is not None for i in alternatives) != 1:
            raise ValueError('only one of `gth_path`, `bed_path`, `pranges`,'
                             '`intervals` or should given as input.')
        if gtf_path:
            import pyranges
            pranges = pyranges.read_gtf(gtf_path,
                                        duplicate_attr=duplicate_attr)

        elif bed_path:
            import pyranges
            pranges = pyranges.read_bed(bed_path)

        elif intervals:
            if interval_attrs is not None:
                raise ValueError(
                    '`interval_attrs` is not valid with `intervals`')

            pranges = intervals_to_pyranges(intervals)

        return pranges
예제 #7
0
def ensembl_gtf():

    """
    >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
    >>> # | Chromosome   | Source     | Feature      | Start     | End       | Score      | Strand       | Frame      | gene_biotype                       | +19   |
    >>> # | (category)   | (object)   | (category)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | (object)                           | ...   |
    >>> # |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------|
    >>> # | 1            | havana     | gene         | 11868     | 14409     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
    >>> # | 1            | havana     | transcript   | 11868     | 14409     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
    >>> # | 1            | havana     | exon         | 11868     | 12227     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
    >>> # | 1            | havana     | exon         | 12612     | 12721     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
    >>> # | ...          | ...        | ...          | ...       | ...       | ...        | ...          | ...        | ...                                | ...   |
    >>> # | 1            | havana     | gene         | 1173055   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
    >>> # | 1            | havana     | transcript   | 1173055   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
    >>> # | 1            | havana     | exon         | 1179364   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
    >>> # | 1            | havana     | exon         | 1173055   | 1176396   | .          | -            | .          | lncRNA                             | ...   |
    >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
    >>> # Stranded PyRanges object has 2,446 rows and 28 columns from 1 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    >>> # 19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.)
    """

    full_path = get_example_path("ensembl_human.gtf.gz")

    return pr.read_gtf(full_path)
예제 #8
0
def read_exon_pyranges(gtf_file, overhang=(100, 100), first_last=True):
    '''
    Read exon as pyranges from gtf_file

    Args:
      gtf_file: gtf file from ensembl/gencode.
      overhang: padding of exon to match variants.
      first_last: set overhang of first and last exon of the gene to zero
        so seq intergenic region will not be processed.
    '''
    df_gtf = pyranges.read_gtf(gtf_file).df
    df_exons = df_gtf[df_gtf['Feature'] == 'exon']
    df_exons = df_exons[[
        'Chromosome', 'Start', 'End', 'Strand', 'exon_id', 'gene_id',
        'gene_name', 'transcript_id'
    ]]

    if first_last:
        df_genes = df_gtf[df_gtf['Feature'] == 'transcript']
        df_genes.set_index('transcript_id', inplace=True)
        df_genes = df_genes.loc[df_exons['transcript_id']]
        df_genes.set_index(df_exons.index, inplace=True)

        starting = df_exons['Start'] == df_genes['Start']
        ending = df_exons['End'] == df_genes['End']

        df_exons.loc[:, 'left_overhang'] = ~starting * overhang[0]
        df_exons.loc[:, 'right_overhang'] = ~ending * overhang[1]

        df_exons.loc[:, 'Start'] -= df_exons['left_overhang']
        df_exons.loc[:, 'End'] += df_exons['right_overhang']

    return pyranges.PyRanges(df_exons)
예제 #9
0
def test_BaseVariantMatcher__read_intervals():
    pranges = pyranges.read_gtf(gtf_file)

    with pytest.raises(ValueError):
        pr = BaseVariantMatcher._read_intervals(pranges=pranges,
                                                gtf_path=gtf_file)

    with pytest.raises(ValueError):
        pr = BaseVariantMatcher._read_intervals(intervals=intervals,
                                                interval_attrs=['gene_id'])

    pr = BaseVariantMatcher._read_intervals(gtf_path=gtf_file)
    assert pr.Chromosome.tolist() == ['chr1'] * 5
    assert pr.Start.tolist() == [200, 200, 200, 1049, 3029]
    assert pr.End.tolist() == [4230, 4230, 402, 1340, 4230]
    # assert len(pr.intervals.tolist()) == 5

    pr = BaseVariantMatcher._read_intervals(bed_path=example_intervals_bed)
    assert pr.Chromosome.tolist() == ['chr1'] * 4
    assert pr.Start.tolist() == [2, 2, 2, 602]
    assert pr.End.tolist() == [1000, 5000, 1002, 604]
    # assert len(pr.intervals.tolist()) == 4

    pr = BaseVariantMatcher._read_intervals(pranges=pranges)
    assert pr.Chromosome.tolist() == ['chr1'] * 5
    assert pr.Start.tolist() == [200, 200, 200, 1049, 3029]
    assert pr.End.tolist() == [4230, 4230, 402, 1340, 4230]
    # assert len(pr.intervals.tolist()) == 5

    pr = BaseVariantMatcher._read_intervals(intervals=intervals)
    assert pr.df.Chromosome.tolist() == ['chr1', 'chr1']
    assert pr.df.Start.tolist() == [1, 23]
    assert pr.df.End.tolist() == [10, 30]
    assert pr.df.Strand.tolist() == ['+', '-']
    assert len(pr.intervals.tolist()) == 2
예제 #10
0
def gencode_gtf():

    """
    >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------+
    >>> # | Chromosome   | Source     | Feature      | Start     | End       | Score      | Strand       | Frame      | gene_id           | +15   |
    >>> # | (category)   | (object)   | (category)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | (object)          | ...   |
    >>> # |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------|
    >>> # | chr1         | HAVANA     | gene         | 11868     | 14409     | .          | +            | .          | ENSG00000223972.5 | ...   |
    >>> # | chr1         | HAVANA     | transcript   | 11868     | 14409     | .          | +            | .          | ENSG00000223972.5 | ...   |
    >>> # | chr1         | HAVANA     | exon         | 11868     | 12227     | .          | +            | .          | ENSG00000223972.5 | ...   |
    >>> # | chr1         | HAVANA     | exon         | 12612     | 12721     | .          | +            | .          | ENSG00000223972.5 | ...   |
    >>> # | ...          | ...        | ...          | ...       | ...       | ...        | ...          | ...        | ...               | ...   |
    >>> # | chr1         | HAVANA     | exon         | 1430549   | 1430662   | .          | -            | .          | ENSG00000225285.1 | ...   |
    >>> # | chr1         | HAVANA     | transcript   | 1430663   | 1434520   | .          | -            | .          | ENSG00000225285.1 | ...   |
    >>> # | chr1         | HAVANA     | exon         | 1434177   | 1434520   | .          | -            | .          | ENSG00000225285.1 | ...   |
    >>> # | chr1         | HAVANA     | exon         | 1430663   | 1430954   | .          | -            | .          | ENSG00000225285.1 | ...   |
    >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------+
    >>> # Stranded PyRanges object has 4,995 rows and 24 columns from 1 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    >>> # 15 hidden columns: gene_type, gene_name, level, havana_gene, transcript_id, transcript_type, transcript_name, transcript_support_level, tag, ... (+ 6 more.)
    """

    full_path = get_example_path("gencode_human.gtf.gz")

    return pr.read_gtf(full_path)
예제 #11
0
def check(path_gtf):

    gtf = pr.read_gtf(path_gtf)

    biotypes = get_biotypes(gtf)

    print(biotypes.value_counts().to_markdown(tablefmt="psql",
                                              floatfmt=",.0f"))

    del gtf
예제 #12
0
    def __init__(
        self,
        fasta_file,
        gtf_file,
    ):
        genome_annotation = pr.read_gtf(gtf_file, as_df=True)
        roi = get_roi_from_genome_annotation(genome_annotation)
        roi = pr.PyRanges(roi)

        super().__init__(
            regions_of_interest=roi,
            reference_sequence=FastaStringExtractor(fasta_file),
        )
예제 #13
0
def test_read_gtf():

    gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True)
    assert len(gr.columns) == 28

    df = gr.df
    transcript = df.iloc[1]
    assert transcript['tag'] == 'basic'

    exon = df[df['exon_id'] == 'ENSE00003812156'].iloc[0]
    assert exon['tag'] == 'basic'

    gr = pr.read_gtf("tests/test_data/ensembl.gtf",
                     full=True,
                     duplicate_attr=True)
    assert len(gr.columns) == 28

    df = gr.df
    transcript = df.iloc[1]
    assert transcript['tag'] == 'basic'

    exon = df[df['exon_id'] == 'ENSE00003812156'].iloc[0]
    assert exon['tag'] == 'CCDS,basic'
def edit_gtf(input_gtf, chr_no):
    df = pr.read_gtf(input_gtf).df
    df = df[['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'gene_id', 'gene_biotype']]
    df = df[df['Feature'] == 'gene']
    df = df[df['gene_biotype'] == 'protein_coding']
    df.drop(df[df['Chromosome'] == 'Mt'].index, inplace=True)
    df.drop(df[df['Chromosome'] == 'Pt'].index, inplace=True)
    df = df.astype({'Chromosome': 'int32'})
    dfs = [df[df['Chromosome'] == x] for x in range(1, chr_no + 1)]
    gtf = []
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        gtf.append(df)

    return gtf
예제 #15
0
def file_to_grange(f, dtype=np.int32, filetype="reads"):

    from pyranges import PyRanges, read_gtf

    if dtype == np.int64:
        extended = True
    else:
        extended = False

    if filetype == "reads":
        df = read_file(f, dtype)
        gr = PyRanges(df, extended=extended)
    elif filetype == "annotation":
        gr = read_gtf(f, annotation="ensembl")

    return gr
예제 #16
0
    def __init__(
        self,
        fasta_file,
        gtf_file,
        vcf_file,
        vcf_file_tbi=None,
        vcf_lazy=True,
    ):
        genome_annotation = pr.read_gtf(gtf_file, as_df=True)
        roi = get_roi_from_genome_annotation(genome_annotation)
        roi = pr.PyRanges(roi)

        from kipoiseq.extractors import MultiSampleVCF
        super().__init__(regions_of_interest=roi,
                         reference_sequence=FastaStringExtractor(fasta_file),
                         variants=MultiSampleVCF(vcf_file, lazy=vcf_lazy))
예제 #17
0
def main(fastafile: str = typer.Option(..., help="fasta file"),
         gtffile: str = typer.Option(..., help="gtf file"),
         outfile: str = typer.Option(..., help="output mRNA fasta file")):
    """根据gtf提取基因的mRNA序列,同一基因的不同转录本会merge起来,每个基因只输出一个合并后的mRNA序列"""
    gr = pr.read_gtf(gtffile)
    df = gr.merge(by=["Feature", "gene_id"], strand=False).as_df()
    seq = Fasta(fasta)
    with open(outfile, 'w') as f:
        for gene, gdf in df.loc[df['Feature'] == 'exon', :].groupby('gene_id'):
            f.write(f'>{gene}\n')
            content = []
            for chrom, start, end in gdf.sort_values('Start')[[
                    'Chromosome', 'Start', 'End'
            ]].values:
                content.append(seq[chrom][start:end].seq)
            else:
                f.write(''.join(content) + '\n')
예제 #18
0
def read_ensemble_genes_gtf(gtf_filename) -> PyRanges:
    """ Read an ensembl gtf and extract gene start end

    Parameters
    ----------
    gtf_filename : str
        GTF filename

    Returns
    -------
    PyRanges
        Genes bounds
    """
    genes = pr.read_gtf(gtf_filename, as_df=True)
    genes = genes.groupby(['Chromosome', 'gene_id', 'gene_name'],
                          observed=True).agg({
                              'Start': min,
                              'End': max
                          }).reset_index()
    genes = pr.PyRanges(genes)
    return genes
예제 #19
0
    def __init__(self,
                 gtf_file,
                 fasta_file,
                 num_upstream,
                 num_downstream,
                 gtf_filter='gene_type == "protein_coding"',
                 anchor='tss',
                 transform=one_hot_dna,
                 interval_attrs=["gene_id", "Strand"],
                 use_strand=True):

        # Read and filter gtf
        gtf = pr.read_gtf(gtf_file).df
        if gtf_filter:
            if isinstance(gtf_filter, str):
                gtf = gtf.query(gtf_filter)
            else:
                gtf = gtf_filter(gtf)
        # Extract anchor
        if isinstance(anchor, str):
            anchor = anchor.lower()
            if anchor in self._function_mapping:
                anchor = self._function_mapping[anchor]
            else:
                raise Exception("No valid anchorpoint was chosen")
        self._gtf_anchor = anchor(gtf)

        # Other parameters
        self._use_strand = use_strand
        self._fa = FastaStringExtractor(fasta_file,
                                        use_strand=self._use_strand)
        self._transform = transform
        if self._transform is None:
            self._transform = lambda x: x
        self._num_upstream = num_upstream
        self._num_downstream = num_downstream
        self._interval_attrs = interval_attrs
예제 #20
0
def test_pyranges_to_intervals():
    pranges = pyranges.read_gtf(gtf_file)
    intervals = list(
        pyranges_to_intervals(pranges,
                              interval_attrs=[
                                  'gene_id', 'gene_name', 'transcript_id',
                                  'exon_id'
                              ]))

    assert len(intervals) == 5
    assert intervals[4].attrs['gene_id'] == 'ENSG00000012048'
    assert intervals[4].attrs['gene_name'] == 'BRCA1'
    assert intervals[4].attrs['transcript_id'] == 'ENST00000357654'
    assert intervals[4].attrs['exon_id'] == 'ENSE00003510592'

    pranges = pyranges.read_bed(example_intervals_bed)
    intervals = list(pyranges_to_intervals(pranges))

    assert len(intervals) == 4
    assert intervals[0].start == 2

    assert pranges.Chromosome.tolist() == ['chr1'] * 4
    assert pranges.Start.tolist() == [2, 2, 2, 602]
    assert pranges.End.tolist() == [1000, 5000, 1002, 604]
def adjust_gtf(file, vcf_file, new_file):
    vcf = vcf_file
    df = pr.read_gtf(file).df
    df = df[df['Feature'] == 'gene']
    df = df[df['gene_biotype'] == 'protein_coding']
    df = df[['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'gene_id']]
    # To ensure we use just chromosome 1-5, excluding Mt and Pt
    df = df[df['Chromosome'].isin(['1', '2', '3', '4', '5'])]
    df.reset_index(inplace=True, drop=True)
    print('processing gtf')
    for idx, shift in enumerate(vcf['SHIFT']):
        if shift != 0:
            position = int(vcf['POS'][idx])
            chrom = vcf['#CHROM'][idx]
            for chr, start, end in zip(enumerate(df['Chromosome']), df['Start'], df['End']):
                start = int(start)
                end = int(end)
                if chr[1] == chrom:
                    if position < start and position < end:
                        df.loc[chr[0], 'Start'] = df.loc[chr[0], 'Start'] + shift
                        df.loc[chr[0], 'End'] = df.loc[chr[0], 'End'] + shift
                    elif start < position < end:
                        df.loc[chr[0], 'End'] = df.loc[chr[0], 'End'] + shift
    df.to_csv(new_file, header=False, index=False, sep='\t')
예제 #22
0
def gencode_gtf():

    full_path = get_example_path("gencode_human.gtf.gz")

    return pr.read_gtf(full_path)
예제 #23
0
def ensembl_gtf():

    full_path = get_example_path("ensembl_human.gtf.gz")

    return pr.read_gtf(full_path)
예제 #24
0
def test_read_gtf():

    gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=False)

    assert list(gr.df.columns[:4]) == "Chromosome Start End Strand".split()
예제 #25
0
def test_read_gff3():

    gr = pr.read_gtf("tests/test_data/gencode.gff3", full=False)

    assert list(gr.df.columns[:4]) == "Chromosome Start End Strand".split()
예제 #26
0
def gtf():

    return pr.read_gtf("tests/test_data/ensembl.gtf")
예제 #27
0
#!/usr/bin/env python
# coding: utf-8

import pyranges
import os
import numpy as np
import logging

# gtf file with gene annotations
gtf_path = snakemake.input.gtf
target_regions_path = snakemake.input.target_regions

logging.basicConfig(filename=snakemake.log[0])

anno = pyranges.read_gtf(gtf_path)

# filter
protein_coding_genes = anno[(anno.Feature == 'gene')
                            & (anno.gene_biotype == 'protein_coding')]

target_regions = pyranges.read_bed(target_regions_path)
protein_coding_genes = protein_coding_genes.overlap(target_regions)

logging.info('found {} protein coding genes.'.format(
    len(protein_coding_genes)))

id_name = np.array([
    '_'.join([i, n]) for i, n in zip(protein_coding_genes.gene_id,
                                     protein_coding_genes.gene_name)
])
예제 #28
0
def test_read_gtf():

    gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True)
    assert len(gr.columns) == 28
def generateIlluminaWindowFromKb(t2gPath, ecPath, splicePath, unsplicePath,
                                 gtfPath, illuminaWindowDir, windowSize):
    """
    generate illumina windows from kb_python results(workflow: nuclei)
    t2gPath: 
        index file
    ecPath: 
        matrix ec
    splicePath: 
        filtered spliced bus
    unsplicePath: 
        filtered spliced bus
    gtfPath: 
        gtf anno file, used to create kb ref
    illuminaWindowDir:
        dir stored illumina reads, end with '/'
    windowSize:
        windowSize
    """
    kbParseTools.mkdir(illuminaWindowDir)
    logger.info("start parse gff file")
    gtfDf = pr.read_gtf(gtfPath, as_df=True)
    gtfDf = gtfDf.query("Feature == 'exon'").reindex(
        ['Chromosome', 'Start', 'End', 'gene_id'], axis=1)
    gtfDf = gtfDf.assign(Gene=lambda x: x["gene_id"]).groupby("Gene").agg({
        "Chromosome":
        lambda x: x.iloc[0],
        "Start":
        'min',
        "End":
        'max'
    })
    gtfDf = gtfDf.assign(
        StartWin=lambda x: x["Start"] // windowSize - 1,
        EndWin=lambda x: x["End"] // windowSize + 1,
    )
    gtfDf = gtfDf.to_dict('index')

    logger.info("start parse bus file")
    kbUmiSpliceMappingInfoDf = kbParseTools.getBustoolsMappingResult(
        t2gPath, ecPath, splicePath)
    kbUmiUnspliceMappingInfoDf = kbParseTools.getBustoolsMappingResult(
        t2gPath, ecPath, unsplicePath)
    kbUmiMappingInfoDf = pd.concat(
        [kbUmiUnspliceMappingInfoDf, kbUmiSpliceMappingInfoDf])

    kbUmiMappingInfoDf = kbUmiMappingInfoDf.groupby('barcodeUmi').agg(
        {'geneLs': lambda x: __getSetOutersect(*x)})
    kbUmiMappingInfoDf = kbUmiMappingInfoDf.assign(
        geneCounts=lambda df: df['geneLs'].map(len)).query("geneCounts >= 1")
    kbUmiMappingInfoDf = kbUmiMappingInfoDf.reset_index().assign(
        barcode=lambda df: df["barcodeUmi"].str.split("_").str[0],
        umi=lambda df: df["barcodeUmi"].str.split("_").str[1],
    ).assign(seq=lambda df: df["barcode"] + df["umi"])

    illuminaWindowContentDt = defaultdict(lambda: defaultdict(lambda: []))
    for oneUmiNt in kbUmiMappingInfoDf.itertuples():
        for gene in oneUmiNt.geneLs:
            geneGtfDt = gtfDf[gene]
            geneChr = geneGtfDt['Chromosome']
            geneStartWin = geneGtfDt['StartWin']
            geneEndWin = geneGtfDt['EndWin']
            for singleWin in range(geneStartWin, geneEndWin + 1):
                illuminaWindowContentDt[geneChr][singleWin].append(
                    f'>{oneUmiNt.barcodeUmi}\n{oneUmiNt.seq}')

    i = 0
    totalCounts = sum([len(x) for x in illuminaWindowContentDt.values()])

    with ThreadPoolExecutor(24) as mtT:
        for chromNum, chromDt in illuminaWindowContentDt.items():
            chromFastaDir = f'{illuminaWindowDir}{chromNum}/'
            kbParseTools.mkdir(chromFastaDir)
            for windowNum, windowLs in chromDt.items():
                i += 1
                mtT.submit(writeWindowFasta, chromFastaDir, windowNum,
                           windowLs, i, totalCounts)
예제 #30
0
파일: gff.py 프로젝트: nvk747/VaLiAnT
def load_gff_cds(fp: str) -> Tuple[PyRanges, PyRanges]:

    # Load necessary fields from GTF/GFF2 file
    ranges: pd.DataFrame = read_gtf(
        fp, as_df=True)[GFF_FIELDS].rename(columns={'Frame': 'frame'})

    logging.debug("GTF/GFF2 file: %d features found." % ranges.shape[0])

    # Drop unnecessary features
    ranges = ranges[ranges.Feature.isin(GFF_FEATURES)]

    logging.debug("GTF/GFF2 file: %d CDS features found." % ranges.shape[0])

    # Compress identifiers
    ranges.transcript_id = ranges.transcript_id.astype('category')
    ranges.gene_id = ranges.gene_id.astype('category')

    # Extract UTR features
    utr_mask: pd.Series = ranges.Feature == 'UTR'
    utr_ranges: PyRanges = PyRanges(
        df=ranges[utr_mask].drop('Feature', axis=1))
    ranges = ranges[~utr_mask]
    del utr_mask

    # Transform frames
    ranges.frame = _get_frames(ranges.frame)
    assert ranges.frame.dtype == 'int8'

    # Extract stop codon features
    stop_mask: pd.Series = ranges.Feature == 'stop_codon'
    stop_codons: Dict[str, int] = {
        r.transcript_id: r.End if r.Strand == '+' else r.Start
        for r in ranges[stop_mask].itertuples()
    }
    ranges = ranges[~stop_mask]
    del stop_mask

    # Sort CDS features by genomic coordinates
    ranges = ranges.drop('Feature', axis=1).sort_values(by=[
        'gene_id', 'transcript_id', 'Chromosome', 'Strand', 'Start', 'End'
    ],
                                                        ignore_index=True)

    # Validate number of gene and transcript identifiers
    gene_n: int = ranges.gene_id.cat.categories.size
    transcript_n: int = ranges.transcript_id.cat.categories.size

    logging.debug("GTF/GFF2 file: %d genes found." % gene_n)
    logging.debug("GTF/GFF2 file: %d transcripts found." % transcript_n)

    if gene_n == 0 or transcript_n == 0:
        raise ValueError("No gene or transcript ID found in GTF/GFF file!")

    if transcript_n > gene_n:
        raise ValueError(
            "Multiple transcripts per gene in GTF/GFF file are not supported!")

    # Check for missing identifiers
    if ranges.gene_id.isnull().values.any():
        raise ValueError("Missing gene ID in GTF/GFF2 file!")
    if ranges.transcript_id.isnull().values.any():
        raise ValueError("Missing transcript ID in GTF/GFF2 file!")

    # Check only one transcript per gene is present
    gene_transcript_counts: pd.DataFrame = ranges.groupby(
        ['gene_id', 'transcript_id'],
        sort=False).size().reset_index(name='counts')
    gene_transcript_counts = gene_transcript_counts[
        gene_transcript_counts.counts > 0]
    if gene_transcript_counts.shape[0] > gene_n:
        raise ValueError(
            "Multiple transcripts per gene in GTF/GFF file are not supported!")

    # Assign a sequential index to each CDS feature (5' to 3')
    ranges['exon_index'] = ranges.groupby(['transcript_id'],
                                          sort=False).pipe(get_exon_indices)

    # Append the stop codons to the last CDS features
    strands: Set[str] = set(ranges.Strand.cat.categories.values)

    if '+' in strands:
        last_cds_plus: np.ndarray = _get_last_cds_indices(ranges, '+')
        ranges.loc[last_cds_plus, 'End'] += 3
        del last_cds_plus

    if '-' in strands:
        last_cds_minus: np.ndarray = _get_last_cds_indices(ranges, '-')
        ranges.loc[last_cds_minus, 'Start'] -= 3
        del last_cds_minus

    # TODO: validate with stop codon features
    del stop_codons

    # Convert to PyRanges
    cds_ranges: PyRanges = PyRanges(df=ranges)

    return cds_ranges, utr_ranges