def checkParentsOverlapTransloInv(filtered_sample_frame, sample_start,
                                  parent_start, sample_end, parent_end, args,
                                  inheritance):

    if args.type == 'singleton' or (
            args.type == 'duo' and inheritance == 'Found_in_Father'
            and args.mother_duo) or (args.type == 'duo'
                                     and inheritance == 'Found_in_Mother'
                                     and args.father_duo):
        # Initialize columns and set to -1 if parents file not provided
        filtered_sample_frame[inheritance] = 'None'
        return filtered_sample_frame

    # new michelle's breakend script
    denovo_start_parent = PyRanges(sample_start).overlap(
        PyRanges(parent_start))
    denovo_end_parent = PyRanges(sample_end).overlap(PyRanges(parent_end))
    denovo_parent_frame = pd.merge(denovo_start_parent.df,
                                   denovo_end_parent.df['SmapEntryID'],
                                   on=['SmapEntryID']).drop_duplicates()
    if denovo_parent_frame.empty:
        filtered_sample_frame[inheritance] = "False"
    else:
        parent_filtered_sample_frame = pd.merge(filtered_sample_frame,
                                                denovo_parent_frame,
                                                on=None,
                                                how='left',
                                                indicator=inheritance)
        parent_filtered_sample_frame[inheritance] = np.where(
            parent_filtered_sample_frame[inheritance] == 'both', 'True',
            'False')
        filtered_sample_frame = parent_filtered_sample_frame.drop_duplicates(
        ).reset_index(drop=True)

    return filtered_sample_frame
def geneOverlapTransloInv(args, sample_start, sample_end, sample_frame):

    gene_frame = pr.read_bed(args.genes)
    gene_start = PyRanges(sample_start).join(gene_frame[["Name", "Score"
                                                         ]]).drop(like="_b")
    gene_end = PyRanges(sample_end).join(gene_frame[["Name",
                                                     "Score"]]).drop(like="_b")

    if gene_start.df.empty and gene_end.df.empty:
        sample_frame['Name'] = sample_frame['Name2'] = sample_frame[
            'Score'] = sample_frame['Score2'] = 'None'
    elif gene_start.df.empty:
        sample_frame = gene_end.df.rename(columns={
            'Name': 'Name2',
            'Score': 'Score2'
        }).filter(items=['SmapEntryID', 'Name']).drop_duplicates().merge(
            sample_frame, on=['SmapEntryID'], how='right')
        sample_frame['Name'] = sample_frame['Score'] = 'None'
    elif gene_end.df.empty:
        sample_frame = gene_start.df.filter(
            items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge(
                sample_frame, on=['SmapEntryID'], how='right')
        sample_frame['Name2'] = sample_frame['Score2'] = 'None'
    else:
        sample_frame = gene_start.df.filter(
            items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge(
                sample_frame, on=['SmapEntryID'], how='right')
        sample_frame = sample_frame.merge(gene_end.df.rename(columns={
            'Name': 'Name2',
            'Score': 'Score2'
        }).filter(items=['SmapEntryID', 'Name2', 'Score2']),
                                          on=['SmapEntryID'],
                                          how='left')

    return (sample_frame)
Exemplo n.º 3
0
def chip_10_plus_one(names):

    df = pd.read_table("tests/chip_10_plus_one.bed", header=None, names=names)

    gr = PyRanges(df)

    assert gr.stranded

    gr.df = gr.df.reindex(np.random.permutation(gr.df.index))

    return gr
def checkRefOverlap(sample_copy, ref_copy, sample_frame):

    overlap_frame = reciprocal_overlap(PyRanges(sample_copy),
                                       PyRanges(ref_copy))

    if overlap_frame.empty:
        filtered_sample_frame = sample_frame
    else:
        common = sample_copy.merge(overlap_frame, on=['SmapEntryID'])
        filtered_sample_frame = sample_frame[(
            ~sample_frame.SmapEntryID.isin(common.SmapEntryID))]

    return filtered_sample_frame
Exemplo n.º 5
0
def _getitem(self, val):
    if isinstance(val, str):
        df = get_string(self, val)
    elif isinstance(val, tuple):
        df = get_tuple(self, val)
    elif isinstance(val, slice):
        df = get_slice(self, val)
    else:
        raise Exception("Not valid subsetter: {}".format(str(val)))

    if not df is None:
        return PyRanges(df)
    else:
        return PyRanges({})
Exemplo n.º 6
0
def checkRefOverlapINVBND(sample_start, sample_end, ref_start, ref_end, sample_frame):

    overlap_start = PyRanges(sample_start).overlap(PyRanges(ref_start))
    overlap_end = PyRanges(sample_end).overlap(PyRanges(ref_end))
    if overlap_start.df.empty and overlap_end.df.empty:
        filtered_sample_frame = sample_frame
    else:
        overlap_frame = overlap_start.df.merge(overlap_end.df, on=['ID'])
        if overlap_frame.empty:
            filtered_sample_frame = sample_frame
        else:
            common = sample_frame.merge(overlap_frame,on=['ID'])
            filtered_sample_frame = sample_frame[(~sample_frame.ID.isin(common.ID))]

    return filtered_sample_frame
Exemplo n.º 7
0
def dfs_min(draw):
    df = draw(better_dfs_min)
    df.loc[:, "End"] += df.Start
    df.insert(3, "Name", "a")
    df.insert(4, "Score", 0)

    gr = PyRanges(df)

    np.random.seed(draw(st.integers(min_value=0, max_value=int(1e6))))
    # this is the same as allowing users to arbitrarily sort
    # their underlying dataframes in whichever way they choose
    # an the PyRanges functionality still works! Wowaweeva
    gr.df = df.reindex(np.random.permutation(df.index.values))

    return gr
Exemplo n.º 8
0
def to_ranges(grles, nb_cpu=1):

    from pyranges import PyRanges

    func = to_ranges_df_strand if grles.stranded else to_ranges_df_no_strand

    if nb_cpu > 1:
        import ray
        ray.init(num_cpus=nb_cpu)
        func = ray.remote(func)
        get = ray.get
    else:
        func.remote = func
        get = lambda x: x

    dfs, keys = [], []
    for k, v in grles.items():
        result = func.remote(v, k)
        dfs.append(result)
        keys.append(k)

    dfs = {k: v for (k, v) in zip(keys, get(dfs))}

    if nb_cpu > 1:
        ray.shutdown()

    return PyRanges(dfs)
Exemplo n.º 9
0
def _getitem(self, val):

    if isinstance(val, list):
        dfs = _keep(self, keep=val).dfs
    elif isinstance(val, str):
        dfs = get_string(self, val)
    elif isinstance(val, tuple):
        dfs = get_tuple(self, val)
    elif isinstance(val, slice):
        dfs = get_slice(self, val)
    elif isinstance(val, dict):
        dfs = get_booldict(self, val)
    elif (isinstance(val, (pd.Series, np.ndarray))) and val.dtype == "bool":
        assert len(val) == len(
            self), "Boolean indexer must be same length as pyrange!"
        _length = 0
        if isinstance(val, pd.Series):
            val = val.values

        dfs = {}
        for k, df in self:
            length = len(df)
            _bool = val[_length:(length + _length)]
            dfs[k] = df[_bool]
            _length += length
    else:
        raise Exception("Not valid subsetter: {}".format(str(val)))

    gr = PyRanges(dfs)
    return gr
Exemplo n.º 10
0
    def load(self, fp: str) -> None:

        # Load variants from VCF
        with get_vcf(fp) as variant_file:
            for record in variant_file.fetch():
                sgrna_id: str = record.info['SGRNA'].strip()
                if sgrna_id in self._variants:
                    self._variants[sgrna_id].add(
                        PamVariant.from_variant_record(record))

        # Log loaded variant statistics
        logging.debug("Collected %d PAM protection variants." % self.count)
        for sgrna_id in self.sgrna_ids:
            if not self._variants[sgrna_id]:
                logging.info("No PAM protection variants for sgRNA %s." %
                             sgrna_id)

        # Populate genomic range table
        df: pd.DataFrame = pd.DataFrame.from_records(
            [(*variant.get_pyrange_record(), sgrna_id)
             for sgrna_id, variants in self._variants.items()
             for variant in variants],
            columns=['Chromosome', 'Start', 'End', 'sgrna_id'])
        df.sgrna_id = df.sgrna_id.astype('category')
        df['variant_id'] = get_id_column(df.shape[0])
        self._ranges = PyRanges(df)
Exemplo n.º 11
0
def dfs_min(draw):  # nosec
    df = draw(better_dfs_min)
    # strand = draw(use_strand)
    df.loc[:, "End"] += df.Start

    df.insert(3, "Name", "a")
    df.insert(4, "Score", 0)

    # df.Start = df.Start.astype(np.int32)
    # df.End = df.End.astype(np.int32)
    # print(df.dtypes)
    # stranded = draw(st.booleans())
    # if not strand:
    #     df = df.drop("Strand", axis=1)

    gr = PyRanges(df, int64=True)
    # print(gr)
    # raise
    # gr = PyRanges(df)

    # do not sort like this, use pyranges sort
    # np.random.seed(draw(st.integers(min_value=0, max_value=int(1e6))))
    # gr.df = df.reindex(np.random.permutation(df.index.values))

    return gr
Exemplo n.º 12
0
def chip_chr1():

    c = """Chromosome Start End Strand
chr1 5 7 +
chr1 3 10 -"""

    return coverage(PyRanges(pd.read_table(StringIO(c), sep="\s+")))
Exemplo n.º 13
0
def background_chr1():

    c = """Chromosome Start End Strand
chr1 1 4 +
chr1 2 5 -"""

    return coverage(PyRanges(pd.read_table(StringIO(c), sep="\s+")))
Exemplo n.º 14
0
def test_subtraction(gr, gr2, strandedness):

    print("gr\n", gr)
    print("gr2\n", gr2)
    bedtools_strand = {False: "", "same": "-s", "opposite": "-S"}[strandedness]

    result_df = None
    with tempfile.TemporaryDirectory() as temp_dir:
        f1 = "{}/f1.bed".format(temp_dir)
        f2 = "{}/f2.bed".format(temp_dir)
        gr.df.to_csv(f1, sep="\t", header=False, index=False)
        gr2.df.to_csv(f2, sep="\t", header=False, index=False)

        cmd = subtraction_command.format(bedtools_strand, f1, f2)

        result = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode()

        bedtools_df = pd.read_table(StringIO(result), header=None, squeeze=True, names="Chromosome Start End Name Score Strand".split())

    result = gr.subtraction(gr2, strandedness=strandedness)
    print("result\n", result)
    print("bedtools_df\n", PyRanges(bedtools_df))

    if not bedtools_df.empty:
        assert_df_equal(result.df, bedtools_df)
    else:
        assert bedtools_df.empty == result.df.empty
Exemplo n.º 15
0
def methylation_pyranges_from_csv(inputfile):
    colnames = ["Chromosome", "Start", "End", "calls", "methylated"]
    return PyRanges(pd.read_csv(inputfile,
                                sep="\t",
                                names=colnames,
                                header=0,
                                usecols=[0, 1, 2, 4, 5]))
def exonOverlap(args, df):

    exon_frame = pr.read_bed(args.exons)
    exon_overlap = PyRanges(df).join(exon_frame).drop(like="_b")

    if exon_overlap.df.empty:
        exon_calls = pd.DataFrame()
    else:
        exon_calls = exon_overlap.df.drop(
            columns=['Chromosome', 'Start', 'End']).rename(
                columns={
                    'Name': 'gene',
                    'Score': 'OMIM_syndrome'
                }).drop_duplicates()
        # if args.genelist:
        #     #gene_list = pd.read_csv(args.genelist, sep='\t', names=['Gene'], header=None)
        exon_calls = exon_calls.merge(args.genelist, on=['gene'], how='left')
        exon_calls.fillna(value={
            'score': 0,
            'normalized_score': 0
        },
                          inplace=True)
        exon_calls = exon_calls.sort_values(by='score', ascending=False)

    return exon_calls
Exemplo n.º 17
0
def f1(names):

    df = pd.read_csv("tests/f1.bed",
                     sep="\t",
                     header=None,
                     names="Chromosome  Start  End  Name Score Strand".split())

    return PyRanges(df)
Exemplo n.º 18
0
def test_cds_context_repository_get_cds_genomic_ranges(strand, len5p, len3p, exp_cds_pre, exp_cds_suf):
    transcript_id = TID
    exon_index = 1
    ccr = CDSContextRepository(PyRanges(df=CDS_RANGES_DF))
    cds_pre, cds_suf = ccr.get_cds_genomic_ranges(transcript_id, strand, exon_index, GR, 0, 0, len5p, len3p)

    assert cds_pre == exp_cds_pre
    assert cds_suf == exp_cds_suf
Exemplo n.º 19
0
def genomic_ranges_to_unstranded_pyranges(
        genomic_ranges: Iterable[GenomicRange]) -> PyRanges:
    return PyRanges(df=pd.DataFrame.from_records(
        chain([
            genomic_range.as_unstranded_pyrange()
            for genomic_range in genomic_ranges
        ]),
        columns=['Chromosome', 'Start', 'End']).drop_duplicates())
Exemplo n.º 20
0
def chip_10(names):

    df = pd.read_csv("tests/chip_10.bed", header=None, names=names, sep="\t")

    gr = PyRanges(df)

    assert gr.stranded

    return gr
Exemplo n.º 21
0
def input_10(names):

    df = pd.read_table("tests/input_10.bed", header=None, names=names)

    gr = PyRanges(df)

    assert gr.stranded

    return gr
Exemplo n.º 22
0
def introns():

    df = pd.read_table("tests/intron.txt", sep="\t", header=None)

    print(df.head())
    print(df.shape)
    df.columns = "Chromosome Start End".split() + list(df.columns[3:])
    print(df.columns)

    return PyRanges(df)
Exemplo n.º 23
0
def _getitem(self, val):

    if isinstance(val, list):
        dfs = _drop(self, keep=val).dfs
    elif isinstance(val, str):
        dfs = get_string(self, val)
    elif isinstance(val, tuple):
        dfs = get_tuple(self, val)
    elif isinstance(val, slice):
        dfs = get_slice(self, val)
    elif isinstance(val, dict):
        dfs = get_booldict(self, val)
    else:
        raise Exception("Not valid subsetter: {}".format(str(val)))

    if not dfs is None:
        return PyRanges(dfs)
    else:
        return PyRanges({})
Exemplo n.º 24
0
def test_cds_context_repository_compute_cds_contexts(start, end, exp_ext_5, exp_ext_3):
    cds_ranges = PyRanges(df=CDS_RANGES_DF)

    chromosome = 'X'
    strand = '+'
    gr = GenomicRange(chromosome, start, end, strand)
    target_ranges = PyRanges(df=pd.DataFrame.from_records([
        gr.as_pyrange()
    ], columns=PYRANGES_FIELDS))
    target_ranges.is_const = False

    # Initialise repository
    ccr = CDSContextRepository(cds_ranges)
    ccr.register_target_ranges(target_ranges)

    # Compute CDS contexts
    ccr.compute_cds_contexts()

    # Check CDS contexts
    assert len(ccr._target_cds_contexts) == 1
    exon_info, (ext_5, ext_3) = ccr._target_cds_contexts[gr]

    # Check exon information
    assert isinstance(exon_info, ExonInfo)
    assert exon_info.gene_id == GID
    assert exon_info.transcript_id == TID

    # Check CDS extension
    if exp_ext_5 is not None:
        assert ext_5 == GenomicRange(chromosome, *exp_ext_5, strand)
    else:
        assert ext_5 is None

    if exp_ext_3 is not None:
        assert ext_3 == GenomicRange(chromosome, *exp_ext_3, strand)
    else:
        assert ext_3 is None

    # Check information retrieval
    assert ccr.get_cds_extensions(gr) == (ext_5, ext_3)
    assert ccr.get_exon_info(gr) == exon_info
    assert ccr.get_transcript_info(gr) == exon_info.transcript_info
Exemplo n.º 25
0
def expected_result_previous_bed_unstranded():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance
0 chr1 3 6 h 0 + 1 2 f 0 + 2
1 chr1 5 7 h 0 - 6 7 f 0 - 0
2 chr1 8 9 h 0 + 6 7 f 0 - 2"""

    df = pd.read_table(StringIO(c), sep=" ", header=0)
    print(df)

    return PyRanges(df)
Exemplo n.º 26
0
def test_cds_context_repository_get_cds_by_index():
    chromosome, strand, start, end, _, transcript_id, _, exon_index = CDS_RANGES[0]
    ccr = CDSContextRepository(PyRanges(df=CDS_RANGES_DF))
    gr = ccr.get_cds_by_index(transcript_id, exon_index)

    assert gr.chromosome == chromosome
    assert gr.strand == strand
    assert gr.start == start
    assert gr.end == end

    with pytest.raises(Exception):
        ccr.get_cds_by_index(transcript_id, 999)
Exemplo n.º 27
0
def read_bam_bin_counts(bins: PyRanges, bams: Dict[str, str], excluded: PyRanges = None, **kwargs) -> AnnData:
    """ Count reads in bins from bams

    Parameters
    ----------
    bins : pyranges.PyRanges
        bins in which to count reads
    bams : Dict[Str]
        bam filenames with cell ids as keys
    excluded: PyRanges
        excluded genomic regions to filter reads

    Returns
    -------
    ad.AnnData
        binned read counts
    """

    bin_data = _convert_pyranges(bins)
    bin_data = _add_bin_index(bin_data)

    cn_matrix = {}

    for cell_id, cell_bam in bams.items():
        logging.info(f"reading {cell_bam}")
        bam_data = pr.read_bam(cell_bam, **kwargs)

        if excluded is not None:
            logging.info("excluding reads")
            bam_data = bam_data.intersect(excluded, invert=True)

        logging.info(f"count overlaps")
        bam_data = bam_data.intersect(bins, how='containment')
        read_counts = bins.count_overlaps(bam_data, overlap_col='reads')

        read_counts = _convert_pyranges(read_counts)
        read_counts = _add_bin_index(read_counts)

        cn_matrix[cell_id] = read_counts['reads']

    cn_matrix = pd.DataFrame(cn_matrix)

    cell_data = pd.DataFrame({'cell_id': cn_matrix.columns.values}).set_index('cell_id')

    adata = ad.AnnData(
        cn_matrix.T,
        obs=cell_data,
        var=bin_data,
    )

    return adata
Exemplo n.º 28
0
    def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None):
        """Calculate windowed estimates of segregating sites.

        Arguments:
            * chrom: identifier for the chromosome
            * L: length of independent locus
            * filt_rec: filter recombination
            * mask: bed file for the underlying mask

        """
        assert self.chrom_pos_dict is not None
        phys_pos = self.chrom_physpos_dict[chrom]
        rec_pos = self.chrom_pos_dict[chrom]
        weights = self.chrom_weight_dict[chrom]
        if filt_rec:
            diff = np.abs(rec_pos[:-1] - rec_pos[1:])
            idx = np.where(diff != 0)[0]
            phys_pos = phys_pos[idx]
            rec_pos = rec_pos[idx]
            weights = weights[idx]
        if mask is not None:
            phys_pos = phys_pos.astype(np.float64)
            df_mask = pyranges.read_bed(mask)
            df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1))
            cov_sites = df_pos.coverage(df_mask)
            sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32))
            idx = np.where(sites_idx > 0.0)[0]
            phys_pos[idx] = np.nan
        # 1. Setup the bins for the analysis
        bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L)
        windowed_vars, bin_edges = np.histogram(
            phys_pos[~np.isnan(phys_pos)],
            bins=bins,
            weights=weights[~np.isnan(phys_pos)],
        )
        bin_edges = bin_edges.astype(np.uint32)
        # Interpolate the midpoints of the recombination bins
        f = interpolate.interp1d(phys_pos, rec_pos)
        midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2
        rec_midpts = f(midpts)
        # Calculate the weightings from the mask as needed ...
        mask_weights = np.ones(rec_midpts.size)
        if mask is not None:
            # Mask must be a bedfile
            df_windows = PyRanges(
                chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:]
            )
            df_mask = pyranges.read_bed(mask)
            cov = df_windows.coverage(df_mask)
            mask_weights = np.array(cov.FractionOverlaps.astype(np.float32))
            # Set the mask weights to scale up the fraction that may be missing!
            mask_weights = 1.0 / (1.0 - mask_weights)
            mask_weights[np.isinf(mask_weights)] = np.nan

        # Stacking all of the data to make sure that we can use it later on
        tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights])
        self.chrom_total_dict[chrom] = tot_data
Exemplo n.º 29
0
def expected_result_previous_bed_opposite_stranded(names):

    c = """chr1 8 9 h 0 + 6 7 f 0 - 2
chr1 5 7 h 0 - 1 2 f 0 + 4"""

    df = pd.read_table(
        StringIO(c),
        sep=" ",
        header=None,
        names=
        "Chromosome  Start  End  Name Score Strand Start_b  End_b  Name_b Score_b Strand_b Distance"
        .split())
    print(df)

    return PyRanges(df)
def checkParentsOverlap(sample_copy, parent_copy, filtered_sample_frame, args,
                        inheritance):

    if args.type == 'singleton' or (
            args.type == 'duo' and inheritance == 'Found_in_Father'
            and args.mother_duo) or (args.type == 'duo'
                                     and inheritance == 'Found_in_Mother'
                                     and args.father_duo):
        # Initialize columns and set to -1 if parents file not provided
        filtered_sample_frame[inheritance] = 'None'
        return filtered_sample_frame

    colnames = [
        'SmapEntryID', 'RefcontigID1', 'RefcontigID2', 'RefStartPos',
        'RefEndPos', 'QryStartPos', 'QryEndPos', 'Confidence', 'Type',
        'Zygosity', 'Genotype'
    ]

    denovo_parent_frame = reciprocal_overlap(PyRanges(sample_copy),
                                             PyRanges(parent_copy))[colnames]

    if denovo_parent_frame.empty:
        filtered_sample_frame[inheritance] = "False"

    else:
        parent_filtered_sample_frame = pd.merge(filtered_sample_frame,
                                                denovo_parent_frame,
                                                on=None,
                                                how='left',
                                                indicator=inheritance)
        parent_filtered_sample_frame[inheritance] = np.where(
            parent_filtered_sample_frame[inheritance] == 'both', True, False)
        filtered_sample_frame = parent_filtered_sample_frame.drop_duplicates(
        ).reset_index(drop=True)

    return filtered_sample_frame