Exemplo n.º 1
0
def expected_result_same_strand_intersection_simple_granges():

    c = """Chromosome Start End Strand Score
chr1    6   7   - 7"""

    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    return PyRanges(df)
Exemplo n.º 2
0
def read_gtf(f):

    """seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below.
    # source - name of the program that generated this feature, or the data source (database or project name)
    feature - feature type name, e.g. Gene, Variation, Similarity
    start - Start position of the feature, with sequence numbering starting at 1.
    end - End position of the feature, with sequence numbering starting at 1.
    score - A floating point value.
    strand - defined as + (forward) or - (reverse).
    # frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..
    attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature."""
    dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"}

    df = pd.read_table(f, sep="\t", comment="#", usecols=[0, 2, 3, 4, 5, 6, 8], header=None, names="Chromosome Feature Start End Score Strand Attribute".split(), dtype=dtypes)

    if sum(df.Score == ".") == len(df):
        cols_to_concat = "Chromosome Start End Strand Feature".split()
    else:
        cols_to_concat = "Chromosome Start End Strand Feature Score".split()

    extract = _fetch_gene_transcript_exon_id(df.Attribute)
    print("extract")
    print(extract)
    extract.columns = "GeneID TranscriptID ExonNumber ExonID".split()

    extract.ExonNumber = extract.ExonNumber.astype(float)

    df = pd.concat([df[cols_to_concat],
                        extract], axis=1)

    return PyRanges(df)
Exemplo n.º 3
0
def expected_result_unstranded():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance
0 chr1 3 6 h 0 + 6 7 f 0 - 1
1 chr1 5 7 h 0 - 6 7 f 0 - 0"""

    return PyRanges(pd.read_table(StringIO(c), sep=" "))
Exemplo n.º 4
0
def hyp4():

    c = """chr1      0    3      +
chr1      3    4      +
chr1      1    2      +"""

    return PyRanges(pd.read_table(StringIO(c), sep="\s+", header=None, names="Chromosome Start End Strand".split()))
Exemplo n.º 5
0
def expected_result_subtract_simple_granges():

    c = """Chromosome Start End Strand Score
chr1	3	6	+ 5
chr1	8	9	+ 1"""
    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    return PyRanges(df)
Exemplo n.º 6
0
def expected_result_counterexample5():

    c = """chr2  0  1  +  1   2   +         1
chr2  2  3  +  1   3   +         0"""

    return PyRanges(pd.read_table(StringIO(c), sep="\s+", header=None,
                                  names="Chromosome Start End Strand Start_b End_b Strand_b Distance".split()))
Exemplo n.º 7
0
def expected_result_regular_intersection():

    c = """chr1	226987603	226987617	U0	0	+
chr8	38747236	38747251	U0	0	-
chr15	26105515	26105518	U0	0	+"""

    return PyRanges(pd.read_table(StringIO(c), header=None, names="Chromosome Start End Name Score Strand".split()))
def expected_result_same_stranded():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance
0 chr1 10241 10440 HWI-ST216_313:3:1302:4516:156396 1 - 9988 10187 HWI-ST216:427:D29R1ACXX:2:1205:6095:16532 1 - 55
1 chr1 110246 110445 HWI-ST216_313:3:1207:4315:142177 1 + 16109 16308 HWI-ST216:427:D29R1ACXX:2:2110:12286:25379 1 + 93939"""

    return PyRanges(pd.read_table(StringIO(c), sep=" "))
def expected_result_opposite_stranded():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance
0 chr1 10246 10445 HWI-ST216_313:3:1207:4315:142177 1 + 9988 10187 HWI-ST216:427:D29R1ACXX:2:1205:6095:16532 1 - 60
1 chr1 110246 110445 HWI-ST216_313:3:1207:4315:142177 1 + 19958 20157 HWI-ST216:427:D29R1ACXX:2:1313:6283:67310 1 - 90090"""

    return PyRanges(pd.read_table(StringIO(c), sep=" "))
Exemplo n.º 10
0
def read_bed(f, output_df=False):

    columns = "Chromosome Start End Name Score Strand ThickStart ThickEnd ItemRGB BlockCount BlockSizes BlockStarts".split(
    )

    first_start = open(f).readline().split()[1]

    header = None

    try:
        int(first_start)
    except ValueError:
        header = 0

    df = pd.read_csv(f,
                     dtype={
                         "Chromosome": "category",
                         "Strand": "category"
                     },
                     header=header,
                     sep="\t")

    df.columns = columns[:df.shape[1]]

    if not output_df:
        return PyRanges(df)
    else:
        return df
Exemplo n.º 11
0
def expected_result_overlap_same_strand():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b
chr15	26105515	26105540	U0	0	+	26105493	26105518	U0	0	+"""

    df = pd.read_table(StringIO(c), header=0, sep="\s+")
    return PyRanges(df)
Exemplo n.º 12
0
def read_bam(f,
             sparse=True,
             output_df=False,
             mapq=0,
             required_flag=0,
             filter_flag=1540):

    try:
        import bamread
    except ModuleNotFoundError as e:
        print(
            "bamread must be installed to read bam. Use `conda install -c bioconda bamread` or `pip install bamread` to install it."
        )
        sys.exit(1)

    if sparse:
        df = bamread.read_bam(f, mapq, required_flag, filter_flag)
    else:
        try:
            df = bamread.read_bam_full(f, mapq, required_flag, filter_flag)
        except AttributeError:
            print(
                "bamread version 0.0.6 or higher is required to read bam non-sparsely."
            )

    if output_df:
        return df
    else:
        return PyRanges(df)
Exemplo n.º 13
0
def simple_gr2():

    c = """Chromosome Start End Strand Score
chr1 1 2 + 1
chr1 6 7 - 2"""
    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    return PyRanges(df)
Exemplo n.º 14
0
def expected_result_overlap_opposite_strand():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b
chr8	38747226	38747251	U0	0	-	38747236	38747261	U0	0	+
chr1	226987592	226987617	U0	0	+	226987603	226987628	U0	0	-"""

    return PyRanges(pd.read_table(StringIO(c), sep="\s+"))
Exemplo n.º 15
0
def read_gtf_restricted(f,
                        annotation=None,
                        output_df=False,
                        skiprows=0,
                        nrows=None):
    """seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below.
    # source - name of the program that generated this feature, or the data source (database or project name)
    feature - feature type name, e.g. Gene, Variation, Similarity
    start - Start position of the feature, with sequence numbering starting at 1.
    end - End position of the feature, with sequence numbering starting at 1.
    score - A floating point value.
    strand - defined as + (forward) or - (reverse).
    # frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..
    attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature."""
    dtypes = {
        "Chromosome": "category",
        "Feature": "category",
        "Strand": "category"
    }

    df_iter = pd.read_csv(
        f,
        sep="\t",
        comment="#",
        usecols=[0, 2, 3, 4, 5, 6, 8],
        header=None,
        names="Chromosome Feature Start End Score Strand Attribute".split(),
        dtype=dtypes,
        chunksize=int(1e5),
        nrows=nrows)

    dfs = []
    for df in df_iter:
        # Since Start is 1-indexed
        df.Start -= 1

        if sum(df.Score == ".") == len(df):
            cols_to_concat = "Chromosome Start End Strand Feature".split()
        else:
            cols_to_concat = "Chromosome Start End Strand Feature Score".split(
            )

        extract = _fetch_gene_transcript_exon_id(df.Attribute, annotation)
        extract.columns = "gene_id transcript_id exon_number exon_id".split()

        extract.exon_number = extract.exon_number.astype(float)

        df = pd.concat([df[cols_to_concat], extract], axis=1, sort=False)

        dfs.append(df)

    df = pd.concat(dfs, sort=False)

    df.loc[:, "Start"] = df.Start - 1

    if not output_df:
        return PyRanges(df)
    else:
        return df
Exemplo n.º 16
0
def test_instantiation_without_strand(chip_10_plus_one):

    # mix series, lists and array
    seqnames = chip_10_plus_one.Chromosome.values
    starts = chip_10_plus_one.Start.values
    ends = chip_10_plus_one.End

    pr = PyRanges(seqnames=seqnames, starts=starts, ends=ends)
Exemplo n.º 17
0
def expected_result_same_strand():

    c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b
chr1 2 4 U0 0 + 1 9 U0 0 +"""

    df = pd.read_table(StringIO(c), header=0, sep="\s+")

    return PyRanges(df)
Exemplo n.º 18
0
def load_dataset(basename):

    full_path = pkg_resources.resource_filename("pyranges", "example_data/{}.bed".format(basename))

    df = pd.read_table(full_path, header=None,
                       names="Chromosome Start End Name Score Strand".split())

    return PyRanges(df)
Exemplo n.º 19
0
def expected_result_regular_overlap_intersection():

    c = """Chromosome Start End Start_a End_a Name_a Score_a Strand Start_b End_b Name_b Score_b Strand_b
chr1 226987603 226987617 226987592 226987617 U0 0 + 226987603 226987628 U0 0 -
chr8 38747236 38747251 38747226 38747251 U0 0 - 38747236 38747261 U0 0 +
chr15 26105515 26105518 26105515 26105540 U0 0 + 26105493 26105518 U0 0 +"""

    return PyRanges(pd.read_table(StringIO(c), sep="\s+"))
Exemplo n.º 20
0
def simple_gr1():

    c = """Chromosome Start End Strand Score
chr1 3 6 + 5
chr1 5 7 - 7
chr1 8 9 + 1"""

    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    return PyRanges(df)
Exemplo n.º 21
0
def simple_gr1():

    c = """Chromosome Start End Score Strand
chr1 3 6 5 +
chr1 5 7 7 -
chr1 8 9 1 +"""

    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    return PyRanges(df)
Exemplo n.º 22
0
def read_gff3(f, annotation=None, as_df=False, nrows=None, skiprows=0):
    """Read files in the General Feature Format.

    Parameters
    ----------
    f : str

        Path to GFF file.

    as_df : bool, default False

        Whether to return as pandas DataFrame instead of PyRanges.

    nrows : int, default None

        Number of rows to read. Default None, i.e. all.

    See Also
    --------

    pyranges.read_gtf : read files in the Gene Transfer Format
    """

    dtypes = {
        "Chromosome": "category",
        "Feature": "category",
        "Strand": "category"
    }

    names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split(
    )

    df_iter = pd.read_csv(f,
                          comment="#",
                          sep="\t",
                          header=None,
                          names=names,
                          dtype=dtypes,
                          chunksize=int(1e5),
                          skiprows=skiprows,
                          nrows=nrows)

    dfs = []
    for df in df_iter:
        extra = to_rows_gff3(df.Attribute.astype(str))
        df = df.drop("Attribute", axis=1)
        ndf = pd.concat([df, extra], axis=1, sort=False)
        dfs.append(ndf)

    df = pd.concat(dfs, sort=False)

    df.loc[:, "Start"] = df.Start - 1

    if not as_df:
        return PyRanges(df)
    else:
        return df
Exemplo n.º 23
0
def test_instantiation_with_str_for_strands_seqnames(chip_10_plus_one):

    # mix series, lists and array
    seqnames = "chr1"
    starts = chip_10_plus_one.Start.values
    ends = chip_10_plus_one.End
    strands = "+"

    pr = PyRanges(seqnames=seqnames, starts=starts, ends=ends, strands=strands)
Exemplo n.º 24
0
def read_bam(f, output_df=False, mapq=0, required_flag=0, filter_flag=1540):

    df = bamread.read_bam(f, mapq, required_flag, filter_flag)

    if output_df:
        return df
    else:
        return PyRanges(df)

    return bamread.read_bam(f, mapq, required_flag, filter_flag)
Exemplo n.º 25
0
def background():
    c = """Chromosome Start End Name Score Strand
chr1	226987603	226987628	U0	0	-
chr8	38747236	38747261	U0	0	+
chr15	26105493	26105518	U0	0	+"""


    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    print(df)
    return PyRanges(df)
Exemplo n.º 26
0
def expected_result_self_unstranded(names):

    c = """chr1    9916    9988    HWI-ST216_313:3:1203:10227:6568 1
chr1    9939    9988    HWI-ST216_313:3:2301:15791:16298        1
chr1    9951    9988    HWI-ST216_313:3:2205:20086:33508        1
chr1    9953    9988    HWI-ST216_313:3:1305:6975:102491        1
chr1    9978    9988    HWI-ST216_313:3:1204:5599:113305        1"""

    df = pd.read_table(StringIO(c), sep="\s+", names=names[:-1], header=None)
    return PyRanges(df)
Exemplo n.º 27
0
def chip():

    c = """Chromosome Start End Name Score Strand
chr1	226987592	226987617	U0	0	+
chr8	38747226	38747251	U0	0	-
chr15	26105515	26105540	U0	0	+"""


    df = pd.read_table(StringIO(c), sep="\s+", header=0)
    # print(df)
    return PyRanges(df)
Exemplo n.º 28
0
def expected_result_no_strand_plus_one(names):

    c = """chr1	9916	9988	HWI-ST216_313:3:1203:10227:6568	1	-
chr1	9939	9988	HWI-ST216_313:3:2301:15791:16298	1	+
chr1	9951	9988	HWI-ST216_313:3:2205:20086:33508	1	-
chr1	9953	9988	HWI-ST216_313:3:1305:6975:102491	1	+
chr1	9978	9988	HWI-ST216_313:3:1204:5599:113305	1	-
chr1	110246	110445	HWI-ST216_313:3:1207:4315:142177	1	+"""

    df = pd.read_table(StringIO(c), sep="\s+", names=names, header=None)
    return PyRanges(df)
def expected_result_counterexample13():

    c = """chr1 3538885 3832293 + 0 1 + 3538885
chr1 4426346 9655531 + 0 1 + 4426346"""

    return PyRanges(
        pd.read_table(
            StringIO(c),
            sep="\s+",
            header=None,
            names="Chromosome Start End Strand Start_b End_b Strand_b Distance"
            .split()))
def expected_result_minus():

    c = """0 chr1 10241 10440 HWI-ST216_313:3:1302:4516:156396 1 - 9988 10187 HWI-ST216:427:D29R1ACXX:2:1205:6095:16532 1 - 55"""

    return PyRanges(
        pd.read_table(
            StringIO(c),
            sep="\s+",
            header=None,
            names=
            "Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance"
            .split()))