예제 #1
0
def test_parse_gtf_lines_without_expand_attributes():
    parsed_dict = parse_gtf_lines(gtf_lines, expand_attribute_column=False)

    # convert to list since Py3's dictionary keys are a distinct collection type
    eq_(list(parsed_dict.keys()), REQUIRED_COLUMNS)
    eq_(parsed_dict["seqname"], ["1", "1"])
    # convert to list for comparison since numerical columns may be NumPy arrays
    eq_(list(parsed_dict["start"]), [11869, 11869])
    eq_(list(parsed_dict["end"]), [14409, 14409])
    # can't compare NaN with equality
    scores = list(parsed_dict["score"])
    assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
    assert len(parsed_dict["attribute"]) == 2
예제 #2
0
def test_parse_gtf_lines_with_expand_attributes():
    parsed_dict = parse_gtf_lines(gtf_lines, expand_attribute_column=True)
    # excluding 'attribute' column from required names
    expected_columns = REQUIRED_COLUMNS[:8] + [
        "gene_id",
        "gene_name",
        "gene_source",
        "gene_biotype",
        "transcript_id",
        "transcript_name",
        "transcript_source",
    ]
    # convert to list since Py3's dictionary keys are a distinct collection type
    eq_(list(parsed_dict.keys()), expected_columns)
    eq_(parsed_dict["seqname"], ["1", "1"])
    # convert to list for comparison since numerical columns may be NumPy arrays
    eq_(list(parsed_dict["start"]), [11869, 11869])
    eq_(list(parsed_dict["end"]), [14409, 14409])
    # can't compare NaN with equality
    scores = list(parsed_dict["score"])
    assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
    eq_(parsed_dict["gene_id"], ["ENSG00000223972", "ENSG00000223972"])
    eq_(parsed_dict["transcript_id"], ["", "ENST00000456328"])
from gtfparse import create_missing_features, parse_gtf_lines
import pandas

# two lines from the Ensembl 54 human GTF containing only a stop_codon and
# exon features, but from which gene and transcript information could be
# inferred
GTF_DATA = """
# seqname biotype feature start end score strand frame attribute
18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\tgene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7"; gene_name "C18orf10"; transcript_name "C18orf10-201";
18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; transcript_name "KIAA1328-202";
"""

GTF_LINES = GTF_DATA.split("\n")

GTF_DICT = parse_gtf_lines(GTF_LINES)
GTF_DATAFRAME = pandas.DataFrame(GTF_DICT)


def test_create_missing_features_identity():
    df_should_be_same = create_missing_features(GTF_DATAFRAME, {})
    assert len(GTF_DATAFRAME) == len(df_should_be_same), \
        "GTF DataFrames should be same size"


def _check_expanded_dataframe(df):
    assert "gene" in set(df["feature"]), \
        "Extended GTF should contain gene feature"
    assert "transcript" in set(df["feature"]), \
        "Extended GTF should contain transcript feature"

    C18orf10_201_transcript_mask = ((df["feature"] == "transcript") &
from gtfparse import create_missing_features, parse_gtf_lines
import pandas

# two lines from the Ensembl 54 human GTF containing only a stop_codon and
# exon features, but from which gene and transcript information could be
# inferred
GTF_DATA = """
# seqname biotype feature start end score strand frame attribute
18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\tgene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7"; gene_name "C18orf10"; transcript_name "C18orf10-201";
18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; transcript_name "KIAA1328-202";
"""

GTF_LINES = GTF_DATA.split("\n")

GTF_DICT = parse_gtf_lines(GTF_LINES)
GTF_DATAFRAME = pandas.DataFrame(GTF_DICT)

def test_create_missing_features_identity():
    df_should_be_same = create_missing_features(GTF_DATAFRAME, {})
    assert len(GTF_DATAFRAME) == len(df_should_be_same), \
        "GTF DataFrames should be same size"

def _check_expanded_dataframe(df):
    assert "gene" in set(df["feature"]), \
        "Extended GTF should contain gene feature"
    assert "transcript" in set(df["feature"]), \
        "Extended GTF should contain transcript feature"

    C18orf10_201_transcript_mask = (
        (df["feature"] == "transcript") &
        (df["transcript_name"] == "C18orf10-201"))
예제 #5
0
def test_parse_gtf_lines_error_too_few_fields():
    bad_gtf_lines = [line.replace("\t", " ") for line in gtf_lines]
    # pylint: disable=no-value-for-parameter
    with assert_raises(ParsingError):
        parse_gtf_lines(bad_gtf_lines)