Exemplo n.º 1
0
def test_parse_file_gff(tmp_path):
    """GFF files are handled correctly with/without FASTA files"""
    gff = tmp_path / "test.gff"
    gff.write_text("content")
    with pytest.raises(FileNotFoundError):
        gp.parse_file(gff)

    (tmp_path / "test.fasta").write_text("content")
    assert gp.parse_file(gff) == dict(name="test", records=[])
Exemplo n.º 2
0
def get_sequences(query_file=None, query_ids=None, query_profiles=None):
    """Convenience function to get dictionary of query sequences from file or IDs.

    Parameters:
        query_file (str): Path to FASTA genbank or EMBL file containing query
        protein sequences.
        query_ids (list): NCBI sequence accessions.
        query_profiles (list): Pfam profile accessions.
    Raises:
        ValueError: Did not receive values for query_file or query_ids.
    Returns:
        sequences (dict): Dictionary of query sequences keyed on accession.
    """
    if query_file and not query_ids:
        organism = gp.parse_file(query_file)
        if Path(query_file).suffix.lower() in gp.FASTA_SUFFIXES:
            sequences = OrderedDict(
                (r.id, str(r.seq)) for r in organism["records"])
        else:
            genes = gp.organisms_to_tuples([organism])
            sequences = OrderedDict((gene[0], gene[4]) for gene in genes)
    elif query_ids:
        sequences = efetch_sequences(query_ids)
    elif query_profiles:
        sequences = None
    else:
        raise ValueError(
            "Expected 'query_file' or 'query_ids', or 'query_profiles'")
    return sequences
Exemplo n.º 3
0
def test_parse_file_valid(tmp_path):
    """Files with valid extensions parsed correctly"""
    paths = [
        tmp_path / "test.gbk",
        tmp_path / "test.fasta",
        tmp_path / "test.embl"
    ]
    for path in paths:
        path.write_text("content")
        assert gp.parse_file(path) == dict(name="test", records=[])
Exemplo n.º 4
0
def parse_query_sequences(query_file=None, query_ids=None, query_profiles=None):
    """Creates a Cluster object from query sequences.

    If EMBL/GenBank, Cluster will use exact genomic coordinates parsed from file.
    Otherwise, a fake Cluster will be created where genes are drawn to scale,
    but always on positive strand and with fixed intergenic distance.
    """
    if query_file:
        organism = gp.parse_file(query_file)
        if Path(query_file).suffix.lower() in gp.FASTA_SUFFIXES:
            cluster = fasta_seqrecords_to_cluster(organism["records"])
        else:
            cluster = seqrecord_to_cluster(organism["records"][0])
    elif query_ids:
        sequences = efetch_sequences(query_ids)
        cluster = dict_to_cluster(sequences)
    elif query_profiles:
        cluster = dict_to_cluster({key: None for key in query_profiles})
    else:
        raise ValueError("Expected 'query_file', 'query_ids' or 'query_profiles'")
    return cluster
Exemplo n.º 5
0
def test_parse_file_invalid_ext(tmp_path):
    """parse_file throws ValueError on invalid extensions"""
    invalid = tmp_path / "test.invalid"
    invalid.write_text("content")
    with pytest.raises(ValueError):
        gp.parse_file(invalid)