Exemplo n.º 1
0
def read_gene2fam(pangenome, gene2fam, disable_bar=False):
    logging.getLogger().info(
        f"Adding {len(gene2fam)} genes to the gene families")

    link = True if pangenome.status["genomesAnnotated"] in [
        "Computed", "Loaded"
    ] else False
    if link:
        if len(gene2fam) != len(
                pangenome.genes
        ):  # then maybe there are genes with identical IDs
            raise Exception(
                "Something unexpected happened during clustering "
                "(have less genes clustered than genes in the pangenome). "
                "A probable reason is that two genes in two different organisms have the same IDs;"
                " If you are sure that all of your genes have non identical IDs, "
                "please post an issue at https://github.com/labgem/PPanGGOLiN/"
            )
    bar = tqdm(gene2fam.items(), unit="gene", disable=disable_bar)
    for gene, (family, is_frag) in bar:
        fam = pangenome.addGeneFamily(family)
        if link:  # doing the linking if the annotations are loaded.
            geneObj = pangenome.getGene(gene)
        else:
            geneObj = Gene(gene)
        geneObj.is_fragment = is_frag
        fam.addGene(geneObj)
    bar.close()
Exemplo n.º 2
0
def launch_prodigal(fnaFile, org, code):
    """
        launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the found genes.
        returns the annotated genes in a list of gene objects.
    """
    locustag = org.name
    cmd = ["prodigal", "-f", "sco","-g",code, "-m", "-c", "-i", fnaFile, "-p", "single", "-q"]
    p = Popen(cmd, stdout=PIPE)

    geneObjs = defaultdict(set)
    c = 0
    for line in p.communicate()[0].decode().split("\n"):
        if line.startswith("# Sequence Data: "):
            for data in line.split(";"):
                if data.startswith("seqhdr"):
                    header = data.split("=")[1].replace('"', "").split()[0]
                    # print(header)

        elif line.startswith(">"):
            c += 1
            lineData = line[1:].split("_")  # not considering the '>'
            gene = Gene(ID = locustag + "_CDS_" + str(c).zfill(4))
            gene.fill_annotations(start=lineData[1],
                                 stop=lineData[2],
                                 strand=lineData[3],
                                 geneType="CDS",
                                 genetic_code=code)
            geneObjs[header].add(gene)

    return geneObjs
Exemplo n.º 3
0
def get_genes():
    for i in range(randint(0, 5)):
        o_gene = Gene(str(i))
        start = randint(0, 100)
        stop = randint(0, 100)
        o_gene.fill_annotations(start, stop, 'x', position=i)
        yield o_gene
Exemplo n.º 4
0
def l_genes():
    l_genes = []
    for i in range(6, 0, -1):
        o_gene = Gene(i)
        o_gene.fill_annotations(i, i, i, position=i - 1)
        l_genes.append(o_gene)

    return l_genes
Exemplo n.º 5
0
def create_gene(org, contig, ID, dbxref, start, stop, strand, gene_type, position = None, gene_name = "", product = "", genetic_code = 11):
    if any('MaGe' in dbref for dbref in dbxref):
        if gene_name == "":
            gene_name = ID
        for val in dbxref:
            if 'MaGe' in val:
                ID = val.split(':')[1]
                break
    if gene_type == "CDS":
        newGene = Gene(ID)
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                position = position,
                                name = gene_name,
                                product = product,
                                genetic_code = genetic_code)
        contig.addGene(newGene)
    else:#either a CDS, or a RNA
        newGene = RNA(ID)
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                name = gene_name,
                                product = product)
        contig.addRNA(newGene)
    newGene.fill_parents(org, contig)
Exemplo n.º 6
0
def create_gene(org, contig, geneCounter, rnaCounter, ID, dbxref, start, stop, strand, gene_type, position = None, gene_name = "", product = "", genetic_code = 11, protein_id = ""):
    if any('MaGe' in dbref for dbref in dbxref):
        if gene_name == "":
            gene_name = ID
        for val in dbxref:
            if 'MaGe' in val:
                ID = val.split(':')[1]
                break
    if gene_type == "CDS":
        if ID == "":
            ID = protein_id#on rare occasions, there are no 'locus_tag' from downloaded .gbk file. So we use the protein_id field instead. (which is not supposed to be unique, but was when cases like this were encountered)

        newGene = Gene(org.name + "_CDS_"+ str(geneCounter).zfill(4))
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                position = position,
                                name = gene_name,
                                product = product,
                                genetic_code = genetic_code,
                                local_identifier = ID)
        contig.addGene(newGene)
    else:# if not CDS, it is a RNA
        newGene = RNA(org.name + "_RNA_"+ str(rnaCounter).zfill(4))
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                name = gene_name,
                                product = product)
        contig.addRNA(newGene)
    newGene.fill_parents(org, contig)
Exemplo n.º 7
0
def test_mkBitArray_with_org(o_family):
    organism = "organism"
    o_gene = Gene(33)
    o_gene.fill_parents(organism, None)

    o_family.addGene(o_gene)

    for i in 1,3,7,12:
        index = { organism: i }
        o_family.mkBitarray(index)
        assert o_family.bitarray == 1<<i
Exemplo n.º 8
0
def t_filled_org(o_org):
    n = 0
    for k in "azerty'":
        o_ctg = o_org.getOrAddContig(k)
        for i in range(randint(0, 5)):
            o_gene = Gene(k + "-" + str(i))
            o_gene.fill_annotations(6, 1, k, position=i)
            o_ctg.addGene(o_gene)
            n += 1

    return o_org, n
Exemplo n.º 9
0
    def _make_gene_pair(org, gene_id1, gene_id2):
        """create a pair of genes that belong to the same organism."""
        lo_genes = []
        for k in gene_id1, gene_id2:
            o_gene = Gene(k)
            o_gene.fill_parents(org,None)

            lo_genes.append(o_gene)

            o_family = GeneFamily(k,k)
            o_family.addGene(o_gene)

        return tuple(lo_genes)
Exemplo n.º 10
0
    def _make_gene_pair(org, gene_id1, gene_id2):
        """create 2 genes from org.
            each gene belong to its own family."""
        lo_genes = []
        for k in gene_id1, gene_id2:
            o_gene = Gene(k)
            o_gene.fill_parents(org, None)

            lo_genes.append(o_gene)

            o_family = GeneFamily(k, k)
            o_family.addGene(o_gene)

        return tuple(lo_genes)
Exemplo n.º 11
0
 def _make_org_with_genes(org):
     """make an organism, add from 2 to 10 contigs
     with 2 to 10 genes each."""
     l_genes = []
     o_org = Organism(org)
     for i in range(randint(2,10)):
         o_ctg = o_org.getOrAddContig("k_{}".format(i))
         for j in range(randint(2,10)):
             name = "{}.{}.{}".format(org, o_ctg.name, j)
             o_gene = Gene(name)
             o_gene.position = j
             o_gene.start = j
             o_ctg.addGene(o_gene)
             l_genes.append(o_gene)
     return o_org, l_genes
Exemplo n.º 12
0
def l_genes(o_org, o_contig):
    """ creates a small gene set for testing.

        returns a list of 4 genes that belongs
        to the same contig and the same organism."""
    l_genes = []
    c = 10
    for i, gene_id in enumerate([
            "toto",
            "tata",
            "titi",
            "tutu",
            "lolo",
            "lala",
            "lili",
            "lulu",
    ]):
        gene = Gene(gene_id)
        gene.fill_annotations(c, c + 30, "+", position=i)
        gene.fill_parents(o_org, o_contig)
        o_contig.addGene(gene)
        gene.family = GeneFamily(i, gene_id)
        gene.family.addPartition("c-cloud")
        l_genes.append(gene)
        c += 35
    return l_genes
Exemplo n.º 13
0
 def _fill_fam_with_genes(o_fam):
     """add genes with names from 2 to 10 to a geneFamily object."""
     l_genes = []
     for i in range(2,10):
         name = "{}_{}".format(o_fam.name, i)
         o_gene = Gene(name)
         o_fam.addGene(o_gene)
         l_genes.append(o_gene)
     return l_genes
Exemplo n.º 14
0
def test_cstr():
    """ By checking o_gene is a Feature, I rely on Feature tests."""
    ID = 4
    o_gene = Gene(ID)
    assert isinstance(o_gene, Feature)
    assert isinstance(o_gene, Gene)

    for attr in "position", "family":
        assert hasattr(o_gene, attr)
    assert o_gene.position is None
    assert o_gene.family is None
Exemplo n.º 15
0
def test_addGene(o_ctg, l_genes):
    with pytest.raises(TypeError):
        o_ctg.addGene(33)

    # gene must have a position before beeing added.
    with pytest.raises(TypeError):
        o_ctg.addGene(Gene(33))

    for o_gene in l_genes:
        o_ctg.addGene(o_gene)

    assert o_ctg.genes == sorted(l_genes, key=lambda x: x.position)
Exemplo n.º 16
0
def filled_families():
    """
    return a list of families and genes.
    there will be between 3 and 10 genes/families.
    Each family has only one gene.
    """
    lo_genes = []
    lo_fam   = []

    n_families = randint(3, 10)
    for fam in range(n_families):
        o_gene = Gene(fam)
        o_gene.fill_parents(None, None)

        o_family = GeneFamily(fam,fam)
        o_family.addGene(o_gene)

        lo_genes.append(o_gene)
        lo_fam.append(o_family)

    return lo_fam, lo_genes
Exemplo n.º 17
0
def l_glist(o_org, o_contig):
    """ creates a small testing context, with 4 CDS, 1 RNA that are all on the same contig in the same organism"""
    l_glist = []
    c = 10
    for i, gene_id in enumerate(["toto", "tata", "titi", "tutu"]):
        gene = Gene(gene_id)
        gene.fill_annotations(c, c + 30, "+", position=i)
        gene.fill_parents(o_org, o_contig)
        o_contig.addGene(gene)
        gene.family = gene_id
        l_glist.append(gene)
        c += 35
    return l_glist
Exemplo n.º 18
0
def readGeneFamilies(pangenome, h5f, disable_bar=False):
    table = h5f.root.geneFamilies

    link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False

    bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar)
    for row in read_chunks(table):
        fam = pangenome.addGeneFamily(row["geneFam"].decode())
        if link:  # linking if we have loaded the annotations
            geneObj = pangenome.getGene(row["gene"].decode())
        else:  # else, no
            geneObj = Gene(row["gene"].decode())
        fam.addGene(geneObj)
        bar.update()
    bar.close()
    pangenome.status["genesClustered"] = "Loaded"
Exemplo n.º 19
0
def test_cstr():
    o_src = Gene('source')
    o_tgt = Gene('target')

    # set organism and contig to None.
    o_src.fill_parents(None, None)
    o_tgt.fill_parents(None, None)

    # define the None GeneFamily, and add the 2 genes to it.
    o_family = GeneFamily(None, None)
    o_family.addGene(o_src)
    o_family.addGene(o_tgt)

    o_edge = Edge(o_src, o_tgt)
    assert isinstance(o_edge, Edge)

    assert o_edge.source == o_src.family
    assert o_edge.target == o_tgt.family
    assert dict(o_edge.organisms) == {None: [(o_src, o_tgt)]}
Exemplo n.º 20
0
def test_cstr_error():
    o_src = Gene('source')
    o_tgt = Gene('target')
    # genes should have a family
    with pytest.raises(Exception):
        o_edge = Edge(o_src, o_tgt)

    o_family = GeneFamily(None, None)
    o_family.addGene(o_src)
    # both genes sould have a family
    with pytest.raises(Exception):
        o_edge = Edge(o_src, o_tgt)

    # gene should belong to the same organism
    o_family.addGene(o_tgt)
    o_src.fill_parents("", None)
    o_tgt.fill_parents(None, None)
    with pytest.raises(Exception):
        o_edge = Edge(o_src, o_tgt)
Exemplo n.º 21
0
def readOrganism(pangenome, orgName, contigDict, circularContigs, link=False):
    org = Organism(orgName)
    for contigName, geneList in contigDict.items():
        contig = org.getOrAddContig(contigName,
                                    is_circular=circularContigs[contigName])
        for row in geneList:
            if link:  #if the gene families are already computed/loaded the gene exists.
                gene = pangenome.getGene(row["ID"].decode())
            else:  #else creating the gene.
                gene_type = row["type"].decode()
                if gene_type == "CDS":
                    gene = Gene(row["ID"].decode())
                elif "RNA" in gene_type:
                    gene = RNA(row["ID"].decode())
            try:
                local = row["local"].decode()
            except ValueError:
                local = ""
            gene.fill_annotations(start=row["start"],
                                  stop=row["stop"],
                                  strand=row["strand"].decode(),
                                  geneType=row["type"].decode(),
                                  position=row["position"],
                                  genetic_code=row["genetic_code"],
                                  name=row["name"].decode(),
                                  product=row["product"].decode(),
                                  local_identifier=local)
            gene.is_fragment = row["is_fragment"]
            gene.fill_parents(org, contig)
            if gene_type == "CDS":
                contig.addGene(gene)
            elif "RNA" in gene_type:
                contig.addRNA(gene)
            else:
                raise Exception(
                    f"A strange type '{gene_type}', which we do not know what to do with, was met."
                )
    pangenome.addOrganism(org)
Exemplo n.º 22
0
def read_org_gff(organism,
                 gff_file_path,
                 circular_contigs,
                 getSeq,
                 pseudo=False):
    (GFF_seqname, _,
     GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range(
         0, 9)  #missing values : source, score, frame. They are unused.

    def getGffAttributes(gff_fields):
        """
            Parses the gff attribute's line and outputs the attributes in a dict structure.
            :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff.
            :type list:
            :return: attributes:
            :rtype: dict
        """
        attributes_field = [
            f for f in gff_fields[GFF_attribute].strip().split(';')
            if len(f) > 0
        ]
        attributes = {}
        for att in attributes_field:
            try:
                (key, value) = att.strip().split('=')
                attributes[key.upper()] = value
            except ValueError:
                pass  #we assume that it is a strange, but useless field for our analysis
        return attributes

    def getIDAttribute(attributes):
        """
            Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found.
            :param attribute:
            :type dict:
            :return: ElementID:
            :rtype: string
        """
        ElementID = attributes.get("ID")
        if not ElementID:
            logging.getLogger().error(
                "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: "
                + gff_file_path)
            exit(1)
        return ElementID

    hasFasta = False
    fastaString = ""
    org = Organism(organism)
    geneCounter = 0
    rnaCounter = 0
    with read_compressed_or_not(gff_file_path) as gff_file:
        for line in gff_file:
            if hasFasta:
                fastaString += line
                continue
            elif line.startswith('##', 0, 2):
                if line.startswith('FASTA', 2, 7):
                    if not getSeq:  #if getting the sequences is useless...
                        break
                    hasFasta = True
                elif line.startswith('sequence-region', 2, 17):
                    fields = [el.strip() for el in line.split()]
                    contig = org.getOrAddContig(
                        fields[1],
                        True if fields[1] in circular_contigs else False)
                continue
            elif line.startswith(
                    '#!', 0, 2
            ):  ## special refseq comment lines for versionning softs, assemblies and annotations.
                continue
            gff_fields = [el.strip() for el in line.split('\t')]
            attributes = getGffAttributes(gff_fields)
            pseudogene = False
            if gff_fields[GFF_type] == 'region':
                if gff_fields[GFF_seqname] in circular_contigs:
                    contig.is_circular = True
            elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]:
                geneID = attributes.get(
                    "PROTEIN_ID"
                )  #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.
                if geneID is None:  #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file)
                    geneID = getIDAttribute(attributes)
                try:
                    name = attributes.pop('NAME')
                except KeyError:
                    try:
                        name = attributes.pop('GENE')
                    except KeyError:
                        name = ""
                if "pseudo" in attributes or "pseudogene" in attributes:
                    pseudogene = True
                try:
                    product = attributes.pop('PRODUCT')
                except KeyError:
                    product = ""

                try:
                    genetic_code = attributes.pop("TRANSL_TABLE")
                except KeyError:
                    genetic_code = "11"
                if contig.name != gff_fields[GFF_seqname]:
                    contig = org.getOrAddContig(
                        gff_fields[GFF_seqname])  #get the current contig
                if gff_fields[GFF_type] == "CDS" and (not pseudogene or
                                                      (pseudogene and pseudo)):
                    gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4))

                    #here contig is filled in order, so position is the number of genes already stored in the contig.
                    gene.fill_annotations(start=int(gff_fields[GFF_start]),
                                          stop=int(gff_fields[GFF_end]),
                                          strand=gff_fields[GFF_strand],
                                          geneType=gff_fields[GFF_type],
                                          position=len(contig.genes),
                                          name=name,
                                          product=product,
                                          genetic_code=genetic_code,
                                          local_identifier=geneID)
                    gene.fill_parents(org, contig)
                    contig.addGene(gene)
                    geneCounter += 1
                elif "RNA" in gff_fields[GFF_type]:
                    rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4))
                    rna.fill_annotations(start=int(gff_fields[GFF_start]),
                                         stop=int(gff_fields[GFF_end]),
                                         strand=gff_fields[GFF_strand],
                                         geneType=gff_fields[GFF_type],
                                         name=name,
                                         product=product,
                                         local_identifier=geneID)
                    rna.fill_parents(org, contig)
                    contig.addRNA(rna)
                    rnaCounter += 1
    ### GET THE FASTA SEQUENCES OF THE GENES
    if hasFasta and fastaString != "":
        contigSequences = read_fasta(org, fastaString.split('\n'))
        for contig in org.contigs:
            for gene in contig.genes:
                gene.add_dna(
                    get_dna_sequence(contigSequences[contig.name], gene))
            for rna in contig.RNAs:
                rna.add_dna(get_dna_sequence(contigSequences[contig.name],
                                             rna))
    return org, hasFasta
Exemplo n.º 23
0
def o_gene():
    return Gene(4)
Exemplo n.º 24
0
def test_str():
    ID = "un truc"
    o_gene = Gene(ID)
    assert str(o_gene) == ID
Exemplo n.º 25
0
def test_addGene_solo(o_family, lo_genes):
    o_gene = Gene(33)
    o_family.addGene(o_gene)
    assert o_family.genes == set([o_gene])
    assert o_gene.family == o_family
Exemplo n.º 26
0
def lo_genes():
    return [ Gene( str(i) ) for i in range(4) ]