예제 #1
0
def annotate_organism(orgName, fileName, circular_contigs, code, kingdom, norna, tmpdir, overlap):
    """
        Function to annotate a single organism
    """
    org = Organism(orgName)

    fastaFile = read_compressed_or_not(fileName)
    contigSequences = read_fasta(org, fastaFile)
    if is_compressed(fileName):
        fastaFile = write_tmp_fasta(contigSequences, tmpdir)

    genes = syntaxic_annotation(org, fastaFile, norna, kingdom, code, tmpdir)
    genes = overlap_filter(genes, contigSequences, overlap)

    for contigName, genes in genes.items():
        contig = org.getOrAddContig(contigName)
        if contig.name in circular_contigs:
            contig.is_circular = True
        for gene in genes:
            gene.add_dna(get_dna_sequence(contigSequences[contig.name], gene))
            gene.fill_parents(org, contig)
            if isinstance(gene, Gene):
                contig.addGene(gene)
            elif isinstance(gene, RNA):
                contig.addRNA(gene)
    return org
예제 #2
0
 def _make_org_with_genes(org):
     """make an organism, add from 2 to 10 contigs
     with 2 to 10 genes each."""
     l_genes = []
     o_org = Organism(org)
     for i in range(randint(2,10)):
         o_ctg = o_org.getOrAddContig("k_{}".format(i))
         for j in range(randint(2,10)):
             name = "{}.{}.{}".format(org, o_ctg.name, j)
             o_gene = Gene(name)
             o_gene.position = j
             o_gene.start = j
             o_ctg.addGene(o_gene)
             l_genes.append(o_gene)
     return o_org, l_genes
예제 #3
0
    def addOrganism(self, newOrg):
        """
            adds an organism that did not exist previously in the pangenome if an :class:`ppanggolin.genome.Organism` object is provided. If an organism with the same name exists it will raise an error.
            If a :class:`str` object is provided, will return the corresponding organism that has this name OR create a new one if it does not exist.

            :param newOrg: Organism to add to the pangenome
            :type newOrg: :class:`ppanggolin.genome.Organism` or str
            :return: The created organism
            :rtype: :class:`ppanggolin.genome.Organism`
            :raises TypeError: if the provided `newOrg` is neither a str nor a :class:`ppanggolin.genome.Organism`
        """
        if isinstance(newOrg, Organism):
            oldLen = len(self._orgGetter)
            self._orgGetter[newOrg.name] = newOrg
            if len(self._orgGetter) == oldLen:
                raise KeyError(
                    f"Redondant organism name was found ({newOrg.name}). All of your organisms must have unique names."
                )
        elif isinstance(newOrg, str):
            org = self._orgGetter.get(newOrg)
            if org is None:
                org = Organism(newOrg)
                self._orgGetter[org.name] = org
            newOrg = org
        else:
            raise TypeError(
                "Provide an Organism object or a str that will serve as organism name"
            )
        return newOrg
예제 #4
0
def l_orgs():
    l_orgs = []
    for i_org in range(randint(5,20)):
        o_org = Organism(str(i_org))
        l_orgs.append(o_org)

    return l_orgs
예제 #5
0
def readOrganism(pangenome, orgName, contigDict, circularContigs, link=False):
    org = Organism(orgName)
    for contigName, geneList in contigDict.items():
        contig = org.getOrAddContig(contigName,
                                    is_circular=circularContigs[contigName])
        for row in geneList:
            if link:  #if the gene families are already computed/loaded the gene exists.
                gene = pangenome.getGene(row["ID"].decode())
            else:  #else creating the gene.
                gene_type = row["type"].decode()
                if gene_type == "CDS":
                    gene = Gene(row["ID"].decode())
                elif "RNA" in gene_type:
                    gene = RNA(row["ID"].decode())
            try:
                local = row["local"].decode()
            except ValueError:
                local = ""
            gene.fill_annotations(start=row["start"],
                                  stop=row["stop"],
                                  strand=row["strand"].decode(),
                                  geneType=row["type"].decode(),
                                  position=row["position"],
                                  genetic_code=row["genetic_code"],
                                  name=row["name"].decode(),
                                  product=row["product"].decode(),
                                  local_identifier=local)
            gene.is_fragment = row["is_fragment"]
            gene.fill_parents(org, contig)
            if gene_type == "CDS":
                contig.addGene(gene)
            elif "RNA" in gene_type:
                contig.addRNA(gene)
            else:
                raise Exception(
                    f"A strange type '{gene_type}', which we do not know what to do with, was met."
                )
    pangenome.addOrganism(org)
예제 #6
0
 def addOrganism(self, newOrg):
     """
         adds an organism that did not exist previously in the pangenome if an Organism object is provided.
         If a str object is provided, will return the corresponding organism OR create a new one.
     """
     if isinstance(newOrg, Organism):
         oldLen = len(self._orgGetter)
         self._orgGetter[newOrg.name] = newOrg
         if len(self._orgGetter) == oldLen:
             raise KeyError(
                 f"Redondant organism name was found ({newOrg.name}). All of your organisms must have unique names."
             )
     elif isinstance(newOrg, str):
         org = self._orgGetter.get(newOrg)
         if org is None:
             org = Organism(newOrg)
             self._orgGetter[org.name] = org
         newOrg = org
     return newOrg
예제 #7
0
def test_cstr():
    name = 4
    o_org = Organism(name)
    assert isinstance(o_org, Organism)
    assert hasattr(o_org, "name")
    assert o_org.name == name
예제 #8
0
def o_org():
    return Organism("toto")
예제 #9
0
def test_str():
    name = "ppoiu"
    o_org = Organism(name)
    assert str(o_org) == name
예제 #10
0
def read_org_gbff(organism,
                  gbff_file_path,
                  circular_contigs,
                  getSeq,
                  pseudo=False):
    """ reads a gbff file and fills Organism, Contig and Genes objects based on information contained in this file """
    org = Organism(organism)

    logging.getLogger().debug(
        "Extracting genes informations from the given gbff")
    # revert the order of the file, to read the first line first.
    lines = read_compressed_or_not(gbff_file_path).readlines()[::-1]
    geneCounter = 0
    rnaCounter = 0
    while len(lines) != 0:
        line = lines.pop()
        # beginning of contig
        if line.startswith('LOCUS'):
            is_circ = False
            if "CIRCULAR" in line.upper(
            ):  #this line contains linear/circular word telling if the dna sequence is circularized or not
                is_circ = True
            contigLocusID = line.split(
            )[1]  #If contigID is not specified in VERSION afterwards like with Prokka, in that case we use the one in LOCUS.
            setContig = False
            while not line.startswith('FEATURES'):
                if line.startswith('VERSION'):
                    contigID = line[12:].strip()
                    if contigID != "":
                        if contigID in circular_contigs:
                            is_circ = True
                        contig = org.getOrAddContig(contigID, is_circ)
                        setContig = True
                line = lines.pop()
        if not setContig:  #if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. Should be unique in a dataset, but if there's an update the contig ID might still be the same even though it should not(?)
            if contigLocusID in circular_contigs:
                is_circ = True
            contig = org.getOrAddContig(contigLocusID, is_circ)
        # start of the feature object.
        dbxref = set()
        gene_name = ""
        product = ""
        locus_tag = ""
        objType = ""
        protein_id = ""
        genetic_code = ""
        usefulInfo = False
        start = None
        end = None
        strand = None
        line = lines.pop()
        while not line.startswith("ORIGIN"):
            currType = line[5:21].strip()
            if currType != "":
                if usefulInfo:
                    create_gene(org, contig, geneCounter, rnaCounter,
                                locus_tag, dbxref, start, end, strand, objType,
                                len(contig.genes), gene_name, product,
                                genetic_code, protein_id)
                    if objType == "CDS":
                        geneCounter += 1
                    else:
                        rnaCounter += 1
                usefulInfo = False
                objType = currType
                if objType in ['CDS', 'rRNA', 'tRNA']:
                    dbxref = set()
                    gene_name = ""
                    try:
                        if not 'join' in line[21:]:
                            usefulInfo = True
                            if line[21:].startswith('complement('):
                                strand = "-"
                                start, end = line[32:].replace(')',
                                                               '').split("..")
                            else:
                                strand = "+"
                                start, end = line[21:].strip().split('..')
                            if '>' in start or '<' in start or '>' in end or '<' in end:
                                usefulInfo = False
                    except ValueError:
                        pass
                        #don't know what to do with that, ignoring for now.
                        #there is a protein with a frameshift mecanism.
            elif usefulInfo:  # current info goes to current objtype, if it's useful.
                if line[21:].startswith("/db_xref"):
                    dbxref.add(line.split("=")[1].replace('"', '').strip())
                elif line[21:].startswith("/locus_tag"):
                    locus_tag = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith("/protein_id"):
                    protein_id = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith('/gene'):  #gene name
                    gene_name = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith('/transl_table'):
                    genetic_code = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith(
                        '/product'
                ):  #need to loop as it can be more than one line long
                    product = line.split('=')[1].replace('"', '').strip()
                    if line.count(
                            '"'
                    ) == 1:  #then the product line is on multiple lines
                        line = lines.pop()
                        product += line.strip().replace('"', '')
                        while line.count('"') != 1:
                            line = lines.pop()
                            product += line.strip().replace('"', '')
                #if it's a pseudogene, we're not keeping it.
                elif line[21:].startswith("/pseudo") and not pseudo:
                    usefulInfo = False
                #that's probably a 'stop' codon into selenocystein.
                elif line[21:].startswith("/transl_except"):
                    usefulInfo = False
            line = lines.pop()
            #end of contig
        if usefulInfo:  #saving the last element...
            create_gene(org, contig, geneCounter, rnaCounter,
                        locus_tag, dbxref, start, end, strand, objType,
                        len(contig.genes), gene_name, product, genetic_code,
                        protein_id)
            if objType == "CDS":
                geneCounter += 1
            else:
                rnaCounter += 1
        if getSeq:
            line = lines.pop()  #first sequence line.
            #if the seq was to be gotten, it would be here.
            sequence = ""
            while not line.startswith('//'):
                sequence += line[10:].replace(" ", "").strip().upper()
                line = lines.pop()
            #get each gene's sequence.
            for gene in contig.genes:
                gene.add_dna(get_dna_sequence(sequence, gene))

    return org, True  #There are always fasta sequences in a gbff
예제 #11
0
def read_org_gff(organism,
                 gff_file_path,
                 circular_contigs,
                 getSeq,
                 pseudo=False):
    (GFF_seqname, _,
     GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range(
         0, 9)  #missing values : source, score, frame. They are unused.

    def getGffAttributes(gff_fields):
        """
            Parses the gff attribute's line and outputs the attributes in a dict structure.
            :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff.
            :type list:
            :return: attributes:
            :rtype: dict
        """
        attributes_field = [
            f for f in gff_fields[GFF_attribute].strip().split(';')
            if len(f) > 0
        ]
        attributes = {}
        for att in attributes_field:
            try:
                (key, value) = att.strip().split('=')
                attributes[key.upper()] = value
            except ValueError:
                pass  #we assume that it is a strange, but useless field for our analysis
        return attributes

    def getIDAttribute(attributes):
        """
            Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found.
            :param attribute:
            :type dict:
            :return: ElementID:
            :rtype: string
        """
        ElementID = attributes.get("ID")
        if not ElementID:
            logging.getLogger().error(
                "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: "
                + gff_file_path)
            exit(1)
        return ElementID

    hasFasta = False
    fastaString = ""
    org = Organism(organism)
    geneCounter = 0
    rnaCounter = 0
    with read_compressed_or_not(gff_file_path) as gff_file:
        for line in gff_file:
            if hasFasta:
                fastaString += line
                continue
            elif line.startswith('##', 0, 2):
                if line.startswith('FASTA', 2, 7):
                    if not getSeq:  #if getting the sequences is useless...
                        break
                    hasFasta = True
                elif line.startswith('sequence-region', 2, 17):
                    fields = [el.strip() for el in line.split()]
                    contig = org.getOrAddContig(
                        fields[1],
                        True if fields[1] in circular_contigs else False)
                continue
            elif line.startswith(
                    '#!', 0, 2
            ):  ## special refseq comment lines for versionning softs, assemblies and annotations.
                continue
            gff_fields = [el.strip() for el in line.split('\t')]
            attributes = getGffAttributes(gff_fields)
            pseudogene = False
            if gff_fields[GFF_type] == 'region':
                if gff_fields[GFF_seqname] in circular_contigs:
                    contig.is_circular = True
            elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]:
                geneID = attributes.get(
                    "PROTEIN_ID"
                )  #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.
                if geneID is None:  #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file)
                    geneID = getIDAttribute(attributes)
                try:
                    name = attributes.pop('NAME')
                except KeyError:
                    try:
                        name = attributes.pop('GENE')
                    except KeyError:
                        name = ""
                if "pseudo" in attributes or "pseudogene" in attributes:
                    pseudogene = True
                try:
                    product = attributes.pop('PRODUCT')
                except KeyError:
                    product = ""

                try:
                    genetic_code = attributes.pop("TRANSL_TABLE")
                except KeyError:
                    genetic_code = "11"
                if contig.name != gff_fields[GFF_seqname]:
                    contig = org.getOrAddContig(
                        gff_fields[GFF_seqname])  #get the current contig
                if gff_fields[GFF_type] == "CDS" and (not pseudogene or
                                                      (pseudogene and pseudo)):
                    gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4))

                    #here contig is filled in order, so position is the number of genes already stored in the contig.
                    gene.fill_annotations(start=int(gff_fields[GFF_start]),
                                          stop=int(gff_fields[GFF_end]),
                                          strand=gff_fields[GFF_strand],
                                          geneType=gff_fields[GFF_type],
                                          position=len(contig.genes),
                                          name=name,
                                          product=product,
                                          genetic_code=genetic_code,
                                          local_identifier=geneID)
                    gene.fill_parents(org, contig)
                    contig.addGene(gene)
                    geneCounter += 1
                elif "RNA" in gff_fields[GFF_type]:
                    rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4))
                    rna.fill_annotations(start=int(gff_fields[GFF_start]),
                                         stop=int(gff_fields[GFF_end]),
                                         strand=gff_fields[GFF_strand],
                                         geneType=gff_fields[GFF_type],
                                         name=name,
                                         product=product,
                                         local_identifier=geneID)
                    rna.fill_parents(org, contig)
                    contig.addRNA(rna)
                    rnaCounter += 1
    ### GET THE FASTA SEQUENCES OF THE GENES
    if hasFasta and fastaString != "":
        contigSequences = read_fasta(org, fastaString.split('\n'))
        for contig in org.contigs:
            for gene in contig.genes:
                gene.add_dna(
                    get_dna_sequence(contigSequences[contig.name], gene))
            for rna in contig.RNAs:
                rna.add_dna(get_dna_sequence(contigSequences[contig.name],
                                             rna))
    return org, hasFasta
예제 #12
0
def test_addOrganism(o_pang):
    o_org = Organism("org")
    assert o_pang.addOrganism(o_org) == o_org
    assert set(o_pang.organisms) == set([o_org])