def annotate_organism(orgName, fileName, circular_contigs, code, kingdom, norna, tmpdir, overlap): """ Function to annotate a single organism """ org = Organism(orgName) fastaFile = read_compressed_or_not(fileName) contigSequences = read_fasta(org, fastaFile) if is_compressed(fileName): fastaFile = write_tmp_fasta(contigSequences, tmpdir) genes = syntaxic_annotation(org, fastaFile, norna, kingdom, code, tmpdir) genes = overlap_filter(genes, contigSequences, overlap) for contigName, genes in genes.items(): contig = org.getOrAddContig(contigName) if contig.name in circular_contigs: contig.is_circular = True for gene in genes: gene.add_dna(get_dna_sequence(contigSequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): contig.addGene(gene) elif isinstance(gene, RNA): contig.addRNA(gene) return org
def _make_org_with_genes(org): """make an organism, add from 2 to 10 contigs with 2 to 10 genes each.""" l_genes = [] o_org = Organism(org) for i in range(randint(2,10)): o_ctg = o_org.getOrAddContig("k_{}".format(i)) for j in range(randint(2,10)): name = "{}.{}.{}".format(org, o_ctg.name, j) o_gene = Gene(name) o_gene.position = j o_gene.start = j o_ctg.addGene(o_gene) l_genes.append(o_gene) return o_org, l_genes
def addOrganism(self, newOrg): """ adds an organism that did not exist previously in the pangenome if an :class:`ppanggolin.genome.Organism` object is provided. If an organism with the same name exists it will raise an error. If a :class:`str` object is provided, will return the corresponding organism that has this name OR create a new one if it does not exist. :param newOrg: Organism to add to the pangenome :type newOrg: :class:`ppanggolin.genome.Organism` or str :return: The created organism :rtype: :class:`ppanggolin.genome.Organism` :raises TypeError: if the provided `newOrg` is neither a str nor a :class:`ppanggolin.genome.Organism` """ if isinstance(newOrg, Organism): oldLen = len(self._orgGetter) self._orgGetter[newOrg.name] = newOrg if len(self._orgGetter) == oldLen: raise KeyError( f"Redondant organism name was found ({newOrg.name}). All of your organisms must have unique names." ) elif isinstance(newOrg, str): org = self._orgGetter.get(newOrg) if org is None: org = Organism(newOrg) self._orgGetter[org.name] = org newOrg = org else: raise TypeError( "Provide an Organism object or a str that will serve as organism name" ) return newOrg
def l_orgs(): l_orgs = [] for i_org in range(randint(5,20)): o_org = Organism(str(i_org)) l_orgs.append(o_org) return l_orgs
def readOrganism(pangenome, orgName, contigDict, circularContigs, link=False): org = Organism(orgName) for contigName, geneList in contigDict.items(): contig = org.getOrAddContig(contigName, is_circular=circularContigs[contigName]) for row in geneList: if link: #if the gene families are already computed/loaded the gene exists. gene = pangenome.getGene(row["ID"].decode()) else: #else creating the gene. gene_type = row["type"].decode() if gene_type == "CDS": gene = Gene(row["ID"].decode()) elif "RNA" in gene_type: gene = RNA(row["ID"].decode()) try: local = row["local"].decode() except ValueError: local = "" gene.fill_annotations(start=row["start"], stop=row["stop"], strand=row["strand"].decode(), geneType=row["type"].decode(), position=row["position"], genetic_code=row["genetic_code"], name=row["name"].decode(), product=row["product"].decode(), local_identifier=local) gene.is_fragment = row["is_fragment"] gene.fill_parents(org, contig) if gene_type == "CDS": contig.addGene(gene) elif "RNA" in gene_type: contig.addRNA(gene) else: raise Exception( f"A strange type '{gene_type}', which we do not know what to do with, was met." ) pangenome.addOrganism(org)
def addOrganism(self, newOrg): """ adds an organism that did not exist previously in the pangenome if an Organism object is provided. If a str object is provided, will return the corresponding organism OR create a new one. """ if isinstance(newOrg, Organism): oldLen = len(self._orgGetter) self._orgGetter[newOrg.name] = newOrg if len(self._orgGetter) == oldLen: raise KeyError( f"Redondant organism name was found ({newOrg.name}). All of your organisms must have unique names." ) elif isinstance(newOrg, str): org = self._orgGetter.get(newOrg) if org is None: org = Organism(newOrg) self._orgGetter[org.name] = org newOrg = org return newOrg
def test_cstr(): name = 4 o_org = Organism(name) assert isinstance(o_org, Organism) assert hasattr(o_org, "name") assert o_org.name == name
def o_org(): return Organism("toto")
def test_str(): name = "ppoiu" o_org = Organism(name) assert str(o_org) == name
def read_org_gbff(organism, gbff_file_path, circular_contigs, getSeq, pseudo=False): """ reads a gbff file and fills Organism, Contig and Genes objects based on information contained in this file """ org = Organism(organism) logging.getLogger().debug( "Extracting genes informations from the given gbff") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] geneCounter = 0 rnaCounter = 0 while len(lines) != 0: line = lines.pop() # beginning of contig if line.startswith('LOCUS'): is_circ = False if "CIRCULAR" in line.upper( ): #this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True contigLocusID = line.split( )[1] #If contigID is not specified in VERSION afterwards like with Prokka, in that case we use the one in LOCUS. setContig = False while not line.startswith('FEATURES'): if line.startswith('VERSION'): contigID = line[12:].strip() if contigID != "": if contigID in circular_contigs: is_circ = True contig = org.getOrAddContig(contigID, is_circ) setContig = True line = lines.pop() if not setContig: #if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. Should be unique in a dataset, but if there's an update the contig ID might still be the same even though it should not(?) if contigLocusID in circular_contigs: is_circ = True contig = org.getOrAddContig(contigLocusID, is_circ) # start of the feature object. dbxref = set() gene_name = "" product = "" locus_tag = "" objType = "" protein_id = "" genetic_code = "" usefulInfo = False start = None end = None strand = None line = lines.pop() while not line.startswith("ORIGIN"): currType = line[5:21].strip() if currType != "": if usefulInfo: create_gene(org, contig, geneCounter, rnaCounter, locus_tag, dbxref, start, end, strand, objType, len(contig.genes), gene_name, product, genetic_code, protein_id) if objType == "CDS": geneCounter += 1 else: rnaCounter += 1 usefulInfo = False objType = currType if objType in ['CDS', 'rRNA', 'tRNA']: dbxref = set() gene_name = "" try: if not 'join' in line[21:]: usefulInfo = True if line[21:].startswith('complement('): strand = "-" start, end = line[32:].replace(')', '').split("..") else: strand = "+" start, end = line[21:].strip().split('..') if '>' in start or '<' in start or '>' in end or '<' in end: usefulInfo = False except ValueError: pass #don't know what to do with that, ignoring for now. #there is a protein with a frameshift mecanism. elif usefulInfo: # current info goes to current objtype, if it's useful. if line[21:].startswith("/db_xref"): dbxref.add(line.split("=")[1].replace('"', '').strip()) elif line[21:].startswith("/locus_tag"): locus_tag = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith("/protein_id"): protein_id = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith('/gene'): #gene name gene_name = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith('/transl_table'): genetic_code = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith( '/product' ): #need to loop as it can be more than one line long product = line.split('=')[1].replace('"', '').strip() if line.count( '"' ) == 1: #then the product line is on multiple lines line = lines.pop() product += line.strip().replace('"', '') while line.count('"') != 1: line = lines.pop() product += line.strip().replace('"', '') #if it's a pseudogene, we're not keeping it. elif line[21:].startswith("/pseudo") and not pseudo: usefulInfo = False #that's probably a 'stop' codon into selenocystein. elif line[21:].startswith("/transl_except"): usefulInfo = False line = lines.pop() #end of contig if usefulInfo: #saving the last element... create_gene(org, contig, geneCounter, rnaCounter, locus_tag, dbxref, start, end, strand, objType, len(contig.genes), gene_name, product, genetic_code, protein_id) if objType == "CDS": geneCounter += 1 else: rnaCounter += 1 if getSeq: line = lines.pop() #first sequence line. #if the seq was to be gotten, it would be here. sequence = "" while not line.startswith('//'): sequence += line[10:].replace(" ", "").strip().upper() line = lines.pop() #get each gene's sequence. for gene in contig.genes: gene.add_dna(get_dna_sequence(sequence, gene)) return org, True #There are always fasta sequences in a gbff
def read_org_gff(organism, gff_file_path, circular_contigs, getSeq, pseudo=False): (GFF_seqname, _, GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range( 0, 9) #missing values : source, score, frame. They are unused. def getGffAttributes(gff_fields): """ Parses the gff attribute's line and outputs the attributes in a dict structure. :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. :type list: :return: attributes: :rtype: dict """ attributes_field = [ f for f in gff_fields[GFF_attribute].strip().split(';') if len(f) > 0 ] attributes = {} for att in attributes_field: try: (key, value) = att.strip().split('=') attributes[key.upper()] = value except ValueError: pass #we assume that it is a strange, but useless field for our analysis return attributes def getIDAttribute(attributes): """ Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found. :param attribute: :type dict: :return: ElementID: :rtype: string """ ElementID = attributes.get("ID") if not ElementID: logging.getLogger().error( "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: " + gff_file_path) exit(1) return ElementID hasFasta = False fastaString = "" org = Organism(organism) geneCounter = 0 rnaCounter = 0 with read_compressed_or_not(gff_file_path) as gff_file: for line in gff_file: if hasFasta: fastaString += line continue elif line.startswith('##', 0, 2): if line.startswith('FASTA', 2, 7): if not getSeq: #if getting the sequences is useless... break hasFasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] contig = org.getOrAddContig( fields[1], True if fields[1] in circular_contigs else False) continue elif line.startswith( '#!', 0, 2 ): ## special refseq comment lines for versionning softs, assemblies and annotations. continue gff_fields = [el.strip() for el in line.split('\t')] attributes = getGffAttributes(gff_fields) pseudogene = False if gff_fields[GFF_type] == 'region': if gff_fields[GFF_seqname] in circular_contigs: contig.is_circular = True elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]: geneID = attributes.get( "PROTEIN_ID" ) #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that. if geneID is None: #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file) geneID = getIDAttribute(attributes) try: name = attributes.pop('NAME') except KeyError: try: name = attributes.pop('GENE') except KeyError: name = "" if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True try: product = attributes.pop('PRODUCT') except KeyError: product = "" try: genetic_code = attributes.pop("TRANSL_TABLE") except KeyError: genetic_code = "11" if contig.name != gff_fields[GFF_seqname]: contig = org.getOrAddContig( gff_fields[GFF_seqname]) #get the current contig if gff_fields[GFF_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4)) #here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], position=len(contig.genes), name=name, product=product, genetic_code=genetic_code, local_identifier=geneID) gene.fill_parents(org, contig) contig.addGene(gene) geneCounter += 1 elif "RNA" in gff_fields[GFF_type]: rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4)) rna.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], name=name, product=product, local_identifier=geneID) rna.fill_parents(org, contig) contig.addRNA(rna) rnaCounter += 1 ### GET THE FASTA SEQUENCES OF THE GENES if hasFasta and fastaString != "": contigSequences = read_fasta(org, fastaString.split('\n')) for contig in org.contigs: for gene in contig.genes: gene.add_dna( get_dna_sequence(contigSequences[contig.name], gene)) for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contigSequences[contig.name], rna)) return org, hasFasta
def test_addOrganism(o_pang): o_org = Organism("org") assert o_pang.addOrganism(o_org) == o_org assert set(o_pang.organisms) == set([o_org])