def getGeneSequencesFromFastas(pangenome, fasta_file): fastaDict = {} for line in read_compressed_or_not(fasta_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: logging.getLogger().error( "No tabulation separator found in organisms file") exit(1) org = pangenome.addOrganism(elements[0]) with read_compressed_or_not(elements[1]) as currFastaFile: fastaDict[org] = read_fasta(org, currFastaFile) if not set(pangenome.organisms) <= set(fastaDict.keys()): missing = len(pangenome.organisms) - len( set(pangenome.organisms) & set(fastaDict.keys())) raise Exception( f"Not all of your pangenome's organisms are present within the provided fasta file. {missing} are missing (out of {len(pangenome.organisms)})." ) for org in pangenome.organisms: try: for contig in org.contigs: for gene in contig.genes: gene.add_dna( get_dna_sequence(fastaDict[org][contig.name], gene)) for rna in contig.RNAs: rna.add_dna( get_dna_sequence(fastaDict[org][contig.name], gene)) except KeyError: msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} that was read from the annotation file. " msg += f"The provided contigs in the fasta were : { ', '.join([contig for contig in fastaDict[org].keys()])}." raise KeyError(msg) pangenome.status["geneSequences"] = "Computed"
def read_org_gbff(organism, gbff_file_path, circular_contigs, getSeq, pseudo=False): """ reads a gbff file and fills Organism, Contig and Genes objects based on information contained in this file """ org = Organism(organism) logging.getLogger().debug( "Extracting genes informations from the given gbff") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] geneCounter = 0 rnaCounter = 0 while len(lines) != 0: line = lines.pop() # beginning of contig if line.startswith('LOCUS'): is_circ = False if "CIRCULAR" in line.upper( ): #this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True contigLocusID = line.split( )[1] #If contigID is not specified in VERSION afterwards like with Prokka, in that case we use the one in LOCUS. setContig = False while not line.startswith('FEATURES'): if line.startswith('VERSION'): contigID = line[12:].strip() if contigID != "": if contigID in circular_contigs: is_circ = True contig = org.getOrAddContig(contigID, is_circ) setContig = True line = lines.pop() if not setContig: #if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. Should be unique in a dataset, but if there's an update the contig ID might still be the same even though it should not(?) if contigLocusID in circular_contigs: is_circ = True contig = org.getOrAddContig(contigLocusID, is_circ) # start of the feature object. dbxref = set() gene_name = "" product = "" locus_tag = "" objType = "" protein_id = "" genetic_code = "" usefulInfo = False start = None end = None strand = None line = lines.pop() while not line.startswith("ORIGIN"): currType = line[5:21].strip() if currType != "": if usefulInfo: create_gene(org, contig, geneCounter, rnaCounter, locus_tag, dbxref, start, end, strand, objType, len(contig.genes), gene_name, product, genetic_code, protein_id) if objType == "CDS": geneCounter += 1 else: rnaCounter += 1 usefulInfo = False objType = currType if objType in ['CDS', 'rRNA', 'tRNA']: dbxref = set() gene_name = "" try: if not 'join' in line[21:]: usefulInfo = True if line[21:].startswith('complement('): strand = "-" start, end = line[32:].replace(')', '').split("..") else: strand = "+" start, end = line[21:].strip().split('..') if '>' in start or '<' in start or '>' in end or '<' in end: usefulInfo = False except ValueError: pass #don't know what to do with that, ignoring for now. #there is a protein with a frameshift mecanism. elif usefulInfo: # current info goes to current objtype, if it's useful. if line[21:].startswith("/db_xref"): dbxref.add(line.split("=")[1].replace('"', '').strip()) elif line[21:].startswith("/locus_tag"): locus_tag = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith("/protein_id"): protein_id = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith('/gene'): #gene name gene_name = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith('/transl_table'): genetic_code = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith( '/product' ): #need to loop as it can be more than one line long product = line.split('=')[1].replace('"', '').strip() if line.count( '"' ) == 1: #then the product line is on multiple lines line = lines.pop() product += line.strip().replace('"', '') while line.count('"') != 1: line = lines.pop() product += line.strip().replace('"', '') #if it's a pseudogene, we're not keeping it. elif line[21:].startswith("/pseudo") and not pseudo: usefulInfo = False #that's probably a 'stop' codon into selenocystein. elif line[21:].startswith("/transl_except"): usefulInfo = False line = lines.pop() #end of contig if usefulInfo: #saving the last element... create_gene(org, contig, geneCounter, rnaCounter, locus_tag, dbxref, start, end, strand, objType, len(contig.genes), gene_name, product, genetic_code, protein_id) if objType == "CDS": geneCounter += 1 else: rnaCounter += 1 if getSeq: line = lines.pop() #first sequence line. #if the seq was to be gotten, it would be here. sequence = "" while not line.startswith('//'): sequence += line[10:].replace(" ", "").strip().upper() line = lines.pop() #get each gene's sequence. for gene in contig.genes: gene.add_dna(get_dna_sequence(sequence, gene)) return org, True #There are always fasta sequences in a gbff
def read_org_gff(organism, gff_file_path, circular_contigs, getSeq, pseudo=False): (GFF_seqname, _, GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range( 0, 9) #missing values : source, score, frame. They are unused. def getGffAttributes(gff_fields): """ Parses the gff attribute's line and outputs the attributes in a dict structure. :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. :type list: :return: attributes: :rtype: dict """ attributes_field = [ f for f in gff_fields[GFF_attribute].strip().split(';') if len(f) > 0 ] attributes = {} for att in attributes_field: try: (key, value) = att.strip().split('=') attributes[key.upper()] = value except ValueError: pass #we assume that it is a strange, but useless field for our analysis return attributes def getIDAttribute(attributes): """ Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found. :param attribute: :type dict: :return: ElementID: :rtype: string """ ElementID = attributes.get("ID") if not ElementID: logging.getLogger().error( "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: " + gff_file_path) exit(1) return ElementID hasFasta = False fastaString = "" org = Organism(organism) geneCounter = 0 rnaCounter = 0 with read_compressed_or_not(gff_file_path) as gff_file: for line in gff_file: if hasFasta: fastaString += line continue elif line.startswith('##', 0, 2): if line.startswith('FASTA', 2, 7): if not getSeq: #if getting the sequences is useless... break hasFasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] contig = org.getOrAddContig( fields[1], True if fields[1] in circular_contigs else False) continue elif line.startswith( '#!', 0, 2 ): ## special refseq comment lines for versionning softs, assemblies and annotations. continue gff_fields = [el.strip() for el in line.split('\t')] attributes = getGffAttributes(gff_fields) pseudogene = False if gff_fields[GFF_type] == 'region': if gff_fields[GFF_seqname] in circular_contigs: contig.is_circular = True elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]: geneID = attributes.get( "PROTEIN_ID" ) #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that. if geneID is None: #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file) geneID = getIDAttribute(attributes) try: name = attributes.pop('NAME') except KeyError: try: name = attributes.pop('GENE') except KeyError: name = "" if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True try: product = attributes.pop('PRODUCT') except KeyError: product = "" try: genetic_code = attributes.pop("TRANSL_TABLE") except KeyError: genetic_code = "11" if contig.name != gff_fields[GFF_seqname]: contig = org.getOrAddContig( gff_fields[GFF_seqname]) #get the current contig if gff_fields[GFF_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4)) #here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], position=len(contig.genes), name=name, product=product, genetic_code=genetic_code, local_identifier=geneID) gene.fill_parents(org, contig) contig.addGene(gene) geneCounter += 1 elif "RNA" in gff_fields[GFF_type]: rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4)) rna.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], name=name, product=product, local_identifier=geneID) rna.fill_parents(org, contig) contig.addRNA(rna) rnaCounter += 1 ### GET THE FASTA SEQUENCES OF THE GENES if hasFasta and fastaString != "": contigSequences = read_fasta(org, fastaString.split('\n')) for contig in org.contigs: for gene in contig.genes: gene.add_dna( get_dna_sequence(contigSequences[contig.name], gene)) for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contigSequences[contig.name], rna)) return org, hasFasta