Exemplo n.º 1
0
def getGeneSequencesFromFastas(pangenome, fasta_file):
    fastaDict = {}
    for line in read_compressed_or_not(fasta_file):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements) <= 1:
            logging.getLogger().error(
                "No tabulation separator found in organisms file")
            exit(1)
        org = pangenome.addOrganism(elements[0])
        with read_compressed_or_not(elements[1]) as currFastaFile:
            fastaDict[org] = read_fasta(org, currFastaFile)
    if not set(pangenome.organisms) <= set(fastaDict.keys()):
        missing = len(pangenome.organisms) - len(
            set(pangenome.organisms) & set(fastaDict.keys()))
        raise Exception(
            f"Not all of your pangenome's organisms are present within the provided fasta file. {missing} are missing (out of {len(pangenome.organisms)})."
        )

    for org in pangenome.organisms:
        try:
            for contig in org.contigs:
                for gene in contig.genes:
                    gene.add_dna(
                        get_dna_sequence(fastaDict[org][contig.name], gene))
                for rna in contig.RNAs:
                    rna.add_dna(
                        get_dna_sequence(fastaDict[org][contig.name], gene))
        except KeyError:
            msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} that was read from the annotation file. "
            msg += f"The provided contigs in the fasta were : { ', '.join([contig for contig in fastaDict[org].keys()])}."
            raise KeyError(msg)
    pangenome.status["geneSequences"] = "Computed"
Exemplo n.º 2
0
def read_org_gbff(organism,
                  gbff_file_path,
                  circular_contigs,
                  getSeq,
                  pseudo=False):
    """ reads a gbff file and fills Organism, Contig and Genes objects based on information contained in this file """
    org = Organism(organism)

    logging.getLogger().debug(
        "Extracting genes informations from the given gbff")
    # revert the order of the file, to read the first line first.
    lines = read_compressed_or_not(gbff_file_path).readlines()[::-1]
    geneCounter = 0
    rnaCounter = 0
    while len(lines) != 0:
        line = lines.pop()
        # beginning of contig
        if line.startswith('LOCUS'):
            is_circ = False
            if "CIRCULAR" in line.upper(
            ):  #this line contains linear/circular word telling if the dna sequence is circularized or not
                is_circ = True
            contigLocusID = line.split(
            )[1]  #If contigID is not specified in VERSION afterwards like with Prokka, in that case we use the one in LOCUS.
            setContig = False
            while not line.startswith('FEATURES'):
                if line.startswith('VERSION'):
                    contigID = line[12:].strip()
                    if contigID != "":
                        if contigID in circular_contigs:
                            is_circ = True
                        contig = org.getOrAddContig(contigID, is_circ)
                        setContig = True
                line = lines.pop()
        if not setContig:  #if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. Should be unique in a dataset, but if there's an update the contig ID might still be the same even though it should not(?)
            if contigLocusID in circular_contigs:
                is_circ = True
            contig = org.getOrAddContig(contigLocusID, is_circ)
        # start of the feature object.
        dbxref = set()
        gene_name = ""
        product = ""
        locus_tag = ""
        objType = ""
        protein_id = ""
        genetic_code = ""
        usefulInfo = False
        start = None
        end = None
        strand = None
        line = lines.pop()
        while not line.startswith("ORIGIN"):
            currType = line[5:21].strip()
            if currType != "":
                if usefulInfo:
                    create_gene(org, contig, geneCounter, rnaCounter,
                                locus_tag, dbxref, start, end, strand, objType,
                                len(contig.genes), gene_name, product,
                                genetic_code, protein_id)
                    if objType == "CDS":
                        geneCounter += 1
                    else:
                        rnaCounter += 1
                usefulInfo = False
                objType = currType
                if objType in ['CDS', 'rRNA', 'tRNA']:
                    dbxref = set()
                    gene_name = ""
                    try:
                        if not 'join' in line[21:]:
                            usefulInfo = True
                            if line[21:].startswith('complement('):
                                strand = "-"
                                start, end = line[32:].replace(')',
                                                               '').split("..")
                            else:
                                strand = "+"
                                start, end = line[21:].strip().split('..')
                            if '>' in start or '<' in start or '>' in end or '<' in end:
                                usefulInfo = False
                    except ValueError:
                        pass
                        #don't know what to do with that, ignoring for now.
                        #there is a protein with a frameshift mecanism.
            elif usefulInfo:  # current info goes to current objtype, if it's useful.
                if line[21:].startswith("/db_xref"):
                    dbxref.add(line.split("=")[1].replace('"', '').strip())
                elif line[21:].startswith("/locus_tag"):
                    locus_tag = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith("/protein_id"):
                    protein_id = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith('/gene'):  #gene name
                    gene_name = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith('/transl_table'):
                    genetic_code = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith(
                        '/product'
                ):  #need to loop as it can be more than one line long
                    product = line.split('=')[1].replace('"', '').strip()
                    if line.count(
                            '"'
                    ) == 1:  #then the product line is on multiple lines
                        line = lines.pop()
                        product += line.strip().replace('"', '')
                        while line.count('"') != 1:
                            line = lines.pop()
                            product += line.strip().replace('"', '')
                #if it's a pseudogene, we're not keeping it.
                elif line[21:].startswith("/pseudo") and not pseudo:
                    usefulInfo = False
                #that's probably a 'stop' codon into selenocystein.
                elif line[21:].startswith("/transl_except"):
                    usefulInfo = False
            line = lines.pop()
            #end of contig
        if usefulInfo:  #saving the last element...
            create_gene(org, contig, geneCounter, rnaCounter,
                        locus_tag, dbxref, start, end, strand, objType,
                        len(contig.genes), gene_name, product, genetic_code,
                        protein_id)
            if objType == "CDS":
                geneCounter += 1
            else:
                rnaCounter += 1
        if getSeq:
            line = lines.pop()  #first sequence line.
            #if the seq was to be gotten, it would be here.
            sequence = ""
            while not line.startswith('//'):
                sequence += line[10:].replace(" ", "").strip().upper()
                line = lines.pop()
            #get each gene's sequence.
            for gene in contig.genes:
                gene.add_dna(get_dna_sequence(sequence, gene))

    return org, True  #There are always fasta sequences in a gbff
Exemplo n.º 3
0
def read_org_gff(organism,
                 gff_file_path,
                 circular_contigs,
                 getSeq,
                 pseudo=False):
    (GFF_seqname, _,
     GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range(
         0, 9)  #missing values : source, score, frame. They are unused.

    def getGffAttributes(gff_fields):
        """
            Parses the gff attribute's line and outputs the attributes in a dict structure.
            :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff.
            :type list:
            :return: attributes:
            :rtype: dict
        """
        attributes_field = [
            f for f in gff_fields[GFF_attribute].strip().split(';')
            if len(f) > 0
        ]
        attributes = {}
        for att in attributes_field:
            try:
                (key, value) = att.strip().split('=')
                attributes[key.upper()] = value
            except ValueError:
                pass  #we assume that it is a strange, but useless field for our analysis
        return attributes

    def getIDAttribute(attributes):
        """
            Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found.
            :param attribute:
            :type dict:
            :return: ElementID:
            :rtype: string
        """
        ElementID = attributes.get("ID")
        if not ElementID:
            logging.getLogger().error(
                "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: "
                + gff_file_path)
            exit(1)
        return ElementID

    hasFasta = False
    fastaString = ""
    org = Organism(organism)
    geneCounter = 0
    rnaCounter = 0
    with read_compressed_or_not(gff_file_path) as gff_file:
        for line in gff_file:
            if hasFasta:
                fastaString += line
                continue
            elif line.startswith('##', 0, 2):
                if line.startswith('FASTA', 2, 7):
                    if not getSeq:  #if getting the sequences is useless...
                        break
                    hasFasta = True
                elif line.startswith('sequence-region', 2, 17):
                    fields = [el.strip() for el in line.split()]
                    contig = org.getOrAddContig(
                        fields[1],
                        True if fields[1] in circular_contigs else False)
                continue
            elif line.startswith(
                    '#!', 0, 2
            ):  ## special refseq comment lines for versionning softs, assemblies and annotations.
                continue
            gff_fields = [el.strip() for el in line.split('\t')]
            attributes = getGffAttributes(gff_fields)
            pseudogene = False
            if gff_fields[GFF_type] == 'region':
                if gff_fields[GFF_seqname] in circular_contigs:
                    contig.is_circular = True
            elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]:
                geneID = attributes.get(
                    "PROTEIN_ID"
                )  #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.
                if geneID is None:  #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file)
                    geneID = getIDAttribute(attributes)
                try:
                    name = attributes.pop('NAME')
                except KeyError:
                    try:
                        name = attributes.pop('GENE')
                    except KeyError:
                        name = ""
                if "pseudo" in attributes or "pseudogene" in attributes:
                    pseudogene = True
                try:
                    product = attributes.pop('PRODUCT')
                except KeyError:
                    product = ""

                try:
                    genetic_code = attributes.pop("TRANSL_TABLE")
                except KeyError:
                    genetic_code = "11"
                if contig.name != gff_fields[GFF_seqname]:
                    contig = org.getOrAddContig(
                        gff_fields[GFF_seqname])  #get the current contig
                if gff_fields[GFF_type] == "CDS" and (not pseudogene or
                                                      (pseudogene and pseudo)):
                    gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4))

                    #here contig is filled in order, so position is the number of genes already stored in the contig.
                    gene.fill_annotations(start=int(gff_fields[GFF_start]),
                                          stop=int(gff_fields[GFF_end]),
                                          strand=gff_fields[GFF_strand],
                                          geneType=gff_fields[GFF_type],
                                          position=len(contig.genes),
                                          name=name,
                                          product=product,
                                          genetic_code=genetic_code,
                                          local_identifier=geneID)
                    gene.fill_parents(org, contig)
                    contig.addGene(gene)
                    geneCounter += 1
                elif "RNA" in gff_fields[GFF_type]:
                    rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4))
                    rna.fill_annotations(start=int(gff_fields[GFF_start]),
                                         stop=int(gff_fields[GFF_end]),
                                         strand=gff_fields[GFF_strand],
                                         geneType=gff_fields[GFF_type],
                                         name=name,
                                         product=product,
                                         local_identifier=geneID)
                    rna.fill_parents(org, contig)
                    contig.addRNA(rna)
                    rnaCounter += 1
    ### GET THE FASTA SEQUENCES OF THE GENES
    if hasFasta and fastaString != "":
        contigSequences = read_fasta(org, fastaString.split('\n'))
        for contig in org.contigs:
            for gene in contig.genes:
                gene.add_dna(
                    get_dna_sequence(contigSequences[contig.name], gene))
            for rna in contig.RNAs:
                rna.add_dna(get_dna_sequence(contigSequences[contig.name],
                                             rna))
    return org, hasFasta