Exemplo n.º 1
0
def test_write_header_gene_no_name():
    """
    From a given line of lstinfo file, giving info for a gene with many unknown parts (gene
    name, product, EC number and more information are NAs), check that the header line of the
    protein and gene files are generated as expected.
    """
    outfile = StringIO()
    lstline = (
        "4632\t5000\tC\tCDS\ttest.0417.00002.0002b_00011\tNA\t| hypothetical protein "
        "| NA | NA")
    ffunc.write_header(lstline, outfile)
    res = outfile.getvalue()
    exp = ">test.0417.00002.0002b_00011 369 NA | hypothetical protein | NA | NA\n"
    assert res == exp
    outfile.close()
Exemplo n.º 2
0
def test_write_header_gene():
    """
    From a given line of lstinfo file, giving info for a gene (start, end, gene name,
    product, EC number, more information), check that the header line of the protein and
    gene files are generated as expected.
    """
    outfile = StringIO()
    lstline = (
        "4416\t6068\tD\tCDS\ttest.0417.00002.0001i_00005\tyiaD\t| "
        "putative lipoprotein YiaD | 6.3.2.- | similar to AA sequence:UniProtKB:P37665"
    )
    ffunc.write_header(lstline, outfile)
    res = outfile.getvalue()
    outfile.close()
    exp = (
        ">test.0417.00002.0001i_00005 1653 yiaD | putative lipoprotein YiaD | 6.3.2.- "
        "| similar to AA sequence:UniProtKB:P37665\n")
    assert res == exp
Exemplo n.º 3
0
def create_prt(faaseq, lstfile, prtseq):
    """
    Generate .prt file, from sequences in .faa, but changing the headers
    using information in .lst

    **Note:** works if proteins are in increasing order (of number after "_" in their name)
    in faa and tbl (hence lst) files.

    If a header is not in the right format, or a protein exists in prt file but not in lstfile,
    conversion is stopped, an error message is output, and prt file is removed.

    Parameters
    ----------
    faaseq : str
        faa file output of prokka
    lstfile : str
        lstinfo converted from prokka tab file
    prtseq : str
        output file where converted proteins must be saved

    Returns
    -------
    bool :
        True if conversion went well, False otherwise
    """
    problem = False
    with open(faaseq) as faa, open(lstfile) as lst, open(prtseq, "w") as prt:
        for line in faa:
            # all header lines must start with PROKKA_<geneID>
            if line.startswith(">"):
                try:
                    # get gene ID
                    gen_id = int(line.split()[0].split("_")[-1])
                except ValueError as err:
                    logger.error(
                        f"Unknown header format {line.strip()} in {faaseq}. "
                        f"Gene ID is not a number.")
                    return False
                gen_id_lst = 0
                # get line of lst corresponding to the gene ID
                lstline = ""
                while gen_id > gen_id_lst:
                    lstline = lst.readline().strip()
                    id_lst = lstline.split("\t")[4].split("_")[-1]
                    # don't cast to int if info for a crispr
                    if id_lst.isdigit():
                        gen_id_lst = int(id_lst)
                # check that gen_id is the same as the lst line
                if gen_id == gen_id_lst:
                    general.write_header(lstline, prt)
                else:
                    logger.error(
                        f"Missing info for protein {line.strip()} (from {faaseq}) "
                        f"in {lstfile}. If it is actually present "
                        "in the lst file, check that proteins are ordered by increasing "
                        "number in both lst and faa files.")
                    return False
            # not header: inside sequence, copy it to the .prt file
            else:
                prt.write(line)
    return True
Exemplo n.º 4
0
def create_gen(ffnseq, lstfile, genseq):
    """
    Generate .gen file, from sequences contained in .ffn, but changing the
    headers using the information in .lst

    Parameters
    ----------
    ffnseq : str
        .ffn file generated by prokka
    lstfile : str
        lstfile converted from prokka tbl file
    genseq : str
        output file, to write in Genes directory
    logger : logging.Logger
        logger object to put information

    Returns
    -------
    bool :
        True if conversion went well, False otherwise
    """
    problem = False
    write = True  # Write next sequence
    with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen:
        for line_ffn in ffn:
            # Ignore gene that we do not want to write (should be a crispr)
            # If line of sequence, write it as is, and go to next line
            if not line_ffn.startswith(">"):
                # We just read a seq line. If we can write (write is True), do it and go
                # to next line
                # Otherwise, just go to next line
                if write:
                    gen.write(line_ffn)
                continue
            # Try to get gene ID. If does not work, ignore this gene (it may be a
            # CRISPR, and we ignore them
            test_gen_id = line_ffn.split()[0].split("_")[-1]
            if not test_gen_id.isdigit():
                # Maybe a CRISPR? Or wrong gene name? -> ignore
                logger.log(
                    utils.detail_lvl(),
                    f"Unknown header format for {line_ffn.strip()}. "
                    "This gene will be ignored in .gen output file.")
                write = False
                continue
            # If ffn contains a gene header, find its information in lst file
            else:
                write = True
                lstline = lst.readline().strip()
                gen_id = int(test_gen_id)
                # genID exists, ffn header is for a gene. Check that it corresponds to
                # information in lst file.
                id_lst = lstline.split("\t")[4].split("_")[-1]
                gen_id_lst = int(id_lst)
                # in lst, find the same gene ID as in ffn (some gene IDs in lst can be absent
                # from ffn, if prokka do not give their sequence).
                # As they are ordered by increasing number, go to next lstline until
                # corresponding gene ID is found. However, if ffn ID > lst ID: ID does not
                # exist in .lst -> problem.
                while gen_id > gen_id_lst:
                    lstline = lst.readline().strip()
                    if not lstline:
                        gen_id_lst = "-1"
                        break
                    id_lst = lstline.split("\t")[4].split("_")[-1]
                    gen_id_lst = int(id_lst)
                # If it found the same gene ID, write info in gene file
                if gen_id == gen_id_lst:
                    general.write_header(lstline.strip(), gen)
                # If gene ID of ffn not found, write error message and stop
                else:
                    logger.error(
                        f"Missing info for gene {line_ffn.strip()} "
                        f"(from {ffnseq}) in {lstfile}. If it is actually present "
                        "in the lst file, check that genes are ordered by increasing number in both lst and ffn files."
                    )
                    return False
    return True
Exemplo n.º 5
0
def create_prt(prot_file, res_prot_file, res_lst_file):
    """
    Generate .prt file (gembase formatted gene names), from features contained in .lst file generated just before.

    Parameters
    ----------
    prot_file : str
        .faa file generated by prodigal
    res_prot_file : str
        output file, to write in Proteins directory
    res_lst_file : str
        .lst file to get all gene names in gembase format instead of re-generating them
    Returns
    -------
    bool :
        True if conversion went well, False otherwise
    """

    # Open:
    # - prot file to read gene sequences from prodigal results
    # - res_prot file to write sequences with gembase headers
    # - res_lst_file to get gene gembase names and other infos (strand, size...)

    with open(prot_file, "r") as faa, open(res_prot_file, "w") as r_prt,\
         open(res_lst_file, "r") as r_lst:
        # Read prt file generated by prodigal
        for lineprot in faa:
            # If protein sequence, write it
            if not lineprot.startswith(">"):
                r_prt.write(lineprot)
                continue
            # If header, replace by gembase header
            # For that, get next lst line (corresponding to next protein,
            # as there is 1 protein per line in .lst -> 1 protein per header in .prt)
            linelst = r_lst.readline().strip()
            # Try to get info from lstline.
            # If lstline empty, it means that the current protein
            # is missing from lst file. We already read the last protein of lst file.
            if linelst != '':
                # If not empty, check lst format, return False if not right format
                try:
                    # If ok, gembase name is in the fifth column of lst file
                    start, end, _, _, gem_name, _, _ = linelst.split("\t")
                except ValueError:
                    logger.error(
                        "Problem in format of lstline ({})".format(linelst))
                    return False
            else:
                logger.error(
                    "No more protein in lst file. We cannot get information on this "
                    "protein ({})! Check that you do not have more proteins than genes "
                    "in prodigal results".format(lineprot.strip()))
                return False
            # Write this gembase name as a new header
            # Size of protein sequence is the third of gene sequence. Check that it is an int.
            try:
                size_gen = (int(end) - int(start) + 1)
            except ValueError:
                logger.error(
                    "Start and/or end of protein {} position is not a number (start "
                    "= {}; end = {})".format(gem_name, start, end))
                return False
            # Find size of protein in number of aa
            # If number of nucleotides in gene cannot be divided by 3 to get number of amino-acids, there is a problem with this protein: return False to ignore this genome
            size_prot = size_gen / 3
            if int(size_prot) != size_prot:
                logger.error(
                    "Gene {} has a number of nucleotides ({}) that is not divisible "
                    "by 3.".format(gem_name, size_gen))
                return False
            gfunc.write_header(linelst, r_prt)
            # new_header = "\t".join([gem_name, str(int(size_prot)), product, info])
            # r_prt.write(">" + new_header + "\n")
        # Check that there are no more proteins in lst than in this prt file
        linelst = r_lst.readline()
        if linelst.strip() != '':
            gem_name = linelst.strip().split("\t")[4]
            logger.error(
                "Protein {} is in .lst file but its sequence is not in the protein "
                "file generated by prodigal.".format(gem_name))
            return False
    return True
Exemplo n.º 6
0
def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath,
                    name):
    """
    Generate .gen file, from sequences contained in .ffn, but changing the
    headers to match with gembase format.
    At the same time, generate .lst file, from the information given in prodigal ffn headers

    Parameters
    ----------
    contigs : dict
        {original_contig_name: gembase_contig_name}
    gen_file : str
        .ffn file generated by prodigal
    res_gen_file : str
        generated .gen file, to write in Genes directory
    res_lst_file : str
        generated .lst file to write in LSTINFO directory
    gpath : str
        path to the genome given to prodigal. Only used for error message
    name : str
        gembase name of the genome to format
    logger : logging.Logger
        logger object to put information

    Returns
    -------
    bool :
        True if conversion went well, False otherwise
    """
    # Variable which will contain the current gene sequence
    seq = ""
    # number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not
    # re-start at 1 for each new contig)
    locus_num = 0
    # contig name of the last gene. To check if we are now in a new contig (-> loc = b) or not
    prev_cont_name = ""
    # Previous ontig number: contig number to use in gembase format
    prev_cont_num = 0
    contig_num = 0
    # Keep start, end, strand and informations (prodigal gives information on start_type,
    # gc_cont etc.) from the previous gene, before overwriting it with information
    # on the new one
    prev_start = ""
    prev_end = ""
    prev_strand = ""
    prev_info = ""
    # Update loc when contig changes ('b' if gene at the border of a contig, 'i' if it is inside)
    prev_loc = "b"
    # To start, the first gene is, by definition, at the border of the contig
    loc = "b"
    # Open files: .ffn prodigal to read, .gen and .lst gembase to create
    with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\
         open(res_lst_file, "w") as r_lst:
        # Read all lines in ffn file (sequences in nuc. for each gene)
        for lineffn in ffn:
            # If it is a sequence, save it and go to next line
            if not lineffn.startswith(">"):
                seq += lineffn
                continue
            # Otherwise:
            # - write header of previous sequence to .gen
            # - write previous sequence (in 'seq') to .gen
            # - write LSTINFO information to .lst
            # - update information (new start, end, contig number etc.) for next gene
            else:
                # Get information given for the new gene (by .ffn file from prodigal)
                (gname, start, end, strand,
                 info) = lineffn.strip().split(">")[-1].split("#")
                # Get contig number from prodigal gene header: prodigal first part of header is:
                #  <original genome name contig name>_<protein number>
                contig_name = gname.strip().split("_")
                if len(contig_name) > 1:
                    contig_name = "_".join(contig_name[:-1])
                else:
                    contig_name = contig_name[0]
                # If new contig:
                # - previous gene was the last of its contig -> prev_loc = "b" ;
                # - the current gene is the first of its contig (loc = "b")
                # - we must increment the contig number
                if contig_name != prev_cont_name:
                    # Check that this contig name is in the list, and get its gembase contig number
                    if contig_name in contigs:
                        contig_num = contigs[contig_name].split(".")[-1]
                    # if not in the list, problem, return false
                    else:
                        logger.error(
                            f"'{contig_name}' found in {gen_file} does not exist in "
                            f"{gpath}.")
                        return False
                    prev_loc = 'b'
                    loc = 'b'
                # If not new contig. If prev_loc == 'b', previous gene is the first protein
                # of this contig.
                # Current gene will be inside the contig (except if new contig for the next gene,
                # meaning only 1 gene in the contig)
                else:
                    loc = 'i'

                # If it is not the first gene of the genome, write previous gene information
                if prev_start != "":
                    # Write line in LSTINFO file, + header and sequence to the gene file
                    lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
                                               prev_loc, name, prev_cont_num,
                                               "NA", prev_info, "NA",
                                               prev_strand, prev_start,
                                               prev_end, r_lst)
                    gfunc.write_header(lstline, r_gen)
                    r_gen.write(seq)
                # -> get new information, save it for the next gene, and go to next line
                # Strands are 1/-1 in prodigal, while we use D,C -> convert, so that next time
                # we find a new gene, it writes this before updating for this new gene
                if int(strand) == 1:
                    strand = "D"
                else:
                    strand = "C"
                # Prepare variables for next gene
                locus_num += 1
                seq = ""
                prev_cont_num = contig_num
                prev_cont_name = contig_name
                prev_start = start
                prev_end = end
                prev_strand = strand
                prev_loc = loc
                prev_info = info
        # Write last gene of the genome (-> loc = 'b'),
        # Just check that there was at least 1 gene found (prev_start != "").
        # Otherwise, nothing to write
        if prev_start != "":
            prev_loc = "b"
            lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", prev_loc,
                                       name, prev_cont_num, "NA", prev_info,
                                       "NA", prev_strand, prev_start, prev_end,
                                       r_lst)
            gfunc.write_header(lstline, r_gen)
            r_gen.write(seq)
    return True