Пример #1
0
def write_gbk_file(output_file_name, contig_dict, sample_name,
                   nucleotide_seq_dict, protein_seq_dict):

    date = genbankDate()
    output_file_name_tmp = output_file_name + ".tmp"
    outputfile = open(output_file_name_tmp, 'w')
    #print contig_dict

    count = 0
    outputStr = ""
    for key in contig_dict:
        first = True
        if count % 10000 == 0:
            #print "count " + str(count)
            outputfile.write(outputStr)
            outputStr = ""
        count += 1

        for attrib in contig_dict[key]:
            id = attrib['id']
            try:
                protein_seq = protein_seq_dict[id]
            except:
                protein_seq = ""
                None

            definition = sample_name
            accession = '.'
            version = '.' + spaces(10) + "GI:."
            dblink = sample_name
            keywords = '.'
            source = sample_name
            organism = sample_name
            if first:
                first = False
                try:
                    dna_seq = nucleotide_seq_dict[key]
                    dna_seq_formatted = format_sequence_origin(dna_seq)
                    dna_length = len(dna_seq)
                    sourceStr = "1.." + str(dna_length)
                except:
                    dna_seq = ""
                    dna_seq_formatted = ""
                    dna_length = 0
                    sourceStr = "0..0"

                outputStr += (
                    "LOCUS       %-18s  %4d bp   DNA           BCT      %-11s\n"
                    % (key, dna_length, date))
                outputStr += (wrap("DEFINITION  ", 12, 74, definition) + '\n')
                outputStr += (wrap("ACCESSION   ", 12, 74, accession) + '\n')
                outputStr += (wrap("VERSION     ", 12, 74, version) + '\n')
                outputStr += (wrap("DBLINK      ", 12, 74, dblink) + '\n')
                outputStr += (wrap("KEYWORDS    ", 12, 74, keywords) + '\n')
                outputStr += (wrap("SOURCE    ", 12, 74, keywords) + '\n')
                outputStr += (wrap("  ORGANISM  ", 12, 74, organism) + '\n')
                outputStr += (wrap("", 12, 74, "Metagenome") + '\n')
                outputStr += (
                    wrap("REFERENCE   ", 12, 74, "1  (bases 1 to XXXXX)") +
                    '\n')
                outputStr += (wrap("  AUTHORS   ", 12, 74, "YYYYYY,X.") + '\n')
                outputStr += (wrap("  CONSRTM   ", 12, 74, "XXXXX") + '\n')
                outputStr += (wrap("  TITLE     ", 12, 74, "XXXXX") + '\n')
                outputStr += (wrap("  JOURNAL   ", 12, 74, "XXXXX") + '\n')
                outputStr += (wrap("   PUBMED   ", 12, 74, "XXXXX") + '\n')
                outputStr += (wrap("  REMARK   ", 12, 74, "XXXXX") + '\n')
                outputStr += (wrap(
                    "COMMENT     ", 12, 74,
                    "PROVISIONAL REFSEQ: This record has not yet been subject to final NCBI review   COMPLETENESS: XXXXX"
                ) + '\n')

                outputStr += (
                    wrap("FEATURES ", 21, 74, "Location/Qualifiers") + '\n')
                outputStr += (wrap("     source", 21, 74, sourceStr) + '\n')
                outputStr += (
                    wrap("", 21, 74, "/organism=\"" + sourceStr + "\"") + '\n')
                outputStr += (wrap("", 21, 74, "/strain=\"1\"") + '\n')
                outputStr += (wrap("", 21, 74, "/chromosome=\"1\"") + '\n')

            if 'start' in attrib and 'end' in attrib:
                geneLoc = str(attrib['start']) + ".." + str(attrib['end'])
            else:
                geneLoc = "0..0"

            if 'strand' in attrib:
                if attrib['strand'] == '-':
                    geneLoc = 'complement' + '(' + geneLoc + ')'

            outputStr += (wrap("     gene", 21, 74, geneLoc) + '\n')
            if 'locus_tag' in attrib:
                locus_tag = "/locus_tag=" + "\"" + attrib['locus_tag'] + "\""
            else:
                locus_tag = "/locus_tag" + "\"\""
            outputStr += (wrap("", 21, 74, locus_tag) + '\n')
            outputStr += (wrap("     CDS", 21, 74, geneLoc) + '\n')
            if 'product' in attrib:
                product = "/product=" + attrib['product']
            else:
                product = "/product=\"\""
            outputStr += (wrap("", 21, 74, product) + '\n')
            outputStr += (wrap("", 21, 74, locus_tag) + '\n')

            codon_start = "/codon_start=1"
            translation_table = "/transl_table=11"
            outputStr += (wrap("", 21, 74, codon_start) + '\n')
            outputStr += (wrap("", 21, 74, translation_table) + '\n')

            translation = "/translation=" + protein_seq
            outputStr += (wrap("", 21, 74, translation) + '\n')

        outputStr += (wrap("ORIGIN", 21, 74, "") + '\n')
        outputStr += (dna_seq_formatted + '\n')
        outputStr += ("//\n")

    outputfile.write(outputStr)
    outputfile.close()
    rename(output_file_name_tmp, output_file_name)
def writeGbkFile(output_file_name, contig_dict, sample_name, nucleotide_seq_dict, protein_seq_dict):
    """This function creates the genbank file from the gff, protein and nucleotide sequences."""
    date = genbankDate()
    output_file_name_tmp = output_file_name + ".tmp"
    outputfile = open(output_file_name_tmp, 'w')

    count = 0
    outputStr = ""
    for key in contig_dict:
        first = True
        if count % 10000 == 0:
            outputfile.write(outputStr)
            outputStr = ""
        count+=1

        for attrib in contig_dict[key]:
            id  = attrib['id']
            try:
                protein_seq = protein_seq_dict[id]
            except:
                protein_seq = ""

            definition = sample_name
            accession = '.'
            version = '.' + " "*10 + "GI:."
            dblink = sample_name
            keywords = '.'
            source = sample_name
            organism = sample_name
            dna_seq_formatted = ""
            if first:
                first = False
                try:
                    dna_seq =  nucleotide_seq_dict[key]
                    dna_seq_formatted =  formatSequenceOrigin(dna_seq)
                    dna_length = len(dna_seq)
                    sourceStr = "1.." + str(dna_length)
                except:
                    dna_seq = ""
                    dna_seq_formatted = ""
                    dna_length = 0
                    sourceStr = "0..0"

                outputStr+=("LOCUS       %-18s  %4d bp   DNA           BCT      %-11s\n" % (key, dna_length,  date))
                outputStr+=(wrap("DEFINITION  ",12,74, definition)+'\n')
                outputStr+=(wrap("ACCESSION   ", 12, 74, accession)+'\n')
                outputStr+=(wrap("VERSION     ", 12, 74, version)+'\n')
                outputStr+=(wrap("DBLINK      ", 12, 74, dblink)+'\n')
                outputStr+=(wrap("KEYWORDS    ", 12, 74,keywords)+'\n')
                outputStr+=(wrap("SOURCE    ", 12, 74, keywords)+'\n')
                outputStr+=(wrap("  ORGANISM  ",12, 74, organism)+'\n')
                outputStr+=(wrap("", 12, 74, "Metagenome")+'\n')
                outputStr+=( wrap("REFERENCE   ",12,74, "1  (bases 1 to XXXXX)")+'\n')
                outputStr+=( wrap("  AUTHORS   ",12,74, "YYYYYY,X.")+'\n')
                outputStr+=( wrap("  CONSRTM   ",12,74, "XXXXX")+'\n')
                outputStr+=( wrap("  TITLE     ",12,74, "XXXXX")+'\n')
                outputStr+=( wrap("  JOURNAL   ",12,74,"XXXXX")+'\n')
                outputStr+=( wrap("   PUBMED   ",12,74,"XXXXX")+'\n')
                outputStr+=( wrap("  REMARK   ",12,74, "XXXXX")+'\n')
                outputStr+=( wrap("COMMENT     ", 12, 74,"PROVISIONAL REFSEQ: This record has not yet been subject to final NCBI review   COMPLETENESS: XXXXX")+'\n')

                outputStr+=( wrap("FEATURES ",21,74,"Location/Qualifiers") +'\n')
                outputStr+=( wrap("     source",21,74,sourceStr) +'\n')
                outputStr+=( wrap("",21,74,"/organism=\"" + sourceStr +"\"") +'\n')
                outputStr+=( wrap("",21,74,"/strain=\"1\"")+'\n')
                outputStr+=( wrap("",21,74,"/chromosome=\"1\"") +'\n')

            if 'start' in attrib and 'end' in attrib:
                geneLoc = str(attrib['start']) +".." + str(attrib['end'])
            else:
                geneLoc = "0..0"

            if 'strand' in attrib:
                if attrib['strand']=='-':
                    geneLoc='complement' + '(' + geneLoc +')'

            outputStr+=( wrap("     gene",21,74,geneLoc) +'\n')
            if 'locus_tag' in attrib:
                locus_tag = "/locus_tag=" + "\"" + attrib['locus_tag'] + "\""
            else:
                locus_tag = "/locus_tag" + "\"\""
            outputStr+=( wrap("",21,74,locus_tag) +'\n')
            outputStr+=( wrap("     CDS",21,74,geneLoc) +'\n')
            if 'product' in attrib:
                product="/product=" + attrib['product']
            else:
                product="/product=\"\""
            outputStr+=( wrap("",21,74,product) +'\n')
            outputStr+=( wrap("",21,74,locus_tag) +'\n')

            codon_start="/codon_start=1"
            translation_table="/transl_table=11"
            outputStr+=( wrap("",21,74,codon_start) +'\n')
            outputStr+=( wrap("",21,74,translation_table) +'\n')

            translation= "/translation="+ protein_seq
            outputStr+=( wrap("",21,74,translation) +'\n')

        outputStr+=(wrap("ORIGIN", 21, 74, "")+'\n')
        outputStr+=(dna_seq_formatted +'\n')
        outputStr+=("//\n")

    outputfile.write(outputStr)
    outputfile.close()
    rename(output_file_name_tmp, output_file_name)