Пример #1
0
def write_format(file):
    record_parser = GenBank.RecordParser(debug_level=2)

    print("Testing GenBank writing for %s..." % os.path.basename(file))
    # be able to handle gzipped files
    if '.gz' in file:
        cur_handle = gzip.open(file, "r")
        compare_handle = gzip.open(file, "r")
    else:
        cur_handle = open(file, "r")
        compare_handle = open(file, "r")

    iterator = GenBank.Iterator(cur_handle, record_parser)
    compare_iterator = GenBank.Iterator(compare_handle)

    while 1:
        cur_record = iterator.next()
        compare_record = compare_iterator.next()

        if cur_record is None or compare_record is None:
            break

        # print("\tTesting for %s" % cur_record.version)

        output_record = str(cur_record) + "\n"
        try:
            do_comparison(compare_record, output_record)
        except AssertionError as msg:
            print("\tTesting for %s" % cur_record.version)
            print(msg)

    cur_handle.close()
    compare_handle.close()
Пример #2
0
def t_write_format():
    record_parser = GenBank.RecordParser(debug_level = 0)

    for file in write_format_files:
        print("Testing GenBank writing for %s..." % os.path.basename(file))
        cur_handle = open(os.path.join("GenBank", file), "r")
        compare_handle = open(os.path.join("GenBank", file), "r")

        iterator = GenBank.Iterator(cur_handle, record_parser)
        compare_iterator = GenBank.Iterator(compare_handle)

        while 1:
            cur_record = iterator.next()
            compare_record = compare_iterator.next()

            if cur_record is None or compare_record is None:
                break

            print("\tTesting for %s" % cur_record.version)

            output_record = str(cur_record) + "\n"
            do_comparison(compare_record, output_record)

        cur_handle.close()
        compare_handle.close()
Пример #3
0
def t_write_format():
    """Test writing to the difference formats."""
    record_parser = GenBank.RecordParser(debug_level=0)

    for next_file in write_format_files:
        print("Testing GenBank writing for %s..." %
              os.path.basename(next_file))
        cur_handle = open(os.path.join("GenBank", next_file), "r")
        compare_handle = open(os.path.join("GenBank", next_file), "r")

        iterator = GenBank.Iterator(cur_handle, record_parser)
        compare_iterator = GenBank.Iterator(compare_handle)

        while True:
            cur_rec = next(iterator)
            compare_record = next(compare_iterator)

            if cur_rec is None or compare_record is None:
                break

            print("\tTesting for %s" % cur_rec.version)

            output_record = str(cur_rec) + "\n"
            do_comparison(compare_record, output_record)

        cur_handle.close()
        compare_handle.close()
Пример #4
0
def write_format(file):
    """Write a GenBank record from a Genbank file and compare them."""
    record_parser = GenBank.RecordParser(debug_level=2)

    print("Testing GenBank writing for %s..." % os.path.basename(file))
    # be able to handle gzipped files
    if ".gz" in file:
        cur_handle = gzip.open(file, "rb")
        compare_handle = gzip.open(file, "rb")
    else:
        cur_handle = open(file)
        compare_handle = open(file)

    iterator = GenBank.Iterator(cur_handle, record_parser)
    compare_iterator = GenBank.Iterator(compare_handle)

    while True:
        cur_record = next(iterator)
        compare_record = next(compare_iterator)

        if cur_record is None or compare_record is None:
            break

        # print("\tTesting for %s" % cur_record.version)

        output_record = str(cur_record) + "\n"
        try:
            do_comparison(compare_record, output_record)
        except AssertionError as msg:
            print("\tTesting for %s" % cur_record.version)
            print(msg)

    cur_handle.close()
    compare_handle.close()
Пример #5
0
    def __processGffFilesNotNew(self, changed):
        for gff in changed:
            loc = os.path.dirname(gff)
            dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
            dbName = os.path.join(loc, dbName)

            gffRewriter = GFFRewriter(filename=gff,
                                      outfile=gff + ".sorted.prepared",
                                      accession=genbank_id)

            #print setting.DATABASES['default']['USER']

            gffRewriter.addUnknownCvTerms({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                settings.DATABASES['default']['NAME']
            })

            gffRewriter.addColor({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                'go'
            })

            error = gffRewriter.getError()

            # run the sqlite database loader to be able to add it to GBrowse
            # since the name should be preserved, no changes need to be made
            # to the GBrowse configuration file
            args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
            runProgram('bp_seqfeature_load.pl', args)

            parser = GenBank.RecordParser()
            gbk = os.path.join(os.path.splitext(gff)[0], '.gbk')
            record = parser.parse(open(gbk))
            organismName = record.organism
            organismDir = os.path.basename(loc)

            GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir,
                                          organismName)

            # now edit the record in Chado
            args = [
                '--organism', organismName, "--gfffile", gff, "--dbname",
                settings.DATABASES['default']['NAME'], "--dbuser",
                settings.DATABASES['default']['USER'], "--dbpass",
                settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"
            ]
            runProgram('gmod_bulk_load_gff3.pl', args)
Пример #6
0
def genbank_single(filename):
    """
    >>> record = genbank_single("GFF/NC_001422.gbk")
    >>> record.taxonomy
    ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
    >>> cds = record.features[-4]
    >>> cds.key
    'CDS'
    >>> location = LocationFromString(cds.location)
    >>> print location
    2931..3917
    >>> subseq = record_subseq(record, location)
    >>> subseq[0:20]
    Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
    """
    return GenBank.RecordParser().parse(open(filename))
Пример #7
0
files_to_parse = []
for file in test_files:
    files_to_parse.append(os.path.join(gb_file_dir, file))

# parse the bioperl test files
# comment this out for now -- there are a bunch of junky records in here
# that no longer exist in GenBank -- do we really need to support those?
# files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')]

# parse the biojava test files
# files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')]

# test the parsers
feature_parser = GenBank.FeatureParser(debug_level=0)
record_parser = GenBank.RecordParser(debug_level=0)

all_parsers = [feature_parser, record_parser]
print("Testing parsers...")
for parser in all_parsers:
    for filename in files_to_parse:
        if not os.path.isfile(filename):
            print("Missing test input file: %s" % filename)
            continue

        handle = open(filename, 'r')
        iterator = GenBank.Iterator(handle, parser)

        while True:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", BiopythonParserWarning)
Пример #8
0
def ntgenbank():
    #retreiving all genebank files in a list calling another function
    nuc_genbank = readfile()
    #nuc_genbank = filter(None, nuc_genbank)
    print len(nuc_genbank)
    print nuc_genbank[0]
    length = len(nuc_genbank)
    print "\nParsing started"
    output = open('result_ntgenbank.csv',
                  'w')  # opening a file to write the ouput

    #writing headings of the output file
    output.write('Name'+','+'NM'+','+ 'NM_version'+','+ 'Symbol'+','+'CDS_start'+','+ 'CDS_stop'+','+'HGNC'+','+\
         'MIM'+','+'EC_number'+','+ 'GeneID' +','+ 'NP'+','+'NP_version'+','+'gene_synonym'+','+'AA_seq'+','+\
          'AA_number'+','+'Chromosome'+ ','+'Chromosome_map'+','+ 'NT_seq'+','+'Organism'+'\n')

    # going through all the genes in the list
    for n in range(1, length):  #0 index is empty
        print n
        test = 'LOCUS  ' + nuc_genbank[n].lstrip(
            '\n')  #removing new line of from individual genebank files
        query = open(
            'genbank.txt',
            'w')  #creating a genbank file to create query gene bank file
        query.write(test)
        query.close()
        parser = GenBank.RecordParser()  #using biopython function for parsing
        record = parser.parse(open('genbank.txt'))

        ##########################################################################################
        nt_seq = (record.sequence).strip('\n')  #stores nucleotide sequence
        nm_and_version = (record.version).strip(
            '\n')  #contains nm and nm_version
        nm = (nm_and_version.split('.')[0]).strip('\n')
        nm_version = (nm_and_version.split('.')[1]).strip('\n')

        ############################################################################################
        source = record.features[0]  #contains all the fields of source
        organism = source.qualifiers[0].value.strip(
            '\n') + ':' + source.qualifiers[2].value.strip('\n')
        try:
            organism = source.qualifiers[0].value.strip(
                '\n') + ':' + source.qualifiers[2].value.strip('\n')
        except:
            organism = ''

        try:
            chrm = (source.qualifiers[3].value).strip(
                '\n')  #stores chromosome number
        except:
            chrm = ''

        try:
            chrm_map = source.qualifiers[4].value.strip('\n')
        except:
            chrm_map = ''

        ############################################################################################
        gene = record.features[1]  #contains all the field of gene
        symbol = (gene.qualifiers[0].value).strip('\n')  #symbol or gene

        #########################################################################################
        cds = ''
        for c in range(0, len(record.features)):
            if ('CDS' in record.features[c].key):
                cds = record.features[c]
                break
            else:
                continue

        if cds != '':

            cds_start_stop = (cds.location).strip(
                '\n')  #stores cds start and stop position
            cds_start = (cds_start_stop.split('..')[0]).strip('\n')
            cds_stop = (cds_start_stop.split('..')[1]).strip('\n')

            #creating a empty dictionary to go through the elements in the CDS and update later if present
            cds_dict = {
                "HGNC": '',
                "MIM:": '',
                "EC_number": '',
                "GeneID": '',
                "product": '',
                "protein_id": '',
                "translation": '',
                "num_aa": '',
                "gene_synonym": ''
            }

            for n in range(0, len(cds.qualifiers)
                           ):  #going through all the elements in the cds
                for key, value in cds_dict.iteritems(
                ):  #looping through the dictionary items to see if present in cds
                    if ((key in cds.qualifiers[n].key)
                            or (key in cds.qualifiers[n].value)):
                        keys = str(key)  #storing dictionary key
                        cds_dict[keys] = str(
                            cds.qualifiers[n].value
                        )  #updating dictionary key with values
                        break
                    else:
                        continue
            np = cds_dict["protein_id"].split('.')[0] + '"'
            np_version = '"' + cds_dict["protein_id"].split('.')[1]
            hgnc = cds_dict["HGNC"]
            mim = cds_dict["MIM:"]
            geneid = cds_dict["GeneID"]
            name = cds_dict["product"]
            synonym = cds_dict["gene_synonym"]
            translation = cds_dict["translation"]
            if translation != '': num_aa = len(translation)
            if len(hgnc) != 0:
                hgnc = '"' + hgnc.split(':')[2]
            if len(mim) != 0:
                mim = '"' + mim.split(':')[1]
            if len(geneid) != 0:
                geneid = '"' + geneid.split(':')[1]

            gvalue = name+','+nm+','+nm_version+','+symbol+','+cds_start+','+cds_stop+',' + hgnc +','+\
             mim+','+cds_dict["EC_number"]+','+geneid+ ','+np+','+np_version+','+synonym+','+\
             translation+','+str(num_aa) +','+str(chrm)+','+chrm_map+','+nt_seq+','+organism+'\n'
            output.write(gvalue)
    print "Parsing completed"
    output.close()
Пример #9
0
	def __processGffFilesNew(self, newOrganismDirs):
		for newOrganism in newOrganismDirs:
			# start by creating the BLAST database
			newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism)
			print newOrganism
			organismFiles = os.walk(newOrganism).next()[2]
			faa = None
			ffn = None
			gff = None
			gbk = None
			for organismFile in organismFiles:
				extension = os.path.splitext(organismFile)[1]
				if (extension == '.ffn'):
					ffn = organismFile
				elif (extension == '.faa'):
					faa = organismFile
				elif (extension == '.gff'):
					gff = organismFile
				elif (extension == '.gbk'):
					gbk = organismFile
				if (faa and ffn and gff and gbk):
					break
			
			if (faa):
				GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True)
				self.report.addLogEntry('Ran formatdb successully on ' + faa)
			if (ffn):
				GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False)
				self.report.addLogEntry('Ran formatdb successully on ' + ffn)
				
			# process the gff and genbank files for creating the databases
			if (gff and gbk):
				# create the sqlite database for GBrowse and create the configuration file
				# for GBrowse hook up
				dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
				dbName = os.path.join(newOrganism, dbName)
				gff = os.path.join(newOrganism, gff)
				
				parser = GenBank.RecordParser()
				gbk = os.path.join(newOrganism, gbk)
				record = parser.parse(open(gbk))
				organismName = record.organism
				accession = record.accession[0]
				self.report.addLogEntry('Found organism name ' + organismName)
				
				# create a brand new GBrowse configuration file
				examiner = GFFExaminer()
				gffHandle = open(gff)
				landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
				
				gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession)
	
				'''gffRewriter.addUnknownCvTerms({
					'user' : settings.DATABASES['default']['USER'], 
					'password' : settings.DATABASES['default']['PASSWORD'], 
					'db' : settings.DATABASES['default']['NAME']
				})'''
			
				gffRewriter.addColor({
					'user' : settings.DATABASES['default']['USER'],
					'password' : settings.DATABASES['default']['PASSWORD'],
					'db' : 'MyGO'
				})
			
				error = gffRewriter.getError()
				print error
				
				gff = gff + ".sorted.prepared"
				
				args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
				runProgram('bp_seqfeature_load.pl', args)
				self.report.addLogEntry('Successfully created sqlite database for ' + str(gff))
				
				organismDir = os.path.basename(newOrganism)
				self.report.addLogEntry('Added new GBrowse entry for ' + organismName)
				
				# now edit the record in Chado by first adding the organism and then adding
				# bulk loading the information from gff3
				id = GenomeDBUtil.addOrganismToChado(gff, organismName)
				GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)
Пример #10
0
errorfile = open(datapath+'/Parser_errr.out', 'w')

genomeid = []

###############################################################################

# Parsing genbank files

###############################################################################
print("\nParsing genbank files...")
start = time.time()
for file in glob.glob('*.gbk'):
    print("Parsing file: ",gbkpath+"/"+file)
    try:
        w = re.findall(r"[\w']+",file)
        parser = GenBank.RecordParser()
        record = parser.parse(open(gbkpath+"/"+file))

        genomefile = open(genomepath+"/"+record.locus+".fasta", "w")
        genomefile.write(">" + record.locus + "\n")
        genomefile.write(record.sequence)
        genomefile.close()
        
        definition = record.definition.split(',')
        definition = definition[0]
        trest = re.sub('[^A-Za-z0-9]+', '_', str(definition))
        organismlist = record.organism.split(" ")
        genuslist.append(organismlist[0])
        specieslist.append(organismlist[1])
        
        statsFile = open(statisticspath+'/'+record.locus + '.stats','w')