def __processGffFilesNotNew(self, changed): for gff in changed: loc = os.path.dirname(gff) dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(loc, dbName) gffRewriter = GFFRewriter(filename=gff, outfile=gff + ".sorted.prepared", accession=genbank_id) #print setting.DATABASES['default']['USER'] gffRewriter.addUnknownCvTerms({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': settings.DATABASES['default']['NAME'] }) gffRewriter.addColor({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': 'go' }) error = gffRewriter.getError() # run the sqlite database loader to be able to add it to GBrowse # since the name should be preserved, no changes need to be made # to the GBrowse configuration file args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) parser = GenBank.RecordParser() gbk = os.path.join(os.path.splitext(gff)[0], '.gbk') record = parser.parse(open(gbk)) organismName = record.organism organismDir = os.path.basename(loc) GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName) # now edit the record in Chado args = [ '--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir" ] runProgram('gmod_bulk_load_gff3.pl', args)
def __processFastaFilesNotNew(self, unchanged, changed): for fasta in changed: extension = os.path.splitext(fasta)[1] fileDir = os.path.dirname(fasta) if (extension == 'ffn'): GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=False) self.report.addLogEntry('Created BLASTn database for ' + fasta + '(replaced old file)...') elif (extension == 'faa'): GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=True) self.report.addLogEntry('Created BLASTp database for ' + fasta + ' (replaced old file)...') for fasta in unchanged: fileDir = os.path.dirname(fasta) prefix = CUR_GENOMIC_DATA_DIR + fileDir extension = os.path.splitext(fasta)[1] if (extension == 'ffn'): # check to see if the nucleotideDB directory exists # for this organism already nucleotideDB = os.path.join(prefix, 'nucleotideDB') nucleotideExists = os.path.isdir(nucleotideDB) # since these are unchanged files we wish to simply move the old directories over # to save cpu time, if they do not exist previously then we will make them in the # new directory if (nucleotideExists): shutil.copytree(nucleotideDB, NEW_GENOMIC_DATA_DIR + fileDir) else: GenomeDBUtil.runFormatDB(os.path.basename(fasta), NEW_GENOMIC_DATA_DIR + fileDir, protein=False) elif (extension == 'faa'): # check to see if the proteinDB directory exists # for this organism already proteinDB = os.path.join(prefix, 'proteinDB') proteinExists = os.path.isdir(proteinDB) # since these are unchanged files we wish to simply move the old directories over # to save cpu time, if they do not exist previously then we will make them in the # new directory if (proteinExists): shutil.copytree(proteinDB, NEW_GENOMIC_DATA_DIR + fileDir) else: GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=True)
def __processGffFilesNotNew(self, changed): for gff in changed: loc = os.path.dirname(gff) dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(loc, dbName) gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=genbank_id) gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] }) gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() # run the sqlite database loader to be able to add it to GBrowse # since the name should be preserved, no changes need to be made # to the GBrowse configuration file args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) parser = GenBank.RecordParser() gbk = os.path.join(os.path.splitext(gff)[0], '.gbk') record = parser.parse(open(gbk)) organismName = record.organism organismDir = os.path.basename(loc) GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName) # now edit the record in Chado args= ['--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"] runProgram('gmod_bulk_load_gff3.pl', args)
def __processFastaFilesNotNew(self, unchanged, changed): for fasta in changed: extension = os.path.splitext(fasta)[1] fileDir = os.path.dirname(fasta) if (extension == 'ffn'): GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=False) self.report.addLogEntry('Created BLASTn database for ' + fasta + '(replaced old file)...') elif (extension == 'faa'): GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=True) self.report.addLogEntry('Created BLASTp database for ' + fasta + ' (replaced old file)...') for fasta in unchanged: fileDir = os.path.dirname(fasta) prefix = CUR_GENOMIC_DATA_DIR + fileDir extension = os.path.splitext(fasta)[1] if (extension == 'ffn'): # check to see if the nucleotideDB directory exists # for this organism already nucleotideDB = os.path.join(prefix, 'nucleotideDB') nucleotideExists = os.path.isdir(nucleotideDB) # since these are unchanged files we wish to simply move the old directories over # to save cpu time, if they do not exist previously then we will make them in the # new directory if (nucleotideExists): shutil.copytree(nucleotideDB, NEW_GENOMIC_DATA_DIR + fileDir) else: GenomeDBUtil.runFormatDB(os.path.basename(fasta), NEW_GENOMIC_DATA_DIR + fileDir, protein=False) elif(extension == 'faa'): # check to see if the proteinDB directory exists # for this organism already proteinDB = os.path.join(prefix, 'proteinDB') proteinExists = os.path.isdir(proteinDB) # since these are unchanged files we wish to simply move the old directories over # to save cpu time, if they do not exist previously then we will make them in the # new directory if (proteinExists): shutil.copytree(proteinDB, NEW_GENOMIC_DATA_DIR + fileDir) else: GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=True)
def __processGffFilesNew(self, newOrganismDirs): for newOrganism in newOrganismDirs: # start by creating the BLAST database newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism) print newOrganism organismFiles = os.walk(newOrganism).next()[2] faa = None ffn = None gff = None gbk = None for organismFile in organismFiles: extension = os.path.splitext(organismFile)[1] if (extension == '.ffn'): ffn = organismFile elif (extension == '.faa'): faa = organismFile elif (extension == '.gff'): gff = organismFile elif (extension == '.gbk'): gbk = organismFile if (faa and ffn and gff and gbk): break if (faa): GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True) self.report.addLogEntry('Ran formatdb successully on ' + faa) if (ffn): GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False) self.report.addLogEntry('Ran formatdb successully on ' + ffn) # process the gff and genbank files for creating the databases if (gff and gbk): # create the sqlite database for GBrowse and create the configuration file # for GBrowse hook up dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(newOrganism, dbName) gff = os.path.join(newOrganism, gff) parser = GenBank.RecordParser() gbk = os.path.join(newOrganism, gbk) record = parser.parse(open(gbk)) organismName = record.organism accession = record.accession[0] self.report.addLogEntry('Found organism name ' + organismName) # create a brand new GBrowse configuration file examiner = GFFExaminer() gffHandle = open(gff) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession) '''gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] })''' gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() print error gff = gff + ".sorted.prepared" args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) self.report.addLogEntry('Successfully created sqlite database for ' + str(gff)) organismDir = os.path.basename(newOrganism) self.report.addLogEntry('Added new GBrowse entry for ' + organismName) # now edit the record in Chado by first adding the organism and then adding # bulk loading the information from gff3 id = GenomeDBUtil.addOrganismToChado(gff, organismName) GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)