def env454run_info_upload(runobj): my_read_csv = dbUpload(runobj) start = time() my_read_csv.put_run_info() elapsed = (time() - start) print "put_run_info time = %s" % str(elapsed)
def setUpClass(cls): cls._connection = dbup.MyConnection(host = "vampsdev", db = "test") msql = "SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;" cls._connection.execute_no_fetch(msql) msql = "SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;" cls._connection.execute_no_fetch(msql) msql = "SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';" cls._connection.execute_no_fetch(msql) data_object = fake_data_object.data_object root_dir = '/Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test' cls.file_path = os.path.join(root_dir, data_object['general']['platform'], data_object['general']['run'], 'lane_1', 'analysis') pi_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") cls._runobj = Run(data_object, pi_path) # cls._runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__))) cls._my_db_upload = dbup.dbUpload(cls._runobj) cls.filenames = [] cls.seq_id_dict = {} cls.fasta_file_path = cls.file_path + "/reads_overlap/ATCACG_NNNNGTATC_3-PERFECT_reads.fa.unique" cls.stats_file = cls.file_path + "/unique_file_counts_test" cls.fasta = u.SequenceSource(cls.fasta_file_path, lazy_init = False) cls.fasta.seq = "TGGGTTTGAACTACTGAGGGCCGGTACAGAGATGTACCCTTCCCTTCGGGGACTTCAGGAG" cls.fasta.id = "D4ZHLFP1:25:B022DACXX:3:1101:14017:2243 1:N:0:ATCACG|frequency:1"
def env454upload_main(runobj, full_upload): """ Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s env454upload -l debug For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) Tables: sequence_ill sequence_pdr_info_ill taxonomy sequence_uniq_info_ill """ whole_start = time.time() my_env454upload = dbUpload(runobj) filenames = my_env454upload.get_fasta_file_names() if not filenames: logger.debug("\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_env454upload.fasta_dir) # sequences = get_sequences(my_env454upload, filenames) for filename in filenames: sequences = my_env454upload.make_seq_upper(filename) if full_upload: env454upload_seq(my_env454upload, filename, sequences) wrapped = wrapper(my_env454upload.get_seq_id_dict, sequences) get_seq_id_dict_time = timeit.timeit(wrapped, number=1) logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time) total_seq = env454upload_all_but_seq(my_env454upload, filenames, full_upload) my_env454upload.check_seq_upload() logger.debug("total_seq = %s" % total_seq) whole_elapsed = (time.time() - whole_start) print "The whole upload took %s s" % whole_elapsed
def test_e_setUpRunInfo(self): my_read_csv = dbup.dbUpload(self._runobj) my_read_csv.put_run_info() sql = "SELECT max(run_info_ill_id) FROM run_info_ill" self.assertEqual(self.get_id(sql), 10) print "done with put_run_info"
def env454upload(runobj): """ Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s env454upload -l debug For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) TODO: 2) Upload env454 data into raw, trim, gast etc tables from files """ whole_start = time() # my_read_csv = readCSV(run) # my_read_csv.read_csv() my_env454upload = dbUpload(runobj) filenames = my_env454upload.get_fasta_file_names() seq_in_file = 0 total_seq = 0 for filename in filenames: try: logger.debug("\n----------------\nfilename = %s" % filename) fasta_file_path = filename filename_base = "-".join(filename.split("/")[-1].split("-")[:-1]) if (filename_base == ""): # For v4v5 illumia filename_base = "_".join(filename.split("/")[-1].split("_")[:3]) run_info_ill_id = my_env454upload.get_run_info_ill_id(filename_base) gast_dict = my_env454upload.get_gasta_result(filename) read_fasta = u.ReadFasta(fasta_file_path) # sequences = read_fasta.sequences sequences = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility if not (len(sequences)): continue read_fasta.close() fasta = u.SequenceSource(fasta_file_path, lazy_init = False) insert_seq_time = 0 get_seq_id_dict_time = 0 insert_pdr_info_time = 0 insert_taxonomy_time = 0 insert_sequence_uniq_info_ill_time = 0 start = time() my_env454upload.insert_seq(sequences) elapsed = (time() - start) insert_seq_time = elapsed logger.debug("seq_in_file = %s" % seq_in_file) logger.debug("insert_seq() took %s time to finish" % insert_seq_time) # print "insert_seq() took ", elapsed, " time to finish" start = time() my_env454upload.get_seq_id_dict(sequences) elapsed = (time() - start) get_seq_id_dict_time = elapsed logger.debug("get_seq_id_dict() took %s time to finish" % get_seq_id_dict_time) while fasta.next(): # sequence_ill_id = my_env454upload.get_sequence_id(fasta.seq) start = time() # print "Inserting pdr info" # for attr in dir(fasta): # print "obj.%s = %s" % (attr, getattr(fasta, attr)) my_env454upload.insert_pdr_info(fasta, run_info_ill_id) elapsed = (time() - start) insert_pdr_info_time += elapsed # print "insert_pdr_info() took ", elapsed, " time to finish" start = time() # print "Inserting taxonomy" my_env454upload.insert_taxonomy(fasta, gast_dict) elapsed = (time() - start) insert_taxonomy_time += elapsed # print "tax_id = ", tax_id ,"; insert_taxonomy() took ", elapsed, " time to finish" # print "tax_id = ", tax_id start = time() # print "Inserting sequence_uniq_info_ill" my_env454upload.insert_sequence_uniq_info_ill(fasta, gast_dict) elapsed = (time() - start) insert_sequence_uniq_info_ill_time += elapsed seq_in_file = fasta.total_seq my_env454upload.put_seq_statistics_in_file(filename, fasta.total_seq) total_seq += seq_in_file logger.debug("insert_pdr_info() took %s time to finish" % insert_pdr_info_time) logger.debug("insert_taxonomy_time() took %s time to finish" % insert_taxonomy_time) logger.debug("insert_sequence_uniq_info_ill() took %s time to finish" % insert_sequence_uniq_info_ill_time) except: # catch everything print "\r[pipelineprocessor] Unexpected:" # handle unexpected exceptions print sys.exc_info()[0] # info about curr exception (type,value,traceback) raise # re-throw caught exception # print "total_seq = %s" % total_seq my_env454upload.check_seq_upload() logger.debug("total_seq = %s" % total_seq) whole_elapsed = (time() - whole_start) print "The whole_upload took %s s" % whole_elapsed
def env454run_info_upload(runobj): my_read_csv = dbUpload(runobj) wrapped = wrapper(my_read_csv.put_run_info) print "put_run_info time = %s" % timeit.timeit(wrapped, number=1)
def file_to_db_upload_main(runobj, full_upload): """ Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s file_to_db_upload -l debug For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) Tables: sequence_ill sequence_pdr_info_ill taxonomy sequence_uniq_info_ill reset AUTO_INCREMENT """ whole_start = time.time() # my_file_to_db_upload = dbUpload(runobj, db_server="vamps2") try: db_name = runobj.database_name except: db_name = "env454" my_file_to_db_upload = dbUpload(runobj, db_name=db_name) # dbUpload(runobj) # filenames = my_file_to_db_upload.get_fasta_file_names() if not my_file_to_db_upload.filenames: err_msg = "\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_file_to_db_upload.fasta_dir my_file_to_db_upload.all_errors.append(err_msg) logger.debug(err_msg) # sequences = get_sequences(my_file_to_db_upload, filenames) get_and_up_seq_time = time.time() total_time = 0 no_run_info_list = [] for filename in my_file_to_db_upload.filenames: sequences = my_file_to_db_upload.seq.prepare_fasta_dict(filename) if not (len(sequences)): err_msg = "There are 0 sequences in filename = %s" % filename logger.debug(err_msg) my_file_to_db_upload.all_errors.append(err_msg) continue if full_upload: file_to_db_upload_seq(my_file_to_db_upload, filename, sequences) wrapped = wrapper(my_file_to_db_upload.seq.get_seq_id_dict, sequences) get_seq_id_dict_time = timeit.timeit(wrapped, number=1) logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time) get_and_up_seq_time_end = (time.time() - get_and_up_seq_time) logger.debug("get_and_up_seq took %s s" % get_and_up_seq_time_end) start_c = time.time() total_time = total_time + file_to_db_upload_all_but_seq( my_file_to_db_upload, filename, no_run_info_list, full_upload) logger.debug("file_to_db_upload_all_but_seq() took %s sec to finish" % (time.time() - start_c)) # doesn't work with mysql 5.6', not needed with no gap auto_increment # my_file_to_db_upload.reset_auto_increment() seq_count_msg = my_file_to_db_upload.check_seq_upload() projects_and_ids = my_file_to_db_upload.get_projects_and_ids() utils = PipelneUtils() if db_name == 'vamps2': my_email = '*****@*****.**' else: my_email = '*****@*****.**' ready_email_body = """Uploaded to %s on %s\nIn this run %s: %s\n%s\n%s """ % (runobj.database_name, runobj.database_host, runobj.run, projects_and_ids, my_file_to_db_upload.equal_amnt_files_txt, seq_count_msg) my_file_to_db_upload.send_message(my_email, 'Projects uploaded to %s' % db_name, ready_email_body) if len(no_run_info_list) > 0: err_msg = "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % ", ".join( no_run_info_list) utils.print_both(err_msg) my_file_to_db_upload.all_errors.append(err_msg) logger.debug("From file_to_db_upload_main. ready_email_body: ") logger.debug(ready_email_body) my_file_to_db_upload.all_errors.extend(my_file_to_db_upload.seq.seq_errors) if len(my_file_to_db_upload.all_errors) > 0: logger.debug('\n=====\nERRORS: \n' + ';\n'.join(my_file_to_db_upload.all_errors)) logger.debug("total_time = %s" % total_time) whole_elapsed = (time.time() - whole_start) logger.debug("The whole upload took %s s" % whole_elapsed)
def run_info_upload(runobj): logger.debug("Start Run info upload to db") my_read_csv = dbUpload(runobj) wrapped = wrapper(my_read_csv.put_run_info) logger.debug("put_run_info time = %s" % timeit.timeit(wrapped, number=1))
def file_to_db_upload_main(runobj, full_upload): """ Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s file_to_db_upload -l debug For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) Tables: sequence_ill sequence_pdr_info_ill taxonomy sequence_uniq_info_ill reset AUTO_INCREMENT """ whole_start = time.time() # my_file_to_db_upload = dbUpload(runobj, db_server="vamps2") try: db_name = runobj.database_name except: db_name = "env454" my_file_to_db_upload = dbUpload(runobj, db_name = db_name) # dbUpload(runobj) # filenames = my_file_to_db_upload.get_fasta_file_names() if not my_file_to_db_upload.filenames: err_msg = "\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_file_to_db_upload.fasta_dir my_file_to_db_upload.all_errors.append(err_msg) logger.debug(err_msg) # sequences = get_sequences(my_file_to_db_upload, filenames) get_and_up_seq_time = time.time() total_time = 0 no_run_info_list = [] for filename in my_file_to_db_upload.filenames: sequences = my_file_to_db_upload.seq.prepare_fasta_dict(filename) if not (len(sequences)): err_msg = "There are 0 sequences in filename = %s" % filename logger.debug(err_msg) my_file_to_db_upload.all_errors.append(err_msg) continue if full_upload: file_to_db_upload_seq(my_file_to_db_upload, filename, sequences) wrapped = wrapper(my_file_to_db_upload.seq.get_seq_id_dict, sequences) get_seq_id_dict_time = timeit.timeit(wrapped, number = 1) logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time) get_and_up_seq_time_end = (time.time() - get_and_up_seq_time) logger.debug("get_and_up_seq took %s s" % get_and_up_seq_time_end) start_c = time.time() total_time = total_time + file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list, full_upload) logger.debug("file_to_db_upload_all_but_seq() took %s sec to finish" % (time.time() - start_c)) # doesn't work with mysql 5.6', not needed with no gap auto_increment # my_file_to_db_upload.reset_auto_increment() seq_count_msg = my_file_to_db_upload.check_seq_upload() projects_and_ids = my_file_to_db_upload.get_projects_and_ids() utils = PipelneUtils() if db_name == 'vamps2': my_email = '*****@*****.**' else: my_email = '*****@*****.**' ready_email_body = """Uploaded to %s on %s\nIn this run %s: %s\n%s\n%s """ % (runobj.database_name, runobj.database_host, runobj.run, projects_and_ids, my_file_to_db_upload.equal_amnt_files_txt, seq_count_msg) my_file_to_db_upload.send_message(my_email, 'Projects uploaded to %s' % db_name, ready_email_body) if len(no_run_info_list) > 0: err_msg = "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % ", ".join( no_run_info_list) utils.print_both(err_msg) my_file_to_db_upload.all_errors.append(err_msg) logger.debug("From file_to_db_upload_main. ready_email_body: ") logger.debug(ready_email_body) my_file_to_db_upload.all_errors.extend(my_file_to_db_upload.seq.seq_errors) if len(my_file_to_db_upload.all_errors) > 0: logger.debug('\n=====\nERRORS: \n' + ';\n'.join(my_file_to_db_upload.all_errors)) logger.debug("total_time = %s" % total_time) whole_elapsed = (time.time() - whole_start) logger.debug("The whole upload took %s s" % whole_elapsed)
def run_info_upload(runobj): logger.debug("Start Run info upload to db") my_read_csv = dbUpload(runobj) wrapped = wrapper(my_read_csv.put_run_info) logger.debug("put_run_info time = %s" % timeit.timeit(wrapped, number = 1))
def env454upload(runobj): """ For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) TODO: 2) Upload env454 data into raw, trim, gast etc tables from files """ whole_start = time() # my_read_csv = readCSV(run) # my_read_csv.read_csv() my_env454upload = dbUpload(runobj) filenames = my_env454upload.get_fasta_file_names() seq_in_file = 0 total_seq = 0 for filename in filenames: try: logger.debug("\n----------------\nfilename = %s" % filename) fasta_file_path = filename filename_base = "-".join(filename.split("/")[-1].split("-")[:-1]) run_info_ill_id = my_env454upload.get_run_info_ill_id(filename_base) gast_dict = my_env454upload.get_gasta_result(filename) read_fasta = u.ReadFasta(fasta_file_path) sequences = read_fasta.sequences if not (len(sequences)): continue read_fasta.close() fasta = u.SequenceSource(fasta_file_path, lazy_init=False) insert_seq_time = 0 get_seq_id_dict_time = 0 insert_pdr_info_time = 0 insert_taxonomy_time = 0 insert_sequence_uniq_info_ill_time = 0 start = time() my_env454upload.insert_seq(sequences) elapsed = time() - start insert_seq_time = elapsed logger.debug("seq_in_file = %s" % seq_in_file) logger.debug("insert_seq() took %s time to finish" % insert_seq_time) # print "insert_seq() took ", elapsed, " time to finish" start = time() my_env454upload.get_seq_id_dict(sequences) elapsed = time() - start get_seq_id_dict_time = elapsed logger.debug("get_seq_id_dict() took %s time to finish" % get_seq_id_dict_time) while fasta.next(): # sequence_ill_id = my_env454upload.get_sequence_id(fasta.seq) start = time() # print "Inserting pdr info" my_env454upload.insert_pdr_info(fasta, run_info_ill_id) elapsed = time() - start insert_pdr_info_time += elapsed # print "insert_pdr_info() took ", elapsed, " time to finish" start = time() # print "Inserting taxonomy" my_env454upload.insert_taxonomy(fasta, gast_dict) elapsed = time() - start insert_taxonomy_time += elapsed # print "tax_id = ", tax_id ,"; insert_taxonomy() took ", elapsed, " time to finish" # print "tax_id = ", tax_id start = time() # print "Inserting sequence_uniq_info_ill" my_env454upload.insert_sequence_uniq_info_ill(fasta, gast_dict) elapsed = time() - start insert_sequence_uniq_info_ill_time += elapsed seq_in_file = fasta.total_seq my_env454upload.put_seq_statistics_in_file(filename, fasta.total_seq) total_seq += seq_in_file logger.debug("insert_pdr_info() took %s time to finish" % insert_pdr_info_time) logger.debug("insert_taxonomy_time() took %s time to finish" % insert_taxonomy_time) logger.debug("insert_sequence_uniq_info_ill() took %s time to finish" % insert_sequence_uniq_info_ill_time) # except Exception, e: # catch all deriving from Exception (instance e) ## sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos)) # frameinfo = getframeinfo(currentframe()) # print frameinfo.filename, frameinfo.lineno # print "\r[pipelineprocessor] Exception: ", e.__str__() # address the instance, print e.__str__() ## raise # re-throw caught exception except: # catch everything print "\r[pipelineprocessor] Unexpected:" # handle unexpected exceptions print sys.exc_info()[0] # info about curr exception (type,value,traceback) raise # re-throw caught exception print "total_seq = %s" % total_seq my_env454upload.check_seq_upload() logger.debug("total_seq = %s" % total_seq) whole_elapsed = time() - whole_start print "The whole_upload took %s s" % whole_elapsed