def test_fasta_to_tab_delim(self): """make sure we can go from fasta to tab delim""" input = """>a RUN1 orig_bc=AAAA new_bc=AAAA bc_diffs=0 123123123 >d RUN1 orig_bc=AAAA new_bc=AAAA bc_diffs=0 atcasdad >h RUN1 orig_bc=AAAA new_bc=AAAA bc_diffs=0 10 11 12""" exp = ['1\t1\ta\ta\tRUN1\tAAAA\tAAAA\t0\t9\tf5bb0c8de146c67b44babbf4e6584cc0\t123123123', '1\t1\td\td\tRUN1\tAAAA\tAAAA\t0\t8\t1fae8caaf715bdc710b99e8c3e843092\tatcasdad', '1\t1\th\th\tRUN1\tAAAA\tAAAA\t0\t8\tb4c2a347f5d0453c4fdae6d5c7b5bc78\t10 11 12'] obs = list(fasta_to_tab_delim(input.splitlines(),1,1)) self.assertEqual(obs, exp)
def load_split_lib_sequences(data_access,input_dir,analysis_id, seq_run_id, split_lib_input_md5sum): """ This function loads the split-library seqs into DB """ # define the split library file paths using the original fasta input # directory split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna') split_lib_hist = join(input_dir, 'split_libraries', 'histograms.txt') split_lib_log = join(input_dir, 'split_libraries', 'split_library_log.txt') # this needs to be a try/except since FASTA files does not have these files try: split_hist_str = open(split_lib_hist).read() split_log_str = open(split_lib_log).read() except IOError: split_hist_str=None split_log_str=None # read in the workflow log file and determine timestamp and svn version of # Qiime used for the analysis svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S") print run_date # get the log file data full_log_fp = glob(join(input_dir, 'log*.txt'))[0] full_log_str = open(full_log_fp, 'U').read() log_str = open(full_log_fp, 'U').readlines() split_lib_cmd="Split-libraries was not run due to this being a FASTA-file" pick_otus_cmd='' # from the workflow log file get the split-library and pick-otus cmds for substr in log_str: if 'split_libraries_fastq.py' in substr: split_lib_cmd=substr elif 'parallel_pick_otus_uclust_ref.py' in substr: pick_otus_cmd=substr elif 'split_libraries.py' in substr: split_lib_cmd=substr elif 'pick_otus.py' in substr: pick_otus_cmd=substr # Insert the split-library log information in the DB valid,split_library_run_id=data_access.loadSplitLibInfo(True,analysis_id,\ run_date, split_lib_cmd,\ svn_version, split_log_str, \ split_hist_str, split_lib_input_md5sum) print "Split-Lib ID: %s" % split_library_run_id if not valid: raise ValueError,'Error: Unable to load split-library info to database server!' print "Finished loading the split-library log information!" # process and load_fna_data print "starting new fna load" start = time.time() ''' The output values and types for each value are as follows: 0: sequence run id (integer) 1: sample id (text) 2: barcode read group tag (text) 3: read id (text) 4: original barcode (text) 5: new barcode (text) 6: number of barcode diffs (integer) 7: sequence length (integer) 8: sequence md5 hash (text) 9: sequence string (text) ''' # define the data types for oracle types = ['i','i', 's', 's', 's', 's', 's', 'i', 'i', 'fc', 's'] con = data_access.getSFFDatabaseConnection() cur = con.cursor() ### this allows for rebuilding indices but shouldn't be necessary #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') #cur = con.cursor() open_fasta = open(split_lib_seqs) iterator=0 # using the generator, load the sequences for res in input_set_generator(fasta_to_tab_delim(open_fasta, seq_run_id,\ split_library_run_id), cur, types,\ buffer_size=500): #print str(res) print 'running %i' % (iterator) iterator=iterator+1 valid = data_access.loadFNAFile(True, res) if not valid: raise ValueError, 'Error: Unable to load FNA file into database!' open_fasta.close() end = time.time() print 'Total processor time elapsed: %s' % str(end - start) print 'Finished loading split_library FNA file.' try: ### MOVING THIS INTO SEQUENCE LOADING SINCE RELIES ON SPLIT_LIBRARY_READ_MAP # try/except necessary since some datasets are metagenomes, # which do not have OTU failures # Get otu_picking_run_id con = data_access.getSFFDatabaseConnection() cur = con.cursor() statement='select otu_picking_run_id from analysis where analysis_id=%s' % (str(analysis_id)) results = cur.execute(statement) for i in results: otu_picking_run_id=i[0] # get the otu-picking failures file pick_otus_failures = join(input_dir, 'gg_97_otus', 'all_failures.txt') lines = open(pick_otus_failures,'U') otu_failures = [] for line in lines: otu_failures.append('%s\t%s'% (line.strip('\n'),str(otu_picking_run_id))) # define oracle data types types=['s','i'] con=data_access.getSFFDatabaseConnection() cur = con.cursor() set_count = 1 # iterate over OTU failures and load them for input_set in input_set_generator(otu_failures, cur, types, buffer_size=10000): valid = data_access.loadOTUFailuresAll(True, input_set) if not valid: raise ValueError, 'Error: Unable to load OTU failures data into database!' print "loading OTU failure set: %s" % set_count set_count += 1 print 'Successfully loaded the OTU failures into the database!' except: print "Unable to load OTU failures!" print 'End of function' return split_library_run_id