def test_run_process_sff_gz_through_split_lib_FLX(self):
        """run_process_sff_through_pick_otus runs without error: Convert to \
FLX"""

        # remove generated mapping file
        moved_mapping_file = join(self.wf_out, split(self.fasting_mapping_fp)[-1])
        self.files_to_remove.append(moved_mapping_file)

        run_process_sff_through_split_lib(
            0,
            "Fasting_subset",
            sff_input_fp=self.sff_gz_fp,
            mapping_fp=self.fasting_mapping_fp,
            output_dir=self.wf_out,
            command_handler=call_commands_serially,
            params=self.params,
            qiime_config=self.qiime_config,
            convert_to_flx=True,
            write_to_all_fasta=False,
            status_update_callback=no_status_updates,
        )

        # get the file basename
        input_file_basename = splitext(splitext(split(self.sff_fp)[1])[0])[0]

        # get the split-library sequence fpath
        split_lib_seqs_fp = join(self.wf_out, "split_libraries", "seqs.fna")

        sff_fp = join(self.wf_out, "Fasting_subset_FLX.sff")
        sff_seqs_fp = join(self.wf_out, "Fasting_subset_FLX.fna")
        sff_qual_fp = join(self.wf_out, "Fasting_subset_FLX.qual")
        sff_flow_fp = join(self.wf_out, "Fasting_subset_FLX.txt")
        new_map_fp = join(self.wf_out, "Fasting_subset_mapping.txt")

        # define files to remove
        self.files_to_remove.append(sff_fp)
        self.files_to_remove.append(sff_seqs_fp)
        self.files_to_remove.append(sff_qual_fp)
        self.files_to_remove.append(sff_flow_fp)

        # get the head of files
        split_lib_head = get_top_fastq_two_lines(open(split_lib_seqs_fp, "U"))
        raw_seq_head = get_top_fastq_two_lines(open(sff_seqs_fp, "U"))
        raw_qual_head = get_top_fastq_two_lines(open(sff_qual_fp, "U"))
        raw_flow_head = get_top_fastq_two_lines(open(sff_flow_fp, "U"))

        # check results
        self.assertEqual("".join(split_lib_head), exp_FLX_split_lib_head)
        self.assertEqual("".join(raw_seq_head), exp_FLX_raw_seq_head)
        self.assertEqual("".join(raw_qual_head), exp_FLX_raw_qual_head)
        self.assertEqual("".join(raw_flow_head), exp_Ti_raw_flow_head)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.wf_out, "log*.txt"))[0]
        self.assertTrue(getsize(log_fp) > 0)
    def test_submit_processed_data_to_db(self):
        """run_process_sff_through_pick_otus runs without error"""
        
        run_process_sff_through_split_lib(0,'Fasting_subset',
          sff_input_fp=self.sff_fp, 
          mapping_fp=self.fasting_mapping_fp,
          output_dir=self.wf_out,
          command_handler=call_commands_serially,
          params=self.params,
          qiime_config=self.qiime_config,convert_to_flx=False,
          write_to_all_fasta=False,
          status_update_callback=no_status_updates)
        
        input_file_basename = splitext(split(self.sff_fp)[1])[0]
        otu_fp = join(self.wf_out,'picked_otus','seqs_otus.txt')
        split_lib_seqs_fp = join(self.wf_out,'split_libraries',\
                                 'seqs.fna')
                                 
        run_chain_pick_otus(split_lib_seqs_fp,
                            output_dir=self.gg_out,
                            command_handler=call_commands_serially,
                            params=self.params,
                            qiime_config=self.qiime_config,parallel=False,
                            status_update_callback=no_status_updates)
                                 
        input_fname = splitext(split(self.sff_fp)[-1])[0]
        db_input_fp = join(self.wf_out,input_fname)

        analysis_id=submit_sff_and_split_lib(data_access,\
                                                db_input_fp+'.fna',0)
        load_otu_mapping(data_access,self.wf_out,analysis_id)
                                                
        print 'Analysis ID is: %s' % str(analysis_id)
        print 'Testing the FLOW_DATA loading!'
        exp_sff_md5='314f4000857668d45a413d2e94a755fc'
        exp_num_seqs=22
        exp_read_id='FLP3FBN01ELBSX'
        exp_instr_code='GS FLX'
        exp_sff_fname='Fasting_subset'
       
       
        con = data_access.getSFFDatabaseConnection()
        cur = con.cursor()
        seq_run_info="""select j.seq_run_id,f.sff_filename,f.number_of_reads,f.md5_checksum,
              h.instrument_code
              from analysis j
              inner join seq_run_to_sff_file s on j.seq_run_id=s.seq_run_id
              inner join sff_file f on f.sff_file_id=s.sff_file_id
              inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id
              inner join sequencing_run h on h.seq_run_id=s.seq_run_id"""
        seq_run_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_1\'" % (str(analysis_id))
        results = cur.execute(seq_run_info)
        
        #print 'Calling getTestFlowData...'
        for data in results:
            obs_seq_run_id,obs_sff_filename,obs_num_of_reads,obs_sff_md5,\
            obs_instrument_code = data
        
        self.assertEqual(obs_sff_filename,exp_sff_fname)
        self.assertEqual(obs_num_of_reads,exp_num_seqs)
        self.assertEqual(obs_sff_md5,exp_sff_md5)
        self.assertEqual(obs_instrument_code,exp_instr_code)
        
        '''
        print 'After getTestFlowData...'
        #print 'Calling getTestFlowData...' 
        obs_seq_run_id,obs_sff_filename,obs_num_of_reads,obs_sff_md5,\
        obs_instrument_code,obs_read_id,obs_read_seq,obs_flow_string,\
        obs_qual_string = data_access.getTestFlowData(True,analysis_id,
                                                            'test.PCx634_1')

        #print 'After getTestFlowData...'                                                   
        self.assertEqual(obs_sff_filename,exp_sff_fname)    
        self.assertEqual(obs_num_of_reads,exp_num_seqs)            
        self.assertEqual(obs_sff_md5,exp_sff_md5)
        self.assertEqual(obs_instrument_code,exp_instr_code)
        self.assertEqual(obs_read_id,exp_read_id)
        self.assertEqual(obs_read_seq,exp_read_seq)
        self.assertEqual(str(obs_flow_string),exp_flow_string)
        self.assertEqual(str(obs_qual_string),exp_qual_string)
        
        print 'Done testing Flow_Data!'
        
        '''
        
        print 'Testing Split-Library Data'
        exp_split_lib_seq='CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTTACCCTCTCAGGCCGGCTACGCATCATCGCCTTGGTGGGCCGTTACCTCACCAACTAGCTAATGCGCCGCAGGTCCATCCATGTTCACGCCTTGATGGGCGCTTTAATATACTGAGCATGCGCTCTGTATACCTATCCGGTTTTAGCTACCGTTTCCAGCAGTTATCCCGGACACATGGGCTAGG'
        exp_split_lib_md5='2c67e0acf745bef73e26c36f0b3bd00a'
        exp_split_lib_seq_md5='008918f7469f8e33d5dd6e01075d5194'

        
        split_lib_info="""select distinct j.seq_run_id,slrm.ssu_sequence_id,l.command,l.md5_checksum,
              s.sequence_string,s.md5_checksum
              from analysis j
              inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id and j.split_library_run_id=slrm.split_library_run_id
              inner join ssu_sequence s on slrm.ssu_sequence_id=s.ssu_sequence_id
              inner join split_library_run l on j.split_library_run_id=l.split_library_run_id"""
        split_lib_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_1\'" % (str(analysis_id))
    
        results = cur.execute(split_lib_info)
        
        #print 'Calling getTestFlowData...'
        for data in results:
            obs_seq_run_id,obs_ssu_seq_id,obs_split_lib_cmd,obs_split_lib_md5,\
            obs_split_lib_seq,obs_split_lib_seq_md5 = data
                                                            
        self.assertEqual(obs_split_lib_md5,exp_split_lib_md5)
        self.assertEqual(obs_split_lib_seq,exp_split_lib_seq)
        self.assertEqual(obs_split_lib_seq_md5,exp_split_lib_seq_md5)
        
        '''
        obs_seq_run_id,obs_ssu_seq_id,obs_split_lib_cmd,obs_split_lib_md5,\
        obs_split_lib_seq,obs_split_lib_seq_md5 = \
                    data_access.getTestSplitLibData(True,analysis_id,
                                                            'test.PCx634_1')
                                                            
        self.assertEqual(obs_split_lib_md5,exp_split_lib_md5)
        self.assertEqual(obs_split_lib_seq,exp_split_lib_seq)
        self.assertEqual(obs_split_lib_seq_md5,exp_split_lib_seq_md5)
        '''
        print 'Testing OTU Data!'
        
        #exp_prokmsa=83669
        exp_otu_md5='0b8edcf8a4275730001877496b41cf55'
        exp_threshold=97
        
        otu_info="""select distinct j.seq_run_id,slrm.ssu_sequence_id,ot.reference_id,gr.ssu_sequence_id,
            ot.reference_id,j.otu_picking_run_id,p.command,p.md5_sum_input_file,
            p.threshold
            from analysis j
            inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id and j.split_library_run_id=slrm.split_library_run_id
            inner join otu_table ot on j.otu_run_set_id=ot.otu_run_set_id
            inner join gg_plus_denovo_reference gr on ot.reference_id=gr.reference_id
            inner join otu_picking_run p on j.otu_picking_run_id=p.otu_picking_run_id"""
        otu_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_2\'" % (str(analysis_id))
    
        results = cur.execute(otu_info)
        
        for data in results:
            obs_seq_run_id,obs_ssu_seq_id,obs_otu_id,obs_otu_ssu_id,\
            obs_prokmsa,obs_otu_picking_run_id,obs_pick_otu_cmd,\
            obs_otu_md5,obs_threshold = data
        
        '''
        obs_seq_run_id,obs_ssu_seq_id,obs_otu_id,obs_otu_ssu_id,\
        obs_prokmsa,obs_otu_picking_run_id,obs_pick_otu_cmd,\
        obs_otu_md5,obs_threshold = \
                    data_access.getTestOTUData(True,analysis_id,
                                                            'test.PCx634_2')
        '''
        #self.assertEqual(obs_prokmsa,exp_prokmsa)
        self.assertEqual(obs_otu_md5,exp_otu_md5)
        self.assertEqual(obs_threshold,exp_threshold)
        
        otu_fail_info="""select distinct j.seq_run_id,f.ssu_sequence_id
              from analysis j
              inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id
              inner join otu_picking_failures f on slrm.ssu_sequence_id=f.ssu_sequence_id"""
        otu_fail_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_14\'" % (str(analysis_id))
    
        results = cur.execute(otu_fail_info)
        
        for data in results:
            obs_seq_run_id,obs_ssu_id= data
        
        '''
        obs_seq_run_id,obs_ssu_id = \
                    data_access.getTestOTUFailureData(True,analysis_id,
                                                            'test.PCx634_14')
        '''
        
        self.failIfEqual(obs_seq_run_id,0)
        self.failIfEqual(obs_ssu_id,0)
        
        valid=data_access.deleteTestAnalysis(True,analysis_id)
        if not valid:
            print "Error: Could not delete data from DB!"
 def test_make_study_sffs(self):
     """Runs the original SFF through split-lib, then runs the filtered one
        one through split-lib and verifies they are the same."""
        
     #Run original SFF throught split-lib
     run_process_sff_through_split_lib(0,'Fasting_subset',\
      sff_input_fp=self.sff_fp,\
      mapping_fp=self.fasting_mapping_fp,\
      output_dir=self.wf_out, \
      command_handler=call_commands_serially,\
      params=self.params,\
      qiime_config=self.qiime_config,\
      convert_to_flx=False,\
      write_to_all_fasta=False,\
      status_update_callback=no_status_updates)
     
     #Create a per study SFF
     per_library_output=join(self.wf_out,'per_study_sff')
     create_dir(per_library_output)
     make_study_sffs(per_library_output,609)
     
     #Run the new SFF through split-lib
     new_map_fp=join(per_library_output,
                 'study_609_run_Fasting_subset_mapping.txt')
     
     new_sff_fp=join(per_library_output,'filtered_sffs',
                 'study_609_Fasting_subset.sff')
     
     new_processed_output=join(self.wf_out,'filtered_wf_da')
     create_dir(new_processed_output)
     
     run_process_sff_through_split_lib(0,'Fasting_subset',\
      sff_input_fp=new_sff_fp,\
      mapping_fp=new_map_fp,\
      output_dir=new_processed_output, \
      command_handler=call_commands_serially,\
      params=self.params,\
      qiime_config=self.qiime_config,\
      convert_to_flx=False,\
      write_to_all_fasta=False,\
      status_update_callback=no_status_updates)
      
     #parse the split-library fasta files
     old_split_lib_seqs=MinimalFastaParser(open(join(self.wf_out,
                                         'split_libraries','seqs.fna')),'U')
     new_split_lib_seqs=MinimalFastaParser(open(join(new_processed_output,
                                         'split_libraries','seqs.fna')),'U')
     
     #Convert original SFF split-lib fasta into a Dictionary for easy lookup
     old_seqs={}
     for key,val in old_split_lib_seqs:
         new_key_head=key.split(' ')
         new_key_head[0]=new_key_head[0].split('_')[0]
         old_seqs[' '.join(new_key_head)]=val
       
     #Convert new SFF split-lib fasta into a Dictionary for easy lookup
     new_seqs={}
     for key,val in new_split_lib_seqs:
         new_key_head=key.split(' ')
         new_key_head[0]=new_key_head[0].split('_')[0].split('.')[1]
         new_seqs[' '.join(new_key_head)]=val
     
     #Check that the seqs and headers match
     for key in old_seqs:
         self.assertTrue(old_seqs[key]==new_seqs[key])
     
     #Check that the output histograms are equal
     old_histogram=open(join(self.wf_out,'split_libraries',
                                             'histograms.txt')).readlines()
     new_histogram=open(join(new_processed_output,'split_libraries',
                                             'histograms.txt')).readlines()
     
     self.assertEqual(old_histogram,new_histogram)
     
     #Check that the output log files are equal
     old_log=open(join(self.wf_out,'split_libraries',
                                     'split_library_log.txt')).readlines()
     new_log=open(join(new_processed_output,'split_libraries',
                                     'split_library_log.txt')).readlines()
     
     self.assertEqual(old_log[:23],new_log[:23])
    def test_submit_processed_data_to_db(self):
        """run_process_sff_through_pick_otus runs without error"""
        
        self.files_to_remove.append(join(self.wf_out,'Fasting_subset.fna'))
        self.files_to_remove.append(join(self.wf_out,'Fasting_subset.qual'))
        self.files_to_remove.append(join(self.wf_out,'Fasting_subset.txt'))
        
        # remove generated mapping file
        moved_mapping_file=join(self.wf_out,split(self.fasting_mapping_fp)[-1])
        self.files_to_remove.append(moved_mapping_file)
        
        # process the sequence data first before loading
        run_process_sff_through_split_lib(0,'Fasting_subset',
          sff_input_fp=self.sff_fp, 
          mapping_fp=self.fasting_mapping_fp,
          output_dir=self.wf_out,
          command_handler=call_commands_serially,
          params=self.params,
          qiime_config=self.qiime_config,convert_to_flx=False,
          write_to_all_fasta=False,
          status_update_callback=no_status_updates)
        
        # get the file basename
        input_file_basename = splitext(split(self.sff_fp)[1])[0]
        
        # get key filepaths
        otu_fp = join(self.wf_out,'picked_otus','seqs_otus.txt')
        split_lib_seqs_fp = join(self.wf_out,'split_libraries',\
                                 'seqs.fna')
        
        # run chained OTU-picking
        run_chain_pick_otus(split_lib_seqs_fp,
                            output_dir=self.gg_out,
                            command_handler=call_commands_serially,
                            params=self.params,
                            qiime_config=self.qiime_config,parallel=False,
                            status_update_callback=no_status_updates)
                                 
        input_fname = splitext(split(self.sff_fp)[-1])[0]
        db_input_fp = join(self.wf_out,input_fname)

        # submit the data
        analysis_id, input_dir, seq_run_id, split_lib_input_md5sum = \
            submit_sff_and_split_lib(data_access,db_input_fp+'.fna',
                                     self.study_id)
        # load OTU picking
        load_otu_mapping(data_access,self.wf_out,analysis_id)
        
        # load split-lib sequences
        split_library_id=load_split_lib_sequences(data_access,input_dir,
                                                analysis_id, seq_run_id,
                                                split_lib_input_md5sum)
        
        ### TEST raw sequence data load
        # expected results
        print 'Analysis ID is: %s' % str(analysis_id)
        print 'Testing the FLOW_DATA loading!'
        exp_sff_md5='314f4000857668d45a413d2e94a755fc'
        exp_num_seqs=22
        exp_read_id='FLP3FBN01ELBSX'
        exp_instr_code='GS FLX'
        exp_sff_fname='Fasting_subset'
        
        # define the query to pull data from DB
        con = data_access.getSFFDatabaseConnection()
        cur = con.cursor()
        seq_run_info="""select j.seq_run_id,f.sff_filename,f.number_of_reads,f.md5_checksum,
              h.instrument_code
              from analysis j
              inner join seq_run_to_sff_file s on j.seq_run_id=s.seq_run_id
              inner join sff_file f on f.sff_file_id=s.sff_file_id
              inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id
              inner join sequencing_run h on h.seq_run_id=s.seq_run_id"""
        seq_run_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_1\'" % (str(analysis_id))
        results = cur.execute(seq_run_info)
        
        # get observed values
        for data in results:
            obs_seq_run_id,obs_sff_filename,obs_num_of_reads,obs_sff_md5,\
            obs_instrument_code = data
        
        # check results
        self.assertEqual(obs_sff_filename,exp_sff_fname)
        self.assertEqual(obs_num_of_reads,exp_num_seqs)
        self.assertEqual(obs_sff_md5,exp_sff_md5)
        self.assertEqual(obs_instrument_code,exp_instr_code)
        
        # TEST split-library data load
        # expected results
        print 'Testing Split-Library Data'
        exp_split_lib_seq='CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTTACCCTCTCAGGCCGGCTACGCATCATCGCCTTGGTGGGCCGTTACCTCACCAACTAGCTAATGCGCCGCAGGTCCATCCATGTTCACGCCTTGATGGGCGCTTTAATATACTGAGCATGCGCTCTGTATACCTATCCGGTTTTAGCTACCGTTTCCAGCAGTTATCCCGGACACATGGGCTAGG'
        exp_split_lib_md5='2c67e0acf745bef73e26c36f0b3bd00a'
        exp_split_lib_seq_md5='008918f7469f8e33d5dd6e01075d5194'

        # define the query to pull data from DB
        split_lib_info="""select distinct j.seq_run_id,slrm.ssu_sequence_id,l.command,l.md5_checksum,
              s.sequence_string,s.md5_checksum
              from analysis j
              inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id and j.split_library_run_id=slrm.split_library_run_id
              inner join ssu_sequence s on slrm.ssu_sequence_id=s.ssu_sequence_id
              inner join split_library_run l on j.split_library_run_id=l.split_library_run_id"""
        split_lib_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_1\'" % (str(analysis_id))
        results = cur.execute(split_lib_info)
        
        # get observed values
        for data in results:
            obs_seq_run_id,obs_ssu_seq_id,obs_split_lib_cmd,obs_split_lib_md5,\
            obs_split_lib_seq,obs_split_lib_seq_md5 = data
        
        # check results                                            
        self.assertEqual(obs_split_lib_md5,exp_split_lib_md5)
        self.assertEqual(obs_split_lib_seq,exp_split_lib_seq)
        self.assertEqual(obs_split_lib_seq_md5,exp_split_lib_seq_md5)
        
        # TEST OTU-table data load
        # expected results
        print 'Testing OTU Data!'
        exp_otu_md5='0b8edcf8a4275730001877496b41cf55'
        exp_threshold=97
        
        # define the query to pull data from DB
        otu_info="""select distinct j.seq_run_id,slrm.ssu_sequence_id,ot.reference_id,gr.ssu_sequence_id,
            ot.reference_id,j.otu_picking_run_id,p.command,p.md5_sum_input_file,
            p.threshold
            from analysis j
            inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id and j.split_library_run_id=slrm.split_library_run_id
            inner join otu_table ot on j.otu_run_set_id=ot.otu_run_set_id
            inner join gg_plus_denovo_reference gr on ot.reference_id=gr.reference_id
            inner join otu_picking_run p on j.otu_picking_run_id=p.otu_picking_run_id"""
        otu_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_2\'" % (str(analysis_id))
        results = cur.execute(otu_info)
        
        # get observed values
        for data in results:
            obs_seq_run_id,obs_ssu_seq_id,obs_otu_id,obs_otu_ssu_id,\
            obs_prokmsa,obs_otu_picking_run_id,obs_pick_otu_cmd,\
            obs_otu_md5,obs_threshold = data
        
        # check results  
        self.assertEqual(obs_otu_md5,exp_otu_md5)
        self.assertEqual(obs_threshold,exp_threshold)
        
        # TEST OTU-failures data load
        # define the query to pull data from DB
        otu_fail_info="""select distinct j.seq_run_id,f.ssu_sequence_id
              from analysis j
              inner join split_library_read_map slrm on j.seq_run_id=slrm.seq_run_id
              inner join otu_picking_failures f on slrm.ssu_sequence_id=f.ssu_sequence_id"""
        otu_fail_info+=" where j.analysis_id=%s and slrm.sequence_name=\'test.PCx634_14\'" % (str(analysis_id))
    
        results = cur.execute(otu_fail_info)
        
        # get observed values
        for data in results:
            obs_seq_run_id,obs_ssu_id= data
        
        # check results  
        self.failIfEqual(obs_seq_run_id,0)
        self.failIfEqual(obs_ssu_id,0)
        
        # delete the loaded data
        valid=data_access.deleteTestAnalysis(True,analysis_id)
        if not valid:
            print "Error: Could not delete data from DB!"