def test_find_new_analysis_seeds2(self):
        base = BaseAdaptor(**{'session_class': self.session_class})
        project_name_file = os.path.join(self.temp_dir,
                                         'project_name_list.txt')
        with open(project_name_file, 'w') as fp:
            fp.write('projectA')

        available_exps,seeded_exps = \
          find_new_analysis_seeds(\
            dbconfig_path=self.dbconfig,
            pipeline_name='PrimaryAnalysis',
            project_name_file=project_name_file,
            species_name_list=['HG38'],
            fastq_type='demultiplexed_fastq',
            library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL']
          )
        self.assertTrue(available_exps is None)
        self.assertTrue('projectA' in seeded_exps)

        pla = PipelineAdaptor(**{'session_class': self.session_class})
        pla.start_session()
        seeded_data, exp_data = pla.fetch_pipeline_seed_with_table_data(\
                                  pipeline_name='PrimaryAnalysis',
                                  table_name='experiment',
                                  status='SEEDED')
        pla.close_session()
        exp_data = exp_data.to_dict(orient='records')
        self.assertTrue(len(exp_data), 1)
        self.assertEqual(exp_data[0]['experiment_igf_id'], 'sampleA_MISEQ')
    def test_find_new_analysis_seeds1(self):
        project_name_file = os.path.join(self.temp_dir,
                                         'project_name_list.txt')
        with open(project_name_file, 'w') as fp:
            fp.write('')

        available_exps,seeded_exps = \
          find_new_analysis_seeds(\
            dbconfig_path=self.dbconfig,
            pipeline_name='PrimaryAnalysis',
            project_name_file=project_name_file,
            species_name_list=['HG38'],
            fastq_type='demultiplexed_fastq',
            library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL']
          )
        self.assertTrue('projectA' in available_exps)
        self.assertTrue(seeded_exps is None)
        pla = PipelineAdaptor(**{'session_class': self.session_class})
        pla.start_session()
        seeded_data, exp_data = pla.fetch_pipeline_seed_with_table_data(\
                                  pipeline_name='PrimaryAnalysis',
                                  table_name='experiment',
                                  status='SEEDED')
        pla.close_session()
        self.assertEqual(len(seeded_data.index), 0)
 def test_load_new_pipeline_data(self):
     load_new_pipeline_data(data_file=self.data_file,
                            dbconfig=self.dbconfig)
     pp = PipelineAdaptor(**{'session_class': self.session_class})
     pp.start_session()
     data = pp.fetch_pipeline_records_pipeline_name(
         pipeline_name='demultiplexing_fastq')
     pp.close_session()
     self.assertEqual(data.pipeline_name, 'demultiplexing_fastq')
示例#4
0
 def test_fetch_pipeline_seed_with_table_data(self):
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     (pipe_seed, table_data) = pl.fetch_pipeline_seed_with_table_data(
         pipeline_name='demultiplexing_fastq')
     pl.close_session()
     self.assertIsInstance(table_data.to_dict(orient='records'), list)
     self.assertEqual(len(table_data.to_dict(orient='records')),
                      len(pipe_seed.to_dict(orient='records')))
     self.assertTrue('seqrun_igf_id' in list(table_data.columns))
示例#5
0
 def test_seed_new_experiments(self):
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     new_exps,_=\
       pl.seed_new_experiments(\
         pipeline_name='PrimaryAnalysis',
         species_name_list=['HG38'],
         fastq_type='demultiplexed_fastq',
       )
     self.assertEqual(len(new_exps), 1)
     self.assertEqual(new_exps[0], 'IGFQ000123_avik_10-4-2018_Miseq')
示例#6
0
 def test_create_pipeline_seed(self):
     pipeline_seed_data1 = [
         {
             'seed_id': '1',
             'seed_table': 'seqrun'
         },
     ]
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     with self.assertRaises(ValueError):
         pl.create_pipeline_seed(data=pipeline_seed_data1)
     pl.close_session()
def load_new_pipeline_data(data_file, dbconfig):
    '''
  A method for loading new data for pipeline table
  '''
    try:
        formatted_data = read_json_data(data_file)
        dbparam = read_dbconf_json(dbconfig)
        pp = PipelineAdaptor(**dbparam)
        pp.start_session()
        pp.store_pipeline_data(data=formatted_data)
        pp.close_session()
    except:
        raise
示例#8
0
 def test_fetch_pipeline_seed_with_table_data(self):
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     (pipe_seed, table_data) = pl.fetch_pipeline_seed_with_table_data(
         pipeline_name='alignment', table_name='experiment')
     pl.close_session()
     self.assertIsInstance(table_data.to_dict(orient='records'), list)
     self.assertEqual(len(table_data.to_dict(orient='records')),
                      len(pipe_seed.to_dict(orient='records')))
     exp_id = table_data.to_dict(orient='records')[0]['experiment_igf_id']
     project_id = table_data.to_dict(orient='records')[0]['project_igf_id']
     self.assertEqual(exp_id, 'IGF00001_HISEQ4000')
     self.assertEqual(project_id, 'IGFP0001_test_22-8-2017_rna_sc')
     self.assertTrue('experiment_igf_id' in list(table_data.columns))
def find_new_analysis_seeds(dbconfig_path, pipeline_name, project_name_file,
                            species_name_list, fastq_type,
                            library_source_list):
    '''
  A utils method for finding and seeding new experiments for analysis
  
  :param dbconfig_path: A database configuration file
  :param slack_config: A slack configuration file
  :param pipeline_name:Pipeline name
  :param fastq_type: Fastq collection type
  :param project_name_file: A file containing the list of projects for seeding pipeline
  :param species_name_list: A list of species to consider for seeding analysis
  :param library_source_list: A list of library source info to consider for seeding analysis
  :returns: List of available experiments or None and a list of seeded experiments or None
  '''
    try:
        available_exps = None
        seeded_exps = None
        if not os.path.exists(project_name_file):
            raise IOError('File {0} not found'.format(project_name_file))

        with open(project_name_file, 'r') as fp:
            project_list = fp.readlines()  # read list of projects from file,
            project_list = [i.strip() for i in project_list]
            if len(project_list) == 0:
                project_list = None

        dbparam = read_dbconf_json(dbconfig_path)
        pl = PipelineAdaptor(**dbparam)
        pl.start_session()
        available_exps,seeded_exps=\
          pl.seed_new_experiments(\
            pipeline_name=pipeline_name,
            species_name_list=species_name_list,
            fastq_type=fastq_type,
            project_list=project_list,
            library_source_list=library_source_list
          )
        pl.close_session()
        return available_exps, seeded_exps
    except:
        raise
示例#10
0
 def test_update_pipeline_seed(self):
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     pipeline_seed_data1 = [
         {
             'pipeline_name': 'demultiplexing_fastq',
             'seed_id': '2',
             'seed_table': 'seqrun',
         },
     ]
     with self.assertRaises(ValueError):
         pl.update_pipeline_seed(data=pipeline_seed_data1)
     pipeline_seed_data2 = [
         {
             'pipeline_name': 'demultiplexing_fastq',
             'seed_id': '2',
             'seed_table': 'seqrun',
             'status': 'RUNNING'
         },
     ]
     pl.update_pipeline_seed(data=pipeline_seed_data2)
     (pipe_seed1, table_data1) = pl.fetch_pipeline_seed_with_table_data(
         pipeline_name='demultiplexing_fastq')
     self.assertEqual(len(table_data1.to_dict(orient='records')),
                      len(pipe_seed1.to_dict(orient='records')))
     pipeline_seed_data3 = [
         {
             'pipeline_name': 'demultiplexing_fastq',
             'seed_id': '1',
             'seed_table': 'seqrun',
             'status': 'RUNNING'
         },
     ]
     pl.update_pipeline_seed(data=pipeline_seed_data3)
     (pipe_seed2, _) = pl.fetch_pipeline_seed_with_table_data(
         pipeline_name='demultiplexing_fastq', status='RUNNING')
     pl.close_session()
     self.assertEqual(
         pipe_seed2.loc[pipe_seed2.seed_id == 1]['status'].values[0],
         'RUNNING')
示例#11
0
 def test_seed_new_experiments1(self):
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     new_exps,_=\
       pl.seed_new_experiments(\
         pipeline_name='PrimaryAnalysis',
         species_name_list=['HG38'],
         fastq_type='demultiplexed_fastq',
         project_list=['IGFQ000123_avik_10-4-2018_Miseq'],
         library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL']
       )
     self.assertFalse(new_exps)
     pl.close_session()
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     (_,exp_data)=pl.fetch_pipeline_seed_with_table_data(\
                               pipeline_name='PrimaryAnalysis',
                               table_name='experiment',
                               status='SEEDED')
     self.assertEqual(len(list(exp_data['experiment_igf_id'].values)), 1)
     self.assertEqual(exp_data['experiment_igf_id'].values[0],
                      'IGF103923_MISEQ')
  def run(self):
    try:
      igf_session_class = self.param_required('igf_session_class')              # set by base class
      pipeline_name = self.param_required('pipeline_name')
      igf_id = self.param_required('igf_id')
      task_id = self.param_required('task_id')
      seed_id = self.param_required('seed_id')
      seed_table = self.param_required('seed_table')
      new_status = self.param_required('new_status')

      pa = PipelineAdaptor(**{'session_class':igf_session_class})
      pa.start_session()                                                        # connect to db
      pa.update_pipeline_seed(\
        data=[{'pipeline_name':pipeline_name,
               'seed_id':int(seed_id),
               'seed_table':seed_table,
               'status':new_status.upper()}])                                   # update seed record in db
      pa.close_session()                                                        # close db connection
      message = \
        'changing status in {0} for seed {1} as {2}'.\
        format(\
          pipeline_name,
          seed_id,
          new_status.upper())                                                   # format message
      self.post_message_to_slack(message, reaction='pass')                      # send message to slack
      self.comment_asana_task(task_name=task_id, comment=message)               # send message to asana
    except Exception as e:
      message = \
        'seqrun: {2}, Error in {0}: {1}'.\
          format(\
            self.__class__.__name__,
            e,
            igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
示例#13
0
 def test_fetch_pipeline_records_pipeline_name(self):
     pl = PipelineAdaptor(**{'session_class': self.session_class})
     pl.start_session()
     pl_data = pl.fetch_pipeline_records_pipeline_name(
         pipeline_name='demultiplexing_fastq')
     self.assertEqual(pl_data.pipeline_id, 1)
示例#14
0
  def run(self):
    '''
    Run method for the seed job factory class of the all pipelines
    
    :param igf_session_class: A database session class
    :param pipeline_name: Name of the pipeline
    :param seed_id_label: A text label for the seed_id, default seed_id
    :param seqrun_id_label: A text for seqrun_id column name, default seqrun_id
    :param seqrun_date_label: A text label for the seqrun date, default seqrun_date
    :param seqrun_igf_id_label: A text label for sequencing run igf id, default seqrun_igf_id
    :param seeded_label: A text label for the status seeded in pipeline_seed table, default SEEDED
    :param running_label: A text label for the status running in the pipeline_seed table, default RUNNING
    :param seed_status_label: A text label for the pipeline_seed status column name, default status
    :param experiment_id_label: A text label for the experiment_id, default experiment_id
    :param pipeseed_mode: A text label for pipeline mode, default demultiplexing
                          Allowed values are 
                             
                             demultiplexing
                             alignment
                             
    :returns: A list of dictionary containing the seqrun ids or experiment_igf_ids seed for analysis
    '''
    try:
      dbconnected=False
      igf_session_class = self.param_required('igf_session_class')              # set by base class
      pipeline_name = self.param_required('pipeline_name')
      seed_id_label = self.param_required('seed_id_label')
      seqrun_id_label = self.param_required('seqrun_id_label')
      seeded_label=self.param_required('seeded_label')
      running_label=self.param_required('running_label')
      seqrun_date_label=self.param_required('seqrun_date_label')
      seqrun_igf_id_label=self.param_required('seqrun_igf_id_label')
      seed_status_label=self.param_required('seed_status_label')
      experiment_id_label = self.param_required('experiment_id_label')
      pipeseed_mode=self.param_required('pipeseed_mode')

      if pipeseed_mode not in ('demultiplexing','alignment'):
        raise ValueError('Pipeseed_mode {0} not supported'.format(pipeseed_mode))

      pipeseeds_data,seed_data=get_pipeline_seeds(\
                                 pipeseed_mode=pipeseed_mode,
                                 pipeline_name=pipeline_name,
                                 igf_session_class=igf_session_class)           # fetch pipeseed data from db
      if len(seed_data.index)>0:
        seed_data=seed_data.\
                  to_dict(orient='records')                                     # convert dataframe to list of dictionaries
        self.param('sub_tasks',seed_data)                                       # set sub_tasks param for the data flow
        pipeseeds_data[seed_status_label]=pipeseeds_data[seed_status_label].\
                                          map({seeded_label:running_label})     # update seed records in pipeseed table, changed status to RUNNING
        pa = PipelineAdaptor(**{'session_class':igf_session_class})             # get db adaptor
        pa.start_session()                                                      # connect to db
        dbconnected=True
        pa.update_pipeline_seed(data=pipeseeds_data.to_dict(orient='records'),
                                autosave=False)                                 # set pipeline seeds as running
        pa.commit_session()                                                     # save changes to db
        pa.close_session()                                                      # close db connection
        dbconnected=False
        message='Total {0} new job found for {1}, pipeline: {2}'.\
                format(len(seed_data),self.__class__.__name__,pipeline_name)    # format msg for slack
        self.post_message_to_slack(message,reaction='pass')                     # send update to slack
      else:
        message='{0}, {1}: no new job created'.format(self.__class__.__name__,\
                                                      pipeline_name)            # format msg for failed jobs
        self.warning(message)
        self.post_message_to_slack(message,reaction='sleep')                    # post about failed job to slack

    except Exception as e:
      message='Error in {0},{1}: {2}'.format(self.__class__.__name__,\
                                             pipeline_name, e)                  # format slack msg
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # send msg to slack
      if dbconnected:
        pa.rollback_session()                                                   # remove changes from db
        pa.close_session()
      raise                                                                     # mark worker as failed
def get_pipeline_seeds(pipeseed_mode,
                       pipeline_name,
                       igf_session_class,
                       seed_id_label='seed_id',
                       seqrun_date_label='seqrun_date',
                       seqrun_id_label='seqrun_id',
                       experiment_id_label='experiment_id',
                       seqrun_igf_id_label='seqrun_igf_id'):
    '''
  A utils function for fetching pipeline seed information
  
  :param pipeseed_mode: A string info about pipeseed mode, allowed values are
                          demultiplexing
                          alignment
  
  :param pipeline_name: A string infor about pipeline name
  :param igf_session_class: A database session class for pipeline seed lookup
  :returns: Two Pandas dataframes, first with pipeseed entries and second with seed info
  '''
    try:
        if pipeseed_mode not in ('demultiplexing', 'alignment'):
            raise ValueError(
                'Pipeseed_mode {0} not supported'.format(pipeseed_mode))

        table_name = None
        if pipeseed_mode == 'demultiplexing':
            table_name = 'seqrun'
        elif pipeseed_mode == 'alignment':
            table_name = 'experiment'

        pa = PipelineAdaptor(**{'session_class':
                                igf_session_class})  # get db adaptor
        pa.start_session()  # connect to db
        dbconnected = True
        pipeseeds_data, table_data = \
                pa.fetch_pipeline_seed_with_table_data(pipeline_name,
                                                       table_name=table_name)       # fetch requires entries as list of dictionaries from table for the seeded entries
        seed_data = pd.DataFrame()
        if not isinstance(pipeseeds_data,pd.DataFrame) or \
           not isinstance(table_data,pd.DataFrame):
            raise AttributeError('Expecting a pandas dataframe of pipeseed data and received {0}, {1}').\
                                 format(type(pipeseeds_data),type(table_data))

        if len(pipeseeds_data.index) > 0 and \
           len(table_data.index) > 0:
            pipeseeds_data[seed_id_label]=pipeseeds_data[seed_id_label].\
                                          map(lambda x: int(x))                       # convert pipeseed column type
            if pipeseed_mode == 'demultiplexing':
                table_data[seqrun_id_label]=table_data[seqrun_id_label].\
                                            map(lambda x: int(x))                       # convert seqrun data column type
                merged_data = pd.merge(pipeseeds_data,
                                       table_data,
                                       how='inner',
                                       on=None,
                                       left_on=[seed_id_label],
                                       right_on=[seqrun_id_label],
                                       left_index=False,
                                       right_index=False)  # join dataframes
                merged_data[seqrun_date_label]=\
                     merged_data[seqrun_igf_id_label].\
                     map(lambda x:  _get_date_from_seqrun(seqrun_igf_id=x))             # get seqrun date from seqrun id
            elif pipeseed_mode == 'alignment':
                table_data[experiment_id_label]=table_data[experiment_id_label].\
                                                map(lambda x: int(x))                   # convert experiment data column type
                merged_data = pd.merge(pipeseeds_data,
                                       table_data,
                                       how='inner',
                                       on=None,
                                       left_on=[seed_id_label],
                                       right_on=[experiment_id_label],
                                       left_index=False,
                                       right_index=False)  # join dataframes

            seed_data=merged_data.\
                      applymap(lambda x: str(x))                                      # convert dataframe to string and add as list of dictionaries
        return pipeseeds_data, seed_data
    except:
        raise