def create_or_update_analysis_collection(self,
                                             file_path,
                                             dbsession,
                                             withdraw_exisitng_collection=True,
                                             autosave_db=True,
                                             force=True,
                                             remove_file=False):
        '''
    A method for create or update analysis file collection in db. Required elements will be
    collected from database if base_path element is given.
    
    :param file_path: file path to load as db collection
    :param dbsession: An active database session
    :param withdraw_exisitng_collection: Remove existing collection group
    :param autosave_db: Save changes to database, default True
    :param remove_file: A toggle for removing existing file from disk, default False
    :param force: Toggle for removing existing file collection, default True
    '''
        try:
            ca = CollectionAdaptor(**{'session': dbsession})

            collection_exists = \
              ca.get_collection_files(
                collection_name=self.collection_name,
                collection_type=self.collection_type)
            if len(collection_exists.index) >0 and \
                withdraw_exisitng_collection:
                remove_data = [{
                    'name': self.collection_name,
                    'type': self.collection_type
                }]
                ca.remove_collection_group_info(
                    data=remove_data, autosave=autosave_db
                )  # removing all existing collection groups for the collection name and type

            fa = FileAdaptor(**{'session': dbsession})
            file_exists = fa.check_file_records_file_path(
                file_path=file_path)  # check if file already present in db
            if file_exists and force:
                fa.remove_file_data_for_file_path(
                    file_path=file_path,
                    remove_file=remove_file,
                    autosave=autosave_db
                )  # remove entry from file table and disk

            collection_data = [{
                'name': self.collection_name,
                'type': self.collection_type,
                'table': self.collection_table,
                'file_path': file_path
            }]
            ca.load_file_and_create_collection(
                data=collection_data,
                calculate_file_size_and_md5=True,
                autosave=autosave_db
            )  # load file, collection and create collection group
        except:
            raise
예제 #2
0
 def setUp(self):
   self.dbconfig = 'data/dbconfig.json'
   dbparam=read_dbconf_json(self.dbconfig)
   base = BaseAdaptor(**dbparam)
   self.engine = base.engine
   self.dbname=dbparam['dbname']
   Base.metadata.drop_all(self.engine)
   if os.path.exists(self.dbname):
     os.remove(self.dbname)
   Base.metadata.create_all(self.engine)
   self.session_class=base.get_session_class()
   base.start_session()
   project_data=[{'project_igf_id':'ProjectA'}]
   pa=ProjectAdaptor(**{'session':base.session})
   pa.store_project_and_attribute_data(data=project_data)                      # load project data
   sample_data=[{'sample_igf_id':'SampleA',
                 'project_igf_id':'ProjectA'}]                                 # sample data
   sa=SampleAdaptor(**{'session':base.session})
   sa.store_sample_and_attribute_data(data=sample_data)                        # store sample data
   experiment_data=[{'experiment_igf_id':'ExperimentA',
                     'sample_igf_id':'SampleA',
                     'library_name':'SampleA',
                     'platform_name':'MISEQ',
                     'project_igf_id':'ProjectA'}]                             # experiment data
   ea=ExperimentAdaptor(**{'session':base.session})
   ea.store_project_and_attribute_data(data=experiment_data)
   self.temp_dir=get_temp_dir()
   temp_files=['a.csv','b.csv']
   for temp_file in temp_files:
     with open(os.path.join(self.temp_dir,temp_file),'w') as fp:
       fp.write('A')
   collection_data=[{'name':'ExperimentA',
                     'type':'AnalysisA_html',
                     'table':'experiment',
                     'file_path':os.path.join(self.temp_dir,temp_file)}
                     for temp_file in temp_files]
   ca=CollectionAdaptor(**{'session':base.session})
   ca.load_file_and_create_collection(data=collection_data,
                                      calculate_file_size_and_md5=False)
   base.close_session()
 def setUp(self):
     self.dbconfig = 'data/dbconfig.json'
     dbparam = read_dbconf_json(self.dbconfig)
     base = BaseAdaptor(**dbparam)
     self.engine = base.engine
     self.dbname = dbparam['dbname']
     Base.metadata.create_all(self.engine)
     self.session_class = base.get_session_class()
     self.json_file_path = 'data/reset_samplesheet_md5/seqrun1_file_md5.json'
     json_data = pd.DataFrame([{
         'file_md5': '1e7531158974b5a5b7cbb7dde09ac779',
         'seqrun_file_name': 'SampleSheet.csv'
     }, {
         'file_md5': '2b22f945bc9e7e390af5432425783a03',
         'seqrun_file_name': 'RTAConfiguration.xml'
     }])
     with open(self.json_file_path, 'w') as jp:
         json.dump(json_data.to_dict(orient='record'), jp, indent=4)
     self.initial_json_md5 = calculate_file_checksum(
         filepath=self.json_file_path)
     self.correct_samplesheet_md5 = '259ed03f2e8c45980de121f7c3a70565'
     self.json_collection_name = 'seqrun1'
     self.json_collection_type = 'ILLUMINA_BCL_MD5'
     self.seqrun_path = 'data/reset_samplesheet_md5'
     self.seqrun_input_list = 'data/reset_samplesheet_md5/seqrun_input_list.txt'
     ca = CollectionAdaptor(**{'session_class': self.session_class})
     ca.start_session()
     data = pd.DataFrame([{
         'name': self.json_collection_name,
         'type': self.json_collection_type,
         'table': 'seqrun',
         'file_path': self.json_file_path,
     }])
     ca.load_file_and_create_collection(data, autosave=True, hasher='md5')
     ca.close_session()
     with open(self.seqrun_input_list, 'w') as fp:
         fp.write(self.json_collection_name)
  def run(self):
    try:
      fastq_file = self.param_required('fastq_file')
      fastq_dir = self.param_required('fastq_dir')
      igf_session_class = self.param_required('igf_session_class')
      fastqc_exe = self.param_required('fastqc_exe')
      tag = self.param_required('tag')
      seqrun_igf_id = self.param_required('seqrun_igf_id')
      seqrun_date = self.param_required('seqrun_date')
      flowcell_id = self.param_required('flowcell_id')
      fastqc_options = self.param('fastqc_options')
      base_results_dir = self.param_required('base_results_dir')
      project_name = self.param_required('project_name')
      force_overwrite = self.param('force_overwrite')
      fastqc_dir_label = self.param('fastqc_dir_label')
      required_collection_table = self.param('required_collection_table')
      sample_name = self.param('sample_name')
      hpc_location = self.param('hpc_location')
      fastqc_collection_type = self.param('fastqc_collection_type')
      use_ephemeral_space = self.param('use_ephemeral_space')
      store_file = self.param('store_file')

      lane_index_info = os.path.basename(fastq_dir)                             # get the lane and index length info
      fastq_file_label = os.path.basename(fastq_file).replace('.fastq.gz','')
      collection_name = None
      collection_table = None
      if tag=='known' and store_file:                                           # fetch sample name for known fastq, if its not defined
        base = BaseAdaptor(**{'session_class':igf_session_class})
        base.start_session()                                                    # connect to db

        ca = CollectionAdaptor(**{'session':base.session})
        (collection_name,collection_table) = \
          ca.fetch_collection_name_and_table_from_file_path(\
            file_path=fastq_file)                                               # fetch collection name and table info

        if collection_table != required_collection_table:
          raise ValueError(
        'Expected collection table {0} and got {1}, {2}'.\
          format(
            required_collection_table,
            collection_table,
            fastq_file))

        ra = RunAdaptor(**{'session':base.session})
        sample = ra.fetch_sample_info_for_run(run_igf_id=collection_name)
        sample_name = sample['sample_igf_id']
        base.close_session()

      fastqc_result_dir = \
        os.path.join(\
          base_results_dir,
          project_name,
          seqrun_date,
          flowcell_id,
          lane_index_info,
          tag)                                                                  # result dir path is generic
      if sample_name is not None:
        fastqc_result_dir = \
          os.path.join(\
            fastqc_result_dir,
            sample_name)                                                        # add sample name to dir path if its available

      fastqc_result_dir = \
        os.path.join(\
          fastqc_result_dir,
          fastq_file_label,
          fastqc_dir_label)                                                     # keep multiple files under same dir

      if os.path.exists(fastqc_result_dir) and force_overwrite:
        remove_dir(fastqc_result_dir)                                           # remove existing output dir if force_overwrite is true

      if not os.path.exists(fastqc_result_dir):
        os.makedirs(fastqc_result_dir,mode=0o775)                               # create output dir if its not present

      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp work dir
      if not os.path.exists(fastq_file):
        raise IOError('fastq file {0} not readable'.format(fastq_file))         # raise if fastq file path is not readable

      fastqc_output = \
        os.path.join(\
          temp_work_dir,
          fastq_file_label)
      os.mkdir(fastqc_output)                                                   # create fastqc output dir
      fastqc_param = \
        self.format_tool_options(fastqc_options)                                # format fastqc params
      fastqc_cmd = \
        [fastqc_exe, '-o',fastqc_output, '-d',temp_work_dir ]                   # fastqc base parameters
      fastqc_cmd.extend(fastqc_param)                                           # add additional parameters
      fastqc_cmd.append(fastq_file)                                             # fastqc input file
      subprocess.check_call(' '.join(fastqc_cmd),shell=True)                    # run fastqc

      fastqc_zip = None
      fastqc_html = None
      for root, _, files in os.walk(top=fastqc_output):
        for file in files:
          if fnmatch.fnmatch(file, '*.zip'):
            input_fastqc_zip = os.path.join(root,file)
            copy2(input_fastqc_zip,fastqc_result_dir)
            fastqc_zip = os.path.join(fastqc_result_dir,file)

          if fnmatch.fnmatch(file, '*.html'):
            input_fastqc_html = os.path.join(root,file)
            copy2(input_fastqc_html,fastqc_result_dir)
            fastqc_html = os.path.join(fastqc_result_dir,file)

      if fastqc_html is None or fastqc_zip is None:
        raise ValueError('Missing required values, fastqc zip: {0}, fastqc html: {1}'.\
                         format(fastqc_zip,fastqc_html))

      if tag=='known' and store_file:
        if collection_name is None:
          raise ValueError('couldn\'t retrieve collection name for {0}'.\
                           format(fastq_file))

        fastqc_files = \
          [{'name':collection_name,
            'type':fastqc_collection_type,
            'table':required_collection_table,
            'file_path':fastqc_zip,
            'location':hpc_location},
           {'name':collection_name,
            'type':fastqc_collection_type,
            'table':required_collection_table,
            'file_path':fastqc_html,
            'location':hpc_location},
          ]
        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        ca.load_file_and_create_collection(data=fastqc_files)                 # store fastqc files to db
        ca.close_session()

      self.param('dataflow_params',
                 {'fastqc_html':fastqc_html,
                  'lane_index_info':lane_index_info,
                  'sample_name':sample_name,
                  'fastqc':{'fastq_dir':fastq_dir,
                            'fastqc_zip':fastqc_zip,
                            'fastqc_html':fastqc_html}})                        # set dataflow params
    except Exception as e:
      message = \
        'seqrun: {2}, Error in {0}: {1}'.\
        format(\
          self.__class__.__name__,
          e,
          seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
예제 #5
0
dbconfig_path = args.dbconfig_path
collection_file_data = args.collection_file_data
calculate_checksum = args.calculate_checksum

if __name__ == '__main__':
    try:
        dbconnected = False
        if not os.path.exists(dbconfig_path):
            raise IOError('Dbconfig file {0} not found'.format(dbconfig_path))

        if not os.path.exists(collection_file_data):
            raise IOError('Collection data json file {0} not found'.format(
                collection_file_data))

        dbparam = read_dbconf_json(dbconfig_path)  # read db config
        collection_data = read_json_data(
            collection_file_data)  # read collection data json
        ca = CollectionAdaptor(**dbparam)
        ca.start_session()  # connect to database
        dbconnected = True
        ca.load_file_and_create_collection(
            data=collection_data,
            calculate_file_size_and_md5=calculate_checksum,
            autosave=True)  # load data and commit changes
        ca.close_session()
        dbconnected = False
    except Exception as e:
        if dbconnected:
            ca.rollback_session()
            ca.close_session()
        raise ValueError('Error: {0}'.format(e))
예제 #6
0
  def run(self):
    try:
      project_igf_id = self.param_required('project_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      file_list = self.param_required('file_list')
      remote_user = self.param_required('remote_user')
      remote_host = self.param_required('remote_host')
      remote_project_path = self.param_required('remote_project_path')
      dir_labels = self.param_required('dir_labels')
      igf_session_class = self.param_required('igf_session_class')
      force_overwrite = self.param('force_overwrite')
      collect_remote_file = self.param('collect_remote_file')
      collection_name = self.param('collection_name')
      collection_type = self.param('collection_type')
      collection_table = self.param('collection_table')
      file_location = self.param('file_location')
      use_ephemeral_space = self.param('use_ephemeral_space')
      destination_output_path = \
        os.path.join(
          remote_project_path,
          project_igf_id)                                                       # get base destination path
      if isinstance(dir_labels, list) and \
         len(dir_labels) > 0:
        destination_output_path=\
          os.path.join(destination_output_path,
                       *dir_labels)

      if collect_remote_file:
        if collection_name is None or \
           collection_type is None:
           raise ValueError('Name and type are required for db collection')

      output_file_list = list()
      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp dir
      for file in file_list:
        if not os.path.exists(file):
          raise IOError('file {0} not found'.\
                        format(file))

        if os.path.isfile(file):
          copy2(
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy file to a temp dir
          dest_file_path = \
            os.path.join(
              destination_output_path,
              os.path.basename(file))                                           # get destination file path
          os.chmod(
            os.path.join(
              temp_work_dir,
              os.path.basename(file)),
            mode=0o764)                                                         # set file permission
        elif os.path.isdir(file):
          copytree(\
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy dir to a temp dir
          dest_file_path=destination_output_path
          for root,dirs,files in os.walk(temp_work_dir):
            for dir_name in dirs:
              os.chmod(
                os.path.join(root,dir_name),
                mode=0o775)
            for file_name in files:
              os.chmod(
                os.path.join(root,file_name),
                mode=0o764)                                                     # changing file and dir permissions for remote files
        else:
          raise ValueError('Unknown source file type: {0}'.\
                           format(file))

        #os.chmod(
        #  os.path.join(
        #    temp_work_dir,
        #    os.path.basename(file)),
        #  mode=0o754)                                                                       # set file permission
        copy_remote_file(\
          source_path=os.path.join(temp_work_dir,
                                   os.path.basename(file)),
          destinationa_path=dest_file_path,
          destination_address='{0}@{1}'.format(remote_user,remote_host),
          force_update=force_overwrite
        )                                                                       # copy file to remote
        if os.path.isdir(file):
          dest_file_path=\
            os.path.join(\
              dest_file_path,
              os.path.basename(file))                                           # fix for dir input

        output_file_list.append(dest_file_path)

      remove_dir(dir_path=temp_work_dir)                                        # remove temp dir
      self.param('dataflow_params',
                 {'status': 'done',
                  'output_list':output_file_list})                              # add dataflow params
      if collect_remote_file:
        data=list()
        remove_data_list=[{'name':collection_name,
                           'type':collection_type}]
        for file in output_file_list:
          data.append(
            {'name':collection_name,
             'type':collection_type,
             'table':collection_table,
             'file_path':file,
             'location':file_location
            }
          )

        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        try:
          ca.remove_collection_group_info(
            data=remove_data_list,
            autosave=False)                                                     # remove existing data before loading new collection
          ca.load_file_and_create_collection(
            data=data,
            autosave=False,
            calculate_file_size_and_md5=False)                                  # load remote files to db
          ca.commit_session()                                                   # commit changes
          ca.close_session()
        except:
          ca.rollback_session()                                                 # rollback changes
          ca.close_session()
          raise

    except Exception as e:
      message = \
        'project: {2}, sample:{3}, Error in {0}: {1}'.\
        format(
          self.__class__.__name__,
          e,
          project_igf_id,
          sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise