class RunTasks(luigi.Task): """ Main entrypoint for this module. """ start_date = luigi.DateSecondParameter(default=(dt.datetime.today() - dt.timedelta(days=1))) load_date = luigi.DateSecondParameter(default=dt.datetime.today()) pl = Pipeline("Load2Sites", "RunTasks") def output(self): return PipelineTasks(self.pl.etl_name, self.pl.task_name, self.load_date.date(), self.start_date.date()) def run(self): try: yield [ TransformLoadSites(start_date=self.start_date, load_date=self.load_date) ] except Exception as e: sys.exit(e) self.pl.log_insert(self.load_date, self.start_date)
class EvaluateLogisticRegression(luigi.Task): date = luigi.DateSecondParameter(default=datetime.now()) task_complete = False def requires(self): return [SplitTrainTest(), TrainLogisticRegression(self.date)] def run(self): df_test = s3.read_parquet(self.input()[0][1].path) y_test = df_test.loc[:, "ttj_sub_12"] X_test = df_test.drop(["ttj", "ttj_sub_12"], axis="columns") lg = s3.read_pickle(self.input()[1].path) metrics = evaluate(lg, X_test, y_test) model_info_to_db( engine=get_db_engine(), model=lg, metrics=metrics, features=X_test.columns.tolist(), date=self.date, model_path=self.input()[1].path, train_data_path=self.input()[0][2].path, test_data_path=self.input()[0][3].path, ) # NOTE: Set task as completed manually. Use the build-in # luigi.contrib.postgres.CopyToTable Task would the right. self.task_complete = True def complete(self): return self.task_complete
class GetUserFromOmie(luigi.Task): date = luigi.DateSecondParameter() def run(self): key = OmieAPI().key secret = OmieAPI().secret headers = {"Content-type": "application/json"} data = { "call": "ListarUsuarios", "app_key": key, "app_secret": secret, "param": [{ "pagina": 1, "registros_por_pagina": 100 }] } response = requests.post( 'https://app.omie.com.br/api/v1/crm/usuarios/', headers=headers, json=data) with self.output().open('w') as outfile: json.dump(response.json(), outfile, indent=4, ensure_ascii=False) def output(self): path = f"data/users_{str(self.date)}.json" return luigi.LocalTarget(path)
class EvaluateRecommendationsRF(luigi.Task): date = luigi.DateSecondParameter(default=datetime.now()) task_complete = False def requires(self): return [TrainRandomForest(self.date), EvaluateRandomForest(self.date)] def run(self): params = yaml.load(open("./conf/base/parameters.yml"), Loader=yaml.FullLoader)["evaluation_params"] model = s3.read_pickle(self.input()[0].path) model_id, test_path, train_path = get_model_info_by_path( self.input()[0].path) df_train = s3.read_parquet(train_path) df_test = s3.read_parquet(test_path) rec_error = get_aggregate_recommendation_error( df_train, df_test, model, params["set_size"], params["num_recs"], params["percent_sample"], ) write_recommendation_eval(get_db_engine(), rec_error, model_id, params) self.task_complete = True def complete(self): return self.task_complete
class TimestampPartitionMixin(object): """ This mixin provides its task with a formatted date partition value property. The partition value is the `date` parameter, formatted by the `partition_date` parameter. It can be used by HivePartitionTasks and tasks which invoke downstream HivePartitionTasks. """ date = luigi.DateSecondParameter( default=datetime.datetime.utcnow(), description='Date/time for the data partition. Default is UTC now.' 'Note that though this is a DateParameter, it also supports datetime objects, and so can ' 'be used to create time-based data partitions.', ) partition_format = luigi.Parameter( config_path={ 'section': 'course-list', 'name': 'partition_format' }, default='%Y-%m-%d', description= 'Format string for the course list table partition\'s `date` parameter. ' 'Must result in a filename-safe string, or your partitions will fail to be created.\n' 'The default value of "%Y-%m-%d" changes daily, and so causes a new course partition to to be ' 'created once a day. For example, use "%Y-%m-%dT%H" to update hourly, though beware of load on ' 'the edX REST API. See strftime for options.', ) @property def partition_value(self): """Partition based on the task's date and partition format strings.""" return unicode(self.date.strftime(self.partition_format))
class Globals(luigi.Config): ''' Global variables. Set using luigi configuration file. ''' # Path to installation of exempliphi PIPELINE_ROOT = luigi.Parameter( default=os.path.split(os.path.dirname(os.path.realpath(__file__)))[0]) # Directory to write output to OUTPUT_DIR = luigi.Parameter(default=os.path.join( os.path.split(os.path.dirname(os.path.realpath(__file__)))[0], 'output')) # Number of cores available in computing environment NUM_THREADS = luigi.IntParameter(default=1) # Coda environment holding dependencies PRIMARY_CONDA_ENV = luigi.Parameter(default='exempliphi') # Path to conda executable. Default expects it to be in PATH. CONDA_EXE = luigi.Parameter(default='conda') # Lower boundary of insert size INSERT_SIZE_LB = luigi.IntParameter( default=250) # Defaults were recommendations from Dr. Ken Frey # Upper boundary of insert size INSERT_SIZE_UB = luigi.IntParameter(default=500) # NCBI nt blast database NT = luigi.Parameter() # Number of workers to use when running pipeline NUM_WORKERS = luigi.IntParameter(default=10) # What cluster scheduling software do you use? SGE, SLURM, or False for single node CLUSTER = luigi.Parameter(default=False) # Date of run RUN_DATE = luigi.DateSecondParameter(default=datetime.now())
class All(luigi.WrapperTask): timestamp = luigi.DateSecondParameter(default=datetime.datetime.now()) def run(self): print("Running All") def requires(self): return TransformedDataset()
class Send_Results_To_MAS(PipelineTask): annotation_accession = luigi.Parameter(default='') database = luigi.Parameter() tool = luigi.Parameter() run_time = luigi.DateSecondParameter(default=datetime.now()) run_locally = True def requires(self): if self.tool == 'blastp': return Blastp( annotation_accession=self.annotation_accession, database=self.database, run_time=self.run_time, mas_server=self.mas_server ) elif self.tool == 'hhsearch': return HHsearch( annotation_accession=self.annotation_accession, database=self.database, run_time=self.run_time, mas_server=self.mas_server ) elif self.tool == 'rpsblast': return RPSBlast( annotation_accession=self.annotation_accession, database=self.database, run_time=self.run_time, mas_server=self.mas_server ) def out_file_path(self, temp=False): return {} def out_dir(self, temp=False): folder = '{}-temp'.format(self.database) if temp else self.database return os.path.join(self.pipeline_out_dir(), self.task_family, self.tool, folder) def do_task(self): r = requests.post( self.mas_server + reverse('upload_results'), auth=(self.g.MAS_USERNAME, self.g.MAS_PASSWORD), files=[('result', open(self.input()['results'].path))], data={ 'tool': self.tool, 'accession': self.annotation_accession, 'database': self.database, 'status': 0 }, verify=self.g.MAS_CRT ) if r.status_code != 200: raise requests.ConnectionError( 'Request to post results to MAS server failed (status code = %i):\n%s' % (r.status_code, r.text) )
class HHblits(PipelineTask): ''' Run HHblits on each protein ''' annotation_accession = luigi.Parameter(default='') protein_id = luigi.Parameter(default='') iterations = luigi.IntParameter(default=3) database = luigi.Parameter() run_time = luigi.DateSecondParameter() tool = 'hhsearch' # At this point this is the only database option. The database parameter is for HHSearch! uniclust = luigi.Parameter() uniclust_cpu = luigi.Parameter() def __init__(self, *args, **kwargs): super(HHblits, self).__init__(*args, **kwargs) if self.uniclust_cpu: self.n_cpu = self.uniclust_cpu def out_dir(self, temp=False): ''' Returns the directory which will hold the files output by this task. ''' base_name = '{}_{}'.format(self.task_family, self.annotation_accession) folder = '{}-temp'.format(base_name) if temp else base_name return os.path.join(self.pipeline_out_dir(), self.task_family, folder) def requires(self): return Pull_Protein( annotation_accession=self.annotation_accession, run_time=self.run_time, tool=self.tool, database=self.database, mas_server=self.mas_server ) def out_file_path(self, temp=False): name = self.annotation_accession return { 'alignment': os.path.join(self.out_dir(temp), '{}.a3m'.format(name)) } def do_task(self): db_path = self.uniclust in_file = self.input()['fasta'].path self._run_command([ 'hhblits', '-i', in_file, '-oa3m', self.out_file_path(True)['alignment'], '-n', str(self.iterations), '-cpu', str(self.n_cpu), '-d', db_path ])
class RPSBlast(PipelineTask): ''' Generate rpsblast (CD-Search) results for a given proteome ''' annotation_accession = luigi.Parameter(default='') e_value = luigi.FloatParameter(default=0.0001) database = luigi.Parameter() run_time = luigi.DateSecondParameter() tool = 'rpsblast' # Available Database Choices cdd = luigi.Parameter() cdd_cpu = luigi.Parameter() def __init__(self, *args, **kwargs): super(RPSBlast, self).__init__(*args, **kwargs) if self.database == 'cdd': self.n_cpu = self.cdd_cpu def requires(self): return Pull_Protein( annotation_accession=self.annotation_accession, run_time=self.run_time, tool=self.tool, database=self.database, mas_server=self.mas_server ) def out_file_path(self, temp=False): name = self.annotation_accession return { 'results': os.path.join(self.out_dir(temp), '{}_{}_rpsblast_results.xml'.format(name, self.database)) } def out_dir(self, temp=False): folder = '{}-temp'.format(self.database) if temp else self.database return os.path.join(self.pipeline_out_dir(), self.task_family, folder) def do_task(self): if self.database == 'cdd': db_path = self.cdd else: raise ValueError('Invalid database ' + self.database) self._run_command([ 'rpsblast', '-query', self.input()['fasta'].path, '-db', db_path, '-evalue', self.e_value, '-out', self.out_file_path(True)['results'], '-outfmt', '5', '-num_threads', str(self.n_cpu) ])
class HHsearch(PipelineTask): protein_id = luigi.Parameter(default='') annotation_accession = luigi.Parameter(default='') run_time = luigi.DateSecondParameter() tool = 'hhsearch' database = luigi.Parameter() # Available Database Choices pdb = luigi.Parameter() pdb_cpu = luigi.Parameter() def __init__(self, *args, **kwargs): super(HHsearch, self).__init__(*args, **kwargs) if self.pdb_cpu: self.n_cpu = self.pdb_cpu def out_dir(self, temp=False): ''' Returns the directory which will hold the files output by this task. ''' base_name = '{}_{}'.format(self.task_family, self.annotation_accession) folder = '{}-temp'.format(base_name) if temp else base_name return os.path.join(self.pipeline_out_dir(), self.task_family, folder) def requires(self): return HHblits( annotation_accession=self.annotation_accession, run_time=self.run_time, database=self.database, mas_server=self.mas_server ) def out_file_path(self, temp=False): name = self.annotation_accession return { 'results': os.path.join(self.out_dir(temp), '{}.hhr'.format(name)) } def do_task(self): if self.database == 'pdb': db_path = self.pdb else: raise ValueError('Invalid database ' + self.database) self._run_command([ 'hhsearch', '-i', self.input()['alignment'].path, '-d', db_path, '-o', self.out_file_path(True)['results'], '-cpu', str(self.n_cpu) ])
class CdxIndexer(luigi.contrib.hadoop_jar.HadoopJarJobTask): input_file = luigi.Parameter() cdx_service = luigi.Parameter() # This is used to add a timestamp to the output file, so this task can always be re-run: timestamp = luigi.DateSecondParameter(default=datetime.datetime.now()) meta_flag = '' task_namespace = "access.index" num_reducers = 5 def requires(self): return CopyToHDFS(input_file=self.input_file, prefix="/9_processing/warcs2cdx/") def ssh(self): return { 'host': 'mapred', 'key_file': '~/.ssh/id_rsa', 'username': '******' } def jar(self): # dir_path = os.path.dirname(os.path.realpath(__file__)) # return os.path.join(dir_path, "../jars/warc-hadoop-recordreaders-3.0.0-SNAPSHOT-job.jar") # Note that when using ssh to submit jobs, this needs to be a JAR on the remote server: return "/home/access/github/ukwa-manage/tasks/jars/warc-hadoop-recordreaders-3.0.0-SNAPSHOT-job.jar" def main(self): return "uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator" def args(self): return [ "-Dmapred.compress.map.output=true", "-Dmapred.output.compress=true", "-Dmapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec", "-i", self.input(), "-o", self.output(), "-r", self.num_reducers, "-w", "-h", "-m", self.meta_flag, "-t", self.cdx_service, "-c", "CDX N b a m s k r M S V g" ] def output(self): timestamp = self.timestamp.isoformat() timestamp = timestamp.replace(':', '-') file_prefix = os.path.splitext(os.path.basename(self.input_file))[0] return state_file(self.timestamp, 'warcs2cdx', '%s-submitted-%s.txt' % (file_prefix, timestamp), on_hdfs=True)
class snowplow_enriched_upload_data(luigi.Task): dataset_date = luigi.DateParameter(default=date.today() - timedelta(days=1)) # force_run = luigi.BoolParameter() _start = luigi.DateSecondParameter(default=datetime.utcnow()) file_root = luigi.Parameter() credentials = GoogleCredentials.get_application_default() def run(self): client = gcs.GCSClient(oauth_credentials=self.credentials) client.put(self.input().path, self.output().path) def output(self): return gcs.GCSTarget("gs://snowplow_tracker/%s_%s.json.gz" % (self.file_root, self.dataset_date.strftime("%Y$m%d")))
class TrainRandomForest(luigi.Task): date = luigi.DateSecondParameter(default=datetime.now()) def requires(self): return SplitTrainTest(self.date) def output(self): return S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/random_forest_T%H%M%S.pkl}".format( date=self.date)), client=s3.create_client(), ) def run(self): df_train = s3.read_parquet(self.input()[0].path) y_train = df_train.loc[:, "ttj_sub_12"] X_train = df_train.drop(["ttj", "ttj_sub_12"], axis="columns") grid = yaml.load(open("./conf/base/parameters.yml"), Loader=yaml.FullLoader)["rf_small_grid"] model = self.train_rf_cv(X_train, y_train, scoring_metric="f1", grid=grid) s3.write_pickle(model, self.output().path) def train_rf_cv(self, X, y, scoring_metric, grid=dict()): """ Runs grid search on a random forest classifier :param X: Feature matrix of training set :param y: Target vector of training set :param scoring_metric: Single metric from which we choose the best classifier :param grid: Cross validation grid :return: Best trained model after grid search """ rf = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight="balanced") rf_grid_search = GridSearchCV(rf, grid, scoring=scoring_metric, cv=5, refit=True) rf_grid_search.fit(X, y) return rf_grid_search.best_estimator_
class process_raw_snowplow_event_data(luigi.Task): dataset_date = luigi.DateParameter(default=date.today() - timedelta(days=1)) # force_run = luigi.BoolParameter() _start = luigi.DateSecondParameter(default=datetime.utcnow()) file_root = luigi.Parameter() def download_s3_file(self, s3_filename): local_filename = "/Users/samuel.peltz/etl/%s" % s3_filename s3_file_full_path =re.compile("snowplow-enrich-output/enriched/archive/run=" + self.dataset_date.strftime("%Y-%m-%d") +r"-\d{2}-\d{2}-\d{2}/*.") try: s3.download_file(Bucket=os.environ.get('SP_BUCKET'), Key=s3_file_full_path, Filename=local_filename) except Exception as e: logger.error("%s - Could not retrieve %s because: %s" % ("download_s3_file()", s3_file_full_path, e)) raise return local_filename def list_files(self, sp_bucket): files = [] response = s3.list_objects_v2(Bucket=os.environ.get('SP_BUCKET')) while True: files.extend([o['Key'] for o in response['Contents']]) if not response['IsTruncated']: break else: response = s3.list_objects_v2(Bucket=os.environ.get('SP_BUCKET'), ContinuationToken=response['NextContinuationToken']) pattern = re.compile(r"snowplow-enrich-output/enriched/archive/run=" + self.dataset_date.strftime("%Y-%m-%d") + r"-\d{2}-\d{2}-\d{2}/part-\d{5}\.*") for thisfile in files: if re.match(pattern, thisfile): s3.download_file(Bucket=os.environ.get('SP_BUCKET'), Key=thisfile) return files def output(self): return luigi.LocalTarget("/Users/samuel.peltz/etl/%s_%s.json.gz" % (self.file_root, self.dataset_date.strftime("%Y%m%d")))
class Run_Pipeline_For_Proteins(luigi.WrapperTask): input_list = luigi.ListParameter() run_time = luigi.DateSecondParameter(default=datetime.now()) mas_server = luigi.Parameter() def requires(self): job_array = [] for search_params in self.input_list: if set(search_params) != {'accession', 'tool', 'database'}: raise KeyError('Incorrect dict keys for pipeline parameters') job_array.append( Send_Results_To_MAS( mas_server=self.mas_server, annotation_accession=search_params['accession'], database=search_params['database'], tool=search_params['tool'] ) ) return job_array
class Pull_Protein(PipelineTask): ''' Pull a single protein through MAS's REST API ''' annotation_accession = luigi.Parameter() run_locally = True database = luigi.Parameter() run_time = luigi.DateSecondParameter() tool = luigi.Parameter() def __init__(self, *args, **kwargs): super(Pull_Protein, self).__init__(*args, **kwargs) self.task_id = '{}_{}_{}'.format(self.get_task_family(), self.run_time, self.annotation_accession) def out_file_path(self, temp=False): return { 'fasta': os.path.join(self.out_dir(temp), '%s.faa' % self.annotation_accession) } def do_task(self): # Get protein sequence from MAS's REST API response = requests.get( self.mas_server + reverse('get_protein', kwargs={'accession': self.annotation_accession}), auth=(self.g.MAS_USERNAME, self.g.MAS_PASSWORD), verify=self.g.MAS_CRT ) if response.status_code != 200: self.logger.error( 'Response status code = {}. Response text = {}.'.format(response.status_code, response.text) ) raise requests.ConnectionError('Request to get protein sequence from MAS server failed') # Write sequence file protein_seq = Seq(response.json()['sequence'], IUPAC.IUPACProtein) rec = SeqRecord(protein_seq, id=self.annotation_accession, description='') SeqIO.write(rec, self.out_file_path(True)['fasta'], 'fasta')
class snowplow_enriched_insert_data(bigquery.BigqueryLoadTask): dataset_date = luigi.DateParameter(default=date.today() - timedelta(days=1)) # force_run = luigi.BoolParameter() _start = luigi.DateSecondParameter(default=datetime.utcnow()) file_root = luigi.Parameter() credentials = GoogleCredentials.get_application_default() source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE def source_uris(self): return [x.path for x in luigi.task.flatten(self.input())] def output(self): return bigquery.BigqueryTarget( "realself-main", "snowplow", "Events" ) def complete(self): return check_partition_modified( table="%s.%s" % (self.output().table.dataset_id, self.file_root), partition=self.dataset_date.strftime("%Y%m%d"), threshold=60, time_ref=self._start)
class ExtractNameId(luigi.Task): date = luigi.DateSecondParameter() def requires(self): return [GetUserFromOmie(self.date)] def get_name_id(self): with self.input()[0].open('r') as json_file: users = json.load(json_file) users_ids = [] for cadastro in users['cadastros']: users_ids.append({ 'code': cadastro['nCodigo'], 'name': cadastro['cNome'] }) return users_ids def run(self): df = pd.DataFrame(self.get_name_id()) df.to_parquet(self.output().path, index=False) def output(self): path = f"data/users_{str(self.date)}_ids.parquet" return luigi.LocalTarget(path)
def test_parse(self): ds = luigi.DateSecondParameter().parse('2013-02-01T184227') self.assertEqual(ds, datetime.datetime(2013, 2, 1, 18, 42, 27))
class ListDateParamTask(luigi.Task): param1 = luigi.Parameter() param2 = luigi.DateSecondParameter(default=datetime.now()) param3 = luigi.Parameter(default=['something'])
class ExtractSitesSummary(luigi.Task): """ Extracts Site Summary data. """ start_date = luigi.DateSecondParameter(default=(dt.datetime.today() - dt.timedelta(days=1))) load_date = luigi.DateSecondParameter(default=dt.datetime.today()) extract_performance = ExtractSitesPerformance() tmp_files = CleanUpTempFiles() pl = Pipeline("LoadSites", "ExtractSitesSummary") # This class ensures that the status of the APICA Checks are up-to-date. check_status = CheckStatus() # Database connections conn = Connection() pw_src_cur = conn.pw_src.return_cursor() pw_tar_cur = conn.pw_tar.return_cursor() def output(self): return PipelineTasks(self.pl.etl_name, self.pl.task_name, self.load_date.date(), self.start_date.date()) def run(self): processing_time_span = self.get_processing_timespan( self.start_date.date()) # Remove the previous days temp files. self.tmp_files.remove() check_ids = self.get_multi_url_check_ids() if len(check_ids) == 0: self.inform_luigi_processing_completed() return for check_id in check_ids: url = self.get_url(processing_time_span, check_id[0]) json_dict_summary_result = self.get_data(url, check_id[0]) if json_dict_summary_result is not None and len( json_dict_summary_result) > 0: self.extract_performance.run(json_dict_summary_result, self.start_date.date()) else: # Write empty performance data since there is no header info for the check id. empty_result = [] fact_path = self.extract_performance.get_file_path( self.start_date.date(), check_id[0]) self.save_data(fact_path, empty_result) # Save the header results. full_path = self.get_file_path(check_id[0]) self.save_data(full_path, json_dict_summary_result) self.pl.tasks_insert(self.load_date, self.start_date) def get_processing_timespan(self, start_date=''): if start_date == '': start_date = (dt.datetime.today() - dt.timedelta(days=1)).date() date_time_span = dict() date_time_span['start_day'] = start_date date_time_span['end_day'] = start_date date_time_span['to_hour'] = 'T23:59:59' date_time_span['from_hour'] = 'T00:00:00' return date_time_span def get_multi_url_check_ids(self): """ Obtain the Check IDs that have multiple URLs associated with them. :return: List of check IDs """ try: check_id_list = self.pw_src_cur.execute( var.GET_MULTI_URL_CHECKS).fetchall() except Exception as e: error_msg = var.GET_MULTI_URL_CHECKS_ERROR_MSG.format(e) self.pl.logger.error(error_msg) sys.exit(error_msg) return check_id_list def get_url(self, processing_time_span, check_id): """ Builds the URL. :param processing_time_span: Time block for desired data. :param check_id: Check Id for the particular client. :return: URL that constitutes the Get API call. """ ts_for_api = var.UTC_TIMESPAN.format(processing_time_span['start_day'], processing_time_span['from_hour'], processing_time_span['end_day'], processing_time_span['to_hour']) url = var.URL_QUERY_INFO.format(var.BASE_URL, str(check_id), ts_for_api, var.AUTH_TICKET) return url def get_data(self, url, check_id): """ Executes the Get API call. :param url: URL that constitutes the Get API call. :param check_id: Check Id for the particular client. :return: Site performance data for the particular client. """ try: response = urllib.request.urlopen(url) except Exception as e: error_msg = "FAILED to obtain data from URL. ERROR MESSAGE: {} --URL: {} ".format( e, url) self.pl.logger.exception(error_msg) self.apica_check_status.determine_check_id_status(check_id) return None # The HTTP call returns a JSON array of monitors. json_list_result = response.read() json_dict_result = json.loads(json_list_result.decode('utf-8')) if not json_dict_result: self.pl.logger.info("No data to process.") return None return json_dict_result def get_file_path(self, check_id): prefix = var.SUMMARIES_HEADERS.format(check_id) filename = get_filename(self.start_date.date(), prefix, 'json') return os.path.join(var.DATA_SITES_PTH, filename) def save_data(self, full_path, json_dict_result): with open(full_path, 'w') as outfile: json.dump(json_dict_result, outfile)
class TransformLoadSites(luigi.Task): start_date = luigi.DateSecondParameter(default=(dt.datetime.today() - dt.timedelta(days=1))) load_date = luigi.DateSecondParameter(default=dt.datetime.today()) pl = Pipeline("Load2Sites", "TransformSitesSummary") site_manager = SiteManager() conn = Connection() pw_src_cur = conn.pw_src.return_cursor() plat_cur = conn.plat_src.return_cursor() pw_tar_cur = conn.pw_tar.return_cursor() pw_tar_sa = conn.sa_create_engine(conn.pw_tar_cstr) pw_src_conn = conn.pw_src.return_conn() def output(self): return PipelineTasks(self.pl.etl_name, self.pl.task_name, self.load_date.date(), self.start_date.date()) def requires(self): return ExtractSitesSummary(start_date=self.start_date, load_date=self.load_date) def run(self): check_ids = self.get_multi_url_check_ids() url_org_id_mappings_df = self.get_url_org_id_mappings() for check_id in check_ids: # Process Dim information for check id. header_data = self.get_data(check_id[0], 'headers') if header_data is not None: headers = [] for dict_data in header_data: if self.is_valid_header(dict_data): headers.append(self.transform_header_data(dict_data)) transformed_headers = pd.DataFrame(headers, columns=var.HEADER_COLUMNS) is_deleted = self.delete_dim_records( transformed_headers['ResultID']) if is_deleted: self.load_data(transformed_headers, var.DIM_SITES_TABLE) # Process Fact information for check id. performance_data = self.get_data(check_id[0], 'performance') performance_results = self.transform_performance_data( performance_data, url_org_id_mappings_df) transformed_results = pd.DataFrame(performance_results, columns=var.SITES_COLUMNS) is_facts_deleted = self.delete_fact_records( transformed_results['ResultID'].iloc[0]) if is_facts_deleted: self.load_data(transformed_results, var.FACT_SITES_TABLE) self.pl.tasks_insert(self.load_date, self.start_date) def get_data(self, check_id, file_type): """ Get the data from previously saved json files. :param check_id: :param file_type: header file or performance file (details) :return: Json object. """ prefix = '' if file_type == 'headers': prefix = 'Summaries-Headers-{}'.format(check_id) else: prefix = 'SitesPerformance-{}'.format(check_id) filename = get_filename(self.start_date.date(), prefix, 'json') full_path = os.path.join(var.DATA_SITES_PTH, filename) data = [] with open(full_path) as data_file: data = json.load(data_file) return data def delete_dim_records(self, result_ids): """ If this date is being re-run the existing Dim records must deleted. :param result_ids: List of Result IDs :return: True is deletion was successful, otherwise False. This is to ensure that the insertion is not executed if an error occurs. """ if len(result_ids) == 0: return True # Build the list for the IN clause. in_values = '' for rid in result_ids: in_values = in_values + "'{}',".format(rid) # Drop the last ',' from the string. in_values = in_values[0:len(in_values) - 1] try: self.pw_tar_cur.execute( var.DELETE_DIM_SITES.format(in_values)).commit() except Exception as e: error_msg = "Unable to delete from DimSites table. ResultIDs: {}. ERROR Message: {}"\ .format(in_values, e) self.pl.logger.exception(error_msg) return False return True def delete_fact_records(self, result_id): """ If this date is being re-run the existing fact records must deleted. :param result_id: Result ID being processed. :return: True is deletion was successful, otherwise False. This is to ensure that the insertion is not executed if an error occurs. """ try: self.pw_tar_cur.execute( var.DELETE_SITES.format(result_id)).commit() except Exception as e: error_msg = "Unable to delete from FactSites table. ResultID: {}. ERROR Message: {}"\ .format(result_id, e) self.pl.logger.exception(error_msg) return False return True def load_data(self, df, table_name): """ Saves records to the database. :param df: The pandas dataframe that is to be saved. :param table_name: The table the data is to be saved to. :return: True if the records were successfully saved. """ try: df.to_sql(table_name, schema='dw', if_exists='append', con=self.pw_tar_sa, index=None) except Exception as e: err_msg = "ERROR occurred while inserting new checks to StgChecks table. Actual message: {}".format( e) self.pl.logger.error(err_msg) sys.exit(err_msg) return True def get_multi_url_check_ids(self): """ Obtain the Check IDs that have multiple URLs associated with them. :return: List of check IDs """ try: check_id_list = self.pw_src_cur.execute( var.GET_MULTI_URL_CHECKS).fetchall() except Exception as e: error_msg = var.GET_MULTI_URL_CHECKS_ERROR_MSG.format(e) self.pl.logger.error(error_msg) sys.exit(error_msg) return check_id_list def is_valid_header(self, dict_data): """ Validates the numeric Dim data returned. :param dict_data: A row of Dim data. :return: True if all fields contain digits.otherwise, False. """ is_valid = True if 'check_id' in dict_data and not type(dict_data['check_id']) is int: is_valid = False if 'value' in dict_data and not type(dict_data['value']) is int: is_valid = False if 'result_code' in dict_data and not type( dict_data['result_code']) is int: is_valid = False if 'attempts' in dict_data and not type(dict_data['attempts']) is int: is_valid = False return is_valid def transform_performance_data(self, performance_data, url_org_id_mappings_df): """ Validates and transforms the Performance data so that it can be saved to the database. If Check_ID, Result Id, TimestampUTC contains invalid data the entire set is rejected. If the url is not provided the performance result for that record is not saved. :param performance_data: Performance data for a specified Result ID :param url_org_id_mappings_df: url to organization mapping. :return: Transformed data. """ processed_check_results = [] data = performance_data[0] check_results = data['check_results'] for check_result in check_results: is_valid_check_result = True check_id = 0 if 'check_id' in check_result and type( check_result['check_id']) is int: check_id = check_result['check_id'] else: is_valid_check_result = False result_id = '' if 'result_id' in check_result and check_result[ 'result_id'] is not None: result_id = check_result['result_id'][0:60] else: is_valid_check_result = False time_stamp_utc = '' if 'time_stamp_utc' in check_result and check_result[ 'time_stamp_utc'] is not None: time_stamp_utc = check_result['time_stamp_utc'] else: is_valid_check_result = False url_results = check_result['url_results'] if not is_valid_check_result or len(url_results) == 0: return processed_check_results for url_result in url_results: is_valid_record = True url_number = -1 if 'url_number' in url_result and type( url_result['url_number']) is int: url_number = url_result['url_number'] url_domain_name = '' url_hash = 0x00 if 'url' in url_result and url_result['url'] is not None: url = url_result['url'][0:2083] url_domain_name = self.get_url_name(url) url_domain_name = url_domain_name.lower().strip() url_hash = self.site_manager.get_hash(url_domain_name) else: is_valid_record = False organization_id = 0 tenant_id = 0 if len(url_domain_name) > 0: result = url_org_id_mappings_df[ url_org_id_mappings_df['URL'] == url_domain_name] if len(result) > 0: organization_id = result['OrganizationID'] tenant_id = result['ParentID'] elapsed_ms = -1 if 'elapsed_ms' in url_result and type( url_result['elapsed_ms']) is int: elapsed_ms = url_result['elapsed_ms'] received_bytes = -1 if 'received_bytes' in url_result and type( url_result['received_bytes']) is int: received_bytes = url_result['received_bytes'] http_method = 'Not Provided' if 'http_method' in url_result and url_result[ 'http_method'] is not None: http_method = url_result['http_method'] http_status_code = -1 if 'http_status_code' in url_result and type( url_result['http_status_code']) is int: http_status_code = url_result['http_status_code'] dns_lookup_duration_ms = -1 if 'dns_lookup_duration_ms' in url_result and type( url_result['dns_lookup_duration_ms']) is int: dns_lookup_duration_ms = url_result[ 'dns_lookup_duration_ms'] connect_duration_ms = -1 if 'connect_duration_ms' in url_result and type( url_result['connect_duration_ms']) is int: connect_duration_ms = url_result['connect_duration_ms'] send_duration_ms = -1 if 'send_duration_ms' in url_result and type( url_result['send_duration_ms']) is int: send_duration_ms = url_result['send_duration_ms'] wait_duration_ms = -1 if 'wait_duration_ms' in url_result and type( url_result['wait_duration_ms']) is int: wait_duration_ms = url_result['wait_duration_ms'] receive_duration_ms = -1 if 'receive_duration_ms' in url_result and type( url_result['receive_duration_ms']) is int: receive_duration_ms = url_result['receive_duration_ms'] headers = 'Not Provided' if 'headers' in url_result and url_result[ 'headers'] is not None: tmp = str(url_result['headers']) headers = tmp[0:2083] multiple_timings = 'Not Provided' if 'multiple_timings' in url_result and url_result[ 'multiple_timings'] is not None: multiple_timings = str(url_result['multiple_timings']) if is_valid_record: result = { "CheckID": check_id, "ResultID": result_id, "URLNumber": url_number, "URL": url_domain_name, "URLHash": url_hash, "TenantID": int(tenant_id), "OrganizationID": int(organization_id), "TimestampUTC": time_stamp_utc, "ElapsedMS": elapsed_ms, "ReceivedBytes": received_bytes, "HTTPMethod": http_method, "HTTPStatusCode": http_status_code, "DNSLookupDurationMS": dns_lookup_duration_ms, "ConnectDurationMS": connect_duration_ms, "SendDurationMS": send_duration_ms, "WaitDurationMS": wait_duration_ms, "ReceiveDurationMS": receive_duration_ms, "Headers": headers, "MultipleTimings": multiple_timings } processed_check_results.append(result) return processed_check_results def transform_header_data(self, dict_data): check_id = 0 if 'check_id' in dict_data: check_id = int(dict_data['check_id']) result_id = '' if 'identifier' in dict_data: result_id = dict_data['identifier'][0:60] message = '' if 'message' in dict_data: message = dict_data['message'] attempts = 0 if 'attempts' in dict_data: attempts = int(dict_data['attempts']) result_code = 0 if 'result_code' in dict_data: result_code = int(dict_data['result_code']) timestamp_utc = '' if 'timestamp_utc' in dict_data: timestamp_utc = dict_data['timestamp_utc'] severity = '' if 'severity' in dict_data: severity = dict_data['severity'] value = 0 if 'value' in dict_data: value = dict_data['value'] unit = '' if 'unit' in dict_data: unit = dict_data['unit'] header = { "ResultID": result_id, "CheckID": check_id, "TimeStampUTC": timestamp_utc, "Message": message, "Attempts": attempts, "ResultCode": result_code, "Severity": severity, "Value": value, "Unit": unit } return header def get_url_org_id_mappings(self): try: url_orgid_mappings_tuples = self.plat_cur.execute( var.GET_URL_ORGID_MAPPINGS_WITH_HASH).fetchall() except Exception as e: error_msg = var.URL_ORGID_MAPPINGS_ERROR_MSG.format(e) self.pl.logger.error(error_msg) sys.exit(error_msg) # Convert to lists. mapping_list = [] for row in url_orgid_mappings_tuples: a_row = list() a_row.append(row[0]) a_row.append(row[1]) a_row.append(row[2]) mapping_list.append(a_row) mappings = pd.DataFrame(mapping_list, columns=var.URL_ORGID_MAPPINGS_COLUMNS) return mappings def get_url_name(self, url): """ Remove the http and other stuff from the url. """ if len(url) == 0: return '' url_info = urllib.parse.urlsplit(url) return url_info.hostname.rstrip()
class SplitTrainTest(luigi.Task): date = luigi.DateSecondParameter(default=datetime.now()) def requires(self): return CreateModellingTable() def output(self): return [ S3Target(s3.path(S3.MODELLING + "train.parquet"), client=s3.create_client()), S3Target(s3.path(S3.MODELLING + "test.parquet"), client=s3.create_client()), S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/train_T%H%M%S.parquet}".format( date=self.date)), client=s3.create_client(), ), S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/test_T%H%M%S.parquet}".format( date=self.date)), client=s3.create_client(), ), ] def run(self): df_modelling = s3.read_parquet(self.input().path) df_train, df_test = self.train_test_split(df_modelling) df_train, df_test = self.scale_numeric_feats(df_train, df_test) # NOTE: Save both datasets twice. # - One set that is tied to a trained model # - One set that gets overwritten with the current one s3.write_parquet(df_train, self.output()[0].path) s3.write_parquet(df_test, self.output()[1].path) s3.write_parquet(df_train, self.output()[2].path) s3.write_parquet(df_test, self.output()[3].path) def train_test_split(self, df_modelling): """ Split modelling table into training and test set. Keep most recent-year long data as test set. :param df_train: Training dataframe :param df_test: Test dataframe :return: tuple(df_train, df_test) """ cutoff = df_modelling["exit_date"].max() - np.timedelta64(1, "Y") df_train = df_modelling[df_modelling["exit_date"] < cutoff].drop( "exit_date", axis="columns") df_test = df_modelling[df_modelling["exit_date"] >= cutoff].drop( "exit_date", axis="columns") # Resample into training set to to 80/20 ratio if test set is larger test_size = round((len(df_train) + len(df_test)) * 0.2) if len(df_test) > test_size: df_sample = df_test.sample(n=(len(df_test) - test_size), random_state=1) df_test = df_test.drop(df_sample.index) df_train = df_train.append(df_sample) return df_train, df_test def scale_numeric_feats(self, df_train, df_test): """ Scales numeric feats of test and training set. Scale test set with training scaler, to prevent data leakage :param df_train: Training dataframe :param df_test: Test dataframe :return: tuple(df_train, df_test) """ scaler = MinMaxScaler() num_cols = list(df_train.select_dtypes(include=[np.number])) scaler.fit(df_train[num_cols]) df_train[num_cols] = scaler.transform(df_train[num_cols]) df_test[num_cols] = scaler.transform(df_test[num_cols]) return df_train, df_test
class Blastp(PipelineTask): ''' Generate blastp results for a given proteome ''' annotation_accession = luigi.Parameter(default='') e_value = luigi.FloatParameter(default=0.01) database = luigi.Parameter() run_time = luigi.DateSecondParameter() tool = 'blastp' # Available Database Choices swissprot = luigi.Parameter() nr = luigi.Parameter() internal = luigi.Parameter() # specific # CPU to use for each job swissprot_cpu = luigi.Parameter() nr_cpu = luigi.Parameter() internal_cpu = luigi.Parameter() def __init__(self, *args, **kwargs): super(Blastp, self).__init__(*args, **kwargs) if self.database == 'nr': self.n_cpu = self.nr_cpu elif self.database == 'swissprot': self.n_cpu = self.swissprot_cpu elif self.database == 'internal': self.n_cpu = self.internal_cpu def requires(self): return Pull_Protein( annotation_accession=self.annotation_accession, run_time=self.run_time, tool=self.tool, database=self.database, mas_server=self.mas_server ) def out_file_path(self, temp=False): name = self.annotation_accession return { 'results': os.path.join(self.out_dir(temp), '{}_{}_blastp_results.xml'.format(name, self.database)) } def out_dir(self, temp=False): folder = '{}-temp'.format(self.database) if temp else self.database return os.path.join(self.pipeline_out_dir(), self.task_family, folder) def do_task(self): if self.database == 'nr': db_path = self.nr elif self.database == 'swissprot': db_path = self.swissprot elif self.database == 'internal': db_path = self.internal else: raise ValueError('Invalid database ' + self.database) self._run_command([ 'blastp', '-query', self.input()['fasta'].path, '-db', db_path, '-evalue', str(self.e_value), '-outfmt', '5', '-out', self.out_file_path(True)['results'], '-num_threads', str(self.n_cpu) ])
class ExtractCapacities(GomusScraperTask): """Extract all capacities from the fetched gomus pages.""" today = luigi.DateSecondParameter(default=dt.datetime.today()) popover_pattern = regex.compile(r''' <script> \s* \$\("\#info-\d+"\)\.popover\( ( \{ \s* (?<elem> \w+ \s* : \s* '(?:\\.|[^\\\'])*' \s*){0} (?:(?&elem) , \s*)* (?&elem) \} ) \); \s* </script> ''', flags=regex.X) def output(self): return luigi.LocalTarget(f'{self.output_dir}/gomus/capacities.csv', format=UTF8) def requires(self): return FetchCapacities(today=self.today) def run(self): with self.input().open() as input_: df_htmls = pd.read_csv(input_) capacities = [ self.extract_capacities(html_path) for html_path in self.tqdm(df_htmls['file_path'], desc="Extracting capacities") ] df_capacities = pd.DataFrame(columns=[ 'quota_id', 'date', 'time', 'max', 'sold', 'reserved', 'available', 'last_updated' ]) if capacities: df_capacities = pd.concat([df_capacities, *capacities]) df_capacities['last_updated'] = self.today with self.output().open('w') as output: df_capacities.to_csv(output, index=False) def extract_capacities(self, html_path): with open(html_path) as file: src = file.read() dom: html.HtmlElement = html.fromstring(src) quota_id, min_date = self.extract_header(dom) logger.debug("Scraping capacities from quota_id=%s for min_date=%s", quota_id, min_date) capacities = self.create_zero_data(min_date) def load_data(data): return pd.DataFrame( data, columns=[*capacities.index.names, *capacities.columns], dtype=object).set_index(capacities.index.names) basic_capacities = load_data(self.extract_basic_capacities(dom)) capacities.update(basic_capacities) detailed_capacities = load_data( self.extract_detailed_capacities(src, min_date)) capacities.update(detailed_capacities) capacities = capacities.reset_index() capacities.insert(0, 'quota_id', quota_id) return capacities def create_zero_data(self, min_date: dt.date): df = pd.DataFrame(columns=['max', 'sold', 'reserved', 'available']) dates = [min_date + dt.timedelta(days=days) for days in range(0, 7)] times = list( self.create_time_range(delta=dt.timedelta( minutes=SLOT_LENGTH_MINUTES))) return df.reindex(pd.MultiIndex.from_product([dates, times], names=['date', 'time']), fill_value=0) def extract_header(self, dom: html.HtmlElement): """Extract general information from the DOM, e.g. quota ID or date.""" quota_id = self.parse_int( dom, '//body/div[2]/div[2]/div[2]/div/div/ol/li[2]/a/div') min_date = self.parse_date( dom, '//body/div[2]/div[2]/div[3]/div/div[1]/div/div[2]/form/div[2]/' 'div/div/input/@value') return quota_id, min_date def extract_basic_capacities(self, dom: html.HtmlElement): """ Extract basic capacity values from the DOM. These are the values from the table indicating the availabilities for each slot. Generally, this is only a subset of data returned by extract_detailed_capacities(). However, in some cases, gomus displays (defect) negative values in the table and does not provide details about them, so this method is required to record the defect values anyway. """ cells = dom.xpath( '//body/div[2]/div[2]/div[3]/div/div[2]/div/div[2]/table/tbody/' 'tr[position()>1]/td[position()>1]') for cell in cells: datetime = dt.datetime.fromtimestamp( int(cell.get('data-timestamp'))) available = int(cell.text_content().strip()) yield dict(date=datetime.date(), time=datetime.time(), max=available, available=available) def extract_detailed_capacities(self, src: str, min_date: dt.date): """Extract capacity details from the hovercards in the HTML source.""" js_infos = [match[0] for match in self.popover_pattern.findall(src)] infos = [js2py.eval_js(f'd = {js}') for js in js_infos] for info in infos: yield self.extract_capacity(info, min_date) def extract_capacity(self, info, min_date): """Extract capacity details from a single hovercard info.""" title: html.HtmlElement = html.fromstring(info['title']) content: html.HtmlElement = html.fromstring(info['content']) datetime = self.parse_date(title, relative_base=min_date) return dict(date=datetime.date(), time=datetime.time(), max=self.parse_int(content, '//tbody[1]/tr[1]/td[2]'), sold=self.parse_int(content, '//tbody[1]/tr[2]/td[2]'), reserved=self.parse_int(content, '//tbody[1]/tr[3]/td[2]'), available=self.parse_int(content, '//tfooter[1]/tr/td[2]')) @staticmethod def create_time_range(delta: dt.timedelta) -> Iterable[dt.time]: assert delta.days == 0 time = npt.nptime() while True: yield time prev_time = time time += delta if time <= prev_time: break