def run(self): client = MongoClient(util.mongo_host, util.mongo_port) try: jobs.update_job_status( str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running ProviderAssertion Batch %s" % self.batch) pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=pipeline_config.report_tags, report_type_query=pipeline_config.report_type_query, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, cohort_ids=pipeline_config.cohort) term_matcher = TermFinder(pipeline_config.terms, pipeline_config.include_synonyms, pipeline_config.include_descendants, pipeline_config.include_ancestors, pipeline_config.vocabulary) pa_filters = provider_assertion_filters if pipeline_config.sections and len(pipeline_config.sections) > 0: pa_filters[SECTIONS_FILTER] = pipeline_config.sections with self.output().open('w') as outfile: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Finding terms with TermFinder") for doc in docs: terms_found = term_matcher.get_term_full_text_matches( doc["report_text"], pa_filters) for term in terms_found: inserted = mongo_writer(client, self.pipeline, self.job, self.batch, pipeline_config, term, doc, "ProviderAssertion") outfile.write(str(inserted)) outfile.write('\n') del terms_found del docs except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close()
def run(self): task_family_name = str(self.task_family) if self.task_name == "ClarityNLPLuigiTask": self.task_name = task_family_name client = util.mongo_client() try: with self.output().open('w') as temp_file: temp_file.write("start writing custom task") jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Batch %s" % self.batch) self.pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") self.docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=self.pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, types=self.pipeline_config.report_types, sources=self.pipeline_config.sources, filter_query=self.pipeline_config.filter_query, cohort_ids=self.pipeline_config.cohort, job_results_filters=self.pipeline_config.job_results) for d in self.docs: doc_id = d[util.solr_report_id_field] if util.use_memory_caching == "true": k = keys.hashkey(doc_id) document_cache[k] = d if util.use_redis_caching == "true": util.write_to_redis_cache("doc:" + doc_id, json.dumps(d)) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running %s main task" % self.task_name) self.run_custom_task(temp_file, client) temp_file.write("Done writing custom task!") self.docs = list() except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close()
def run(self): client = MongoClient(util.mongo_host, util.mongo_port) try: jobs.update_job_status( str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running MeasurementFinder Batch %s" % self.batch) pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") docs = solr_data.query(self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, cohort_ids=pipeline_config.cohort) filters = dict() if pipeline_config.sections and len(pipeline_config.sections) > 0: filters[SECTIONS_FILTER] = pipeline_config.sections with self.output().open('w') as outfile: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Finding terms with MeasurementFinder") # TODO incorporate sections and filters for doc in docs: meas_results = run_measurement_finder_full( doc["report_text"], pipeline_config.terms) for meas in meas_results: inserted = mongo_writer(client, self.pipeline, self.job, self.batch, pipeline_config, meas, doc, "MeasurementFinder") outfile.write(str(inserted)) outfile.write('\n') del meas_results del docs except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close()
def initialize_task_and_get_documents(pipeline_id, job_id, owner): jobs.update_job_status( str(job_id), util.conn_string, jobs.IN_PROGRESS, "Initializing task -- pipeline: %s, job: %s, owner: %s" % (str(pipeline_id), str(job_id), str(owner))) pipeline_config = config.get_pipeline_config(pipeline_id, util.conn_string) added = copy.copy(pipeline_config.terms) for term in pipeline_config.terms: related_terms = get_related_terms(util.conn_string, term, pipeline_config.include_synonyms, pipeline_config.include_descendants, pipeline_config.include_ancestors, escape=False) if related_terms and len(related_terms) > 0: added.extend(related_terms) solr_query = config.get_query(custom_query=pipeline_config.custom_query, terms=added) total_docs = solr_data.query_doc_size( solr_query, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, solr_url=util.solr_url, types=pipeline_config.report_types, filter_query=pipeline_config.filter_query, tags=pipeline_config.report_tags, report_type_query=pipeline_config.report_type_query, sources=pipeline_config.sources, cohort_ids=pipeline_config.cohort, job_results_filters=pipeline_config.job_results) jobs.update_job_status( str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_SOLR_DOCS", str(total_docs)) doc_limit = config.get_limit(total_docs, pipeline_config) jobs.update_job_status( str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_DOCUMENT_LIMIT", str(doc_limit)) jobs.update_job_status( str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_EVALUATED_DOCS", str(min(doc_limit, total_docs))) ranges = range(0, (doc_limit + int(util.row_count)), int(util.row_count)) return solr_query, total_docs, doc_limit, ranges
def run(self): task_family_name = str(self.task_family) if self.task_name == "ClarityNLPLuigiTask": self.task_name = task_family_name client = MongoClient(util.mongo_host, util.mongo_port) try: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Batch %s" % self.batch) self.pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") self.docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=self.pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, types=self.pipeline_config.report_types, filter_query=self.pipeline_config.filter_query) with self.output().open('w') as temp_file: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running %s main task" % self.task_name) self.run_custom_task(temp_file, client) temp_file.write("Done writing custom task!") self.docs = list() except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close()
def initialize_task_and_get_documents(pipeline_id, job_id, owner): jobs.update_job_status( str(job_id), util.conn_string, jobs.IN_PROGRESS, "Initializing task -- pipeline: %s, job: %s, owner: %s" % (str(pipeline_id), str(job_id), str(owner))) pipeline_config = config.get_pipeline_config(pipeline_id, util.conn_string) jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS, "Getting related terms") added = copy.copy(pipeline_config.terms) for term in pipeline_config.terms: related_terms = get_related_terms(util.conn_string, term, pipeline_config.include_synonyms, pipeline_config.include_descendants, pipeline_config.include_ancestors, escape=False) if related_terms and len(related_terms) > 0: added.extend(related_terms) jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS, "Getting Solr doc size") solr_query = config.get_query(custom_query=pipeline_config.custom_query, terms=added) total_docs = solr_data.query_doc_size( solr_query, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, solr_url=util.solr_url, types=pipeline_config.report_types, filter_query=pipeline_config.filter_query, tags=pipeline_config.report_tags, report_type_query=pipeline_config.report_type_query, cohort_ids=pipeline_config.cohort) doc_limit = config.get_limit(total_docs, pipeline_config) ranges = range(0, (doc_limit + util.row_count), util.row_count) jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS, "Running batch tasks") return solr_query, total_docs, doc_limit, ranges
def initialize_task_and_get_documents(pipeline_id, job_id, owner): jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS, "Initializing task -- pipeline: %s, job: %s, owner: %s" % (str(pipeline_id), str(job_id), str(owner))) pipeline_config = config.get_pipeline_config(pipeline_id, util.conn_string) added = copy.copy(pipeline_config.terms) for term in pipeline_config.terms: related_terms = get_related_terms(util.conn_string, term, pipeline_config.include_synonyms, pipeline_config .include_descendants, pipeline_config.include_ancestors, escape=False) if related_terms and len(related_terms) > 0: added.extend(related_terms) solr_query = config.get_query(custom_query=pipeline_config.custom_query, terms=added) total_docs = solr_data.query_doc_size(solr_query, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, solr_url=util.solr_url, types=pipeline_config.report_types, filter_query=pipeline_config.filter_query, tags=pipeline_config.report_tags, report_type_query=pipeline_config.report_type_query, sources=pipeline_config.sources, cohort_ids=pipeline_config.cohort, job_results_filters=pipeline_config.job_results) jobs.update_job_status(str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_SOLR_DOCS", str(total_docs)) doc_limit = config.get_limit(total_docs, pipeline_config) jobs.update_job_status(str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_DOCUMENT_LIMIT", str(doc_limit)) jobs.update_job_status(str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_EVALUATED_DOCS", str(min(doc_limit, total_docs))) ranges = range(0, (doc_limit + int(util.row_count)), int(util.row_count)) return solr_query, total_docs, doc_limit, ranges
def run(self): client = MongoClient(util.mongo_host, util.mongo_port) current_doc = None try: jobs.update_job_status( str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running ValueExtractor Batch %s" % self.batch) pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") docs = solr_data.query(self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, cohort_ids=pipeline_config.cohort) filters = dict() if pipeline_config.sections and len(pipeline_config.sections) > 0: filters[SECTIONS_FILTER] = pipeline_config.sections with self.output().open('w') as outfile: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Finding terms with ValueExtractor") # TODO incorporate sections and filters for doc in docs: current_doc = doc result = run_value_extractor_full( pipeline_config.terms, doc["report_text"], float(pipeline_config.minimum_value), float(pipeline_config.maximum_value), pipeline_config.case_sensitive) if result: for val in result: inserted = mongo_writer(client, self.pipeline, self.job, self.batch, pipeline_config, val, doc, "ValueExtractor") outfile.write(str(inserted)) outfile.write('\n') del result else: outfile.write("no matches!\n") del docs except AssertionError as a: print(a) print(current_doc) except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status( str(self.job), util.conn_string, jobs.WARNING, 'Report ID: ' + current_doc['report_id'] + '\n' ''.join(traceback.format_stack())) print(ex) # print(current_doc) finally: client.close()