Exemplo n.º 1
0
    def run(self):

        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            jobs.update_job_status(
                str(self.job), util.conn_string, jobs.IN_PROGRESS,
                "Running ProviderAssertion Batch %s" % self.batch)

            pipeline_config = config.get_pipeline_config(
                self.pipeline, util.conn_string)

            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS, "Running Solr query")
            docs = solr_data.query(
                self.solr_query,
                rows=util.row_count,
                start=self.start,
                solr_url=util.solr_url,
                tags=pipeline_config.report_tags,
                report_type_query=pipeline_config.report_type_query,
                mapper_inst=util.report_mapper_inst,
                mapper_url=util.report_mapper_url,
                mapper_key=util.report_mapper_key,
                cohort_ids=pipeline_config.cohort)
            term_matcher = TermFinder(pipeline_config.terms,
                                      pipeline_config.include_synonyms,
                                      pipeline_config.include_descendants,
                                      pipeline_config.include_ancestors,
                                      pipeline_config.vocabulary)
            pa_filters = provider_assertion_filters
            if pipeline_config.sections and len(pipeline_config.sections) > 0:
                pa_filters[SECTIONS_FILTER] = pipeline_config.sections

            with self.output().open('w') as outfile:
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Finding terms with TermFinder")
                for doc in docs:
                    terms_found = term_matcher.get_term_full_text_matches(
                        doc["report_text"], pa_filters)
                    for term in terms_found:
                        inserted = mongo_writer(client, self.pipeline,
                                                self.job, self.batch,
                                                pipeline_config, term, doc,
                                                "ProviderAssertion")
                        outfile.write(str(inserted))
                        outfile.write('\n')
                    del terms_found
            del docs
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()
Exemplo n.º 2
0
    def run(self):
        task_family_name = str(self.task_family)
        if self.task_name == "ClarityNLPLuigiTask":
            self.task_name = task_family_name
        client = util.mongo_client()

        try:
            with self.output().open('w') as temp_file:
                temp_file.write("start writing custom task")
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running Batch %s" % self.batch)

                self.pipeline_config = config.get_pipeline_config(
                    self.pipeline, util.conn_string)
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS, "Running Solr query")
                self.docs = solr_data.query(
                    self.solr_query,
                    rows=util.row_count,
                    start=self.start,
                    solr_url=util.solr_url,
                    tags=self.pipeline_config.report_tags,
                    mapper_inst=util.report_mapper_inst,
                    mapper_url=util.report_mapper_url,
                    mapper_key=util.report_mapper_key,
                    types=self.pipeline_config.report_types,
                    sources=self.pipeline_config.sources,
                    filter_query=self.pipeline_config.filter_query,
                    cohort_ids=self.pipeline_config.cohort,
                    job_results_filters=self.pipeline_config.job_results)

                for d in self.docs:
                    doc_id = d[util.solr_report_id_field]
                    if util.use_memory_caching == "true":
                        k = keys.hashkey(doc_id)
                        document_cache[k] = d
                    if util.use_redis_caching == "true":
                        util.write_to_redis_cache("doc:" + doc_id,
                                                  json.dumps(d))
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running %s main task" % self.task_name)
                self.run_custom_task(temp_file, client)
                temp_file.write("Done writing custom task!")

            self.docs = list()
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()
Exemplo n.º 3
0
    def run(self):
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            jobs.update_job_status(
                str(self.job), util.conn_string, jobs.IN_PROGRESS,
                "Running MeasurementFinder Batch %s" % self.batch)

            pipeline_config = config.get_pipeline_config(
                self.pipeline, util.conn_string)

            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS, "Running Solr query")
            docs = solr_data.query(self.solr_query,
                                   rows=util.row_count,
                                   start=self.start,
                                   solr_url=util.solr_url,
                                   tags=pipeline_config.report_tags,
                                   mapper_inst=util.report_mapper_inst,
                                   mapper_url=util.report_mapper_url,
                                   mapper_key=util.report_mapper_key,
                                   cohort_ids=pipeline_config.cohort)

            filters = dict()
            if pipeline_config.sections and len(pipeline_config.sections) > 0:
                filters[SECTIONS_FILTER] = pipeline_config.sections

            with self.output().open('w') as outfile:
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Finding terms with MeasurementFinder")
                # TODO incorporate sections and filters
                for doc in docs:
                    meas_results = run_measurement_finder_full(
                        doc["report_text"], pipeline_config.terms)
                    for meas in meas_results:
                        inserted = mongo_writer(client, self.pipeline,
                                                self.job, self.batch,
                                                pipeline_config, meas, doc,
                                                "MeasurementFinder")
                        outfile.write(str(inserted))
                        outfile.write('\n')
                    del meas_results
            del docs
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()
Exemplo n.º 4
0
def initialize_task_and_get_documents(pipeline_id, job_id, owner):
    jobs.update_job_status(
        str(job_id), util.conn_string, jobs.IN_PROGRESS,
        "Initializing task -- pipeline: %s, job: %s, owner: %s" %
        (str(pipeline_id), str(job_id), str(owner)))

    pipeline_config = config.get_pipeline_config(pipeline_id, util.conn_string)
    added = copy.copy(pipeline_config.terms)

    for term in pipeline_config.terms:
        related_terms = get_related_terms(util.conn_string,
                                          term,
                                          pipeline_config.include_synonyms,
                                          pipeline_config.include_descendants,
                                          pipeline_config.include_ancestors,
                                          escape=False)
        if related_terms and len(related_terms) > 0:
            added.extend(related_terms)

    solr_query = config.get_query(custom_query=pipeline_config.custom_query,
                                  terms=added)
    total_docs = solr_data.query_doc_size(
        solr_query,
        mapper_inst=util.report_mapper_inst,
        mapper_url=util.report_mapper_url,
        mapper_key=util.report_mapper_key,
        solr_url=util.solr_url,
        types=pipeline_config.report_types,
        filter_query=pipeline_config.filter_query,
        tags=pipeline_config.report_tags,
        report_type_query=pipeline_config.report_type_query,
        sources=pipeline_config.sources,
        cohort_ids=pipeline_config.cohort,
        job_results_filters=pipeline_config.job_results)
    jobs.update_job_status(
        str(job_id), util.conn_string,
        jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_SOLR_DOCS",
        str(total_docs))
    doc_limit = config.get_limit(total_docs, pipeline_config)
    jobs.update_job_status(
        str(job_id), util.conn_string,
        jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_DOCUMENT_LIMIT",
        str(doc_limit))
    jobs.update_job_status(
        str(job_id), util.conn_string,
        jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_EVALUATED_DOCS",
        str(min(doc_limit, total_docs)))
    ranges = range(0, (doc_limit + int(util.row_count)), int(util.row_count))

    return solr_query, total_docs, doc_limit, ranges
Exemplo n.º 5
0
    def run(self):
        task_family_name = str(self.task_family)
        if self.task_name == "ClarityNLPLuigiTask":
            self.task_name = task_family_name
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS,
                                   "Running Batch %s" % self.batch)

            self.pipeline_config = config.get_pipeline_config(
                self.pipeline, util.conn_string)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS, "Running Solr query")
            self.docs = solr_data.query(
                self.solr_query,
                rows=util.row_count,
                start=self.start,
                solr_url=util.solr_url,
                tags=self.pipeline_config.report_tags,
                mapper_inst=util.report_mapper_inst,
                mapper_url=util.report_mapper_url,
                mapper_key=util.report_mapper_key,
                types=self.pipeline_config.report_types,
                filter_query=self.pipeline_config.filter_query)

            with self.output().open('w') as temp_file:
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running %s main task" % self.task_name)
                self.run_custom_task(temp_file, client)
                temp_file.write("Done writing custom task!")

            self.docs = list()
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()
Exemplo n.º 6
0
def initialize_task_and_get_documents(pipeline_id, job_id, owner):
    jobs.update_job_status(
        str(job_id), util.conn_string, jobs.IN_PROGRESS,
        "Initializing task -- pipeline: %s, job: %s, owner: %s" %
        (str(pipeline_id), str(job_id), str(owner)))

    pipeline_config = config.get_pipeline_config(pipeline_id, util.conn_string)

    jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS,
                           "Getting related terms")
    added = copy.copy(pipeline_config.terms)

    for term in pipeline_config.terms:
        related_terms = get_related_terms(util.conn_string,
                                          term,
                                          pipeline_config.include_synonyms,
                                          pipeline_config.include_descendants,
                                          pipeline_config.include_ancestors,
                                          escape=False)
        if related_terms and len(related_terms) > 0:
            added.extend(related_terms)

    jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS,
                           "Getting Solr doc size")
    solr_query = config.get_query(custom_query=pipeline_config.custom_query,
                                  terms=added)
    total_docs = solr_data.query_doc_size(
        solr_query,
        mapper_inst=util.report_mapper_inst,
        mapper_url=util.report_mapper_url,
        mapper_key=util.report_mapper_key,
        solr_url=util.solr_url,
        types=pipeline_config.report_types,
        filter_query=pipeline_config.filter_query,
        tags=pipeline_config.report_tags,
        report_type_query=pipeline_config.report_type_query,
        cohort_ids=pipeline_config.cohort)
    doc_limit = config.get_limit(total_docs, pipeline_config)
    ranges = range(0, (doc_limit + util.row_count), util.row_count)
    jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS,
                           "Running batch tasks")

    return solr_query, total_docs, doc_limit, ranges
Exemplo n.º 7
0
def initialize_task_and_get_documents(pipeline_id, job_id, owner):
    jobs.update_job_status(str(job_id), util.conn_string, jobs.IN_PROGRESS,
                           "Initializing task -- pipeline: %s, job: %s, owner: %s" % (str(pipeline_id), str(job_id),
                                                                                      str(owner)))

    pipeline_config = config.get_pipeline_config(pipeline_id, util.conn_string)
    added = copy.copy(pipeline_config.terms)

    for term in pipeline_config.terms:
        related_terms = get_related_terms(util.conn_string, term, pipeline_config.include_synonyms, pipeline_config
                                          .include_descendants, pipeline_config.include_ancestors, escape=False)
        if related_terms and len(related_terms) > 0:
            added.extend(related_terms)

    solr_query = config.get_query(custom_query=pipeline_config.custom_query, terms=added)
    total_docs = solr_data.query_doc_size(solr_query, mapper_inst=util.report_mapper_inst,
                                          mapper_url=util.report_mapper_url,
                                          mapper_key=util.report_mapper_key, solr_url=util.solr_url,
                                          types=pipeline_config.report_types, filter_query=pipeline_config.filter_query,
                                          tags=pipeline_config.report_tags,
                                          report_type_query=pipeline_config.report_type_query,
                                          sources=pipeline_config.sources,
                                          cohort_ids=pipeline_config.cohort,
                                          job_results_filters=pipeline_config.job_results)
    jobs.update_job_status(str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) + "_SOLR_DOCS",
                           str(total_docs))
    doc_limit = config.get_limit(total_docs, pipeline_config)
    jobs.update_job_status(str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) +
                           "_DOCUMENT_LIMIT",
                           str(doc_limit))
    jobs.update_job_status(str(job_id), util.conn_string, jobs.STATS + "_PIPELINE_" + str(pipeline_id) +
                           "_EVALUATED_DOCS",
                           str(min(doc_limit, total_docs)))
    ranges = range(0, (doc_limit + int(util.row_count)), int(util.row_count))

    return solr_query, total_docs, doc_limit, ranges
Exemplo n.º 8
0
    def run(self):
        client = MongoClient(util.mongo_host, util.mongo_port)

        current_doc = None
        try:
            jobs.update_job_status(
                str(self.job), util.conn_string, jobs.IN_PROGRESS,
                "Running ValueExtractor Batch %s" % self.batch)

            pipeline_config = config.get_pipeline_config(
                self.pipeline, util.conn_string)

            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS, "Running Solr query")
            docs = solr_data.query(self.solr_query,
                                   rows=util.row_count,
                                   start=self.start,
                                   solr_url=util.solr_url,
                                   tags=pipeline_config.report_tags,
                                   mapper_inst=util.report_mapper_inst,
                                   mapper_url=util.report_mapper_url,
                                   mapper_key=util.report_mapper_key,
                                   cohort_ids=pipeline_config.cohort)

            filters = dict()
            if pipeline_config.sections and len(pipeline_config.sections) > 0:
                filters[SECTIONS_FILTER] = pipeline_config.sections

            with self.output().open('w') as outfile:
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Finding terms with ValueExtractor")
                # TODO incorporate sections and filters
                for doc in docs:
                    current_doc = doc
                    result = run_value_extractor_full(
                        pipeline_config.terms, doc["report_text"],
                        float(pipeline_config.minimum_value),
                        float(pipeline_config.maximum_value),
                        pipeline_config.case_sensitive)
                    if result:
                        for val in result:
                            inserted = mongo_writer(client, self.pipeline,
                                                    self.job, self.batch,
                                                    pipeline_config, val, doc,
                                                    "ValueExtractor")
                            outfile.write(str(inserted))
                            outfile.write('\n')
                        del result
                    else:
                        outfile.write("no matches!\n")
            del docs
        except AssertionError as a:
            print(a)
            print(current_doc)
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(
                str(self.job), util.conn_string, jobs.WARNING,
                'Report ID: ' + current_doc['report_id'] + '\n'
                ''.join(traceback.format_stack()))
            print(ex)
            # print(current_doc)
        finally:
            client.close()