示例#1
0
def phenotype_subjects(job_id: str, phenotype_final: bool):
    client = util.mongo_client()
    db = client[util.mongo_db]
    res = []
    # db.phenotype_results.aggregate([  {"$match":{"job_id":{"$eq":10201}, "phenotype_final":{"$eq":true}}},
    #  {"$group" : {_id:"$subject", count:{$sum:1}}} ])
    try:
        q = [{
            "$match": {
                "phenotype_final": {
                    "$eq": phenotype_final
                },
                "job_id": {
                    "$eq": int(job_id)
                }
            }
        }, {
            "$group": {
                "_id": "$subject",
                "count": {
                    "$sum": 1
                }
            }
        }]
        res = list(db.phenotype_results.aggregate(q))
        res = sorted(res, key=lambda r: r['count'], reverse=True)
    except Exception as e:
        log(e, ERROR)
    finally:
        client.close()

    return res
示例#2
0
def get_predicate_boolean(expression: nlpql_parserParser.PredicateBooleanContext, define_name, final):
    log(expression)
    operator = ""
    entities = list()

    op = PhenotypeOperations(define_name, operator, entities, final=final, raw_text=get_pretty_text(expression))
    return op
示例#3
0
def lookup_phenotype_results_by_id(id_list: list):
    client = util.mongo_client()
    db = client[util.mongo_db]
    obj = dict()
    obj['results'] = list()
    obj['indexes'] = dict()

    try:
        # db.phenotype_results.find({"_id": { $in: [ObjectId("5b117352bcf26f020e392a9c"),
        # ObjectId("5b117352bcf26f020e3926e2")]}})
        # TODO TODO TODO
        ids = list(map(lambda x: ObjectId(x), id_list))
        res = db.phenotype_results.find({"_id": {"$in": ids}})
        obj['results'] = list(res)
        n = 0
        for o in obj['results']:
            o = display_mapping(o)
            id = str(o['_id'])
            obj['indexes'][id] = n
            n = n + 1

    except Exception as e:
        log(e, ERROR)
        traceback.print_exc(file=sys.stdout)
        obj['success'] = False
    finally:
        client.close()

    return obj
示例#4
0
def handle_tuple(context, phenotype: PhenotypeModel, define_name, final):
    log('tuple')

    obj = get_obj_context(context.getChild(0).getChild(1), to_string=True)
    num_children = len(context.children)
    op_raw_text = ''
    tuple_name = '{}_Step1'.format(define_name)
    if num_children == 2:
        operation = parse_operation(context.getChild(1), tuple_name, final)
        if operation:
            if not phenotype.operations:
                phenotype.operations = list()
            phenotype.operations.append(operation)
            op_raw_text = operation.get('raw_text')

    else:
        operation = None

    raw_text = '''
        Tuple {}
        {}
    '''

    if not operation:
        where = ''
    else:
        where = 'where {}'.format(op_raw_text)
    tuple_str = json.dumps(obj, indent=4)
    pe = PhenotypeEntity(tuple_name, 'define', final=final, tuple_=True, tuple_object=obj, tuple_predicate=operation,
                         raw_text=raw_text.format(tuple_str, where), tuple_raw_text='Tuple {}'.format(tuple_str))

    if not phenotype.tuples:
        phenotype.tuples = list()

    phenotype.tuples.append(pe)
示例#5
0
def get_not_expression(expression: nlpql_parserParser.ExpressionContext, define_name, final):
    log(expression)
    entities = list()
    operator = ""

    op = PhenotypeOperations(define_name, operator, entities, final=final, raw_text=get_pretty_text(expression))
    return op
示例#6
0
def show_help():
    log(get_version())
    log("""
    USAGE: python3 ./{0} 

    OPTIONS:

         -i, --infile             <quoted string> (required) input filename
                                                  matrix market format
        [-r, --min_docs_per_term] <positive int>  mininum number of docs per
                                                  term, default is 3
                                                  (minimum allowable row sum
                                                  in result matrix)
        [-c, --min_terms_per_doc] <positive int>  minimum number of terms per
                                                  document, default is 5
                                                  (minimum allowable column sum
                                                  in result matrix)
        [-p, --precision]         <positive int>  precision of result matrix
                                                  (default is 4 digits)
        [-hvzbw]

    FLAGS:

        -h, --help           log this information and exit.
        -v, --version        log version information and exit.
        -z, --selftest       Run self-tests and exit.
        -b, --boolean        Replace nonzero entries in input matrix with 1
                             (i.e. use 1 if value present, 0 if not)
        -w, --weights        Compute tf-idf weights in result matrix
                             (i.e. output a term-document matrix instead of
                             a term-frequency matrix)

    """.format(_MODULE_NAME))
示例#7
0
def handle_data_entity(context, phenotype: PhenotypeModel, define_name, final):
    log('data entity')
    pe = PhenotypeEntity(define_name, 'define', final=final)
    call = get_method_call(context.getChild(0))
    # hasSepsis = PhenotypeEntity('hasSepsis', 'define',
    #                             library='ClarityNLP',
    #                             funct='ProviderAssertion',
    #                             named_arguments={
    #                                 "termsets": ['Sepsis'],
    #                                 "documentsets": [
    #                                     'ProviderNotes',
    #                                     "Radiology"
    #                                 ]
    #                             })
    pe["funct"] = call["funct"]
    pe["library"] = call["library"]
    named_args = call["named_arguments"]
    args = call["arguments"]
    if named_args:
        pe["named_arguments"] = named_args
    if args and len(args) > 0:
        pe["arguments"] = args
    if not phenotype.data_entities:
        phenotype.data_entities = list()

    phenotype.data_entities.append(pe)
示例#8
0
def insert_pipeline_config(pipeline: PipelineConfig, connection_string: str):
    conn = psycopg2.connect(connection_string)
    cursor = conn.cursor()
    pipeline_id = -1

    try:
        if pipeline:
            pipeline_json = pipeline.to_json()
            cursor.execute("""
                          INSERT INTO
                          nlp.pipeline_config(owner, config, pipeline_type, name, description, date_created)
                          VALUES(%s, %s, %s, %s, %s, current_timestamp) RETURNING pipeline_id
                          """,
                           (pipeline.owner, pipeline_json, pipeline.config_type, pipeline.name, pipeline.description))

            pipeline_id = cursor.fetchone()[0]
            conn.commit()

    except Exception as ex:
        log('failed to insert pipeline')
        log(ex)
    finally:
        conn.close()

    return pipeline_id
示例#9
0
def get_pipeline_config(pipeline_id, connection_string):
    conn = psycopg2.connect(connection_string)
    cursor = conn.cursor()

    try:
        cursor.execute("""
                     SELECT  *
                     FROM    nlp.pipeline_config
                     WHERE   pipeline_id = %s
                     """, [str(pipeline_id)])

        row = cursor.fetchone()
        if row:
            obj = PipelineConfig.from_json(row[2])
            if obj:
                return obj
            else:
                return get_default_config()
        else:
            log("no rows returned")
    except Exception as ex:
        log(ex)
    finally:
        conn.close()

    return get_default_config()
示例#10
0
 def custom_cleanup(self, pipeline_id, job, owner, pipeline_type, pipeline_config, client, db):
     log('removing intermediate n-gram records')
     db.phenotype_results.remove({
         "nlpql_feature": pipeline_config.name,
         "job_id": job,
         "phenotype_final": False
     })
示例#11
0
def try_match_anchored_left(sentence_index, start, end, start_offsets, end_offsets, words):
    """If no match, try removing final words one at a time. This is less
    preferable than the previous loop, which anchors at the right to
    the terminating symbol.
    """

    if TRACE: log("\tTrying match_anchored_left...", DEBUG)
    
    candidates = []

    new_start = start + start_offsets[0]
    for i in reversed(range(1, len(words))):
        test_text = ' '.join(words[0:i])
        if TRACE: log("\t\ttrial text:\t\t->{0}<-".format(test_text), DEBUG)
        if test_text in synonym_map:
            # found exact match, so save all associated concepts
            new_end = start + end_offsets[i-1]
            #if TRACE: print("\t\tspan: {0}-{1}".format(new_start, new_end))
            for c in synonym_map[test_text]:
                cid = concept_to_cid_map[c]
                treecode_list = graph.treecode_list(cid)
                sh = SectionHeader(sentence_index, new_start, new_end, test_text, c, treecode_list)
                candidates.append(sh)
            break

    if TRACE:
        print_num_found(len(candidates))

    return candidates
示例#12
0
def try_match_anchored_right(sentence_index, start, end, start_offsets, end_offsets, words):
    """
    Remove initial words one at a time and try for exact match. This
    finds word sequences anchored at the right, near the terminating
    symbol.
    """

    if TRACE: log("\tTrying match_anchored_right...", DEBUG)
    
    candidates = []

    new_end = start + end_offsets[-1]
    for i in range(1, len(words)):
        test_text = ' '.join(words[i:])
        if TRACE: log("\t\ttrial text:\t\t->{0}<-".format(test_text), DEBUG)
        if test_text in synonym_map:
            # found exact match, so save all associated concepts
            new_start = start + start_offsets[i]
            for c in synonym_map[test_text]:
                cid = concept_to_cid_map[c]
                treecode_list = graph.treecode_list(cid)
                sh = SectionHeader(sentence_index, new_start, new_end, test_text, c, treecode_list)
                candidates.append(sh)
            break

    if TRACE:
        print_num_found(len(candidates))
        
    return candidates
示例#13
0
def phenotype_subject_results(job_id: str, phenotype_final: bool,
                              subject: str):
    client = util.mongo_client()
    db = client[util.mongo_db]
    res = []
    try:
        query = {
            "job_id": int(job_id),
            "phenotype_final": phenotype_final,
            "subject": subject
        }

        temp = list(db["phenotype_results"].find(query))
        for r in temp:
            obj = r.copy()
            for k in r.keys():
                val = r[k]
                if (isinstance(val, int)
                        or isinstance(val, float)) and math.isnan(val):
                    del obj[k]
            res.append(obj)

    except Exception as e:
        log(e, ERROR)
    finally:
        client.close()

    return res
示例#14
0
    def requires(self):
        try:
            self.solr_query, total_docs, doc_limit, ranges = initialize_task_and_get_documents(
                self.pipeline, self.job, self.owner)

            task = registered_pipelines[str(self.pipelinetype)]
            if task.parallel_task:
                matches = [
                    task(pipeline=self.pipeline,
                         job=self.job,
                         start=n,
                         solr_query=self.solr_query,
                         batch=n) for n in ranges
                ]
            else:
                matches = [
                    task(pipeline=self.pipeline,
                         job=self.job,
                         start=0,
                         solr_query=self.solr_query,
                         batch=0)
                ]

            return matches
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            log(ex)
        return list()
示例#15
0
def handle_operation(context, phenotype: PhenotypeModel, define_name, final):
    log('operation')

    # SepsisState = PhenotypeOperations('SepsisState', 'OR', ['onVentilator', 'hasSepsis'], final=True)

    # : notOperator=(NOT | BANG) expression
    # | expression logicalOperator expression
    # | predicate IS NOT? BOOL
    expression_context = context.getChild(1)
    first = expression_context.getChild(0)

    res = None
    if type(first) == nlpql_parserParser.NotOperatorContext:
        res = get_not_expression(expression_context, define_name, final)
    elif type(first) == nlpql_parserParser.ExpressionContext:
        res = get_logical_expression(expression_context, define_name, final)
    elif type(first) == nlpql_parserParser.PredicateBooleanContext:
        res = get_predicate_boolean(expression_context, define_name, final)
    elif type(first) == nlpql_parserParser.PredicateContext:
        res = get_predicate_expression(expression_context.getChild(0), define_name, final)

    if not phenotype.operations:
        phenotype.operations = list()

    if res:
        phenotype.operations.append(res)
示例#16
0
def _get_datetime_window(custom_args, data_earliest, data_latest):
    """
    Extract and parse the datetime_start and datetime_end custom
    arguments. These specify the start/end datetime filters for the CQL
    results.
    """

    datetime_start = None
    datetime_end = None

    if _ARG_TIME_START in custom_args:
        time_start = custom_args[_ARG_TIME_START]
        datetime_start = tc.parse_time_command(time_start, data_earliest,
                                               data_latest)

    if _ARG_TIME_END in custom_args:
        time_end = custom_args[_ARG_TIME_END]
        datetime_end = tc.parse_time_command(time_end, data_earliest,
                                             data_latest)

    if _TRACE:
        log('\n*** datetime_start: {0}'.format(datetime_start))
        log('***   datetime_end: {0}'.format(datetime_end))

    return (datetime_start, datetime_end)
示例#17
0
def _log_error(msg):
    """
    Print an error message to the log file.
    """

    log('TermProximityTask error: {0}'.format(msg))
    return
示例#18
0
def _get_custom_arg(str_key, str_variable_name, job_id, custom_arg_dict):
    """
    Extract a value at the given key from the given dict, or return None
    if not found. Attempt to read from environmental variables, if known.
    """

    value = None
    if str_key in custom_arg_dict:
        value = custom_arg_dict[str_key]

    if value is None:
        if str_key in util.properties:
            value = util.properties[str_key]

    # treat empty strings as None
    if str == type(value) and 0 == len(value):
        value = None

    # echo in job status and in log file
    msg = 'CQLExecutionTask: {0} == {1}'.format(str_variable_name, value)
    data_access.update_job_status(job_id, util.conn_string,
                                  data_access.IN_PROGRESS, msg)
    # write msg to log file
    log(msg)

    return value
示例#19
0
def run_measurement_finder(filename):

    measurements = list()
    try:
        p = Popen(
            ['java', '-jar', FULL_JAR, '-f', filename, '-m', MODEL_FULL_DIR],
            stdout=PIPE,
            stderr=STDOUT)
        json_str = ''
        for line in p.stdout:
            json_str += str(line, 'utf-8')

        json_obj = json.loads(json_str)
        results = json_obj['results']
        for res in results:
            meas_count = int(res["measurementCount"])
            if meas_count > 0:
                # print("found %d measurements" % meas_count)
                meas_results = res['measurements']
                for meas in meas_results:
                    meas_obj = Measurement.from_dict(meas)
                    meas_obj.__setattr__('sentence', res['sentence'])
                    measurements.append(meas_obj)
    except Exception as e:
        log(e, ERROR)

    return measurements
示例#20
0
def _process_bundle(name, bundle_obj, result_type_str):
    """
    Process a DSTU2 or DSTU3 resource bundle returned from the CQL Engine.
    """

    if _TRACE: log('Decoding BUNDLE resource...')

    # this bundle should be a string representation of a list of dicts
    obj_type = type(bundle_obj)
    assert str == obj_type

    try:
        obj = json.loads(bundle_obj)
    except json.decoder.JSONDecodeError as e:
        log('\t{0}: String conversion (bundle) failed with error: "{1}"'.
            format(_MODULE_NAME, e))
        return []

    # now find out what type of obj was created from the string
    obj_type = type(obj)
    assert list == obj_type

    rts = result_type_str.lower()

    bundled_objs = []
    for elt in obj:
        if rts.endswith('stu2') or rts.endswith('stu3') or rts.endswith(
                'list'):
            result = _process_resource(elt)
        if result is not None:
            # insert the name as a new 'cql_feature' field
            result[KEY_CQL_FEATURE] = name
            bundled_objs.append(result)

    return bundled_objs
示例#21
0
def scrape_common_verbs():
    """
    Scrape a list of the 100 most common English verbs.
    """
    
    page = requests.get(url_common)
    if 200 != page.status_code:
        log('Error getting url: ' + url)
        sys.exit(-1)

    soup = BeautifulSoup(page.text, 'html.parser')

    verbs = []

    # get all 'a' elements, look for the first one with href='/dictionary/be'
    a_elts = soup.find_all('a')

    start=0
    for a in a_elts:
        if 'be' == a.get_text().strip().lower():
            break
        else:
            start += 1

    # skip the verb 'be', which has a complex entry
    for i in range(start+1, start+1000):
        verb = a_elts[i].get_text().strip().lower()
        verbs.append(verb)

    return verbs
示例#22
0
def _convert_units(value, units, is_area_measurement, is_vol_measurement):
    """
    Given a token value in the indicated units, convert to mm and return
    the new value.
    """

    convert_cm = units.startswith('cm') or    \
                 units.startswith('centi') or \
                 units.startswith('cc')
    convert_in = units.startswith('in')
    is_area = is_area_measurement or _is_area_unit(units)
    is_volume = is_vol_measurement or _is_vol_unit(units)

    if _TRACE:
        log('_convert_units::is_area: {0}'.format(is_area))
        log('_convert_units::is_volume: {0}'.format(is_volume))

    if convert_cm:
        # convert from cm to mm
        value = value * _CM_TO_MM
        if is_area:
            value = value * _CM_TO_MM
        elif is_volume:
            value = value * _CM_TO_MM_SQ
    elif convert_in:
        # convert from inches to mm
        value = value * _IN_TO_MM
        if is_area:
            value = value * _IN_TO_MM
        elif is_volume:
            value = value * _IN_TO_MM_SQ

    return value
示例#23
0
def get_descendants(conn_string, concept, vocabulary):
    conn = psycopg2.connect(conn_string)
    cursor = conn.cursor()

    if vocabulary is None:
        vocabulary = "SNOMED"

    try:
        cursor.execute(
            """ SELECT concept_name
        FROM nlp.concept_ancestor INNER JOIN nlp.concept on concept_id = descendant_concept_id
        WHERE ancestor_concept_id in (SELECT concept_id from nlp.concept where lower(concept_name) = %s
        AND vocabulary_id=%s AND invalid_reason IS null) AND vocabulary_id=%s AND invalid_reason is null
        order by max_levels_of_separation asc
        """, (concept.lower(), vocabulary, vocabulary))

        result = cursor.fetchall()
        return result

    except Exception as ex:
        log('Failed to get descendants')
        log(str(ex))

    finally:
        conn.close()

    return list()
示例#24
0
def _run(json_obj):

    results = []

    # assumes we either have a list of objects or a single obj
    obj_type = type(json_obj)
    if list == obj_type:
        #log('**** SAW A LIST ****')
        for e in json_obj:
            result_obj = crp.decode_top_level_obj(e)
            if result_obj is None:
                continue
            if list is not type(result_obj):
                results.append(result_obj)
            else:
                results.extend(result_obj)
    elif dict == obj_type:
        #log('**** SAW A DICT ****')
        result_obj = crp.decode_top_level_obj(json_obj)
        if result_obj is not None:
            if list is not type(result_obj):
                results.append(result_obj)
            else:
                results.extend(result_obj)

    #log('found {0} results'.format(len(results)))

    counter = 0
    for obj in results:
        log('Result {0}: {1}'.format(counter, obj))
        counter += 1
示例#25
0
def get_synonyms(conn_string, concept, vocabulary):
    conn = psycopg2.connect(conn_string)
    cursor = conn.cursor()

    if vocabulary is None:
        vocabulary = "SNOMED"

    try:
        cursor.execute(
            """ SELECT concept_synonym_name
        FROM nlp.concept_synonym s INNER JOIN nlp.concept c on c.concept_id = s.concept_id
        WHERE lower(concept_name) = %s and c.vocabulary_id=%s and invalid_reason is null
        order by concept_synonym_name
        """, (concept.lower(), vocabulary))

        result = cursor.fetchall()

        return result

    except Exception as ex:
        log('Failed to get synonyms')
        log(str(ex))

    finally:
        conn.close()

    return list()
示例#26
0
def get_related_terms(conn_string,
                      concept,
                      vocabulary,
                      get_synonyms_bool=True,
                      get_descendants_bool=False,
                      get_ancestors_bool=False,
                      escape=True):
    related_terms = []
    escaped = []
    if get_synonyms_bool:
        res = get_synonyms(conn_string, concept, vocabulary)
        if res:
            for r in res:
                related_terms.append(r[0])
                escaped.append(re.escape(r[0]))
    if get_descendants_bool:
        res = get_descendants(conn_string, concept, vocabulary)
        if res:
            for r in res:
                related_terms.append(r[0])
                escaped.append(re.escape(r[0]))
    if get_ancestors_bool:
        res = get_ancestors(conn_string, concept, vocabulary)
        if res:
            for r in res:
                related_terms.append(r[0])
                escaped.append(re.escape(r[0]))

    if len(related_terms) > 0:
        log(related_terms)

    if escape:
        return list(set(escaped))
    else:
        return list(set(related_terms))
示例#27
0
def delete_job_by_id(job_id: int):
    log('deleting job now ' + str(job_id))
    flag = delete_job(str(job_id), util.conn_string)
    if flag == 1:
        return "Successfully deleted Job!"
    else:
        return "Unable to delete Job!"
示例#28
0
def get_section_source():
    """GET source file for sections and synonyms"""
    try:
        file_path = get_sec_tag_source_tags()
        return send_file(file_path)
    except Exception as ex:
        log(ex)
        return "Failed to retrieve sections source file"
示例#29
0
def get_document_set_attributes(model):
    tags = dict()
    types = dict()
    custom_query = dict()
    filter_query = dict()
    source = dict()
    # Clarity.createDocumentSet({
    #     report_tags: [optional]
    #     report_types: [optional],
    #     "filter_query": "
    # query: "query"
    if model.document_sets:
        for d in model.document_sets:
            log(d)
            if d['library'] == "Clarity" or d["library"] == "ClarityNLP":
                args = d['arguments']
                named_args = d['named_arguments']
                funct = d['funct']
                doc_set_name = d['name']
                if funct == "createReportTagList":
                    if len(args) == 1 and type(args[0]) == list:
                        tags[doc_set_name] = args[0]
                    else:
                        tags[doc_set_name] = args
                elif funct == "createDocumentSet":
                    if named_args:
                        if "report_tags" in named_args:
                            arg_report_tags = named_args["report_tags"]
                            if len(arg_report_tags) == 1 and type(arg_report_tags[0]) == list:
                                tags[doc_set_name] = arg_report_tags[0]
                            else:
                                tags[doc_set_name] = arg_report_tags
                        if "report_types" in named_args:
                            types[doc_set_name] = named_args["report_types"]
                        if "filter_query" in named_args:
                            fq = normalize_query_quotes(named_args["filter_query"])
                            filter_query[doc_set_name] = fq
                        if "query" in named_args:
                            query = normalize_query_quotes(named_args["query"])
                            custom_query[doc_set_name] = query
                        if "source" in named_args:
                            if type(named_args["source"]) == str:
                                source[doc_set_name] = named_args["source"].split(",")
                            else:
                                source[doc_set_name] = named_args["source"]
                        elif "sources" in named_args:
                            if type(named_args["sources"]) == str:
                                source[doc_set_name] = named_args["sources"].split(",")
                            else:
                                source[doc_set_name] = named_args["sources"]
                elif funct == "createReportTypeList":
                    if len(args) == 1 and type(args[0]) == list:
                        types[doc_set_name] = args[0]
                    else:
                        types[doc_set_name] = args

    return tags, types, custom_query, filter_query, source
示例#30
0
    def run_custom_task(self, pipeline_id, job, owner, pipeline_type,
                        pipeline_config, client, db):
        group_key = get_config_string(pipeline_config,
                                      "group_by",
                                      default='report_type')
        log('run custom task collector')
        q = [{
            "$match": {
                "nlpql_feature": {
                    "$eq": pipeline_config.name
                },
                "job_id": {
                    "$eq": job
                }
            }
        }, {
            "$group": {
                "_id": ("$" + group_key),
                "avg_word_cnt": {
                    "$avg": "$words"
                },
                "avg_grade_level": {
                    "$avg": "$grade_level"
                },
                "avg_sentences": {
                    "$avg": "$sentences"
                },
                "avg_long_words": {
                    "$avg": "$long_words"
                },
                "avg_polysyllable_words": {
                    "$avg": "$polysyllable_words"
                }
            }
        }]

        results = list(db.phenotype_results.aggregate(q))

        group_by = "text_stats_group_by_field_" + group_key
        for r in results:
            pipeline_mongo_writer(
                client,
                pipeline_id,
                pipeline_type,
                job,
                0,
                pipeline_config,
                None, {
                    group_by: r['_id'],
                    'average_word_count': r['avg_word_cnt'],
                    'average_grade_level': r['avg_grade_level'],
                    'average_sentences': r['avg_sentences'],
                    'average_long_words': r['avg_long_words'],
                    'average_polysyllable_words': r['avg_polysyllable_words']
                },
                phenotype_final=True)