def test_get_term_parent_hierarchies(self): lookup_table = {"parents": {"a": ["b"], "b": ["c"], "d": ["e", "f"], "g": ["h", "i"], "i": ["j"]}} self.assertCountEqual([["z"]], pipeline_helpers.get_term_parent_hierarchies("z", lookup_table)) self.assertCountEqual([["c"]], pipeline_helpers.get_term_parent_hierarchies("c", lookup_table)) self.assertCountEqual([["b", "c"]], pipeline_helpers.get_term_parent_hierarchies("b", lookup_table)) self.assertCountEqual([["a", "b", "c"]], pipeline_helpers.get_term_parent_hierarchies("a", lookup_table)) self.assertCountEqual([["d", "e"], ["d", "f"]], pipeline_helpers.get_term_parent_hierarchies("d", lookup_table)) self.assertCountEqual([["g", "h"], ["g", "i", "j"]], pipeline_helpers.get_term_parent_hierarchies("g", lookup_table))
def find_match(map_result, ontology_lookup_table): mapping_output = map_result["mapping_output"] input_to_ontology_mapping = map_result["input_to_ontology_mapping"] ontology_to_input_mapping = map_result["ontology_to_input_mapping"] input_term_label_map = map_result["input_term_label"] ontology_term_label_map = map_result["ontology_term_label"] for input_term, mapping_object in mapping_output.items(): input_term_id = get_term_id(input_term) input_term_label = get_term_label(input_term) # Standardize sample to lowercase and with punctuation treatment. sample = input_term_label.lower() sample = helpers.punctuation_treatment(sample) # Tokenize sample and remove stop words and 1-letter words sample_tokens = word_tokenize(sample) # Get "cleaned_sample" cleaned_sample = "" for token in sample_tokens: # Ignore dates if helpers.is_date(token) or helpers.is_number(token): continue # Ignore single letter if helpers.is_single_letter(token): continue # Some preprocessing token = helpers.preprocess(token) lemma = helpers.singularize_token( token, ontology_lookup_table, []) lemma = helpers.spelling_correction( lemma, ontology_lookup_table, []) lemma = helpers.abbreviation_normalization_token( lemma, ontology_lookup_table, []) lemma = helpers.non_English_normalization_token( lemma, ontology_lookup_table, []) cleaned_sample = helpers.get_cleaned_sample( cleaned_sample, lemma, ontology_lookup_table) cleaned_sample = re.sub(' +', ' ', cleaned_sample) cleaned_sample = helpers.abbreviation_normalization_phrase( cleaned_sample, ontology_lookup_table, []) cleaned_sample = helpers.non_English_normalization_phrase( cleaned_sample, ontology_lookup_table, []) cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample) # Attempt full term match full_term_match = helpers.map_term(sample, ontology_lookup_table) if not full_term_match: # Attempt full term match with cleaned sample full_term_match =\ helpers.map_term(cleaned_sample, ontology_lookup_table) if not full_term_match: # Attempt full term match using suffixes full_term_match =\ helpers.map_term(sample, ontology_lookup_table, consider_suffixes=True) if not full_term_match: # Attempt full term match with cleaned sample using suffixes full_term_match =\ helpers.map_term(cleaned_sample, ontology_lookup_table, consider_suffixes=True) if full_term_match: ontology_term_id = full_term_match["id"].upper() ontology_term_label = full_term_match["term"] ontology_term = "%s:%s" % (ontology_term_label, ontology_term_id) mapping_output[input_term] = ontology_term input_to_ontology_mapping[input_term_id] = ontology_term_id ontology_to_input_mapping[ontology_term_id] = input_term_id ontology_term_label_map[ontology_term_id] = full_term_match["term"] input_term_label_map[input_term_id] = input_term_label else: # Attempt various component matches component_matches = [] covered_tokens = set() for i in range(5, 0, -1): for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i): concat_gram_chunk = " ".join(gram_chunk) gram_tokens = word_tokenize(concat_gram_chunk) gram_permutations =\ list(OrderedDict.fromkeys(permutations( concat_gram_chunk.split()))) # gram_tokens covered in prior component match if set(gram_tokens) <= covered_tokens: continue for gram_permutation in gram_permutations: gram_permutation_str = " ".join(gram_permutation) component_match =\ helpers.map_term(gram_permutation_str, ontology_lookup_table) if not component_match: # Try again with suffixes component_match =\ helpers.map_term(gram_permutation_str, ontology_lookup_table, consider_suffixes=True) if component_match: component_matches.append(component_match) covered_tokens.update(gram_tokens) break # We need should not consider component matches that are # ancestral to other component matches. ancestors = set() for component_match in component_matches: component_match_hierarchies =\ helpers.get_term_parent_hierarchies(component_match["id"], ontology_lookup_table) for component_match_hierarchy in component_match_hierarchies: # We do not need the first element component_match_hierarchy.pop(0) ancestors |= set(component_match_hierarchy) matched_components = [] for component_match in component_matches: if component_match["id"] not in ancestors: matched_component =\ "%s:%s" % (component_match["term"], component_match["id"]) matched_components.append(matched_component) # TODO: revisit this step. # We do need it, but perhaps the function could be # simplified? if len(matched_components): matched_components = helpers.retain_phrase(matched_components) if matched_components: if len(matched_components) == 1: single_value = matched_components[0] onto_term_id = get_term_id(single_value) onto_term_label = get_term_label(single_value) mapping_output[input_term] = single_value input_to_ontology_mapping[input_term_id] = onto_term_id if onto_term_id not in ontology_to_input_mapping: ontology_to_input_mapping[onto_term_id] = input_term_id ontology_term_label_map[onto_term_id] = onto_term_label input_term_label_map[input_term_id] = input_term_label else: mapping_output[input_term] = matched_components input_to_ontology_mapping[input_term_id] =\ [onto_term_id for onto_term_id in map( lambda s: get_term_id(s), matched_components)] input_term_label_map[input_term_id] = input_term_label for ontology_term in matched_components: onto_term_id = get_term_id(ontology_term) onto_term_label = get_term_label(ontology_term) ontology_term_label_map[onto_term_id] = onto_term_label return map_result
def classify_sample(sample, matched_terms_with_ids, lookup_table, classification_lookup_table): """TODO...""" # LexMapr and IFSAC buckets mapped to the parental hierarchies of # each element in ``matched_term_with_ids``. lexmapr_hierarchy_buckets = [] ifsac_hierarchy_buckets = [] # Lowest-level mapping for each element in ``matched_terms_with_ids``. lexmapr_final_buckets = [] ifsac_final_buckets = [] # IFSAC labels corresponding to the buckets in # ``ifsac_final_buckets``. ifsac_final_labels = [] if matched_terms_with_ids: for matched_term_with_id in matched_terms_with_ids: [_, term_id] = matched_term_with_id.split(":", 1) matched_term_hierarchies = get_term_parent_hierarchies( term_id, lookup_table) for matched_term_hierarchy in matched_term_hierarchies: lexmapr_hierarchy_bucket = \ classify_sample_helper(matched_term_hierarchy, classification_lookup_table["buckets_lexmapr"]) if lexmapr_hierarchy_bucket: lexmapr_hierarchy_buckets.append(lexmapr_hierarchy_bucket) lexmapr_final_bucket_level = min( lexmapr_hierarchy_bucket.keys()) lexmapr_final_bucket = lexmapr_hierarchy_bucket[ lexmapr_final_bucket_level] if lexmapr_final_bucket not in lexmapr_final_buckets: lexmapr_final_buckets.append(lexmapr_final_bucket) ifsac_hierarchy_bucket = \ classify_sample_helper(matched_term_hierarchy, classification_lookup_table["buckets_ifsactop"]) if ifsac_hierarchy_bucket: ifsac_hierarchy_buckets.append(ifsac_hierarchy_bucket) ifsac_final_bucket_level = min( ifsac_hierarchy_bucket.keys()) ifsac_final_bucket = \ ifsac_hierarchy_bucket[ifsac_final_bucket_level] if ifsac_final_bucket not in ifsac_final_buckets: ifsac_final_buckets.append(ifsac_final_bucket) # ``ifsac_final_bucket`` has is a one-item # dictionary of the following format: # ``{bucket_id:bucket_label}``. ifsac_final_bucket_id = list( ifsac_final_bucket.keys())[0] ifsac_final_label = \ classification_lookup_table["ifsac_labels"][ifsac_final_bucket_id] ifsac_final_labels.append(ifsac_final_label) if not ifsac_final_labels or set(ifsac_final_labels) == {"food"}: # Attempt to find a classification using ifsac_default default_classification = "" sample_tokens = word_tokenize(sample) sample_tokens = list( map(lambda token: singularize(token), sample_tokens)) for bucket, label in classification_lookup_table[ "ifsac_default"].items(): bucket_tokens = word_tokenize(bucket) bucket_tokens = list( map(lambda token: singularize(token), bucket_tokens)) if not (set(bucket_tokens) - set(sample_tokens)): default_classification = label if default_classification: ifsac_final_buckets.append("Default classification") ifsac_final_labels.append(default_classification) ifsac_final_labels = \ refine_ifsac_final_labels(sample, ifsac_final_labels, classification_lookup_table["ifsac_refinement"]) return { "lexmapr_hierarchy_buckets": lexmapr_hierarchy_buckets, "lexmapr_final_buckets": lexmapr_final_buckets, "ifsac_final_buckets": ifsac_final_buckets, "ifsac_final_labels": ifsac_final_labels }
def run(args): """ Main text mining pipeline. """ # If the user specified a profile, we must retrieve args specified # by the profile, unless they were explicitly overridden. if args.profile: args = pipeline_resources.get_profile_args(args) # To contain all resources, and their variations, that samples are # matched to. Start by adding pre-defined resources from # lexmapr.predefined_resources. # TODO: These pre-defined resources are the remnants of early # LexMapr development. We should eventually move to only adding # terms from online ontologies to lookup tables. lookup_table = pipeline_resources.get_predefined_resources() # To contain resources fetched from online ontologies, if any. # Will eventually be added to ``lookup_table``. ontology_lookup_table = None if args.config: # Fetch online ontology terms specified in config file. ontology_lookup_table = pipeline_resources.get_config_resources( args.config, args.no_cache) elif args.profile: # Fetch online ontology terms specified in profile. ontology_lookup_table = pipeline_resources.get_profile_resources( args.profile) if ontology_lookup_table: # Merge ``ontology_lookup_table`` into ``lookup_table`` lookup_table = helpers.merge_lookup_tables(lookup_table, ontology_lookup_table) # To contain resources used in classification. classification_lookup_table = None if args.bucket: classification_lookup_table = pipeline_resources.get_classification_resources( ) # Output file Column Headings output_fields = [ "Sample_Id", "Sample_Desc", "Cleaned_Sample", "Matched_Components" ] if args.full: output_fields += [ "Match_Status(Macro Level)", "Match_Status(Micro Level)" ] if args.bucket: if args.full: output_fields += [ "LexMapr Classification (Full List)", "LexMapr Bucket", "Third Party Bucket", "Third Party Classification" ] else: output_fields += ["Third Party Classification"] fw = open(args.output, 'w') if args.output else sys.stdout # Main output file fw.write('\t'.join(output_fields)) # Input file fr = open(args.input_file, "r") _, ext = os.path.splitext(args.input_file) if ext == ".csv": fr_reader = csv.reader(fr, delimiter=",") elif ext == ".tsv": fr_reader = csv.reader(fr, delimiter="\t") else: raise ValueError("Should not reach here") # Skip header next(fr_reader) # Iterate over samples for matching to ontology terms for row in fr_reader: sample_id = row[0].strip() original_sample = " ".join(row[1:]).strip() cleaned_sample = "" matched_components = [] macro_status = "No Match" micro_status = [] lexmapr_classification = [] lexmapr_bucket = [] third_party_bucket = [] third_party_classification = [] # Standardize sample to lowercase and with punctuation # treatment. sample = original_sample.lower() sample = helpers.punctuation_treatment(sample) sample_tokens = word_tokenize(sample) # Get ``cleaned_sample`` for tkn in sample_tokens: # Ignore dates if helpers.is_date(tkn) or helpers.is_number(tkn): continue # Some preprocessing tkn = helpers.preprocess(tkn) lemma = helpers.singularize_token(tkn, lookup_table, micro_status) lemma = helpers.spelling_correction(lemma, lookup_table, micro_status) lemma = helpers.abbreviation_normalization_token( lemma, lookup_table, micro_status) lemma = helpers.non_English_normalization_token( lemma, lookup_table, micro_status) cleaned_sample = helpers.get_cleaned_sample( cleaned_sample, lemma, lookup_table) cleaned_sample = re.sub(' +', ' ', cleaned_sample) cleaned_sample = helpers.abbreviation_normalization_phrase( cleaned_sample, lookup_table, micro_status) cleaned_sample = helpers.non_English_normalization_phrase( cleaned_sample, lookup_table, micro_status) cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample) # Attempt full term match full_term_match = helpers.map_term(sample, lookup_table) if not full_term_match: # Attempt full term match with cleaned sample full_term_match = helpers.map_term(cleaned_sample, lookup_table) if full_term_match: micro_status.insert(0, "Used Cleaned Sample") if not full_term_match: # Attempt full term match using suffixes full_term_match = helpers.map_term(sample, lookup_table, consider_suffixes=True) if not full_term_match: # Attempt full term match with cleaned sample using suffixes full_term_match =\ helpers.map_term(cleaned_sample, lookup_table, consider_suffixes=True) if full_term_match: micro_status.insert(0, "Used Cleaned Sample") if full_term_match: matched_components.append(full_term_match["term"] + ":" + full_term_match["id"]) macro_status = "Full Term Match" micro_status += full_term_match["status"] if args.bucket: classification_result = classify_sample( sample, matched_components, lookup_table, classification_lookup_table) lexmapr_classification = classification_result[ "lexmapr_hierarchy_buckets"] lexmapr_bucket = classification_result["lexmapr_final_buckets"] third_party_bucket = classification_result[ "ifsac_final_buckets"] third_party_classification = classification_result[ "ifsac_final_labels"] else: # Attempt various component matches component_matches = [] covered_tokens = set() for i in range(5, 0, -1): for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i): concat_gram_chunk = " ".join(gram_chunk) gram_tokens = word_tokenize(concat_gram_chunk) gram_permutations =\ list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split()))) # gram_tokens covered in prior component match if set(gram_tokens) <= covered_tokens: continue for gram_permutation in gram_permutations: gram_permutation_str = " ".join(gram_permutation) component_match = helpers.map_term( gram_permutation_str, lookup_table) if not component_match: # Try again with suffixes component_match = helpers.map_term( gram_permutation_str, lookup_table, consider_suffixes=True) if component_match: component_matches.append(component_match) covered_tokens.update(gram_tokens) break # We need should not consider component matches that are # ancestral to other component matches. ancestors = set() for component_match in component_matches: component_match_hierarchies =\ helpers.get_term_parent_hierarchies(component_match["id"], lookup_table) for component_match_hierarchy in component_match_hierarchies: # We do not need the first element component_match_hierarchy.pop(0) ancestors |= set(component_match_hierarchy) for component_match in component_matches: if component_match["id"] not in ancestors: matched_component = component_match[ "term"] + ":" + component_match["id"] matched_components.append(matched_component) # TODO: revisit this step. # We do need it, but perhaps the function could be # simplified? if len(matched_components): matched_components = helpers.retainedPhrase(matched_components) # Finalize micro_status # TODO: This is ugly, so revisit after revisiting # ``retainedPhrase``. micro_status_covered_matches = set() for component_match in component_matches: possible_matched_component = component_match[ "term"] + ":" + component_match["id"] if possible_matched_component in matched_components: if possible_matched_component not in micro_status_covered_matches: micro_status_covered_matches.add( possible_matched_component) micro_status.append("{%s: %s}" % (component_match["term"], component_match["status"])) if matched_components: macro_status = "Component Match" if args.bucket: classification_result = classify_sample( sample, matched_components, lookup_table, classification_lookup_table) lexmapr_classification = classification_result[ "lexmapr_hierarchy_buckets"] lexmapr_bucket = classification_result["lexmapr_final_buckets"] third_party_bucket = classification_result[ "ifsac_final_buckets"] third_party_classification = classification_result[ "ifsac_final_labels"] # Write to row fw.write("\n" + sample_id + "\t" + original_sample + "\t" + cleaned_sample + "\t" + str(matched_components)) if args.full: fw.write("\t" + macro_status + "\t" + str(micro_status)) if args.bucket: if args.full: fw.write("\t" + str(lexmapr_classification) + "\t" + str(lexmapr_bucket) + "\t" + str(third_party_bucket)) fw.write("\t" + str(sorted(third_party_classification))) fw.write('\n') # Output files closed if fw is not sys.stdout: fw.close() # Input file closed fr.close()