示例#1
0
def extract_cohesive_groups1(table_name, attrs):

    tokens = set()
    ctb = nlp.curate_string(table_name)
    tokens |= set(ctb.split(' '))
    for attr in attrs:
        cattr = nlp.curate_string(attr)
        tokens |= set(cattr.split(' '))
    #tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 1]
    token_vector = [(t, glove_api.get_embedding_for_word(t)) for t in tokens
                    if t not in stopwords.words('english') and len(t) > 1
                    and glove_api.get_embedding_for_word(t) is not None]

    threshold = 0.5

    group = set()
    for a, b in itertools.combinations(token_vector, 2):
        sim = glove_api.semantic_distance(a[1], b[1])
        if sim > threshold:
            group.add(a[0])
            group.add(b[0])

    #group2 = extract_cohesive_groups2(table_name, attrs)

    return [(threshold, group)]  #, group2
示例#2
0
def find_relation_class_attr_name_sem_matchings(network, kr_handlers):
    # Retrieve relation names

    #self.find_relation_class_name_sem_matchings()
    st = time.time()
    names = []
    seen_fields = []
    for (db_name, source_name, field_name, _) in network.iterate_values():
        orig_field_name = field_name
        if field_name not in seen_fields:
            seen_fields.append(field_name)  # seen already
            field_name = nlp.camelcase_to_snakecase(field_name)
            field_name = field_name.replace('-', ' ')
            field_name = field_name.replace('_', ' ')
            field_name = field_name.lower()
            svs = []
            for token in field_name.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(
                ('attribute', (db_name, source_name, orig_field_name), svs))

    num_attributes_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(('class', (kr_name, original_cl_name), svs))

    matchings = []
    for idx_rel in range(0,
                         num_attributes_inserted):  # Compare only with classes
        for idx_class in range(num_attributes_inserted, len(names)):
            svs_rel = names[idx_rel][2]
            svs_cla = names[idx_class][2]
            semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla)
            if semantic_sim > 0.8:
                # match.format db_name, source_name, field_name -> class_name
                match = ((names[idx_rel][1][0], names[idx_rel][1][1],
                          names[idx_rel][1][2]), names[idx_class][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (sem): " + str(et - st))
    return matchings
示例#3
0
def extract_cohesive_groups(table_name,
                            attrs,
                            sem_sim_threshold=0.7,
                            group_size_cutoff=0):
    def does_it_keep_group_coherent(running_group, a, b, threshold):
        if len(running_group) == 0:
            return True
        av = glove_api.get_embedding_for_word(a)
        bv = glove_api.get_embedding_for_word(b)
        for el in running_group:
            elv = glove_api.get_embedding_for_word(el)
            sim_a = glove_api.semantic_distance(elv, av)
            if sim_a > threshold:
                sim_b = glove_api.semantic_distance(elv, bv)
                if sim_b > threshold:
                    return True
                else:
                    return False
            else:
                return False

    tokens = set()
    ctb = nlp.curate_string(table_name)
    tokens |= set(ctb.split(' '))
    for attr in attrs:
        cattr = nlp.curate_string(attr)
        tokens |= set(cattr.split(' '))
    tokens = [
        t for t in tokens if t not in stopwords.words('english') and len(t) > 1
    ]

    running_groups = [set()]
    for a, b in itertools.combinations(tokens, 2):
        av = glove_api.get_embedding_for_word(a)
        bv = glove_api.get_embedding_for_word(b)
        if av is None or bv is None:
            continue
        sim = glove_api.semantic_distance(av, bv)
        if sim > sem_sim_threshold:  # try to add to existing group
            added_to_existing_group = False
            for running_group in running_groups:
                ans = does_it_keep_group_coherent(running_group, a, b,
                                                  sem_sim_threshold)
                if ans:  # Add to as many groups as necessary
                    added_to_existing_group = True
                    running_group.add(a)
                    running_group.add(b)
            if not added_to_existing_group:
                running_group = set()
                running_group.add(a)
                running_group.add(b)
                running_groups.append(running_group)

    return [(sem_sim_threshold, group) for group in running_groups
            if len(group) > group_size_cutoff]
示例#4
0
def find_sem_coh_matchings(network, kr_handlers):
    matchings = []
    matchings_special = []
    # Get all relations with groups
    table_groups = dict()
    for db, t, attrs in SS.read_table_columns(None, network=network):
        groups = SS.extract_cohesive_groups(t, attrs)
        table_groups[(db, t)] = groups  # (score, [set()])

    names = []
    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(('class', (kr_name, original_cl_name), svs))

    for db_table_info, groups in table_groups.items():
        db_name, table_name = db_table_info
        class_seen = []  # to filter out already seen classes
        for g_score, g_tokens in groups:
            g_svs = []
            for t in g_tokens:
                sv = glove_api.get_embedding_for_word(t)
                if sv is not None:
                    g_svs.append(sv)
            for _, class_info, class_svs in names:
                kr_name, class_name = class_info
                sim = SS.compute_semantic_similarity(class_svs, g_svs)
                if sim > g_score and class_name not in class_seen:
                    class_seen.append(class_name)
                    match = ((db_name, table_name, "_"), (kr_name, class_name))
                    matchings.append(match)
                """
                similar = SS.groupwise_semantic_sim(class_svs, g_svs, 0.7)
                if similar:
                    class_seen.append(class_name)
                    match = ((db_name, table_name, "_"), (kr_name, class_name))
                    matchings_special.append(match)
                continue
                """

    return matchings, table_groups  #, matchings_special
示例#5
0
def extract_cohesive_groups2(table_name, attrs):
    def maybe_add_new_set(groups, current):
        # Right now, filter duplicate sets, and subsumed sets as well
        score, current_set = current
        for score, set_attrs in groups:
            if len(current_set) == len(set_attrs) and len(current_set -
                                                          set_attrs) == 0:
                return  # if repeated, then just return without adding
            len_a = len(current_set)
            len_b = len(set_attrs)
            if len_a > len_b:
                if len(set_attrs - current_set) == 0:
                    return
            else:
                if len((current_set - set_attrs)) == 0:
                    return
        groups.append(current)  # otherwise add and finish

    groups = []
    tokens = set()
    ctb = nlp.curate_string(table_name)
    tokens |= set(ctb.split(' '))
    for attr in attrs:
        cattr = nlp.curate_string(attr)
        tokens |= set(cattr.split(' '))
    tokens = [
        t for t in tokens if t not in stopwords.words('english') and len(t) > 1
    ]
    for anchor in tokens:

        threshold = 0.7

        current = (
            threshold, set()
        )  # keeps (score, []) cohesiveness score and list of attrs that honor it
        for t in tokens:
            if anchor == t:  # not interested in self-comparison
                continue
            anchor_v = glove_api.get_embedding_for_word(anchor)
            t_v = glove_api.get_embedding_for_word(t)
            if anchor_v is not None and t_v is not None:
                ss = glove_api.semantic_distance(anchor_v, t_v)
                if ss > current[0]:
                    new_set = current[1]
                    new_set.add(anchor)
                    new_set.add(t)
                    #current = (ss, new_set)
                    current = (threshold, new_set)
        if len(current[1]) > 0:
            maybe_add_new_set(groups, current)
    return groups
示例#6
0
def test_find_semantic_sim():
    # Load onto
    om = SSAPI(None, None, None, None)
    # Load parsed ontology
    om.add_krs([("dbpedia", "cache_onto/schemaorg.pkl")], parsed=True)

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    print("Loading ontology classes...")
    names = []
    # Load classes
    for kr_name, kr_handler in om.kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(('class', cl, svs))
    print("Loading ontology classes...OK")

    while True:
        # Get words
        i = input("introduce two words separated by space to get similarity. EXIT to exit")
        tokens = i.split(' ')
        if tokens[0] == "EXIT":
            print("bye!")
            break
        svs = []
        for t in tokens:
            sv = glove_api.get_embedding_for_word(t)
            if sv is not None:
                svs.append(sv)
            else:
                print("No vec for : " + str(t))
        for _, cl, vecs in names:
            sim = SS.compute_semantic_similarity(svs, vecs)
            if sim > 0.4:
                print(str(cl) + " -> " + str(sim))
示例#7
0
 def does_it_keep_group_coherent(running_group, a, b, threshold):
     if len(running_group) == 0:
         return True
     av = glove_api.get_embedding_for_word(a)
     bv = glove_api.get_embedding_for_word(b)
     for el in running_group:
         elv = glove_api.get_embedding_for_word(el)
         sim_a = glove_api.semantic_distance(elv, av)
         if sim_a > threshold:
             sim_b = glove_api.semantic_distance(elv, bv)
             if sim_b > threshold:
                 return True
             else:
                 return False
         else:
             return False
示例#8
0
def get_semantic_vectors_for(tokens):
    s_vectors = []
    for t in tokens:
        vec = glove_api.get_embedding_for_word(t)
        if vec is not None:
            s_vectors.append(vec)
    return s_vectors
示例#9
0
def generate_table_vectors(path_to_serialized_model, network=False):
    table_vectors = dict()

    for db_name, table_name, cols in read_table_columns(
            path_to_serialized_model, network=network):
        semantic_vectors = []
        seen_tokens = []
        for c in cols:
            c = c.replace('_', ' ')
            tokens = c.split(' ')
            for token in tokens:
                token = token.lower()
                if token not in stopwords.words('english'):
                    if token not in seen_tokens:
                        seen_tokens.append(token)
                        vec = glove_api.get_embedding_for_word(token)
                        if vec is not None:
                            semantic_vectors.append(vec)
        print("Table: " + str(table_name) + " has: " +
              str(len(semantic_vectors)))
        table_vectors[(db_name, table_name)] = semantic_vectors
    return table_vectors
示例#10
0
 def _get_kr_classes_vectors(self):
     class_vectors = dict()
     for kr_name, kr in self.kr_handlers.items():
         for class_name in kr.classes_id():
             success, ret = kr.bow_repr_of(class_name, class_id=True)  # Get bag of words representation
             if success:
                 label, bow = ret
                 seen_tokens = []  # filtering out already seen tokens
                 sem_vectors = []
                 for el in bow:
                     el = el.replace('_', ' ')
                     tokens = el.split(' ')
                     for token in tokens:
                         token = token.lower()
                         if token not in stopwords.words('english'):
                             seen_tokens.append(token)
                             sem_vector = glove_api.get_embedding_for_word(token)
                             if sem_vector is not None:
                                 sem_vectors.append(sem_vector)
                 if len(sem_vectors) > 0:  # otherwise just no context generated for this class
                     class_vectors[kr.name_of_class(class_name)] = sem_vectors
             else:
                 print(ret)
     return class_vectors
def find_matching_to_text(network,
                          semantic_sim_threshold=0.5,
                          sensitivity_neg_signal=0.5,
                          negative_signal_threshold=0.4,
                          penalize_unknown_word=False,
                          add_exact_matches=True,
                          reference_name="",
                          reference_gen=None):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_fields = set()
    for (db_name, source_name, field_name, _) in network.iterate_values():
        orig_field_name = field_name
        key_seen = source_name + field_name
        if key_seen not in seen_fields:
            seen_fields.add(key_seen)  # seen already
            field_name = nlp.camelcase_to_snakecase(field_name)
            field_name = field_name.replace('-', ' ')
            field_name = field_name.replace('_', ' ')
            field_name = field_name.lower()
            svs = []
            for token in field_name.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(
                ('attribute', (db_name, source_name, orig_field_name), svs))

    num_attributes_inserted = len(names)

    # Retrieve class names

    for cl in reference_gen:
        original_cl_name = cl
        cl = cl.replace('-', ' ')
        cl = cl.replace('_', ' ')
        cl = cl.lower()
        svs = []
        for token in cl.split():
            if token not in stopwords.words('english'):
                sv = glove_api.get_embedding_for_word(token)
                if sv is not None:
                    svs.append(sv)
        names.append(('class', (reference_name, original_cl_name), svs))

    print("N equals: " + str(len(names)))

    pos_matchings = []
    neg_matchings = []
    for idx_class in range(num_attributes_inserted, len(names)):
        for idx_rel in range(
                0, num_attributes_inserted):  # Compare only with classes
            ban_index1, ban_index2 = get_ban_indexes(names[idx_rel][1][2],
                                                     names[idx_class][1][1])
            svs_rel = remove_banned_vectors(ban_index1, names[idx_rel][2])
            svs_cla = remove_banned_vectors(ban_index2, names[idx_class][2])
            semantic_sim, strong_signal = SS.compute_semantic_similarity(
                svs_rel,
                svs_cla,
                penalize_unknown_word=penalize_unknown_word,
                add_exact_matches=add_exact_matches,
                signal_strength_threshold=sensitivity_neg_signal)
            if strong_signal and semantic_sim > semantic_sim_threshold:
                # match.format db_name, source_name, field_name -> class_name
                match = ((names[idx_rel][1][0], names[idx_rel][1][1],
                          names[idx_rel][1][2]), names[idx_class][1])
                pos_matchings.append(match)
                continue  # FIXME: one matching per entity
            elif strong_signal and semantic_sim < negative_signal_threshold:
                match = ((names[idx_rel][1][0], names[idx_rel][1][1],
                          names[idx_rel][1][2]), names[idx_class][1])
                neg_matchings.append(match)
    et = time.time()
    print("l52: " + str(et - st))
    return pos_matchings, neg_matchings
示例#12
0
def find_relation_class_name_sem_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            svs = []
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    #if sv is not None:
                    svs.append(
                        sv)  # append even None, to apply penalization later
            names.append(('relation', (db_name, original_source_name), svs))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    #if sv is not None:
                    svs.append(
                        sv)  # append even None, to apply penalization later
            names.append(('class', (kr_name, original_cl_name), svs))

    matchings = []
    for idx_rel in range(0,
                         num_relations_inserted):  # Compare only with classes
        for idx_class in range(num_relations_inserted, len(names)):
            svs_rel = names[idx_rel][2]
            svs_cla = names[idx_class][2]
            semantic_sim = SS.compute_semantic_similarity(
                svs_rel,
                svs_cla,
                penalize_unknown_word=True,
                add_exact_matches=False)
            #semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla)
            if semantic_sim > 0.5:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx_rel][1][0], names[idx_rel][1][1], "_"),
                         names[idx_class][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (sem): " + str(et - st))
    return matchings