示例#1
0
 def does_it_keep_group_coherent(running_group, a, b, threshold):
     if len(running_group) == 0:
         return True
     av = glove_api.get_embedding_for_word(a)
     bv = glove_api.get_embedding_for_word(b)
     for el in running_group:
         elv = glove_api.get_embedding_for_word(el)
         sim_a = glove_api.semantic_distance(elv, av)
         if sim_a > threshold:
             sim_b = glove_api.semantic_distance(elv, bv)
             if sim_b > threshold:
                 return True
             else:
                 return False
         else:
             return False
示例#2
0
def extract_cohesive_groups1(table_name, attrs):

    tokens = set()
    ctb = nlp.curate_string(table_name)
    tokens |= set(ctb.split(' '))
    for attr in attrs:
        cattr = nlp.curate_string(attr)
        tokens |= set(cattr.split(' '))
    #tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 1]
    token_vector = [(t, glove_api.get_embedding_for_word(t)) for t in tokens
                    if t not in stopwords.words('english') and len(t) > 1
                    and glove_api.get_embedding_for_word(t) is not None]

    threshold = 0.5

    group = set()
    for a, b in itertools.combinations(token_vector, 2):
        sim = glove_api.semantic_distance(a[1], b[1])
        if sim > threshold:
            group.add(a[0])
            group.add(b[0])

    #group2 = extract_cohesive_groups2(table_name, attrs)

    return [(threshold, group)]  #, group2
示例#3
0
def groupwise_semantic_sim(sv1, sv2, threshold):
    to_ret = False  # the default is false
    for a, b in itertools.product(sv1, sv2):
        sim = glove_api.semantic_distance(a, b)
        if sim < threshold:
            return False  # return False and terminate as soon as one combination does not satisfy the threshold
        to_ret = True  # if at least we iterate once, the default changes to True
    return to_ret
示例#4
0
def compute_internal_cohesion(sv):
    semantic_sim_array = []
    for a, b in itertools.combinations(sv, 2):
        sem_sim = glove_api.semantic_distance(a, b)
        semantic_sim_array.append(sem_sim)
    coh = 0
    if len(semantic_sim_array) > 1:  # if not empty slice
        coh = np.mean(semantic_sim_array)
    return coh
示例#5
0
def compute_sem_distance_with(x, sv):
    semantic_sim_array = []
    for el in sv:
        if x is not None and el is not None:
            sem_sim = glove_api.semantic_distance(x, el)
            semantic_sim_array.append(sem_sim)
    ssim = 0
    if len(semantic_sim_array) > 1:
        ssim = np.mean(semantic_sim_array)
    return ssim
示例#6
0
def compute_internal_cohesion_elementwise(x, sv):
    semantic_sim_array = []
    for el in sv:
        if x is not None and el is not None:
            sem_sim = glove_api.semantic_distance(x, el)
            semantic_sim_array.append(sem_sim)
    coh = 0
    if len(semantic_sim_array) > 1:
        coh = np.mean(semantic_sim_array)
    return coh
示例#7
0
def extract_cohesive_groups(table_name,
                            attrs,
                            sem_sim_threshold=0.7,
                            group_size_cutoff=0):
    def does_it_keep_group_coherent(running_group, a, b, threshold):
        if len(running_group) == 0:
            return True
        av = glove_api.get_embedding_for_word(a)
        bv = glove_api.get_embedding_for_word(b)
        for el in running_group:
            elv = glove_api.get_embedding_for_word(el)
            sim_a = glove_api.semantic_distance(elv, av)
            if sim_a > threshold:
                sim_b = glove_api.semantic_distance(elv, bv)
                if sim_b > threshold:
                    return True
                else:
                    return False
            else:
                return False

    tokens = set()
    ctb = nlp.curate_string(table_name)
    tokens |= set(ctb.split(' '))
    for attr in attrs:
        cattr = nlp.curate_string(attr)
        tokens |= set(cattr.split(' '))
    tokens = [
        t for t in tokens if t not in stopwords.words('english') and len(t) > 1
    ]

    running_groups = [set()]
    for a, b in itertools.combinations(tokens, 2):
        av = glove_api.get_embedding_for_word(a)
        bv = glove_api.get_embedding_for_word(b)
        if av is None or bv is None:
            continue
        sim = glove_api.semantic_distance(av, bv)
        if sim > sem_sim_threshold:  # try to add to existing group
            added_to_existing_group = False
            for running_group in running_groups:
                ans = does_it_keep_group_coherent(running_group, a, b,
                                                  sem_sim_threshold)
                if ans:  # Add to as many groups as necessary
                    added_to_existing_group = True
                    running_group.add(a)
                    running_group.add(b)
            if not added_to_existing_group:
                running_group = set()
                running_group.add(a)
                running_group.add(b)
                running_groups.append(running_group)

    return [(sem_sim_threshold, group) for group in running_groups
            if len(group) > group_size_cutoff]
示例#8
0
def extract_cohesive_groups2(table_name, attrs):
    def maybe_add_new_set(groups, current):
        # Right now, filter duplicate sets, and subsumed sets as well
        score, current_set = current
        for score, set_attrs in groups:
            if len(current_set) == len(set_attrs) and len(current_set -
                                                          set_attrs) == 0:
                return  # if repeated, then just return without adding
            len_a = len(current_set)
            len_b = len(set_attrs)
            if len_a > len_b:
                if len(set_attrs - current_set) == 0:
                    return
            else:
                if len((current_set - set_attrs)) == 0:
                    return
        groups.append(current)  # otherwise add and finish

    groups = []
    tokens = set()
    ctb = nlp.curate_string(table_name)
    tokens |= set(ctb.split(' '))
    for attr in attrs:
        cattr = nlp.curate_string(attr)
        tokens |= set(cattr.split(' '))
    tokens = [
        t for t in tokens if t not in stopwords.words('english') and len(t) > 1
    ]
    for anchor in tokens:

        threshold = 0.7

        current = (
            threshold, set()
        )  # keeps (score, []) cohesiveness score and list of attrs that honor it
        for t in tokens:
            if anchor == t:  # not interested in self-comparison
                continue
            anchor_v = glove_api.get_embedding_for_word(anchor)
            t_v = glove_api.get_embedding_for_word(t)
            if anchor_v is not None and t_v is not None:
                ss = glove_api.semantic_distance(anchor_v, t_v)
                if ss > current[0]:
                    new_set = current[1]
                    new_set.add(anchor)
                    new_set.add(t)
                    #current = (ss, new_set)
                    current = (threshold, new_set)
        if len(current[1]) > 0:
            maybe_add_new_set(groups, current)
    return groups
示例#9
0
def compute_semantic_similarity_min_average(sv1, sv2):
    global_sim = []
    for v1 in sv1:
        local_sim = []
        for v2 in sv2:
            sem_sim = glove_api.semantic_distance(v1, v2)
            local_sim.append(sem_sim)
        if len(local_sim) > 0:
            ls = min(local_sim)
        else:
            continue
        global_sim.append(ls)
    gs = 0
    if len(global_sim) > 1:
        gs = np.mean(global_sim)
    elif len(global_sim) == 1:
        gs = global_sim[0]
    return gs
示例#10
0
def compute_semantic_similarity(sv1,
                                sv2,
                                penalize_unknown_word=False,
                                add_exact_matches=True):
    accum = []
    for a, b in itertools.product(sv1, sv2):
        if a is not None and b is not None:
            if not (a == b).all(
            ) or add_exact_matches:  # otherwise this just does not add up
                sim = glove_api.semantic_distance(a, b)
                accum.append(sim)
        elif penalize_unknown_word:  # if one is None and penalize is True, then sim = 0
            sim = 0
            accum.append(sim)
    sim = 0
    if len(accum) > 0:
        sim = np.mean(accum)
    return sim
示例#11
0
def compute_semantic_similarity_median(sv1, sv2):
    global_sim = []
    for v1 in sv1:
        local_sim = []
        for v2 in sv2:
            sem_sim = glove_api.semantic_distance(v1, v2)
            local_sim.append(sem_sim)
        ls = 0
        if len(local_sim) > 1:
            ls = np.median(local_sim)
        elif len(local_sim) == 1:
            ls = local_sim[0]
        global_sim.append(ls)
    gs = 0
    if len(global_sim) > 1:
        gs = np.median(global_sim)
    elif len(global_sim) == 1:
        gs = global_sim[0]
    return gs
示例#12
0
def compute_semantic_similarity(sv1,
                                sv2,
                                penalize_unknown_word=False,
                                add_exact_matches=True,
                                signal_strength_threshold=0.5):
    total_comparisons = 0
    skipped_comparisons = 0
    accum = []
    for a, b in itertools.product(sv1, sv2):
        if a is not None and b is not None:
            if not (a == b).all(
            ) or add_exact_matches:  # otherwise this just does not add up
                total_comparisons += 1
                sim = glove_api.semantic_distance(a, b)
                accum.append(sim)
            elif (a == b).all() and not add_exact_matches:
                skipped_comparisons += 1
        elif penalize_unknown_word:  # if one is None and penalize is True, then sim = 0
            skipped_comparisons += 1
            sim = 0
            accum.append(sim)
    sim = 0
    if len(accum) > 0:
        sim = np.mean(accum)

    strong_signal = False
    # in this case we cannot judge the semantic as the word is not in the dict
    if total_comparisons == 0:
        # capturing the case of [] - [a, ...n] when n > 1: intuition is that many words convey a lot of "meaning"
        if len(sv1) > 2 or len(sv2) > 2:
            return sim, True
        return sim, strong_signal
    total_of_all_comparisons = skipped_comparisons + total_comparisons
    ratio_of_strong_signal = 0
    if total_of_all_comparisons > 0:
        ratio_of_strong_signal = float(total_comparisons /
                                       total_of_all_comparisons)

    # if not many skipped comparisons, then this is a strong signal
    if ratio_of_strong_signal >= signal_strength_threshold:
        strong_signal = True

    return sim, strong_signal