Exemplo n.º 1
0
Arquivo: Utils.py Projeto: lwheng/fyp
 def jaccard(self, inputA, inputB):
   if type(inputA) == str or type(inputA) == unicode:
     inputA_tokens = inputA.lower().split()
     inputB_tokens = inputB.lower().split()
     return distance.jaccard_distance(set(inputA_tokens), set(inputB_tokens))
   elif type(inputA) == nltk.text.Text:
     inputA_tokens = map(lambda x: x.lower(), inputA.vocab().keys())
     inputB_tokens = map(lambda x: x.lower(), inputB.vocab().keys())
     return distance.jaccard_distance(set(inputA_tokens), set(inputB_tokens))
   elif type(inputA) == list:
     inputA_tokens = map(lambda x: x.lower(), inputA)
     inputB_tokens = map(lambda x: x.lower(), inputB)
     return distance.jaccard_distance(set(inputA_tokens), set(inputB_tokens))
Exemplo n.º 2
0
def calcualte(dict, manifesto_bow):
    article_id, article_dict = dict
    if data_io.DATA_FIELD in article_dict and len(
            article_dict[data_io.DATA_FIELD]) > 0:
        printv('\tProcessing ' + str(article_id) + ' with ' +
               str(len(article_dict[data_io.DATA_FIELD])) + ' words.')
        if SKIP_BOW_CREATION:
            distance = jaccard_distance(set(manifesto_bow),
                                        set(article_dict[data_io.DATA_FIELD]))
        else:
            article_bow = create_clean_bow(article_dict[data_io.DATA_FIELD])
            distance = jaccard_distance(set(manifesto_bow), set(article_bow))
        return (article_dict[data_io.DATE_FIELD] + '-' + article_id, distance)
Exemplo n.º 3
0
def main():
    assert len(sys.argv) == 4
    reader = Seq2SeqDatasetReader(source_tokenizer=NoOpTokenizer(), target_tokenizer=NoOpTokenizer())
    train = reader.read(sys.argv[1])
    val = reader.read(sys.argv[2])
    test = reader.read(sys.argv[3])

    generator = Generator()
    rules, rules_anon, rules_ground, semantics, entities = load_all_2018(generator, GRAMMAR_DIR)
    anonymizer = Anonymizer(*entities)


    neighbors = []
    for x in itertools.chain(train, val):
        command = str(x["source_tokens"][1:-1][0])
        form = str(x["target_tokens"][1:-1][0])
        anon_command = anonymizer(command)
        neighbors.append((anon_command, form))

    test_pairs = []
    for x in test:
        test_pairs.append((str(x["source_tokens"][1:-1][0]), str(x["target_tokens"][1:-1][0])))

    print("Check grammar membership")
    naive_parser = GrammarBasedParser(rules_anon)
    anon_parser = AnonymizingParser(naive_parser, anonymizer)

    correct, parsed = bench_parser(anon_parser, test_pairs)
    print("Got {} of {} ({:.2f})".format(parsed, len(test_pairs), 100.0 * parsed / len(test_pairs)))

    print("Jaccard distance")
    sweep_thresh(neighbors, test_pairs, anonymizer, lambda x, y: jaccard_distance(set(x.split()), set(y.split())),
                 [0.1 * i for i in range(11)])
    print("Edit distance")
    sweep_thresh(neighbors, test_pairs, anonymizer, editdistance.eval)
Exemplo n.º 4
0
    def combineClusters(clusters, **twitter_stream_settings):
        def getHashtagSet(vector):
            return set([
                word for dimension in vector for word in dimension.split()
                if word.startswith('#')
            ])

        def getClusterInt(id):
            return int(id.split('_')[1])

        mergedClustersMap = {}
        for cluster in [
                clusters[v] for v in sorted(clusters, key=getClusterInt)
        ]:
            mergedClusterId = None
            for mergedCluster in mergedClustersMap.itervalues():
                clusterHashtags, mergedClusterHashtags = getHashtagSet(
                    cluster), getHashtagSet(mergedCluster)
                if len(
                        clusterHashtags.union(mergedClusterHashtags)
                ) and jaccard_distance(
                        clusterHashtags,
                        mergedClusterHashtags) <= 1 - twitter_stream_settings[
                            'cluster_merging_jaccard_distance_threshold']:
                    mergedCluster.mergeCluster(
                        cluster), mergedCluster.mergedClustersList.append(
                            cluster.clusterId)
                    mergedClusterId = mergedCluster.clusterId
                    break
            if mergedClusterId == None:
                mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
                    cluster)
                mergedCluster.mergedClustersList = [cluster.clusterId]
                mergedClustersMap[mergedCluster.clusterId] = mergedCluster
        return mergedClustersMap
Exemplo n.º 5
0
 def jaccard(self, entry, gram_number):
     spellings = self.words[self.words.str.startswith(entry[0])]
     distances = ((jaccard_distance(set(ngrams(entry, gram_number)),
                                    set(ngrams(word, gram_number))), word)
                  for word in spellings)
     closest = min(distances)
     return closest[1]
Exemplo n.º 6
0
def mainFunction(listaTagsTreino, listaFrasesTreino,
                 listaFrasesDesenvolvimento):
    results = []
    bestSentences = []
    i = 0
    while i < len(listaFrasesDesenvolvimento):
        j = 0
        best = 1000
        tagId = "VOID"
        bestSentence = ""
        while j < len(listaFrasesTreino):
            # It is really a distance and not a similarity measure (1-similarity)
            result = jaccard_distance(
                set(listaFrasesTreino[j].split()),
                set(listaFrasesDesenvolvimento[i].split()))
            #result = edit_distance(listaFrasesTreino[j].split(), listaFrasesDesenvolvimento[i].split())
            #print(result)
            if result < best:
                tagId = listaTagsTreino[j]
                bestSentence = listaFrasesTreino[j]
                best = result
            j = j + 1
        results.append(tagId)
        bestSentences.append(bestSentence)
        i = i + 1
    return results, bestSentences
Exemplo n.º 7
0
def jaccard_team(input_team, all_teams):
    """
	Trova il giocatore corrispondente a quello inserito dall'user.

	:param input_player: str

	:param all_players: list of str


	:return jac_player: str

	"""

    dist = 10
    tri_guess = set(ngrams(input_team[:3].upper(), 2))
    jac_team = ''

    for tm in all_teams:
        p = tm.replace(' ', '')
        trit = set(ngrams(p, 2))
        jd = jaccard_distance(tri_guess, trit)
        if not jd:
            return tm
        elif jd < dist:
            dist = jd
            jac_team = tm

    return jac_team
Exemplo n.º 8
0
 def compute_lcs(self, doc1, doc2):
     LCS, MCLCS1, MCLCSN = self.lcs(doc1, doc2)
     jaccard_score = 1 - jaccard_distance(set(doc1), set(doc2))
     #score =  ( LCS +   MCLCSN + jaccard_score)/3.0
     score = 0.1 * LCS + 0.3 * MCLCSN + 0.6 * jaccard_score
     #score = jaccard_score
     return score
Exemplo n.º 9
0
    def testLSH(self):
        strings = [
                   "abcdefghijklmnopqrstuvwxyz",
                   "abcdefghijklmnopqrstuvw",
                   "defghijklmnopqrstuvw",
                   "zyxwvutsrqponmlkjihgfedcba",
                   "1abcdefghijklmnopuvw1",
                   "123456789",
                   "012345678",
                   "234567890",
                   ]
        for i, a in enumerate(strings):
            for j, b in enumerate(strings[i+1:]):
                print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b)))

        random.seed(12345)
        lsh = LSHCache(shingler=Shingler(1))
        self.assertListEqual([set(),
                              set([0]),
                              set([0,1]),
                              set([0,1,2]),
                              set([0,1,2,3]),
                              set(),
                              set([5]),
                              set([5,6])], lsh.insert_batch(strings))
Exemplo n.º 10
0
def jaccard_player(input_player, all_players):
    """
	Trova il giocatore corrispondente a quello inserito dall'user.

	:param input_player: str

	:param all_players: list of str


	:return jac_player: str

	"""

    dist = 10
    tri_guess = set(ngrams(input_player.upper(), 3))
    jac_player = ''

    for pl in all_players:
        p = pl.replace(' ', '')
        trit = set(ngrams(p, 3))
        jd = jaccard_distance(tri_guess, trit)
        if not jd:
            return pl
        elif jd < dist:
            dist = jd
            jac_player = pl

    return jac_player
Exemplo n.º 11
0
def jaccard(string1, string2):
    '''
    Jaccard distance
    '''
    return jaccard_distance(
        set(string1.split()),
        set(string2.split())
    )
Exemplo n.º 12
0
def get_jaccard_sim(text1, text2):
#    countvect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), binary=True)
#    countvect.fit([text1, text2])
    t1 = _remove_tags(clean_tweet(text1))
    t2 = _remove_tags(clean_tweet(text2))
    tokens1 = t1.split()
    tokens2 = t2.split()
    
    return 1-jaccard_distance(set(tokens1), set(tokens2))
Exemplo n.º 13
0
def extract_misc_attribute_businesses(misc_attributes_from_question,
                                      extracted_business_dictionary, question):

    #print 'misc attribute', misc_attributes_from_question
    candidate_businesses = {}
    distances_from_attributes = {}

    for attribute in misc_attributes_from_question:
        for token in question.split():
            distance = jaccard_distance(set(list(attribute)),
                                        set(list(question)))
            distances_from_attributes[attribute] = distance

    sorted_distances = sorted(distances_from_attributes.items(),
                              key=operator.itemgetter(1))

    #print('Highest distances')
    #pprint (sorted_distances)

    for i in range(len(sorted_distances[:1])):
        best_attribute = sorted_distances[i]

        for business in extracted_business_dictionary:
            if business['attributes']:
                for attribute in business['attributes']:
                    if str(attribute.split(':')[0]) == ''.join(
                            best_attribute[0].split()):
                        eliminated = ['No', 'None', 'False']
                        if attribute.split(':')[1] not in eliminated:
                            #print 'Present',business
                            candidate_businesses[
                                business['business_id']] = business['stars']

    #print 'Candidate Businesses', candidate_businesses

    sorted_businesses = sorted(candidate_businesses.items(),
                               key=operator.itemgetter(1))[::-1]

    #sorted_businesses = [dict1 for dict1 in extracted_business_dictionary]

    #Extract businesses sorted by ratings
    sorted_business_ids = [
        business_id for business_id, ratings in sorted_businesses
    ]

    ranked_businesses = []
    for business_id in sorted_business_ids:
        ranked_businesses += [
            dictionary for dictionary in extracted_business_dictionary
            if dictionary['business_id'] == business_id
        ]

    if len(ranked_businesses) == 0:
        return extracted_business_dictionary, 'No'

    else:
        return ranked_businesses, 'Yes'
Exemplo n.º 14
0
def jaccard(entries, gram_number):
    outcomes = []
    for entry in entries:
        spellings = spellings_series[spellings_series.str.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry, gram_number)),
                                       set(ngrams(word, gram_number))), word)
                     for word in spellings)
        closest = min(distances)
        outcomes.append(closest[1])
        return outcomes
Exemplo n.º 15
0
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']):
    gram_num = 3
    recommendations = []
    for entry in entries:
        words = correct_series[correct_series.str.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry, gram_num)),
                                       set(ngrams(word, gram_num))), word)
                     for word in words)
        closest = min(distances)
        recommendations.append(closest[1])
    return recommendations  # Your answer here
Exemplo n.º 16
0
def getSuggestedWords(search_tokens):
    suggestion = []
    for word in search_tokens:
        if not (d.check(word)):
            poss_suggest = d.suggest(word)[0:4]
            # fine-tune to pick best suggestion using jaccard_distance
            dists = [jaccard_distance(set(w), set(word)) for w in poss_suggest]
            suggestion.append(poss_suggest[dists.index(min(dists))])
        else:
            suggestion.append(word)
    return suggestion
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']):
    from nltk.metrics.distance import jaccard_distance
    from nltk.util import ngrams

    # return # Your answer here
    list = []
    for entry in entries:
        temp = [(jaccard_distance(set(ngrams(entry, 3)), set(ngrams(w, 3))), w)
                for w in correct_spellings if w[0] == entry[0]]
        recommended_ = sorted(temp, key=lambda val: val[0])[0][1]
        list.append(recommended_)
    return list
Exemplo n.º 18
0
def jaccard(misspelled_words, gram_number):
    correct_spellings = pd.Series(words.words())
    outcomes = []
    for entry in misspelled_words:
        words_starting_with = correct_spellings[correct_spellings.str.startswith(entry[0])]
        scoreWordPairs = [(word, jaccard_distance(
                                    set(ngrams(word, gram_number)),
                                    set(ngrams(entry, gram_number))
                                    )
                            ) for word in words_starting_with]
        closet = min(scoreWordPairs, key=lambda x: x[1])
        outcomes.append(closet[0])
    return outcomes
Exemplo n.º 19
0
def jaccard_result(name_to_fix: str, all_options: list, ngrams_lenght: int):

    name_to_correct = name_to_fix.lower().replace(' ', '')
    n_in = set(ngrams(name_to_correct, ngrams_lenght))

    out_opts = [pl.lower().replace(' ', '') for pl in all_options]
    n_outs = [set(ngrams(pl, ngrams_lenght)) for pl in out_opts]

    distances = [jaccard_distance(n_in, n_out) for n_out in n_outs]

    if len(set(distances)) == 1 and distances[0] == 1:
        return jaccard_result(name_to_correct, all_options, ngrams_lenght-1)
    else:
        return np.array(all_options)[np.argsort(distances)][:3]
Exemplo n.º 20
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    from nltk.metrics.distance import (
        jaccard_distance, )
    from nltk.util import ngrams
    spellings_series = pd.Series(correct_spellings)
    correct = []
    for entry in entries:
        spellings = spellings_series[spellings_series.str.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry, 4)),
                                       set(ngrams(word, 4))), word)
                     for word in spellings)
        closet = min(distances)
        correct.append(closet[1])

    return correct
Exemplo n.º 21
0
def jaccard_result(in_opt: str, all_opt: list, ngrm: int) -> str:

    in_opt = in_opt.lower().replace(' ', '')
    n_in = set(ngrams(in_opt, ngrm))

    out_opts = [pl.lower().replace(' ', '') for pl in all_opt]
    n_outs = [set(ngrams(pl, ngrm)) for pl in out_opts]

    distances = [jaccard_distance(n_in, n_out) for n_out in n_outs]

    if len(set(distances)) == 1:
        return jaccard_result(in_opt, all_opt, ngrm - 1) if ngrm > 2 else ''
    else:
        idx = int(np.argmin(distances))
        return all_opt[idx]
Exemplo n.º 22
0
def doYouMean(keyword):
    from nltk.corpus import words
    correct_spellings = words.words()

    from nltk.metrics.distance import jaccard_distance
    from nltk.util import ngrams
    result = ''
    for key in keyword.split():
        if len(key) > 1:
            temp = [(jaccard_distance(set(ngrams(key, 2)), set(ngrams(w,
                                                                      2))), w)
                    for w in correct_spellings if w[0] == key[0]]
            result += sorted(temp, key=lambda val: val[0])[0][1] + ' '
        else:
            result += key + ' '
    return result
Exemplo n.º 23
0
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']):
    from nltk.metrics.distance import (edit_distance, jaccard_distance)
    from nltk.util import ngrams

    df = pd.Series(data=correct_spellings)
    words = [word for word in df if word.startswith(entries[0][0])]
    res = []
    for entry in entries:
        words = [word for word in df if word.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry, 3)),
                                       set(ngrams(word, 3))), word)
                     for word in words)

        closest = min(distances)

        res.append(closest[1])
    return res
 def combineClusters(clusters, **twitter_stream_settings):
     def getHashtagSet(vector): return set([word for dimension in vector for word in dimension.split() if word.startswith('#')])
     def getClusterInt(id): return int(id.split('_')[1])
     mergedClustersMap = {}
     for cluster in [clusters[v] for v in sorted(clusters, key=getClusterInt)]:
         mergedClusterId = None
         for mergedCluster in mergedClustersMap.itervalues():
             clusterHashtags, mergedClusterHashtags = getHashtagSet(cluster), getHashtagSet(mergedCluster)
             if len(clusterHashtags.union(mergedClusterHashtags)) and jaccard_distance(clusterHashtags, mergedClusterHashtags) <= 1-twitter_stream_settings['cluster_merging_jaccard_distance_threshold']: 
                 mergedCluster.mergeCluster(cluster), mergedCluster.mergedClustersList.append(cluster.clusterId)
                 mergedClusterId = mergedCluster.clusterId
                 break
         if mergedClusterId==None:
             mergedCluster = StreamCluster.getClusterObjectToMergeFrom(cluster)
             mergedCluster.mergedClustersList = [cluster.clusterId]
             mergedClustersMap[mergedCluster.clusterId]=mergedCluster
     return mergedClustersMap
Exemplo n.º 25
0
def JaccardSimAndMasiDis(text1, text2, stop_words=False):
    word1list = word_tokenize(text1)
    word2list = word_tokenize(text2)

    if stop_words:
        word1list = [
            word.lower() for word in word1list if word not in StopWords
        ]
        word2list = [
            word.lower() for word in word2list if word not in StopWords
        ]

    word1set = set(word1list)
    word2set = set(word2list)

    return 1 - jaccard_distance(
        word1set, word2set)  #, 1 - masi_distance(word1set, word2set)
Exemplo n.º 26
0
def predict_labels(test_questions, known_questions, coarseness):
    labels = []
    n_known_questions = len(known_questions)
    for test_question in test_questions:
        smallest_dist = 5000
        closest_question = None
        for i in range(n_known_questions):
            dist = jaccard_distance(set(test_question),
                                    set(known_questions[i].question))
            if (dist < smallest_dist):
                smallest_dist = dist
                closest_question = known_questions[i]
        if (closest_question != None):
            labels.append(get_label(closest_question, coarseness))
        else:
            labels.append(None)
    return labels
Exemplo n.º 27
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):

    from nltk.metrics.distance import jaccard_distance
    from nltk.util import ngrams
    correct_words = []

    for e in entries:
        min_dist = 1
        closest = None
        for c in correct_spellings:
            if c[0] == e[0]:
                d = jaccard_distance(set(ngrams(e, n=4)), set(ngrams(c, n=4)))
                if d < min_dist:
                    min_dist = d
                    closest = c
        correct_words = correct_words + [closest]

    return correct_words
Exemplo n.º 28
0
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']):
    from nltk.metrics.distance import jaccard_distance
    from nltk.util import ngrams
    # set gram number
    N_g = 3
    
    spelling_df = pd.Series(correct_spellings)
    
    vals_returned = []
    for word in entries:
        spell = spelling_df[spelling_df.str.startswith(word[0])]
        dist_calced = ((jaccard_distance(set(ngrams(word, N_g)), set(ngrams(var, N_g))), var) for var in spell)
        #closest_val = min(dist_calced)
        #vals_returned.append(closest_val[1])
        vals_returned.append(min(dist_calced)[1])
    
    # This function should return a list of length three: 
    #   ['cormulent_reccomendation', 'incendenece_reccomendation', 'validrate_reccomendation'].
    return vals_returned
Exemplo n.º 29
0
def get_realtion_info(relation_candidate, remain_sentence):  # [name, relation, target_entity, target_entity_keyid]
    # temp_relations = ccksNeo.get_entity_info_by_keyid(entity_keyid)  #该实体的信息
    # 实体名,路径,目标实体
    # print(temp_relations)
    relation_info = []
    for candidate in relation_candidate:
        # for key, value in temp_relations.items():  #路径名,目标实体
        segmentor1 = Segmentor()
        segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")

        temp = list(segmentor1.segment(remain_sentence))
        segmentor1.release()
        guanxideci = jieba.cut(candidate[0])
        for word in guanxideci:
            if word in model and word in temp:
                temp.remove(word)
        '''
        segmentor2 = Segmentor()
        segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model")
        temp2 = list(segmentor2.segment(candidate[1]))
        segmentor2.release()
        '''
        ##################jaccard
        temp2 = [candidate[1]]
        set1 = set(temp)
        set2 = set(temp2)
        jaccard = jaccard_distance(set1, set2)
        edit = difflib.SequenceMatcher(None, question, candidate[1]).ratio()
        print(temp, temp2)
        w2v = serviceWord2vec.get_similarity(temp, list(jieba.cut(candidate[1])))
        '''

        if key == c_relation_name:
            is_correct = 1
        else:
            is_correct = 0
        '''
        #
        relation_info.append([candidate[0], candidate[1], candidate[2], candidate[3], jaccard, edit, w2v])
        # 实体,路径名,目标实体,jaccard距离,编辑距离,向量相似度
    # print(relation_info)
    return relation_info
    '''
Exemplo n.º 30
0
def jaccard(entries, gram_number):
    """find the closet words to each entry

    Args:
     entries: collection of words to match
     gram_number: number of n-grams to use

    Returns:
     list: words with the closest jaccard distance to entries
    """
    outcomes = []
    for entry in entries:
        spellings = spellings_series[spellings_series.str.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry, gram_number)),
                                       set(ngrams(word, gram_number))), word)
                     for word in spellings)
        # distances is a generator
        closest = min(distances)
        outcomes.append(closest[1])
    return outcomes
Exemplo n.º 31
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):

    from nltk.metrics.distance import jaccard_distance
    from nltk.util import ngrams

    # make tripairs of adjacent letters
    # print(set(ngrams('cormulent', 3)))

    best = []
    for i, entry in enumerate(entries):
        spellings_check = [w for w in correct_spellings if w[0] == entry[0]]
        distances = [(entry, word,
                      jaccard_distance(set(ngrams(entry, 4)),
                                       set(ngrams(word, 4))))
                     for word in spellings_check]
        distances.sort(key=lambda tup: tup[2])
        best.append(distances[0])
    recommended = [word for _, word, _ in best]

    return recommended
Exemplo n.º 32
0
def jaccard_result(in_opt: str, all_opt: list, ngrm: int) -> str:
    """
    Fix user input.
    """

    in_opt = in_opt.lower().replace(' ', '')
    n_in = set(ngrams(in_opt, ngrm))

    out_opts = [pl.lower().replace(' ', '').replace('+', '') for pl in all_opt]
    n_outs = [set(ngrams(pl, ngrm)) for pl in out_opts]

    if in_opt in out_opts:
        return all_opt[out_opts.index(in_opt)]

    distances = [jaccard_distance(n_in, n_out) for n_out in n_outs]

    if len(set(distances)) == 1:
        return jaccard_result(in_opt, all_opt, ngrm - 1) if ngrm > 2 else ''
    else:
        return all_opt[np.argmin(distances)]
Exemplo n.º 33
0
def jaccard_distance_similarity(lhs, entities):
    min_similarity = [
        ('', '', 1000000.0),
    ]

    for entity in entities:
        similarity = distance.jaccard_distance(
            set(ngrams(lhs, n=2)), set(ngrams(entity['author'], n=2)))
        if similarity < min_similarity[0][2]:
            min_similarity = [
                (entity['author'], entity['url'], similarity),
            ]
        elif min_similarity[0][2] == similarity:
            min_similarity.append(
                (entity['author'], entity['url'], similarity))

    print(
        'Jaccard distance with {0} and {1} entities results in {2} minimum similarity of {3}'
        .format(lhs, len(entities), len(min_similarity), min_similarity[0][2]))

    return min_similarity
Exemplo n.º 34
0
a = wordnet.synsets('tone')[4]

b = wordnet.synsets('color')[0]

wordnet.similarity(a,b)




a = ['this', 'is', 'a', 'test']
b = ['this', 'was', 'a', 'test']

edit_distance(a, b)

jaccard_distance(set(a), set(b))

masi_distance(set(a), set(b))








from pattern.web import DBPedia

sparql = '\n'.join((
    'prefix dbo: <http://dbpedia.org/ontology/>',
    'select ?person ?place where {',
Exemplo n.º 35
0
@author: space
'''
import argparse
import logging
import random
import itertools as it
import functools as ft
from lsh import LSHCache, XORHashFamily, MultiplyHashFamily, Shingler
from nltk.metrics.distance import jaccard_distance, masi_distance, edit_distance

minhash_choices = { 'xor': XORHashFamily,
                    'multiply': MultiplyHashFamily,
                  }

similarity_choices = { 'jaccard': lambda a,b,s: 1 - jaccard_distance(set(s.shingle(a)), set(s.shingle(b))),
                       'masi': lambda a,b,s: 1 - masi_distance(set(s.shingle(a)), set(s.shingle(b))),
                       'edit': lambda a,b,s: 1 - float(edit_distance(a,b))/max(len(a),len(b)),
                       'edit_transposition': lambda a,b,s: 1-float(edit_distance(a,b,True))/max(len(a),len(b)) }

generator_choices = { 'combinations': it.combinations,
                      'combinations_replacement': it.combinations_with_replacement,
                      'permutations': it.permutations }

def parse_args(argv=None):
    parser = argparse.ArgumentParser(description="Analyze performance of LSH over a mock generated data set")

    lsh_group = parser.add_argument_group('LSH Cache parameters')
    
    lsh_group.add_argument("-b", "--num-bands", type=int,
                        help="""number of bands in LSH cache""")
def jacquard_sim(text1,text2):
    set1=set(tokenizer(text1))
    set2=set(tokenizer(text2))
    sim=jaccard_distance(set1, set2)#, normalize=True)
    return sim
Exemplo n.º 37
0
def search_misawa(meigens, targetSentence, retR=False,
        method='masi', model=None, dictionary=None):
    """
    MASI距離によりベストなミサワを探す関数
    - IN  : 名言リスト、解析対象文章
    - OUT : 画像のURL
    """
    targetWords = mecab_func.breakdown_into_validwords(targetSentence)
    
    if len(targetWords) <= 2 or len(targetWords) >= 30:
        logger.warning("bad tweet for misawa-recommend")
        if retR:
            return 1., None
        else:
            return (1.)

    # 入力された文章で解析可能な場合
    hit = False
    minr = 1.0
    matched_inf = {}
    cnt = 0

    for meigen in meigens:

        words = meigen['words']

        if method == 'jaccard':
            # Jaccard距離による類似度判定。小さいほど類似
            r = jaccard_distance(set(targetWords), set(words))
        elif method == 'masi':
            # MASI距離による類似度判定。小さいほど類似
            r = masi_distance(set(targetWords), set(words))
        elif method[0:3] in ['lsi', 'lda', 'LSI', 'LDA']:
            # コサイン類似度で判定。負で評価し、小さいほど類似
            vec = model[dictionary.doc2bow(targetWords)]
            r = -1.*matutils.cossim(meigen[method], vec)
        elif method[0:3] in ['d2v', 'doc']:
            # コサイン類似度で判定。負で評価し、小さいほど類似
            r = -1.*d2v_similarity(targetWords, words, model)

        if r < minr:
            hit = True
            minr = r
            matched_inf = meigen
        cnt = cnt + 1

    # 例外: すべての名言との距離が 1.0
    if not hit:
        logger.info("ベストマッチなし")
        if retR:
            return 1., None
        else:
            return (1.)

    logger.info("========calculation report========")
    logger.info("method: %s [r = %f]" % (method, minr))
    logger.info("input : %s %s" % (targetSentence.replace('\n', ' '), targetWords))
    logger.info('meigen: %s %s' % (matched_inf['body'].replace('\n', ' '), matched_inf['words']))

    if retR:
        # 戻り値: MASI距離, 全ミサワ情報
        return minr, matched_inf
    else:
        # レポート
        # 戻り値: 画像のURL
        return(matched_inf)
Exemplo n.º 38
0
 def __init__(self,combo):
     self.f1,self.f2 = combo
     self.f1_set = set(self.clean_name(self.f1.name))
     self.f2_set = set(self.clean_name(self.f2.name))
     self.distance = jaccard_distance(self.f1_set,self.f2_set)
# -*- coding: utf-8 -*-

from nltk.metrics.distance import jaccard_distance, masi_distance
from prettytable import PrettyTable

fields = ['X', 'Y', 'Jaccard(X,Y)', 'MASI(X,Y)']
pt = PrettyTable(fields)
[pt.set_field_align(f, 'l') for f in fields]

for z in range(4):
    X = set()
    for x in range(z, 4):
        Y = set()
        for y in range(1, 3):
            X.add(x)
            Y.add(y)
            pt.add_row([list(X), list(Y), round(jaccard_distance(X, Y), 2),
                       round(masi_distance(X, Y), 2)])
print(pt)
Exemplo n.º 40
0
# Features
top_entry = json_response[0]
true_matches = [bool(song['Match']) for song in json_response[1:]]

FEATURE = 'SongName'
NGRAMS = 2
top_entry_value = preproc(top_entry[FEATURE])
print 'Comparing song name to top match reference:', top_entry[FEATURE]
top_entry_word_bigrams = set(ngrams(word_tokenize(top_entry_value), NGRAMS))

matches = []
for song in json_response[1:]:

    this_value = preproc(song[FEATURE])
    print '\t%s' % song[FEATURE]

    this_word_bigrams = set(ngrams(word_tokenize(this_value), NGRAMS))
    wbg_distance = jaccard_distance(top_entry_word_bigrams, this_word_bigrams)
    print '\t\tWord bigrams + Jaccard:\t'+str(wbg_distance)

    is_this_match = is_match(wbg_distance)
    print '\t\tMatch?', is_this_match
    matches.append(is_this_match)

cm = ConfusionMatrix(true_matches, matches)
print 'Confusion matrix'
print cm

print 'Accuracy:', accuracy(true_matches, matches)
Exemplo n.º 41
0
def get_values(entities, domain):
    _random, bayes_random = {}, {}
    bayes_no_variation, bayes_variation = {}, {}
    siddharthan, deemter  = {}, {}

    for _id in entities:
        evaluation = p.load(open(os.path.join(properties.evaluation_dir, _id)))

        for fold in evaluation:
            if fold not in bayes_random:
                _random[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]}
                bayes_random[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]}
                bayes_no_variation[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]}
                bayes_variation[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]}
                siddharthan[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]}
                deemter[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]}

            for item in evaluation[fold]:
                item_domain = get_domain(item['features']['fname'])

                if domain == item_domain or domain == '':
                    string_real = item['real']['reference']
                    string_random = item['random']['reference']
                    string_bayes_random = item['bayes_random']['reference'][0][0]
                    string_bayes_no_variation = item['bayes_no_variation']['reference'][0][0]
                    string_bayes_variation = item['bayes_variation']['reference'][0][0]
                    string_siddharthan = item['siddharthan']['reference']
                    string_deemter = item['deemter']['reference']

                    dist_random = edit_distance(string_random, string_real)
                    dist_bayes_random = edit_distance(string_bayes_random, string_real)
                    dist_bayes_no_variation = edit_distance(string_bayes_no_variation, string_real)
                    dist_bayes_variation = edit_distance(string_bayes_variation, string_real)
                    dist_siddharthan = edit_distance(string_siddharthan, string_real)
                    dist_deemter = edit_distance(string_deemter, string_real)

                    tokens_real = set(nltk.word_tokenize(string_real))
                    tokens_random = set(nltk.word_tokenize(string_random))
                    tokens_bayes_random = set(nltk.word_tokenize(string_bayes_random))
                    tokens_bayes_no_variation = set(nltk.word_tokenize(string_bayes_no_variation))
                    tokens_bayes_variation = set(nltk.word_tokenize(string_bayes_variation))
                    tokens_siddharthan = set(nltk.word_tokenize(string_siddharthan))
                    tokens_deemter = set(nltk.word_tokenize(string_deemter))

                    jaccard_random = jaccard_distance(tokens_random, tokens_real)
                    jaccard_bayes_random = jaccard_distance(tokens_bayes_random, tokens_real)
                    jaccard_bayes_no_variation = jaccard_distance(tokens_bayes_no_variation, tokens_real)
                    jaccard_bayes_variation = jaccard_distance(tokens_bayes_variation, tokens_real)
                    jaccard_siddharthan = jaccard_distance(tokens_siddharthan, tokens_real)
                    jaccard_deemter = jaccard_distance(tokens_deemter, tokens_real)

                    bayes_random[fold]['y_real'].append(item['real']['label'])
                    bayes_random[fold]['y_pred'].append(item['bayes_random']['label'][0])
                    bayes_random[fold]['string'].append(dist_bayes_random)
                    bayes_random[fold]['jaccard'].append(jaccard_bayes_random)

                    bayes_no_variation[fold]['y_real'].append(item['real']['label'])
                    bayes_no_variation[fold]['y_pred'].append(item['bayes_no_variation']['label'][0])
                    bayes_no_variation[fold]['string'].append(dist_bayes_no_variation)
                    bayes_no_variation[fold]['jaccard'].append(jaccard_bayes_no_variation)

                    bayes_variation[fold]['y_real'].append(item['real']['label'])
                    bayes_variation[fold]['y_pred'].append(item['bayes_variation']['label'][0])
                    bayes_variation[fold]['string'].append(dist_bayes_variation)
                    bayes_variation[fold]['jaccard'].append(jaccard_bayes_variation)

                    _random[fold]['y_real'].append(item['real']['label'])
                    _random[fold]['y_pred'].append(item['random']['label'])
                    _random[fold]['string'].append(dist_random)
                    _random[fold]['jaccard'].append(jaccard_random)

                    siddharthan[fold]['y_real'].append(item['real']['label'])
                    siddharthan[fold]['y_pred'].append(item['siddharthan']['label'])
                    siddharthan[fold]['string'].append(dist_siddharthan)
                    siddharthan[fold]['jaccard'].append(jaccard_siddharthan)

                    deemter[fold]['y_real'].append(item['real']['label'])
                    deemter[fold]['y_pred'].append(item['deemter']['label'])
                    deemter[fold]['string'].append(dist_deemter)
                    deemter[fold]['jaccard'].append(jaccard_deemter)
    return _random, bayes_random, bayes_no_variation, bayes_variation, siddharthan, deemter
Exemplo n.º 42
0
def jaccard_unigram_distance(affil_1, affil_2):
    """Unigram distance between two strings"""
    affil_set_1 = set(w_tokenizer.tokenize(affil_1['Name']))
    affil_set_2 = set(w_tokenizer.tokenize(affil_2['Name']))
    return jaccard_distance(affil_set_1, affil_set_2)
Exemplo n.º 43
0
def jacquard_sim(set1,set2):
    sim=jaccard_distance(set1, set2)#, normalize=True)
    return sim
Exemplo n.º 44
0
Arquivo: Utils.py Projeto: lwheng/fyp
 def jaccard(self, inputA, inputB):
   # Returns jaccard index. Smaller the more better
   a = inputA.lower()
   b = inputB.lower()
   return distance.jaccard_distance(set(a.split()), set(b.split()))
Exemplo n.º 45
0
def ner_jaccard(ne1, ne2):
    if(len(ne1)==0 or len(ne2)==0):
        return 1
        
    return 1-jaccard_distance(set(ne1), set(ne2))