Пример #1
0
def test_word_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None  # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat',
                               'li') is not None  # 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa',
                                     'lin') is not None  #0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn',
                                     'wup') is not None  # 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng',
                                      'res') is not None  #7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn',
                                      'jcn') is not None  #0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng',
                                      'wpath') is not None  #0.593666388463
 def controlledSetWordNetSimilarity(self, word, similarWords):
     wns = WordNetSimilarity()
     for similarWord in similarWords.copy():
         if wns.word_similarity(
                 word, similarWord, 'li'
         ) < 0.9996:  # Variable to control accuracy of controlset
             similarWords.discard(similarWord)
     return similarWords
Пример #3
0
def test_classification_evaluation():
    from sematch.evaluation import AspectEvaluation
    from sematch.application import SimClassifier, SimSVMClassifier
    from sematch.semantic.similarity import WordNetSimilarity
    evaluation = AspectEvaluation()
    X, y = evaluation.load_dataset()
    wns = WordNetSimilarity()
    word_sim = lambda x, y: wns.word_similarity(x, y)
    simclassifier = SimClassifier.train(zip(X, y), word_sim)
    evaluation.evaluate(X, y, simclassifier)
    simSVMclassifier = SimSVMClassifier.train(X, y, word_sim)
    evaluation.evaluate(X, y, simSVMclassifier)
Пример #4
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    wordsim_eval = WordSimEvaluation()
    wns = WordNetSimilarity()
    #define similarity metrics
    lin = lambda x, y: wns.word_similarity(x, y, 'lin')
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    #evaluate similarity metrics
    assert wordsim_eval.evaluate_multiple_metrics({'lin':lin, 'wpath':wpath}, 'noun_simlex') is not None
    #performa Steiger's Z significance Test
    assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
Пример #5
0
def semantic_matching(trend_one, trend_two):
    treshold = 0.3
    trend_one_processed = text_processing(trend_one, keep_spaces=True)
    trend_two_processed = text_processing(trend_two, keep_spaces=True)
    # The options are Wordnet, YAGO and DBpedia (only the first seems usable)
    wns = WordNetSimilarity()
    matches = list({
        x['original']
        for x in trend_one_processed for y in trend_two_processed
        if wns.word_similarity(x['processed'], y['processed'], 'li') > treshold
    })

    if len(matches) == 0: return 'No matches'
    return matches
Пример #6
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    wordsim_eval = WordSimEvaluation()
    wns = WordNetSimilarity()
    #define similarity metrics
    lin = lambda x, y: wns.word_similarity(x, y, 'lin')
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    #evaluate similarity metrics
    assert wordsim_eval.evaluate_multiple_metrics({
        'lin': lin,
        'wpath': wpath
    }, 'noun_simlex') is not None
    #performa Steiger's Z significance Test
    assert wordsim_eval.statistical_test('wpath', 'lin',
                                         'noun_simlex') is not None
Пример #7
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(
        feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'],
                                          q['candidate'],
                                          similarity,
                                          K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold,
                                                  predict,
                                                  average='weighted')[2]
Пример #8
0
def test_wordnet_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat', 'li') is not None# 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None#0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None# 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None#7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None#0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None#0.593666388463
Пример #9
0
def test_simcat_classifier():
    from sematch.classification import SimCatClassifier
    from sematch.evaluation import ABSAEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    # defining similarity metric
    wns = WordNetSimilarity()
    sim_metric_jcn = lambda x, y: wns.word_similarity(x, y, 'jcn')
    sim_metric_wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.9)
    # loadding dataset
    absa_eval = ABSAEvaluation()
    X_train_16, y_train_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml')
    X_test_16, y_test_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml')
    # train the classifiers
    sim_jcn_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_jcn)
    sim_wpath_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_wpath)
    # evaluate the classifiers
    #absa_eval.evaluate(X_test_16, y_test_16, sim_jcn_classifier)
    #absa_eval.evaluate(X_test_16, y_test_16, sim_wpath_classifier)
    assert sim_jcn_classifier is not None
    assert sim_wpath_classifier is not None
Пример #10
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold, predict, average='weighted')[2]
# pip install sematch
# nltk.download('wordnet_ic')
# You also need to edit one of the sematch library files, sparql in case you are using python 3. You need to change the print statement.
from sematch.semantic.similarity import WordNetSimilarity
import pandas as pd

wns = WordNetSimilarity()

words = ['artist', 'musician', 'scientist', 'physicist', 'actor', 'movie']
sim_matrix = [[wns.word_similarity(w1, w2, 'wpath') for w1 in words]
              for w2 in words]
df = pd.DataFrame(sim_matrix, index=words, columns=words)
print(df)

print(wns.word_similarity("Dog", "Cat"))
Пример #12
0
class fmodel(object):
    def __init__(self):
        self.out = {}
        self.keras = keras_similar()
        self.classifier = Qclassifier()
        self.spell=Spelling()
        self.wn = WordNetSimilarity()
        self.en_nlp = spacy.load("en_core_web_md")
        self.stopwords_en=[]
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
            'utils', 'stopwords_en.txt')) as f:

            self.stopwords_en = f.read().splitlines()

    def ent_nltk(self, sentence):
        ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
        iob_tagged = tree2conlltags(ne_tree)
        ents = [[0, 0, 10]]
        for i in range(len(iob_tagged)):
            each = iob_tagged[i]

            if each[2] != 'O':
                if ents[-1][2] == (i - 1):
                    ents[-1][0] += " " + each[0]
                    ents[-1][2] = i
                else:
                    ents.append([each[0], each[2][2:], i])
        if len(ents) > 1:
            ents = ents[1:]
            ents = [ent[0] for ent in ents]
        else:
            ents = []

        return ents

    def mini_similar(self, q1, q2):
        self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0, 'class': ["", ""],
                    'f_class': 0, "sentiment": [0, 0, 0],
                    "keywords": [[""], [""]], "numbers": [[], []],
                    "entities": [[], []], "max_keywords": 0, "keywords_sim": 0}
        regex = re.compile('[^a-zA-Z0-9]')
        q1 = regex.sub('', q1)
        q2 = regex.sub('', q2)
        if q1 == q2:
            self.out['sim'] = 1
            self.out['sim_per'] = 100
            return self.out
        else:
            s1 = self.wn.word_similarity(q1, q2, 'lin')
            print(s1)

            if s1 > 0.9:
                self.out['sim'] = 1
                self.out['sim_per'] = 100
                return self.out

            elif s1 > 0.8:
                self.out['sim'] = 1
                self.out['sim_per'] = s1  # max([s1,s2,s3])
                return self.out
        return self.out

    def is_one_word(self, q1, q2):
        l1 = q1
        l2 = q2
        flag1 = False
        flag2 = False
        stop = True
        word1 = ""
        word2 = ""

        if len(l1)!=len(l2):

            return False
        else:
            for i in range(len(l1)):
                    if l1[i].text != l2[i].text or l1[i].lemma_ != l2[i].lemma_: 
                        if(flag2):
                            return False
                            
                        elif l1[i].text in self.stopwords_en and l2[i].text in self.stopwords_en:
                            word1 = l1[i].text
                            word2 = l2[i].text
                            flag1 = True                                                   

                        else:
                            word1 = l1[i].lemma_
                            word2 = l2[i].lemma_
                            flag1 = True
                            flag2 = True
        if flag1:
            self.out = self.mini_similar(word1,word2)
            return True
            

    def similar(self, text, challenge):
        if not isinstance(text, str) or not isinstance(challenge, str):
            q1 = text
            q2 = challenge
        else:
            q1 = normalizr.normalize(text, normalizations)
            q2 = normalizr.normalize(challenge, normalizations)

        q1 = self.spell.correct_str(q1,True)
        q2 = self.spell.correct_str(q2,True)

        if (len(q1.split()) == 1 and len(q2.split()) == 1) or (q1 == q2):
            return self.mini_similar(q1, q2)
        regex = re.compile(u'/')  # [^a-zA-Z]')
        q1 = regex.sub('', q1)
        q2 = regex.sub('', q2)

        self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0.0, 'class': ["", ""],
                    'f_class': 0, "sentiment": [0, 0, 0],
                    "keywords": [[""], [""]], "numbers": [[], []],
                    "entities": [[], []], "max_keywords": 0,
                    "keywords_sim": 0.0}
        q1_neg_list = list(set(mark_negation(q1.split())[0]))
        q2_neg_list = list(set(mark_negation(q2.split())[0]))

        if q1 == "" or q2 == "":
            return self.out


        sq1 = self.en_nlp(q1)
        sq2 = self.en_nlp(q2)

        if self.is_one_word(sq1, sq2):
            return self.out
        count = 0

        start_time = time.time()

        entsq1 = self.ent_nltk(q1)
        entsq2 = self.ent_nltk(q2)

        self.out['entities'][1] = entsq2
        self.out['entities'][0] = entsq1

        for ent in sq1.ents:
            if ent.text not in entsq1:
                # self.out['entities'][0].append([ent.label_, ent.text])
                self.out['entities'][0].append(ent.text)

        for ent in sq2.ents:
            if ent.text not in entsq2:
                # self.out['entities'][1].append((ent.label_, ent.text))
                self.out['entities'][1].append(ent.text)

        if self.out['entities'][0]:

            if self.out['entities'][1]:
                if(len(self.out['entities'][0])!= len(self.out['entities'][1])):
                    return self.out

                self.out['max_keywords'] += len(
                    set(self.out['entities'][0] + self.out['entities'][1]))


                for each in self.out['entities'][0]:
                    if(each in self.out['entities'][1]):
                        count += 1
                    else:
                        return self.out
            else:
                return self.out

        elif self.out['entities'][1]:
            return self.out
        

            

        elapsed_time = time.time() - start_time

        self.out['keras'] = self.keras.similar(q1, q2)

        self.out['sentiment'][0] = get_sentiment_values(q1)[1]['compound']
        self.out['sentiment'][1] = get_sentiment_values(q2)[1]['compound']
        self.out['sentiment'][2] = abs(
            self.out['sentiment'][0] - self.out['sentiment'][1])

        if (abs(self.out['sentiment'][0]) > 0.3 and abs(
                self.out['sentiment'][1]) > 0.3):
            if self.out['sentiment'][2] >= 0.6:
                return self.out

        start_time = time.time()
        self.out['class'][0] = self.classifier.classify_question(sq1)
        self.out['class'][1] = self.classifier.classify_question(sq2)

        self.out['f_class'] = (self.out['class'][0] == self.out['class'][1])

        self.out['keywords'][0], self.out['numbers'][0] = extract_features(sq1)
        self.out['keywords'][1], self.out['numbers'][1] = extract_features(sq2)

        self.out['max_keywords'] += len(
            set(self.out['keywords'][0] + self.out['keywords'][1]))

        if self.out['class'][0] > 0 and self.out['class'][1] > 0:
            self.out['max_keywords'] += 1

        for each in self.out['keywords'][0]:
            if each in self.out['keywords'][1]:
                if (each in q1_neg_list and each not in q2_neg_list) or (
                                each in q2_neg_list and each not in q1_neg_list):
                    self.out['max_keywords'] += 1
                else:
                    if(each in self.stopwords_en):
                        count += 0.30
                        #self.out['max_keywords'] -= 1
                    else:      
                        count+=1

        if self.out['numbers'][0]:
            self.out['max_keywords'] += 1

            if self.out['numbers'][1]:
                self.out['max_keywords'] += 1
                if self.out['numbers'][1] != self.out['numbers'][0]:
                    return self.out

        elif self.out['numbers'][1]:
            self.out['max_keywords'] += 1

        if self.out['class'][0] > 0 and self.out['class'][1] > 0:
            self.out['max_keywords'] += 1

            if self.out['f_class']:
                if self.out['max_keywords'] > 1:
                    count += 1
                else:
                    count += 0.35

        # keywords_s1= [x for x in keywords_s1 if x not in keywords_s2]
        # keywords_s3= [x for x in keywords_s2 if x not in keywords_s1]
        if self.out['max_keywords'] < 1:
            self.out['keywords_sim'] = 0
        else:
            self.out['keywords_sim'] = (count / self.out['max_keywords']) * 100
            self.out['sim_per'] = (self.out['keywords_sim']+self.out['keras'])/2.0
            #print(self.out['keywords_sim'],count,self.out['max_keywords'])

        '''
        k_value = []
        s_value = []

        k = 100.0
        s = 30.0
        k_step = 10.0
        s_step = 4.0

        self.out["sim_per"] = (self.out['keywords_sim'] + self.out['keras']) / 2

        for i in range(7):
            k -= k_step
            s += s_step
            k_value.append(k)
            s_value.append(s)
        '''
        s_value = [34.0, 40.0, 50.0, 55.0, 60.0, 60.0, 60.0]
        k_value = [90.0, 85.0, 80.0, 75.0, 70.0, 60.0, 30.0]

        if self.out['keras'] >= k_value[0]:
            if self.out['keywords_sim'] >= s_value[0]:
                self.out['sim'] = 1
                return self.out

        elif self.out['keras'] > k_value[1]:
            if self.out['keywords_sim'] >= s_value[1]:
                self.out['sim'] = 1
                return self.out

        elif self.out['keras'] > k_value[2]:
            if self.out['keywords_sim'] >= s_value[2]:
                self.out['sim'] = 1
                return self.out

        elif self.out['keras'] > k_value[3]:
            if self.out['keywords_sim'] >= s_value[3]:
                self.out['sim'] = 1
                return self.out
        elif self.out['keras'] > k_value[4]:
            if self.out['keywords_sim'] >= s_value[4]:
                self.out['sim'] = 1
                return self.out
        elif self.out['keras'] > k_value[5]:
            if self.out['keywords_sim'] >= s_value[5]:
                self.out['sim'] = 1
                return self.out
        elif self.out['keras'] > k_value[6]:
            if self.out['keywords_sim'] >= s_value[6]:
                self.out['sim'] = 1
                return self.out
        



        return self.out

    def similarr(self, text, questions=list()):

        answer, max_similarity = None, 0
        if not text or len(questions) == 0:
            return answer, max_similarity
        for question in questions:
            try:
                result = self.similar(text.lower(),
                                      question.get('question').lower())
            except:
                result = self.similar(text, question.get('question'))

            if result.get('sim') == 1:
                confidence = result.get('sim_per')
                if max_similarity <= confidence <= 100:
                    max_similarity = confidence
                    answer = question.get('id')
                    # print("round stop\n")
                if max_similarity >= 95:
                    break

        # print('[Stop]')
        return answer, max_similarity

    def get_suggestions(self, text=None, texts=list()):
        res = []
        s = []
        min_confidence = 45

        for each in texts:
            result = self.similar(text, each.get('question').lower())
            if result.get('sim') == 1:
                confidence = result.get('sim_per')
                if 100 >= confidence > min_confidence:
                    if each.get('rich_text'):
                        response = each.get('rich_text')
                    else:
                        flow = int(each.get('response').replace('flow-', ''))
                        flow = Flow.objects.filter(id=flow).values('id', 'name',
                                                                   'category__name')
                        if flow.exists():
                            response = [{'flow': flow}]
                        else:
                            response = None
                    if response:
                        res.append((confidence, each.get('id'), response,
                                    each.get('question')))
            s = sorted(res, key=operator.itemgetter(0), reverse=True)[:3]
        suggestions = []
        for e in s:
            if e[2]:
                messages = []
                for m in e[2]:
                    messages.append({'message': format_message(m)})
                suggestions.append({'confidence': e[0], 'id': e[1],
                                    'message': messages})
        return suggestions
Пример #13
0
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transform input text into feature representation
    """
    def __init__(self,
                 corpus,
                 feature_num=10,
                 model='onehot',
                 wn_method='path',
                 vec_file='models/GoogleNews-vectors-negative300.bin',
                 binary=True):
        """
        :param corpus: use a corpus to train a vector representation
        :param feature_num: number of dimensions
        :param model: onehot or wordnet or word2vec or both
        """
        self._model = model
        self._wn_method = wn_method
        self._features = self.extract_features(corpus, feature_num)
        self._wns = WordNetSimilarity(
        ) if model == 'wordnet' or model == 'both' else None
        self._wvs = WordVecSimilarity(
            vec_file,
            binary) if model == 'word2vec' or model == 'both' else None

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return X

    def extract_features(self, corpus, feature_num=10):
        cat_word = {}
        for sent, cat in corpus:
            cat_word.setdefault(cat,
                                []).extend(lemmatization(word_tokenize(sent)))
        features = {cat: Counter(cat_word[cat]) for cat in cat_word}
        feature_words = []
        for c, f in features.iteritems():
            words, counts = zip(*f.most_common(feature_num))
            feature_words.extend(list(words))
        feature_words = set(feature_words)
        return feature_words

    def similarity(self, tokens, feature, method='wordnet'):
        if method == 'wordnet':
            sim = lambda x: self._wns.word_similarity(feature, x, self.
                                                      _wn_method)
        else:
            sim = lambda x: self._wvs.word_similarity(feature, x)
        return max(map(sim, tokens) + [0.0])

    def unigram_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['contains({})'.format(f)] = (f in words)
        return features

    def wordnet_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['wns({})'.format(f)] = self.similarity(words, f)
        return features

    def word2vec_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['w2v({})'.format(f)] = self.similarity(words,
                                                            f,
                                                            method='word2vec')
        return features

    def semantic_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['wns({})'.format(f)] = self.similarity(words, f)
            features['w2v({})'.format(f)] = self.similarity(words,
                                                            f,
                                                            method='word2vec')
        return features

    def transform(self, X):
        tokenize = lambda x: lemmatization(word_tokenize(x))
        X_tokens = map(tokenize, X)
        if self._model == 'onehot':
            return map(self.unigram_features, X_tokens)
        elif self._model == 'wordnet':
            return map(self.wordnet_features, X_tokens)
        elif self._model == 'word2vec':
            return map(self.word2vec_features, X_tokens)
        elif self._model == 'both':
            return map(self.semantic_features, X_tokens)
Пример #14
0
from sematch.semantic.similarity import WordNetSimilarity

import codecs

wns = WordNetSimilarity()
poems = codecs.open('generatedpoems.txt', 'r', encoding='utf-8')
data = open('data.txt', 'a')
for x in poems:
    temp_words = x.split(" ")
    total = 0
    count = 0
    for y in range(len(temp_words) - 1):
        total += wns.word_similarity(temp_words[y], temp_words[y + 1], 'li')
        count += 1
    total /= count
    data.write(str(total) + '\n')
data.close()
poems.close()
#print wns.word_similarity(w1, w2, 'li')
Пример #15
0
from sematch.semantic.similarity import WordNetSimilarity
L1=[]
L2=[]
L3=[]
wns = WordNetSimilarity()

# Computing English word similarity using Li method
x=wns.word_similarity('programmer', 'coder', 'software engineer')
if(x>0.7):
    L1.append('programmer')
    L1.append('coder')
    L1.append('software engineer')
else:continue
    
    
# Computing english word similarity using Li method
wns.word_similarity('softwrae program', 'computer software', 'software system')
if(x>0.7):
    L1.append('software program')
    L1.append('computer software')
    L1.append('software system')
else:continue
Пример #16
0
def yhmh_nlp(url, trigger_words):
    text, triggers = parse_my_url(url, trigger_words)
    print("triggers2: %s" % (triggers))
    if text is "" or len(triggers) == 0:
        return ""

    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities
    verbose = True
    counter = 0
    counter2 = 0
    text_output_array = pd.DataFrame(np.zeros((len(entities), 3)))

    for entity in entities:
        entity_type = enums.Entity.Type(entity.type)
        if len(entity.name) < 25 and '.' not in entity.name:
            text_output_array.iloc[counter, 0] = entity.name
            text_output_array.iloc[counter, 1] = entity_type.name
            text_output_array.iloc[counter, 2] = entity.salience
            counter += 1
        else:
            counter2 += 1

    celebrity_status = 0
    if len(entities) > 0:
        if entities[0].metadata.get(
                'wikipedia_url',
                '-') != '-' and text_output_array.iloc[0, 1] == 'PERSON':
            celebrity_status = 1
        elif entities[1].metadata.get(
                'wikipedia_url',
                '-') and text_output_array.iloc[1, 1] == 'PERSON':
            celebrity_status = 1
        else:
            celebrity_status = 0

    text_output_array = text_output_array.iloc[0:len(entities) - counter2, :]

    # Detects the sentiment of the text
    #sentiment = client.analyze_sentiment(document=document).document_sentiment

    wns = WordNetSimilarity()

    keywords_target = pd.Series.to_list(text_output_array[0])
    #keywords_target = list(set(keywords_target))

    #seen = set(keywords_target)
    #keywords_target = []
    #for x in keywords_target:
    #    if x not in seen:
    #        keywords_target.append(x)
    #        seen.add(x)
    #
    #keywords_target=seen
    forbidden_keywords = [
        'medicine', 'drug', 'fun', 'hospital', 'suicide', 'death', 'mental',
        'health', 'illness', 'insta', ',man', 'woman', 'family', 'people',
        'many', 'place', 'same', 'others', 'brain', 'all', 'end', 'statement',
        'lot', 'condolences'
    ]

    regex = re.compile(r'([A-Z]([a-z])+)')
    selected_files = list(filter(regex.search, keywords_target))
    res = list(set(keywords_target) - set(selected_files))

    regex = re.compile(r'^@')
    selected_files = list(filter(regex.search, res))
    res = list(set(keywords_target) - set(selected_files))

    regex = re.compile(r"\b[A-Z][A-Z]+\b")
    selected_files = list(filter(regex.search, res))
    res = list(set(res) - set(selected_files))

    regex = re.compile(r'([A-Z]([a-z])+)')
    selected_files = list(filter(regex.search, res))
    res = list(set(res) - set(selected_files))
    for key in range(len(res)):
        if ' ' in res[key]:
            res[key] = res[key].split(' ')[0]

    for x in range(len(res)):
        for y in range(len(forbidden_keywords)):
            if res[x] == forbidden_keywords[y]:
                res[x] = []
    res = list(filter(None, res))

    res_dictionary = Counter(res)

    res_output = res_dictionary.most_common(10)
    res_output = dict(res_output)
    res_output = list(res_output.keys())

    print(res_output)
    res = res_output[0:num_keywords]
    database = pd.read_csv(
        CURATED_LIST
    )  #('/Users/vmutai/Projects/HMH/admin/microblog/app/yhmh_curated_articles.csv')

    if celebrity_status == 1:
        database = database[database.celebrity == 1]
    elif celebrity_status == 0:
        database = database[database.celebrity == 0]
    similarity_ranks = pd.DataFrame(np.zeros(database.shape[0]))
    for z in range(database.shape[0]):
        newlist = []
        N_rows = len(res)
        keywords_source = database.iloc[z, 4:4 + num_keywords]
        keywords_source = pd.Series.tolist(keywords_source)
        N_cols = len(keywords_source)
        #similarity_list = pd.DataFrame(np.zeros((N_rows, N_cols)))
        foo = [1]
        for x in range(len(res)):
            for y in range(len(keywords_source)):
                value = wns.word_similarity(res[x], keywords_source[y], 'lin')
                #similarity_matrix.at[x,y]=value
                foo.append(value)
        matrix_average = sum(foo) / np.count_nonzero(foo)
        similarity_ranks.at[z, 0] = matrix_average
    maximum = pd.DataFrame.idxmax(similarity_ranks)
    url_to_return = pd.Series.tolist(database.iloc[maximum, 0])
    print(url_to_return)

    title = pd.Series.tolist(database.iloc[maximum, 1])

    def output(title, res_output, url_to_return):
        a = {
            'header': title[0],
            'keywords_list': res_output,
            'url_recommendation': url_to_return[0]
        }
        print("JSON DUMP")
        print(a)

        try:
            return json.dumps(a)
        except:
            return "awesome2!"

    json_output = output(title, res_output, url_to_return)
    print(json_output)

    return json_output
Пример #17
0
from nltk.corpus import wordnet_ic
from nltk.corpus.reader.wordnet import information_content

brown_ic = wordnet_ic.ic('ic-brown.dat')
wns = WordNetSimilarity()
# arg1 and arg2: predicates represented in strings separated by underscores
# e.g. cast_member or star
preA = sys.argv[1].split("_")
preB = sys.argv[2].split("_")
# arg3: pairwise similarity matrix in which rows are separated by underscore
# e.g. 0.6_0.5, or 0.6,0.7_0.3,0.4
data = []
for a in preA:
    row = []
    for b in preB:
        wdsim = wns.word_similarity(a, b, 'wup')
        row.append(wdsim)
    data.append(row)
data = numpy.matrix(data)
#max values in rows
Amax = data.max(1)
icA = []
for i in range(len(preA)):
    try:
        if lesk(preA, preA[i]) is None:
            # preA is not in WordNet
            icA.append(1)
        elif information_content(lesk(preA, preA[i]), brown_ic) == 'inf':
            icA.append(1)
        else:
            icA.append(information_content(lesk(preA, preA[i]), brown_ic))
Пример #18
0
        matriceListe = []
        matricelistePaire = []
        matricelistePaireSort = []
        matricelistePaireAction = []
        matricelistePaireObject = []

        for word in sorted_terms_lists:
            tokens = word
            for index, row in listcara.iterrows():
                abstractNumber = 'abs'.format(str((i)))
                listaction = row['Colonne3']
                listaction = re.sub(r'\([^)]*\)', '', listaction)

                #comparaison betwen tags and classe Triz

                indiceSimAction = wns.word_similarity(word, str(listaction))

                if indiceSimAction == 0 or word.isdigit() == True:
                    #print "rien a faire "
                    continue

                else:
                    valeurs = []

                    valeurs = [
                        i, NumberBrevet, word, listaction, indiceSimAction,
                        abstract, urlEspacenet
                    ]

                    ligne = ",".join(str(v) for v in valeurs) + "\n"
        clean.append(t)
    return clean


cleanCleanCat1 = cleanTexts(categoryList1)
cleanCleanCat2 = cleanTexts(categoryList2)

wns = WordNetSimilarity()
similarCategories = []
for cat in cleanCleanCat1:
    sims = []
    for t in cleanCleanCat2:
        TextSim = []
        for w in cat:
            # wdsSim=[1 if w == wr else wns.word_similarity(w, wr, 'li') for wr in t]
            wdsSim = [wns.word_similarity(w, wr, 'li') for wr in t]
            TextSim.extend(wdsSim)
        sims.append((cleanCleanCat2.index(t), sum(TextSim)))
    if max(sims, key=lambda x: x[1])[1] > 0:
        similarCategories.append(
            (max(sims, key=lambda x: x[1])[0], max(sims,
                                                   key=lambda x: x[1])[1]))
    else:
        similarCategories.append('')
    print('{0} texts out of {1} done'.format(
        cleanCleanCat1.index(cat) + 1, len(cleanCleanCat1)))

with open('S:/path/In-market audiences_sim.csv',
          'w',
          newline='',
          encoding='utf-8') as csvfile:
Пример #20
0
from sematch.semantic.similarity import WordNetSimilarity
wns = WordNetSimilarity()

# Computing English word similarity using Li method
wns.word_similarity('dog', 'cat', 'li')  # 0.449327301063
# Computing Spanish word similarity using Lin method
wns.monol_word_similarity('perro', 'gato', 'spa', 'lin')  #0.876800984373
# Computing Chinese word similarity using Wu & Palmer method
wns.monol_word_similarity('狗', '猫', 'cmn', 'wup')  # 0.857142857143
# Computing Spanish and English word similarity using Resnik method
wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res')  #7.91166650904
# Computing Spanish and Chinese word similarity using Jiang & Conrad method
wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn')  #0.31023804699
# Computing Chinese and English word similarity using WPath method
wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath')  #0.593666388463