def test_word_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None # 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None # 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None #0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None #0.593666388463
def controlledSetWordNetSimilarity(self, word, similarWords): wns = WordNetSimilarity() for similarWord in similarWords.copy(): if wns.word_similarity( word, similarWord, 'li' ) < 0.9996: # Variable to control accuracy of controlset similarWords.discard(similarWord) return similarWords
def test_classification_evaluation(): from sematch.evaluation import AspectEvaluation from sematch.application import SimClassifier, SimSVMClassifier from sematch.semantic.similarity import WordNetSimilarity evaluation = AspectEvaluation() X, y = evaluation.load_dataset() wns = WordNetSimilarity() word_sim = lambda x, y: wns.word_similarity(x, y) simclassifier = SimClassifier.train(zip(X, y), word_sim) evaluation.evaluate(X, y, simclassifier) simSVMclassifier = SimSVMClassifier.train(X, y, word_sim) evaluation.evaluate(X, y, simSVMclassifier)
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity wordsim_eval = WordSimEvaluation() wns = WordNetSimilarity() #define similarity metrics lin = lambda x, y: wns.word_similarity(x, y, 'lin') wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) #evaluate similarity metrics assert wordsim_eval.evaluate_multiple_metrics({'lin':lin, 'wpath':wpath}, 'noun_simlex') is not None #performa Steiger's Z significance Test assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
def semantic_matching(trend_one, trend_two): treshold = 0.3 trend_one_processed = text_processing(trend_one, keep_spaces=True) trend_two_processed = text_processing(trend_two, keep_spaces=True) # The options are Wordnet, YAGO and DBpedia (only the first seems usable) wns = WordNetSimilarity() matches = list({ x['original'] for x in trend_one_processed for y in trend_two_processed if wns.word_similarity(x['processed'], y['processed'], 'li') > treshold }) if len(matches) == 0: return 'No matches' return matches
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity wordsim_eval = WordSimEvaluation() wns = WordNetSimilarity() #define similarity metrics lin = lambda x, y: wns.word_similarity(x, y, 'lin') wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) #evaluate similarity metrics assert wordsim_eval.evaluate_multiple_metrics({ 'lin': lin, 'wpath': wpath }, 'noun_simlex') is not None #performa Steiger's Z significance Test assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load( feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def test_wordnet_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None# 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None#0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None# 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None#7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None#0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None#0.593666388463
def test_simcat_classifier(): from sematch.classification import SimCatClassifier from sematch.evaluation import ABSAEvaluation from sematch.semantic.similarity import WordNetSimilarity # defining similarity metric wns = WordNetSimilarity() sim_metric_jcn = lambda x, y: wns.word_similarity(x, y, 'jcn') sim_metric_wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.9) # loadding dataset absa_eval = ABSAEvaluation() X_train_16, y_train_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml') X_test_16, y_test_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml') # train the classifiers sim_jcn_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_jcn) sim_wpath_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_wpath) # evaluate the classifiers #absa_eval.evaluate(X_test_16, y_test_16, sim_jcn_classifier) #absa_eval.evaluate(X_test_16, y_test_16, sim_wpath_classifier) assert sim_jcn_classifier is not None assert sim_wpath_classifier is not None
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load(feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
# pip install sematch # nltk.download('wordnet_ic') # You also need to edit one of the sematch library files, sparql in case you are using python 3. You need to change the print statement. from sematch.semantic.similarity import WordNetSimilarity import pandas as pd wns = WordNetSimilarity() words = ['artist', 'musician', 'scientist', 'physicist', 'actor', 'movie'] sim_matrix = [[wns.word_similarity(w1, w2, 'wpath') for w1 in words] for w2 in words] df = pd.DataFrame(sim_matrix, index=words, columns=words) print(df) print(wns.word_similarity("Dog", "Cat"))
class fmodel(object): def __init__(self): self.out = {} self.keras = keras_similar() self.classifier = Qclassifier() self.spell=Spelling() self.wn = WordNetSimilarity() self.en_nlp = spacy.load("en_core_web_md") self.stopwords_en=[] with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'utils', 'stopwords_en.txt')) as f: self.stopwords_en = f.read().splitlines() def ent_nltk(self, sentence): ne_tree = ne_chunk(pos_tag(word_tokenize(sentence))) iob_tagged = tree2conlltags(ne_tree) ents = [[0, 0, 10]] for i in range(len(iob_tagged)): each = iob_tagged[i] if each[2] != 'O': if ents[-1][2] == (i - 1): ents[-1][0] += " " + each[0] ents[-1][2] = i else: ents.append([each[0], each[2][2:], i]) if len(ents) > 1: ents = ents[1:] ents = [ent[0] for ent in ents] else: ents = [] return ents def mini_similar(self, q1, q2): self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0, 'class': ["", ""], 'f_class': 0, "sentiment": [0, 0, 0], "keywords": [[""], [""]], "numbers": [[], []], "entities": [[], []], "max_keywords": 0, "keywords_sim": 0} regex = re.compile('[^a-zA-Z0-9]') q1 = regex.sub('', q1) q2 = regex.sub('', q2) if q1 == q2: self.out['sim'] = 1 self.out['sim_per'] = 100 return self.out else: s1 = self.wn.word_similarity(q1, q2, 'lin') print(s1) if s1 > 0.9: self.out['sim'] = 1 self.out['sim_per'] = 100 return self.out elif s1 > 0.8: self.out['sim'] = 1 self.out['sim_per'] = s1 # max([s1,s2,s3]) return self.out return self.out def is_one_word(self, q1, q2): l1 = q1 l2 = q2 flag1 = False flag2 = False stop = True word1 = "" word2 = "" if len(l1)!=len(l2): return False else: for i in range(len(l1)): if l1[i].text != l2[i].text or l1[i].lemma_ != l2[i].lemma_: if(flag2): return False elif l1[i].text in self.stopwords_en and l2[i].text in self.stopwords_en: word1 = l1[i].text word2 = l2[i].text flag1 = True else: word1 = l1[i].lemma_ word2 = l2[i].lemma_ flag1 = True flag2 = True if flag1: self.out = self.mini_similar(word1,word2) return True def similar(self, text, challenge): if not isinstance(text, str) or not isinstance(challenge, str): q1 = text q2 = challenge else: q1 = normalizr.normalize(text, normalizations) q2 = normalizr.normalize(challenge, normalizations) q1 = self.spell.correct_str(q1,True) q2 = self.spell.correct_str(q2,True) if (len(q1.split()) == 1 and len(q2.split()) == 1) or (q1 == q2): return self.mini_similar(q1, q2) regex = re.compile(u'/') # [^a-zA-Z]') q1 = regex.sub('', q1) q2 = regex.sub('', q2) self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0.0, 'class': ["", ""], 'f_class': 0, "sentiment": [0, 0, 0], "keywords": [[""], [""]], "numbers": [[], []], "entities": [[], []], "max_keywords": 0, "keywords_sim": 0.0} q1_neg_list = list(set(mark_negation(q1.split())[0])) q2_neg_list = list(set(mark_negation(q2.split())[0])) if q1 == "" or q2 == "": return self.out sq1 = self.en_nlp(q1) sq2 = self.en_nlp(q2) if self.is_one_word(sq1, sq2): return self.out count = 0 start_time = time.time() entsq1 = self.ent_nltk(q1) entsq2 = self.ent_nltk(q2) self.out['entities'][1] = entsq2 self.out['entities'][0] = entsq1 for ent in sq1.ents: if ent.text not in entsq1: # self.out['entities'][0].append([ent.label_, ent.text]) self.out['entities'][0].append(ent.text) for ent in sq2.ents: if ent.text not in entsq2: # self.out['entities'][1].append((ent.label_, ent.text)) self.out['entities'][1].append(ent.text) if self.out['entities'][0]: if self.out['entities'][1]: if(len(self.out['entities'][0])!= len(self.out['entities'][1])): return self.out self.out['max_keywords'] += len( set(self.out['entities'][0] + self.out['entities'][1])) for each in self.out['entities'][0]: if(each in self.out['entities'][1]): count += 1 else: return self.out else: return self.out elif self.out['entities'][1]: return self.out elapsed_time = time.time() - start_time self.out['keras'] = self.keras.similar(q1, q2) self.out['sentiment'][0] = get_sentiment_values(q1)[1]['compound'] self.out['sentiment'][1] = get_sentiment_values(q2)[1]['compound'] self.out['sentiment'][2] = abs( self.out['sentiment'][0] - self.out['sentiment'][1]) if (abs(self.out['sentiment'][0]) > 0.3 and abs( self.out['sentiment'][1]) > 0.3): if self.out['sentiment'][2] >= 0.6: return self.out start_time = time.time() self.out['class'][0] = self.classifier.classify_question(sq1) self.out['class'][1] = self.classifier.classify_question(sq2) self.out['f_class'] = (self.out['class'][0] == self.out['class'][1]) self.out['keywords'][0], self.out['numbers'][0] = extract_features(sq1) self.out['keywords'][1], self.out['numbers'][1] = extract_features(sq2) self.out['max_keywords'] += len( set(self.out['keywords'][0] + self.out['keywords'][1])) if self.out['class'][0] > 0 and self.out['class'][1] > 0: self.out['max_keywords'] += 1 for each in self.out['keywords'][0]: if each in self.out['keywords'][1]: if (each in q1_neg_list and each not in q2_neg_list) or ( each in q2_neg_list and each not in q1_neg_list): self.out['max_keywords'] += 1 else: if(each in self.stopwords_en): count += 0.30 #self.out['max_keywords'] -= 1 else: count+=1 if self.out['numbers'][0]: self.out['max_keywords'] += 1 if self.out['numbers'][1]: self.out['max_keywords'] += 1 if self.out['numbers'][1] != self.out['numbers'][0]: return self.out elif self.out['numbers'][1]: self.out['max_keywords'] += 1 if self.out['class'][0] > 0 and self.out['class'][1] > 0: self.out['max_keywords'] += 1 if self.out['f_class']: if self.out['max_keywords'] > 1: count += 1 else: count += 0.35 # keywords_s1= [x for x in keywords_s1 if x not in keywords_s2] # keywords_s3= [x for x in keywords_s2 if x not in keywords_s1] if self.out['max_keywords'] < 1: self.out['keywords_sim'] = 0 else: self.out['keywords_sim'] = (count / self.out['max_keywords']) * 100 self.out['sim_per'] = (self.out['keywords_sim']+self.out['keras'])/2.0 #print(self.out['keywords_sim'],count,self.out['max_keywords']) ''' k_value = [] s_value = [] k = 100.0 s = 30.0 k_step = 10.0 s_step = 4.0 self.out["sim_per"] = (self.out['keywords_sim'] + self.out['keras']) / 2 for i in range(7): k -= k_step s += s_step k_value.append(k) s_value.append(s) ''' s_value = [34.0, 40.0, 50.0, 55.0, 60.0, 60.0, 60.0] k_value = [90.0, 85.0, 80.0, 75.0, 70.0, 60.0, 30.0] if self.out['keras'] >= k_value[0]: if self.out['keywords_sim'] >= s_value[0]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[1]: if self.out['keywords_sim'] >= s_value[1]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[2]: if self.out['keywords_sim'] >= s_value[2]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[3]: if self.out['keywords_sim'] >= s_value[3]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[4]: if self.out['keywords_sim'] >= s_value[4]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[5]: if self.out['keywords_sim'] >= s_value[5]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[6]: if self.out['keywords_sim'] >= s_value[6]: self.out['sim'] = 1 return self.out return self.out def similarr(self, text, questions=list()): answer, max_similarity = None, 0 if not text or len(questions) == 0: return answer, max_similarity for question in questions: try: result = self.similar(text.lower(), question.get('question').lower()) except: result = self.similar(text, question.get('question')) if result.get('sim') == 1: confidence = result.get('sim_per') if max_similarity <= confidence <= 100: max_similarity = confidence answer = question.get('id') # print("round stop\n") if max_similarity >= 95: break # print('[Stop]') return answer, max_similarity def get_suggestions(self, text=None, texts=list()): res = [] s = [] min_confidence = 45 for each in texts: result = self.similar(text, each.get('question').lower()) if result.get('sim') == 1: confidence = result.get('sim_per') if 100 >= confidence > min_confidence: if each.get('rich_text'): response = each.get('rich_text') else: flow = int(each.get('response').replace('flow-', '')) flow = Flow.objects.filter(id=flow).values('id', 'name', 'category__name') if flow.exists(): response = [{'flow': flow}] else: response = None if response: res.append((confidence, each.get('id'), response, each.get('question'))) s = sorted(res, key=operator.itemgetter(0), reverse=True)[:3] suggestions = [] for e in s: if e[2]: messages = [] for m in e[2]: messages.append({'message': format_message(m)}) suggestions.append({'confidence': e[0], 'id': e[1], 'message': messages}) return suggestions
class TextPreprocessor(BaseEstimator, TransformerMixin): """ Transform input text into feature representation """ def __init__(self, corpus, feature_num=10, model='onehot', wn_method='path', vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param corpus: use a corpus to train a vector representation :param feature_num: number of dimensions :param model: onehot or wordnet or word2vec or both """ self._model = model self._wn_method = wn_method self._features = self.extract_features(corpus, feature_num) self._wns = WordNetSimilarity( ) if model == 'wordnet' or model == 'both' else None self._wvs = WordVecSimilarity( vec_file, binary) if model == 'word2vec' or model == 'both' else None def fit(self, X, y=None): return self def inverse_transform(self, X): return X def extract_features(self, corpus, feature_num=10): cat_word = {} for sent, cat in corpus: cat_word.setdefault(cat, []).extend(lemmatization(word_tokenize(sent))) features = {cat: Counter(cat_word[cat]) for cat in cat_word} feature_words = [] for c, f in features.iteritems(): words, counts = zip(*f.most_common(feature_num)) feature_words.extend(list(words)) feature_words = set(feature_words) return feature_words def similarity(self, tokens, feature, method='wordnet'): if method == 'wordnet': sim = lambda x: self._wns.word_similarity(feature, x, self. _wn_method) else: sim = lambda x: self._wvs.word_similarity(feature, x) return max(map(sim, tokens) + [0.0]) def unigram_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['contains({})'.format(f)] = (f in words) return features def wordnet_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['wns({})'.format(f)] = self.similarity(words, f) return features def word2vec_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['w2v({})'.format(f)] = self.similarity(words, f, method='word2vec') return features def semantic_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['wns({})'.format(f)] = self.similarity(words, f) features['w2v({})'.format(f)] = self.similarity(words, f, method='word2vec') return features def transform(self, X): tokenize = lambda x: lemmatization(word_tokenize(x)) X_tokens = map(tokenize, X) if self._model == 'onehot': return map(self.unigram_features, X_tokens) elif self._model == 'wordnet': return map(self.wordnet_features, X_tokens) elif self._model == 'word2vec': return map(self.word2vec_features, X_tokens) elif self._model == 'both': return map(self.semantic_features, X_tokens)
from sematch.semantic.similarity import WordNetSimilarity import codecs wns = WordNetSimilarity() poems = codecs.open('generatedpoems.txt', 'r', encoding='utf-8') data = open('data.txt', 'a') for x in poems: temp_words = x.split(" ") total = 0 count = 0 for y in range(len(temp_words) - 1): total += wns.word_similarity(temp_words[y], temp_words[y + 1], 'li') count += 1 total /= count data.write(str(total) + '\n') data.close() poems.close() #print wns.word_similarity(w1, w2, 'li')
from sematch.semantic.similarity import WordNetSimilarity L1=[] L2=[] L3=[] wns = WordNetSimilarity() # Computing English word similarity using Li method x=wns.word_similarity('programmer', 'coder', 'software engineer') if(x>0.7): L1.append('programmer') L1.append('coder') L1.append('software engineer') else:continue # Computing english word similarity using Li method wns.word_similarity('softwrae program', 'computer software', 'software system') if(x>0.7): L1.append('software program') L1.append('computer software') L1.append('software system') else:continue
def yhmh_nlp(url, trigger_words): text, triggers = parse_my_url(url, trigger_words) print("triggers2: %s" % (triggers)) if text is "" or len(triggers) == 0: return "" client = language.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities verbose = True counter = 0 counter2 = 0 text_output_array = pd.DataFrame(np.zeros((len(entities), 3))) for entity in entities: entity_type = enums.Entity.Type(entity.type) if len(entity.name) < 25 and '.' not in entity.name: text_output_array.iloc[counter, 0] = entity.name text_output_array.iloc[counter, 1] = entity_type.name text_output_array.iloc[counter, 2] = entity.salience counter += 1 else: counter2 += 1 celebrity_status = 0 if len(entities) > 0: if entities[0].metadata.get( 'wikipedia_url', '-') != '-' and text_output_array.iloc[0, 1] == 'PERSON': celebrity_status = 1 elif entities[1].metadata.get( 'wikipedia_url', '-') and text_output_array.iloc[1, 1] == 'PERSON': celebrity_status = 1 else: celebrity_status = 0 text_output_array = text_output_array.iloc[0:len(entities) - counter2, :] # Detects the sentiment of the text #sentiment = client.analyze_sentiment(document=document).document_sentiment wns = WordNetSimilarity() keywords_target = pd.Series.to_list(text_output_array[0]) #keywords_target = list(set(keywords_target)) #seen = set(keywords_target) #keywords_target = [] #for x in keywords_target: # if x not in seen: # keywords_target.append(x) # seen.add(x) # #keywords_target=seen forbidden_keywords = [ 'medicine', 'drug', 'fun', 'hospital', 'suicide', 'death', 'mental', 'health', 'illness', 'insta', ',man', 'woman', 'family', 'people', 'many', 'place', 'same', 'others', 'brain', 'all', 'end', 'statement', 'lot', 'condolences' ] regex = re.compile(r'([A-Z]([a-z])+)') selected_files = list(filter(regex.search, keywords_target)) res = list(set(keywords_target) - set(selected_files)) regex = re.compile(r'^@') selected_files = list(filter(regex.search, res)) res = list(set(keywords_target) - set(selected_files)) regex = re.compile(r"\b[A-Z][A-Z]+\b") selected_files = list(filter(regex.search, res)) res = list(set(res) - set(selected_files)) regex = re.compile(r'([A-Z]([a-z])+)') selected_files = list(filter(regex.search, res)) res = list(set(res) - set(selected_files)) for key in range(len(res)): if ' ' in res[key]: res[key] = res[key].split(' ')[0] for x in range(len(res)): for y in range(len(forbidden_keywords)): if res[x] == forbidden_keywords[y]: res[x] = [] res = list(filter(None, res)) res_dictionary = Counter(res) res_output = res_dictionary.most_common(10) res_output = dict(res_output) res_output = list(res_output.keys()) print(res_output) res = res_output[0:num_keywords] database = pd.read_csv( CURATED_LIST ) #('/Users/vmutai/Projects/HMH/admin/microblog/app/yhmh_curated_articles.csv') if celebrity_status == 1: database = database[database.celebrity == 1] elif celebrity_status == 0: database = database[database.celebrity == 0] similarity_ranks = pd.DataFrame(np.zeros(database.shape[0])) for z in range(database.shape[0]): newlist = [] N_rows = len(res) keywords_source = database.iloc[z, 4:4 + num_keywords] keywords_source = pd.Series.tolist(keywords_source) N_cols = len(keywords_source) #similarity_list = pd.DataFrame(np.zeros((N_rows, N_cols))) foo = [1] for x in range(len(res)): for y in range(len(keywords_source)): value = wns.word_similarity(res[x], keywords_source[y], 'lin') #similarity_matrix.at[x,y]=value foo.append(value) matrix_average = sum(foo) / np.count_nonzero(foo) similarity_ranks.at[z, 0] = matrix_average maximum = pd.DataFrame.idxmax(similarity_ranks) url_to_return = pd.Series.tolist(database.iloc[maximum, 0]) print(url_to_return) title = pd.Series.tolist(database.iloc[maximum, 1]) def output(title, res_output, url_to_return): a = { 'header': title[0], 'keywords_list': res_output, 'url_recommendation': url_to_return[0] } print("JSON DUMP") print(a) try: return json.dumps(a) except: return "awesome2!" json_output = output(title, res_output, url_to_return) print(json_output) return json_output
from nltk.corpus import wordnet_ic from nltk.corpus.reader.wordnet import information_content brown_ic = wordnet_ic.ic('ic-brown.dat') wns = WordNetSimilarity() # arg1 and arg2: predicates represented in strings separated by underscores # e.g. cast_member or star preA = sys.argv[1].split("_") preB = sys.argv[2].split("_") # arg3: pairwise similarity matrix in which rows are separated by underscore # e.g. 0.6_0.5, or 0.6,0.7_0.3,0.4 data = [] for a in preA: row = [] for b in preB: wdsim = wns.word_similarity(a, b, 'wup') row.append(wdsim) data.append(row) data = numpy.matrix(data) #max values in rows Amax = data.max(1) icA = [] for i in range(len(preA)): try: if lesk(preA, preA[i]) is None: # preA is not in WordNet icA.append(1) elif information_content(lesk(preA, preA[i]), brown_ic) == 'inf': icA.append(1) else: icA.append(information_content(lesk(preA, preA[i]), brown_ic))
matriceListe = [] matricelistePaire = [] matricelistePaireSort = [] matricelistePaireAction = [] matricelistePaireObject = [] for word in sorted_terms_lists: tokens = word for index, row in listcara.iterrows(): abstractNumber = 'abs'.format(str((i))) listaction = row['Colonne3'] listaction = re.sub(r'\([^)]*\)', '', listaction) #comparaison betwen tags and classe Triz indiceSimAction = wns.word_similarity(word, str(listaction)) if indiceSimAction == 0 or word.isdigit() == True: #print "rien a faire " continue else: valeurs = [] valeurs = [ i, NumberBrevet, word, listaction, indiceSimAction, abstract, urlEspacenet ] ligne = ",".join(str(v) for v in valeurs) + "\n"
clean.append(t) return clean cleanCleanCat1 = cleanTexts(categoryList1) cleanCleanCat2 = cleanTexts(categoryList2) wns = WordNetSimilarity() similarCategories = [] for cat in cleanCleanCat1: sims = [] for t in cleanCleanCat2: TextSim = [] for w in cat: # wdsSim=[1 if w == wr else wns.word_similarity(w, wr, 'li') for wr in t] wdsSim = [wns.word_similarity(w, wr, 'li') for wr in t] TextSim.extend(wdsSim) sims.append((cleanCleanCat2.index(t), sum(TextSim))) if max(sims, key=lambda x: x[1])[1] > 0: similarCategories.append( (max(sims, key=lambda x: x[1])[0], max(sims, key=lambda x: x[1])[1])) else: similarCategories.append('') print('{0} texts out of {1} done'.format( cleanCleanCat1.index(cat) + 1, len(cleanCleanCat1))) with open('S:/path/In-market audiences_sim.csv', 'w', newline='', encoding='utf-8') as csvfile:
from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() # Computing English word similarity using Li method wns.word_similarity('dog', 'cat', 'li') # 0.449327301063 # Computing Spanish word similarity using Lin method wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') # 0.857142857143 # Computing Spanish and English word similarity using Resnik method wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') #0.31023804699 # Computing Chinese and English word similarity using WPath method wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') #0.593666388463