def test_word_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None # 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None # 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None #0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None #0.593666388463
def map_subjects(subjects: list, filter_dis=0.2): # mapping the subjects, filter the i,j in M wns = WordNetSimilarity() # enumerate pairing and calculate distances # [['中国人', '安乐死'], ['太阳', '很好']] pair = [] # return the indexes pairing pair_idxs = [] for index, value in enumerate(subjects): i = index + 1 while i < len(subjects): # compare list : next list com_value = subjects[i] for v in value: for cv in com_value: pair_distance = wns.monol_word_similarity( v, cv, 'cmn', 'wup') # print(f'{v} -> {cv}: {pair_distance}') if pair_distance > filter_dis: pair.append(pair_distance) # pairing index: (row, column) pair_idxs.append( ([index, value.index(v)], [i, com_value.index(cv)])) i += 1 return pair_idxs
def controlledSetWordNetSimilarity(self, word, similarWords): wns = WordNetSimilarity() for similarWord in similarWords.copy(): if wns.word_similarity( word, similarWord, 'li' ) < 0.9996: # Variable to control accuracy of controlset similarWords.discard(similarWord) return similarWords
def __init__(self, wsd_method='maxsim', sim_name='wpath'): ''' wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive'] sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath'] ''' self._method = wsd_method self._sim_name = sim_name self._wn_sim = WordNetSimilarity()
def test_language(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #check the supported languages assert wns.languages() is not None #find the language code assert wns.languages('English') is not None assert wns.languages('chinese_simplified') is not None assert wns.languages('spanish') is not None
def __init__(self): self.out = {} self.keras = keras_similar() self.classifier = Qclassifier() self.spell=Spelling() self.wn = WordNetSimilarity() self.en_nlp = spacy.load("en_core_web_md") self.stopwords_en=[] with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'utils', 'stopwords_en.txt')) as f: self.stopwords_en = f.read().splitlines()
def test_classification_evaluation(): from sematch.evaluation import AspectEvaluation from sematch.application import SimClassifier, SimSVMClassifier from sematch.semantic.similarity import WordNetSimilarity evaluation = AspectEvaluation() X, y = evaluation.load_dataset() wns = WordNetSimilarity() word_sim = lambda x, y: wns.word_similarity(x, y) simclassifier = SimClassifier.train(zip(X, y), word_sim) evaluation.evaluate(X, y, simclassifier) simSVMclassifier = SimSVMClassifier.train(X, y, word_sim) evaluation.evaluate(X, y, simSVMclassifier)
def semantic_matching(trend_one, trend_two): treshold = 0.3 trend_one_processed = text_processing(trend_one, keep_spaces=True) trend_two_processed = text_processing(trend_two, keep_spaces=True) # The options are Wordnet, YAGO and DBpedia (only the first seems usable) wns = WordNetSimilarity() matches = list({ x['original'] for x in trend_one_processed for y in trend_two_processed if wns.word_similarity(x['processed'], y['processed'], 'li') > treshold }) if len(matches) == 0: return 'No matches' return matches
def test_sim_graph(): from sematch.semantic.graph import SimGraph from sematch.semantic.similarity import WordNetSimilarity from sematch.nlp import Extraction, lemmatization from sematch.sparql import EntityFeatures from collections import Counter madrid = EntityFeatures().features( 'http://dbpedia.org/resource/Tom_Cruise') words = Extraction().extract_words_sent(madrid['abstract']) words = list(set(lemmatization(words))) wns = WordNetSimilarity() word_graph = SimGraph(words, wns.word_similarity) word_scores = word_graph.page_rank() words, scores = zip(*Counter(word_scores).most_common(10)) assert words is not None
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity wordsim_eval = WordSimEvaluation() wns = WordNetSimilarity() #define similarity metrics lin = lambda x, y: wns.word_similarity(x, y, 'lin') wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) #evaluate similarity metrics assert wordsim_eval.evaluate_multiple_metrics({ 'lin': lin, 'wpath': wpath }, 'noun_simlex') is not None #performa Steiger's Z significance Test assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity evaluation = WordSimEvaluation() print evaluation.dataset_names() wns = WordNetSimilarity() # define similarity metrics wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) # evaluate similarity metrics print evaluation.evaluate_metric('wpath', wpath, 'noun_simlex') # performa Steiger's Z significance Test print evaluation.statistical_test('wpath', 'path', 'noun_simlex') wpath_es = lambda x, y: wns.monol_word_similarity(x, y, 'spa', 'path') wpath_en_es = lambda x, y: wns.crossl_word_similarity( x, y, 'eng', 'spa', 'wpath') print evaluation.evaluate_metric('wpath_es', wpath_es, 'rg65_spanish') print evaluation.evaluate_metric('wpath_en_es', wpath_en_es, 'rg65_EN-ES')
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load( feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def __init__(self, corpus, feature_num=10, model='onehot', wn_method='path', vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param corpus: use a corpus to train a vector representation :param feature_num: number of dimensions :param model: onehot or wordnet or word2vec or both """ self._model = model self._wn_method = wn_method self._features = self.extract_features(corpus, feature_num) self._wns = WordNetSimilarity( ) if model == 'wordnet' or model == 'both' else None self._wvs = WordVecSimilarity( vec_file, binary) if model == 'word2vec' or model == 'both' else None
def yhmh_nlp(url, trigger_words): text, triggers = parse_my_url(url, trigger_words) print("triggers2: %s" % (triggers)) if text is "" or len(triggers) == 0: return "" client = language.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities verbose = True counter = 0 counter2 = 0 text_output_array = pd.DataFrame(np.zeros((len(entities), 3))) for entity in entities: entity_type = enums.Entity.Type(entity.type) if len(entity.name) < 25 and '.' not in entity.name: text_output_array.iloc[counter, 0] = entity.name text_output_array.iloc[counter, 1] = entity_type.name text_output_array.iloc[counter, 2] = entity.salience counter += 1 else: counter2 += 1 celebrity_status = 0 if len(entities) > 0: if entities[0].metadata.get( 'wikipedia_url', '-') != '-' and text_output_array.iloc[0, 1] == 'PERSON': celebrity_status = 1 elif entities[1].metadata.get( 'wikipedia_url', '-') and text_output_array.iloc[1, 1] == 'PERSON': celebrity_status = 1 else: celebrity_status = 0 text_output_array = text_output_array.iloc[0:len(entities) - counter2, :] # Detects the sentiment of the text #sentiment = client.analyze_sentiment(document=document).document_sentiment wns = WordNetSimilarity() keywords_target = pd.Series.to_list(text_output_array[0]) #keywords_target = list(set(keywords_target)) #seen = set(keywords_target) #keywords_target = [] #for x in keywords_target: # if x not in seen: # keywords_target.append(x) # seen.add(x) # #keywords_target=seen forbidden_keywords = [ 'medicine', 'drug', 'fun', 'hospital', 'suicide', 'death', 'mental', 'health', 'illness', 'insta', ',man', 'woman', 'family', 'people', 'many', 'place', 'same', 'others', 'brain', 'all', 'end', 'statement', 'lot', 'condolences' ] regex = re.compile(r'([A-Z]([a-z])+)') selected_files = list(filter(regex.search, keywords_target)) res = list(set(keywords_target) - set(selected_files)) regex = re.compile(r'^@') selected_files = list(filter(regex.search, res)) res = list(set(keywords_target) - set(selected_files)) regex = re.compile(r"\b[A-Z][A-Z]+\b") selected_files = list(filter(regex.search, res)) res = list(set(res) - set(selected_files)) regex = re.compile(r'([A-Z]([a-z])+)') selected_files = list(filter(regex.search, res)) res = list(set(res) - set(selected_files)) for key in range(len(res)): if ' ' in res[key]: res[key] = res[key].split(' ')[0] for x in range(len(res)): for y in range(len(forbidden_keywords)): if res[x] == forbidden_keywords[y]: res[x] = [] res = list(filter(None, res)) res_dictionary = Counter(res) res_output = res_dictionary.most_common(10) res_output = dict(res_output) res_output = list(res_output.keys()) print(res_output) res = res_output[0:num_keywords] database = pd.read_csv( CURATED_LIST ) #('/Users/vmutai/Projects/HMH/admin/microblog/app/yhmh_curated_articles.csv') if celebrity_status == 1: database = database[database.celebrity == 1] elif celebrity_status == 0: database = database[database.celebrity == 0] similarity_ranks = pd.DataFrame(np.zeros(database.shape[0])) for z in range(database.shape[0]): newlist = [] N_rows = len(res) keywords_source = database.iloc[z, 4:4 + num_keywords] keywords_source = pd.Series.tolist(keywords_source) N_cols = len(keywords_source) #similarity_list = pd.DataFrame(np.zeros((N_rows, N_cols))) foo = [1] for x in range(len(res)): for y in range(len(keywords_source)): value = wns.word_similarity(res[x], keywords_source[y], 'lin') #similarity_matrix.at[x,y]=value foo.append(value) matrix_average = sum(foo) / np.count_nonzero(foo) similarity_ranks.at[z, 0] = matrix_average maximum = pd.DataFrame.idxmax(similarity_ranks) url_to_return = pd.Series.tolist(database.iloc[maximum, 0]) print(url_to_return) title = pd.Series.tolist(database.iloc[maximum, 1]) def output(title, res_output, url_to_return): a = { 'header': title[0], 'keywords_list': res_output, 'url_recommendation': url_to_return[0] } print("JSON DUMP") print(a) try: return json.dumps(a) except: return "awesome2!" json_output = output(title, res_output, url_to_return) print(json_output) return json_output
def test_synset_expand(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() cat = wns.word2synset('cat')[0] assert wns.synset_expand(cat) is not None
from flask import Flask, json, request, render_template as template from sematch.application import Matcher from sematch.semantic.similarity import ConceptSimilarity, WordNetSimilarity from sematch.semantic.similarity import YagoTypeSimilarity, EntitySimilarity from sematch.semantic.graph import DBpediaDataTransform, Taxonomy import os DEBUG = True SECRET_KEY = 'Secret_development_key' DATA_FILE = 'data/data.txt' app = Flask(__name__) app.config.from_object(__name__) wn_sim = WordNetSimilarity() yago_sim = YagoTypeSimilarity() matcher = Matcher() dbpedia_sim = ConceptSimilarity(Taxonomy(DBpediaDataTransform()), 'models/dbpedia_type_ic.txt') entity = EntitySimilarity() from search import text_lsa, text_tfidf, data @app.route('/api/text_search') def text_search(): query = request.args.get('query') result = text_tfidf.search(query) result_data = []
from datetime import datetime from csv import DictReader from math import exp, log, sqrt from random import random, shuffle import pickle import sys import string import numpy as np from sematch.semantic.similarity import WordNetSimilarity from config import path wns = WordNetSimilarity() import string string.punctuation.__add__('!!') string.punctuation.__add__('(') string.punctuation.__add__(')') string.punctuation.__add__('?') string.punctuation.__add__('.') string.punctuation.__add__(',') # from gensim.models import Word2Vec # model = Word2Vec.load_word2vec_format(path+'GoogleNews-vectors-negative300.bin', binary=True) # C binary format # print model.vocab model = None def remove_punctuation(x): new_line = [w for w in list(x) if w not in string.punctuation] new_line = ''.join(new_line) return new_line
import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sematch.semantic.similarity import WordNetSimilarity WNS = WordNetSimilarity() # NOTE: For reference see: https://pdfs.semanticscholar.org/1374/617e135eaa772e52c9a2e8253f49483676d6.pdf def random_sentences(num_rand_sentences, df_main): """Select num_rand_sentences at random from the Dataframe Args: num_rand_sentences (int): the number of sentences to select at random Return: list: list of sentences """ size = num_rand_sentences indices = np.random.randint(0, df_main.shape[0], size) tokenized_subset = df_main['tokenized_sentence'].dropna() sentence_subset = df_main['sentence'].dropna() lecture_subset = df_main['lecture'].dropna() start_time_subset = df_main['start_time'].dropna() end_time_subset = df_main['end_time'].dropna() random_tokenized_sentences = map(lambda x: tokenized_subset[x], indices) random_normal_sentences = map(lambda x: sentence_subset[x], indices)