Пример #1
0
def test_word_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None  # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat',
                               'li') is not None  # 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa',
                                     'lin') is not None  #0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn',
                                     'wup') is not None  # 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng',
                                      'res') is not None  #7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn',
                                      'jcn') is not None  #0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng',
                                      'wpath') is not None  #0.593666388463
def map_subjects(subjects: list, filter_dis=0.2):
    # mapping the subjects, filter the i,j in M
    wns = WordNetSimilarity()
    # enumerate pairing and calculate distances
    # [['中国人', '安乐死'], ['太阳', '很好']]
    pair = []
    # return the indexes pairing
    pair_idxs = []
    for index, value in enumerate(subjects):
        i = index + 1
        while i < len(subjects):
            # compare list : next list
            com_value = subjects[i]
            for v in value:
                for cv in com_value:
                    pair_distance = wns.monol_word_similarity(
                        v, cv, 'cmn', 'wup')
                    # print(f'{v} -> {cv}:  {pair_distance}')
                    if pair_distance > filter_dis:
                        pair.append(pair_distance)
                        # pairing index: (row, column)
                        pair_idxs.append(
                            ([index, value.index(v)], [i,
                                                       com_value.index(cv)]))
            i += 1

    return pair_idxs
 def controlledSetWordNetSimilarity(self, word, similarWords):
     wns = WordNetSimilarity()
     for similarWord in similarWords.copy():
         if wns.word_similarity(
                 word, similarWord, 'li'
         ) < 0.9996:  # Variable to control accuracy of controlset
             similarWords.discard(similarWord)
     return similarWords
Пример #4
0
 def __init__(self, wsd_method='maxsim', sim_name='wpath'):
     '''
     wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive']
     sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath']
     '''
     self._method = wsd_method
     self._sim_name = sim_name
     self._wn_sim = WordNetSimilarity()
Пример #5
0
def test_language():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #check the supported languages
    assert wns.languages() is not None
    #find the language code
    assert wns.languages('English') is not None
    assert wns.languages('chinese_simplified') is not None
    assert wns.languages('spanish') is not None
Пример #6
0
    def __init__(self):
        self.out = {}
        self.keras = keras_similar()
        self.classifier = Qclassifier()
        self.spell=Spelling()
        self.wn = WordNetSimilarity()
        self.en_nlp = spacy.load("en_core_web_md")
        self.stopwords_en=[]
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
            'utils', 'stopwords_en.txt')) as f:

            self.stopwords_en = f.read().splitlines()
Пример #7
0
def test_classification_evaluation():
    from sematch.evaluation import AspectEvaluation
    from sematch.application import SimClassifier, SimSVMClassifier
    from sematch.semantic.similarity import WordNetSimilarity
    evaluation = AspectEvaluation()
    X, y = evaluation.load_dataset()
    wns = WordNetSimilarity()
    word_sim = lambda x, y: wns.word_similarity(x, y)
    simclassifier = SimClassifier.train(zip(X, y), word_sim)
    evaluation.evaluate(X, y, simclassifier)
    simSVMclassifier = SimSVMClassifier.train(X, y, word_sim)
    evaluation.evaluate(X, y, simSVMclassifier)
Пример #8
0
def semantic_matching(trend_one, trend_two):
    treshold = 0.3
    trend_one_processed = text_processing(trend_one, keep_spaces=True)
    trend_two_processed = text_processing(trend_two, keep_spaces=True)
    # The options are Wordnet, YAGO and DBpedia (only the first seems usable)
    wns = WordNetSimilarity()
    matches = list({
        x['original']
        for x in trend_one_processed for y in trend_two_processed
        if wns.word_similarity(x['processed'], y['processed'], 'li') > treshold
    })

    if len(matches) == 0: return 'No matches'
    return matches
Пример #9
0
def test_sim_graph():
    from sematch.semantic.graph import SimGraph
    from sematch.semantic.similarity import WordNetSimilarity
    from sematch.nlp import Extraction, lemmatization
    from sematch.sparql import EntityFeatures
    from collections import Counter
    madrid = EntityFeatures().features(
        'http://dbpedia.org/resource/Tom_Cruise')
    words = Extraction().extract_words_sent(madrid['abstract'])
    words = list(set(lemmatization(words)))
    wns = WordNetSimilarity()
    word_graph = SimGraph(words, wns.word_similarity)
    word_scores = word_graph.page_rank()
    words, scores = zip(*Counter(word_scores).most_common(10))
    assert words is not None
Пример #10
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    wordsim_eval = WordSimEvaluation()
    wns = WordNetSimilarity()
    #define similarity metrics
    lin = lambda x, y: wns.word_similarity(x, y, 'lin')
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    #evaluate similarity metrics
    assert wordsim_eval.evaluate_multiple_metrics({
        'lin': lin,
        'wpath': wpath
    }, 'noun_simlex') is not None
    #performa Steiger's Z significance Test
    assert wordsim_eval.statistical_test('wpath', 'lin',
                                         'noun_simlex') is not None
Пример #11
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    evaluation = WordSimEvaluation()
    print evaluation.dataset_names()
    wns = WordNetSimilarity()
    # define similarity metrics
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    # evaluate similarity metrics
    print evaluation.evaluate_metric('wpath', wpath, 'noun_simlex')
    # performa Steiger's Z significance Test
    print evaluation.statistical_test('wpath', 'path', 'noun_simlex')
    wpath_es = lambda x, y: wns.monol_word_similarity(x, y, 'spa', 'path')
    wpath_en_es = lambda x, y: wns.crossl_word_similarity(
        x, y, 'eng', 'spa', 'wpath')
    print evaluation.evaluate_metric('wpath_es', wpath_es, 'rg65_spanish')
    print evaluation.evaluate_metric('wpath_en_es', wpath_en_es, 'rg65_EN-ES')
Пример #12
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(
        feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'],
                                          q['candidate'],
                                          similarity,
                                          K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold,
                                                  predict,
                                                  average='weighted')[2]
Пример #13
0
 def __init__(self,
              corpus,
              feature_num=10,
              model='onehot',
              wn_method='path',
              vec_file='models/GoogleNews-vectors-negative300.bin',
              binary=True):
     """
     :param corpus: use a corpus to train a vector representation
     :param feature_num: number of dimensions
     :param model: onehot or wordnet or word2vec or both
     """
     self._model = model
     self._wn_method = wn_method
     self._features = self.extract_features(corpus, feature_num)
     self._wns = WordNetSimilarity(
     ) if model == 'wordnet' or model == 'both' else None
     self._wvs = WordVecSimilarity(
         vec_file,
         binary) if model == 'word2vec' or model == 'both' else None
Пример #14
0
def yhmh_nlp(url, trigger_words):
    text, triggers = parse_my_url(url, trigger_words)
    print("triggers2: %s" % (triggers))
    if text is "" or len(triggers) == 0:
        return ""

    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities
    verbose = True
    counter = 0
    counter2 = 0
    text_output_array = pd.DataFrame(np.zeros((len(entities), 3)))

    for entity in entities:
        entity_type = enums.Entity.Type(entity.type)
        if len(entity.name) < 25 and '.' not in entity.name:
            text_output_array.iloc[counter, 0] = entity.name
            text_output_array.iloc[counter, 1] = entity_type.name
            text_output_array.iloc[counter, 2] = entity.salience
            counter += 1
        else:
            counter2 += 1

    celebrity_status = 0
    if len(entities) > 0:
        if entities[0].metadata.get(
                'wikipedia_url',
                '-') != '-' and text_output_array.iloc[0, 1] == 'PERSON':
            celebrity_status = 1
        elif entities[1].metadata.get(
                'wikipedia_url',
                '-') and text_output_array.iloc[1, 1] == 'PERSON':
            celebrity_status = 1
        else:
            celebrity_status = 0

    text_output_array = text_output_array.iloc[0:len(entities) - counter2, :]

    # Detects the sentiment of the text
    #sentiment = client.analyze_sentiment(document=document).document_sentiment

    wns = WordNetSimilarity()

    keywords_target = pd.Series.to_list(text_output_array[0])
    #keywords_target = list(set(keywords_target))

    #seen = set(keywords_target)
    #keywords_target = []
    #for x in keywords_target:
    #    if x not in seen:
    #        keywords_target.append(x)
    #        seen.add(x)
    #
    #keywords_target=seen
    forbidden_keywords = [
        'medicine', 'drug', 'fun', 'hospital', 'suicide', 'death', 'mental',
        'health', 'illness', 'insta', ',man', 'woman', 'family', 'people',
        'many', 'place', 'same', 'others', 'brain', 'all', 'end', 'statement',
        'lot', 'condolences'
    ]

    regex = re.compile(r'([A-Z]([a-z])+)')
    selected_files = list(filter(regex.search, keywords_target))
    res = list(set(keywords_target) - set(selected_files))

    regex = re.compile(r'^@')
    selected_files = list(filter(regex.search, res))
    res = list(set(keywords_target) - set(selected_files))

    regex = re.compile(r"\b[A-Z][A-Z]+\b")
    selected_files = list(filter(regex.search, res))
    res = list(set(res) - set(selected_files))

    regex = re.compile(r'([A-Z]([a-z])+)')
    selected_files = list(filter(regex.search, res))
    res = list(set(res) - set(selected_files))
    for key in range(len(res)):
        if ' ' in res[key]:
            res[key] = res[key].split(' ')[0]

    for x in range(len(res)):
        for y in range(len(forbidden_keywords)):
            if res[x] == forbidden_keywords[y]:
                res[x] = []
    res = list(filter(None, res))

    res_dictionary = Counter(res)

    res_output = res_dictionary.most_common(10)
    res_output = dict(res_output)
    res_output = list(res_output.keys())

    print(res_output)
    res = res_output[0:num_keywords]
    database = pd.read_csv(
        CURATED_LIST
    )  #('/Users/vmutai/Projects/HMH/admin/microblog/app/yhmh_curated_articles.csv')

    if celebrity_status == 1:
        database = database[database.celebrity == 1]
    elif celebrity_status == 0:
        database = database[database.celebrity == 0]
    similarity_ranks = pd.DataFrame(np.zeros(database.shape[0]))
    for z in range(database.shape[0]):
        newlist = []
        N_rows = len(res)
        keywords_source = database.iloc[z, 4:4 + num_keywords]
        keywords_source = pd.Series.tolist(keywords_source)
        N_cols = len(keywords_source)
        #similarity_list = pd.DataFrame(np.zeros((N_rows, N_cols)))
        foo = [1]
        for x in range(len(res)):
            for y in range(len(keywords_source)):
                value = wns.word_similarity(res[x], keywords_source[y], 'lin')
                #similarity_matrix.at[x,y]=value
                foo.append(value)
        matrix_average = sum(foo) / np.count_nonzero(foo)
        similarity_ranks.at[z, 0] = matrix_average
    maximum = pd.DataFrame.idxmax(similarity_ranks)
    url_to_return = pd.Series.tolist(database.iloc[maximum, 0])
    print(url_to_return)

    title = pd.Series.tolist(database.iloc[maximum, 1])

    def output(title, res_output, url_to_return):
        a = {
            'header': title[0],
            'keywords_list': res_output,
            'url_recommendation': url_to_return[0]
        }
        print("JSON DUMP")
        print(a)

        try:
            return json.dumps(a)
        except:
            return "awesome2!"

    json_output = output(title, res_output, url_to_return)
    print(json_output)

    return json_output
Пример #15
0
def test_synset_expand():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    cat = wns.word2synset('cat')[0]
    assert wns.synset_expand(cat) is not None
Пример #16
0
from flask import Flask, json, request, render_template as template
from sematch.application import Matcher
from sematch.semantic.similarity import ConceptSimilarity, WordNetSimilarity
from sematch.semantic.similarity import YagoTypeSimilarity, EntitySimilarity
from sematch.semantic.graph import DBpediaDataTransform, Taxonomy

import os

DEBUG = True
SECRET_KEY = 'Secret_development_key'
DATA_FILE = 'data/data.txt'

app = Flask(__name__)
app.config.from_object(__name__)

wn_sim = WordNetSimilarity()
yago_sim = YagoTypeSimilarity()
matcher = Matcher()
dbpedia_sim = ConceptSimilarity(Taxonomy(DBpediaDataTransform()),
                                'models/dbpedia_type_ic.txt')
entity = EntitySimilarity()

from search import text_lsa, text_tfidf, data


@app.route('/api/text_search')
def text_search():
    query = request.args.get('query')
    result = text_tfidf.search(query)
    result_data = []
from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt
from random import random, shuffle
import pickle
import sys
import string
import numpy as np
from sematch.semantic.similarity import WordNetSimilarity
from config import path
wns = WordNetSimilarity()

import string

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
string.punctuation.__add__('?')
string.punctuation.__add__('.')
string.punctuation.__add__(',')

# from gensim.models import Word2Vec
# model = Word2Vec.load_word2vec_format(path+'GoogleNews-vectors-negative300.bin', binary=True)  # C binary format
# print model.vocab
model = None


def remove_punctuation(x):
    new_line = [w for w in list(x) if w not in string.punctuation]
    new_line = ''.join(new_line)
    return new_line
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sematch.semantic.similarity import WordNetSimilarity

WNS = WordNetSimilarity()


# NOTE: For reference see: https://pdfs.semanticscholar.org/1374/617e135eaa772e52c9a2e8253f49483676d6.pdf

def random_sentences(num_rand_sentences, df_main):
    """Select num_rand_sentences at random from the Dataframe

    Args:
        num_rand_sentences (int): the number of sentences to select at random

    Return:
         list: list of sentences
    """
    size = num_rand_sentences
    indices = np.random.randint(0, df_main.shape[0], size)

    tokenized_subset = df_main['tokenized_sentence'].dropna()
    sentence_subset = df_main['sentence'].dropna()
    lecture_subset = df_main['lecture'].dropna()
    start_time_subset = df_main['start_time'].dropna()
    end_time_subset = df_main['end_time'].dropna()

    random_tokenized_sentences = map(lambda x: tokenized_subset[x], indices)
    random_normal_sentences = map(lambda x: sentence_subset[x], indices)