Exemplo n.º 1
0
def test_word_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None  # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat',
                               'li') is not None  # 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa',
                                     'lin') is not None  #0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn',
                                     'wup') is not None  # 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng',
                                      'res') is not None  #7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn',
                                      'jcn') is not None  #0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng',
                                      'wpath') is not None  #0.593666388463
def map_subjects(subjects: list, filter_dis=0.2):
    # mapping the subjects, filter the i,j in M
    wns = WordNetSimilarity()
    # enumerate pairing and calculate distances
    # [['中国人', '安乐死'], ['太阳', '很好']]
    pair = []
    # return the indexes pairing
    pair_idxs = []
    for index, value in enumerate(subjects):
        i = index + 1
        while i < len(subjects):
            # compare list : next list
            com_value = subjects[i]
            for v in value:
                for cv in com_value:
                    pair_distance = wns.monol_word_similarity(
                        v, cv, 'cmn', 'wup')
                    # print(f'{v} -> {cv}:  {pair_distance}')
                    if pair_distance > filter_dis:
                        pair.append(pair_distance)
                        # pairing index: (row, column)
                        pair_idxs.append(
                            ([index, value.index(v)], [i,
                                                       com_value.index(cv)]))
            i += 1

    return pair_idxs
 def controlledSetWordNetSimilarity(self, word, similarWords):
     wns = WordNetSimilarity()
     for similarWord in similarWords.copy():
         if wns.word_similarity(
                 word, similarWord, 'li'
         ) < 0.9996:  # Variable to control accuracy of controlset
             similarWords.discard(similarWord)
     return similarWords
Exemplo n.º 4
0
 def __init__(self, wsd_method='maxsim', sim_name='wpath'):
     '''
     wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive']
     sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath']
     '''
     self._method = wsd_method
     self._sim_name = sim_name
     self._wn_sim = WordNetSimilarity()
Exemplo n.º 5
0
def test_language():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #check the supported languages
    assert wns.languages() is not None
    #find the language code
    assert wns.languages('English') is not None
    assert wns.languages('chinese_simplified') is not None
    assert wns.languages('spanish') is not None
Exemplo n.º 6
0
def test_classification_evaluation():
    from sematch.evaluation import AspectEvaluation
    from sematch.application import SimClassifier, SimSVMClassifier
    from sematch.semantic.similarity import WordNetSimilarity
    evaluation = AspectEvaluation()
    X, y = evaluation.load_dataset()
    wns = WordNetSimilarity()
    word_sim = lambda x, y: wns.word_similarity(x, y)
    simclassifier = SimClassifier.train(zip(X, y), word_sim)
    evaluation.evaluate(X, y, simclassifier)
    simSVMclassifier = SimSVMClassifier.train(X, y, word_sim)
    evaluation.evaluate(X, y, simSVMclassifier)
Exemplo n.º 7
0
    def __init__(self):
        self.out = {}
        self.keras = keras_similar()
        self.classifier = Qclassifier()
        self.spell=Spelling()
        self.wn = WordNetSimilarity()
        self.en_nlp = spacy.load("en_core_web_md")
        self.stopwords_en=[]
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
            'utils', 'stopwords_en.txt')) as f:

            self.stopwords_en = f.read().splitlines()
Exemplo n.º 8
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    wordsim_eval = WordSimEvaluation()
    wns = WordNetSimilarity()
    #define similarity metrics
    lin = lambda x, y: wns.word_similarity(x, y, 'lin')
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    #evaluate similarity metrics
    assert wordsim_eval.evaluate_multiple_metrics({'lin':lin, 'wpath':wpath}, 'noun_simlex') is not None
    #performa Steiger's Z significance Test
    assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
Exemplo n.º 9
0
def semantic_matching(trend_one, trend_two):
    treshold = 0.3
    trend_one_processed = text_processing(trend_one, keep_spaces=True)
    trend_two_processed = text_processing(trend_two, keep_spaces=True)
    # The options are Wordnet, YAGO and DBpedia (only the first seems usable)
    wns = WordNetSimilarity()
    matches = list({
        x['original']
        for x in trend_one_processed for y in trend_two_processed
        if wns.word_similarity(x['processed'], y['processed'], 'li') > treshold
    })

    if len(matches) == 0: return 'No matches'
    return matches
Exemplo n.º 10
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    wordsim_eval = WordSimEvaluation()
    wns = WordNetSimilarity()
    #define similarity metrics
    lin = lambda x, y: wns.word_similarity(x, y, 'lin')
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    #evaluate similarity metrics
    assert wordsim_eval.evaluate_multiple_metrics({
        'lin': lin,
        'wpath': wpath
    }, 'noun_simlex') is not None
    #performa Steiger's Z significance Test
    assert wordsim_eval.statistical_test('wpath', 'lin',
                                         'noun_simlex') is not None
Exemplo n.º 11
0
def test_wordsim_evaluation():
    from sematch.evaluation import WordSimEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    evaluation = WordSimEvaluation()
    print evaluation.dataset_names()
    wns = WordNetSimilarity()
    # define similarity metrics
    wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8)
    # evaluate similarity metrics
    print evaluation.evaluate_metric('wpath', wpath, 'noun_simlex')
    # performa Steiger's Z significance Test
    print evaluation.statistical_test('wpath', 'path', 'noun_simlex')
    wpath_es = lambda x, y: wns.monol_word_similarity(x, y, 'spa', 'path')
    wpath_en_es = lambda x, y: wns.crossl_word_similarity(
        x, y, 'eng', 'spa', 'wpath')
    print evaluation.evaluate_metric('wpath_es', wpath_es, 'rg65_spanish')
    print evaluation.evaluate_metric('wpath_en_es', wpath_en_es, 'rg65_EN-ES')
Exemplo n.º 12
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(
        feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'],
                                          q['candidate'],
                                          similarity,
                                          K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold,
                                                  predict,
                                                  average='weighted')[2]
Exemplo n.º 13
0
 def __init__(self,
              corpus,
              feature_num=10,
              model='onehot',
              wn_method='path',
              vec_file='models/GoogleNews-vectors-negative300.bin',
              binary=True):
     """
     :param corpus: use a corpus to train a vector representation
     :param feature_num: number of dimensions
     :param model: onehot or wordnet or word2vec or both
     """
     self._model = model
     self._wn_method = wn_method
     self._features = self.extract_features(corpus, feature_num)
     self._wns = WordNetSimilarity(
     ) if model == 'wordnet' or model == 'both' else None
     self._wvs = WordVecSimilarity(
         vec_file,
         binary) if model == 'word2vec' or model == 'both' else None
Exemplo n.º 14
0
def test_simcat_classifier():
    from sematch.classification import SimCatClassifier
    from sematch.evaluation import ABSAEvaluation
    from sematch.semantic.similarity import WordNetSimilarity
    # defining similarity metric
    wns = WordNetSimilarity()
    sim_metric_jcn = lambda x, y: wns.word_similarity(x, y, 'jcn')
    sim_metric_wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.9)
    # loadding dataset
    absa_eval = ABSAEvaluation()
    X_train_16, y_train_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml')
    X_test_16, y_test_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml')
    # train the classifiers
    sim_jcn_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_jcn)
    sim_wpath_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_wpath)
    # evaluate the classifiers
    #absa_eval.evaluate(X_test_16, y_test_16, sim_jcn_classifier)
    #absa_eval.evaluate(X_test_16, y_test_16, sim_wpath_classifier)
    assert sim_jcn_classifier is not None
    assert sim_wpath_classifier is not None
Exemplo n.º 15
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold, predict, average='weighted')[2]
Exemplo n.º 16
0
def test_wordnet_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat', 'li') is not None# 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None#0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None# 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None#7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None#0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None#0.593666388463
Exemplo n.º 17
0
def test_language():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #check the supported languages
    assert wns.languages() is not None
    #find the language code
    assert wns.languages('English') is not None
    assert wns.languages('chinese_simplified') is not None
    assert wns.languages('spanish') is not None
Exemplo n.º 18
0
def test_sim_graph():
    from sematch.semantic.graph import SimGraph
    from sematch.semantic.similarity import WordNetSimilarity
    from sematch.nlp import Extraction, lemmatization
    from sematch.sparql import EntityFeatures
    from collections import Counter
    madrid = EntityFeatures().features(
        'http://dbpedia.org/resource/Tom_Cruise')
    words = Extraction().extract_words_sent(madrid['abstract'])
    words = list(set(lemmatization(words)))
    wns = WordNetSimilarity()
    word_graph = SimGraph(words, wns.word_similarity)
    word_scores = word_graph.page_rank()
    words, scores = zip(*Counter(word_scores).most_common(10))
    assert words is not None
Exemplo n.º 19
0
from sematch.semantic.similarity import WordNetSimilarity
wns = WordNetSimilarity()

# Computing English word similarity using Li method
wns.word_similarity('dog', 'cat', 'li')  # 0.449327301063
# Computing Spanish word similarity using Lin method
wns.monol_word_similarity('perro', 'gato', 'spa', 'lin')  #0.876800984373
# Computing Chinese word similarity using Wu & Palmer method
wns.monol_word_similarity('狗', '猫', 'cmn', 'wup')  # 0.857142857143
# Computing Spanish and English word similarity using Resnik method
wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res')  #7.91166650904
# Computing Spanish and Chinese word similarity using Jiang & Conrad method
wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn')  #0.31023804699
# Computing Chinese and English word similarity using WPath method
wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath')  #0.593666388463
Exemplo n.º 20
0
def test_synset_expand():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    cat = wns.word2synset('cat')[0]
    assert wns.synset_expand(cat) is not None
Exemplo n.º 21
0
def yhmh_nlp(url, trigger_words):
    text, triggers = parse_my_url(url, trigger_words)
    print("triggers2: %s" % (triggers))
    if text is "" or len(triggers) == 0:
        return ""

    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities
    verbose = True
    counter = 0
    counter2 = 0
    text_output_array = pd.DataFrame(np.zeros((len(entities), 3)))

    for entity in entities:
        entity_type = enums.Entity.Type(entity.type)
        if len(entity.name) < 25 and '.' not in entity.name:
            text_output_array.iloc[counter, 0] = entity.name
            text_output_array.iloc[counter, 1] = entity_type.name
            text_output_array.iloc[counter, 2] = entity.salience
            counter += 1
        else:
            counter2 += 1

    celebrity_status = 0
    if len(entities) > 0:
        if entities[0].metadata.get(
                'wikipedia_url',
                '-') != '-' and text_output_array.iloc[0, 1] == 'PERSON':
            celebrity_status = 1
        elif entities[1].metadata.get(
                'wikipedia_url',
                '-') and text_output_array.iloc[1, 1] == 'PERSON':
            celebrity_status = 1
        else:
            celebrity_status = 0

    text_output_array = text_output_array.iloc[0:len(entities) - counter2, :]

    # Detects the sentiment of the text
    #sentiment = client.analyze_sentiment(document=document).document_sentiment

    wns = WordNetSimilarity()

    keywords_target = pd.Series.to_list(text_output_array[0])
    #keywords_target = list(set(keywords_target))

    #seen = set(keywords_target)
    #keywords_target = []
    #for x in keywords_target:
    #    if x not in seen:
    #        keywords_target.append(x)
    #        seen.add(x)
    #
    #keywords_target=seen
    forbidden_keywords = [
        'medicine', 'drug', 'fun', 'hospital', 'suicide', 'death', 'mental',
        'health', 'illness', 'insta', ',man', 'woman', 'family', 'people',
        'many', 'place', 'same', 'others', 'brain', 'all', 'end', 'statement',
        'lot', 'condolences'
    ]

    regex = re.compile(r'([A-Z]([a-z])+)')
    selected_files = list(filter(regex.search, keywords_target))
    res = list(set(keywords_target) - set(selected_files))

    regex = re.compile(r'^@')
    selected_files = list(filter(regex.search, res))
    res = list(set(keywords_target) - set(selected_files))

    regex = re.compile(r"\b[A-Z][A-Z]+\b")
    selected_files = list(filter(regex.search, res))
    res = list(set(res) - set(selected_files))

    regex = re.compile(r'([A-Z]([a-z])+)')
    selected_files = list(filter(regex.search, res))
    res = list(set(res) - set(selected_files))
    for key in range(len(res)):
        if ' ' in res[key]:
            res[key] = res[key].split(' ')[0]

    for x in range(len(res)):
        for y in range(len(forbidden_keywords)):
            if res[x] == forbidden_keywords[y]:
                res[x] = []
    res = list(filter(None, res))

    res_dictionary = Counter(res)

    res_output = res_dictionary.most_common(10)
    res_output = dict(res_output)
    res_output = list(res_output.keys())

    print(res_output)
    res = res_output[0:num_keywords]
    database = pd.read_csv(
        CURATED_LIST
    )  #('/Users/vmutai/Projects/HMH/admin/microblog/app/yhmh_curated_articles.csv')

    if celebrity_status == 1:
        database = database[database.celebrity == 1]
    elif celebrity_status == 0:
        database = database[database.celebrity == 0]
    similarity_ranks = pd.DataFrame(np.zeros(database.shape[0]))
    for z in range(database.shape[0]):
        newlist = []
        N_rows = len(res)
        keywords_source = database.iloc[z, 4:4 + num_keywords]
        keywords_source = pd.Series.tolist(keywords_source)
        N_cols = len(keywords_source)
        #similarity_list = pd.DataFrame(np.zeros((N_rows, N_cols)))
        foo = [1]
        for x in range(len(res)):
            for y in range(len(keywords_source)):
                value = wns.word_similarity(res[x], keywords_source[y], 'lin')
                #similarity_matrix.at[x,y]=value
                foo.append(value)
        matrix_average = sum(foo) / np.count_nonzero(foo)
        similarity_ranks.at[z, 0] = matrix_average
    maximum = pd.DataFrame.idxmax(similarity_ranks)
    url_to_return = pd.Series.tolist(database.iloc[maximum, 0])
    print(url_to_return)

    title = pd.Series.tolist(database.iloc[maximum, 1])

    def output(title, res_output, url_to_return):
        a = {
            'header': title[0],
            'keywords_list': res_output,
            'url_recommendation': url_to_return[0]
        }
        print("JSON DUMP")
        print(a)

        try:
            return json.dumps(a)
        except:
            return "awesome2!"

    json_output = output(title, res_output, url_to_return)
    print(json_output)

    return json_output
Exemplo n.º 22
0
ResultTemplateFlask = os.path.join(ResultPathContent,
                                   'Trizifiier').replace('\\', '/')
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
if not os.path.exists(
        ResultTemplateFlask):  #creation des dossiers templates et dataFormat
    os.mkdir(ResultTemplateFlask)
if not os.path.exists(ResultTemplateFlask + '/templates'
                      ):  #creation des dossiers templates et dataFormat
    os.mkdir(ResultTemplateFlask + '/templates')
if not os.path.exists(ResultTemplateFlask + '/DataFormat'
                      ):  #creation des dossiers templates et dataFormat
    os.mkdir(ResultTemplateFlask + '/DataFormat')
#add here tempo dir
temporar = configFile.temporPath
wns = WordNetSimilarity()
i = 0
# build file list
#direct = os.path.normpath(ResultBiblioPath)
#direct = os.path.normpath(ResultClaimsPath)
direct = os.path.normpath(ResultAbstractPath)

# affiche url de chaque documents txt dans le dossier de la requete inseree , EN tous les url dossier pour en ect...
Fr, En, Unk = GenereListeFichiers(direct)


def convert_tag(tag):
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
Exemplo n.º 23
0
class WSD:

    def __init__(self, wsd_method='maxsim', sim_name='wpath'):
        '''
        wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive']
        sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath']
        '''
        self._method = wsd_method
        self._sim_name = sim_name
        self._wn_sim = WordNetSimilarity()

    def disambiguate_graph(self, sentence):
        words_origin = word_tokenize(sentence)
        #extract words that have a synset in WordNet, currently support NOUN.
        words = [w for w in words_origin if self._wn_sim.word2synset(w)]
        # map words to synsets
        words_synsets = {w:self._wn_sim.word2synset(w) for w in words}
        # construct sets list
        synsets = list(itertools.chain.from_iterable([words_synsets[w] for w in words]))
        # remove duplicate synsets
        synsets = list(set(synsets))
        # define semantic similarity metric
        sim_metric = lambda x, y: self._wn_sim.similarity(x, y, self._sim_name)
        # construct similarity graphs
        sim_graph = SimGraph(synsets, sim_metric)
        # get pagerank scores of synsets
        rank_scores = sim_graph.page_rank()
        results = []
        for w in words_origin:
            if w in words:
                candidate_scores = {s:rank_scores[s] for s in words_synsets[w]}
                results.append((w, Counter(candidate_scores).most_common(1)[0][0]))
            else:
                results.append((w, None))
        return results

    def classify(self, featureset):
        context = featureset['context']
        senses = featureset['senses']
        return self.max_senses(context, senses)

    def context2words(self, sent):
        words = word_tokenize(sent.lower())
        words = [w for w in words if len(w) > 2]
        return lemmatization(words)

    def random_sense(self, word):
        senses = self._wn_sim.word2synset(word)
        return random.choice(senses)

    def first_sense(self, word):
        senses = self._wn_sim.word2synset(word)
        return senses[0]

    def word_sense_similarity(self, word, sense):
        word_senses = self._wn_sim.word2synset(word)
        scorer = lambda x:self._wn_sim.similarity(x, sense, self._sim_name)
        sim_scores = map(scorer, word_senses) + [0.0]
        return max(sim_scores)

    def max_senses(self, context, senses):
        if len(senses) == 1:
            return senses[0]
        context_words = self.context2words(context)
        result = {}
        for ss in senses:
            scorer = lambda x: self.word_sense_similarity(x, ss)
            sim_score = sum(map(scorer, context_words))
            result[ss] = sim_score
        return Counter(result).most_common(1)[0][0]

    def max_sim(self, context, word):
        senses = self._wn_sim.word2synset(word)
        return self.max_senses(context, senses)

    def lesk(self, context, word):
        from nltk.wsd import lesk as nltk_lesk
        context_words = self.context2words(context)
        return nltk_lesk(context_words, word, 'n')
Exemplo n.º 24
0
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transform input text into feature representation
    """
    def __init__(self,
                 corpus,
                 feature_num=10,
                 model='onehot',
                 wn_method='path',
                 vec_file='models/GoogleNews-vectors-negative300.bin',
                 binary=True):
        """
        :param corpus: use a corpus to train a vector representation
        :param feature_num: number of dimensions
        :param model: onehot or wordnet or word2vec or both
        """
        self._model = model
        self._wn_method = wn_method
        self._features = self.extract_features(corpus, feature_num)
        self._wns = WordNetSimilarity(
        ) if model == 'wordnet' or model == 'both' else None
        self._wvs = WordVecSimilarity(
            vec_file,
            binary) if model == 'word2vec' or model == 'both' else None

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return X

    def extract_features(self, corpus, feature_num=10):
        cat_word = {}
        for sent, cat in corpus:
            cat_word.setdefault(cat,
                                []).extend(lemmatization(word_tokenize(sent)))
        features = {cat: Counter(cat_word[cat]) for cat in cat_word}
        feature_words = []
        for c, f in features.iteritems():
            words, counts = zip(*f.most_common(feature_num))
            feature_words.extend(list(words))
        feature_words = set(feature_words)
        return feature_words

    def similarity(self, tokens, feature, method='wordnet'):
        if method == 'wordnet':
            sim = lambda x: self._wns.word_similarity(feature, x, self.
                                                      _wn_method)
        else:
            sim = lambda x: self._wvs.word_similarity(feature, x)
        return max(map(sim, tokens) + [0.0])

    def unigram_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['contains({})'.format(f)] = (f in words)
        return features

    def wordnet_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['wns({})'.format(f)] = self.similarity(words, f)
        return features

    def word2vec_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['w2v({})'.format(f)] = self.similarity(words,
                                                            f,
                                                            method='word2vec')
        return features

    def semantic_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['wns({})'.format(f)] = self.similarity(words, f)
            features['w2v({})'.format(f)] = self.similarity(words,
                                                            f,
                                                            method='word2vec')
        return features

    def transform(self, X):
        tokenize = lambda x: lemmatization(word_tokenize(x))
        X_tokens = map(tokenize, X)
        if self._model == 'onehot':
            return map(self.unigram_features, X_tokens)
        elif self._model == 'wordnet':
            return map(self.wordnet_features, X_tokens)
        elif self._model == 'word2vec':
            return map(self.word2vec_features, X_tokens)
        elif self._model == 'both':
            return map(self.semantic_features, X_tokens)
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sematch.semantic.similarity import WordNetSimilarity

WNS = WordNetSimilarity()


# NOTE: For reference see: https://pdfs.semanticscholar.org/1374/617e135eaa772e52c9a2e8253f49483676d6.pdf

def random_sentences(num_rand_sentences, df_main):
    """Select num_rand_sentences at random from the Dataframe

    Args:
        num_rand_sentences (int): the number of sentences to select at random

    Return:
         list: list of sentences
    """
    size = num_rand_sentences
    indices = np.random.randint(0, df_main.shape[0], size)

    tokenized_subset = df_main['tokenized_sentence'].dropna()
    sentence_subset = df_main['sentence'].dropna()
    lecture_subset = df_main['lecture'].dropna()
    start_time_subset = df_main['start_time'].dropna()
    end_time_subset = df_main['end_time'].dropna()

    random_tokenized_sentences = map(lambda x: tokenized_subset[x], indices)
    random_normal_sentences = map(lambda x: sentence_subset[x], indices)
Exemplo n.º 26
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
#Predicate semantic similarity in Python2
import numpy
import json
import sys
from sematch.semantic.similarity import WordNetSimilarity
from nltk.wsd import lesk
from nltk.corpus import wordnet_ic
from nltk.corpus.reader.wordnet import information_content

brown_ic = wordnet_ic.ic('ic-brown.dat')
wns = WordNetSimilarity()
# arg1 and arg2: predicates represented in strings separated by underscores
# e.g. cast_member or star
preA = sys.argv[1].split("_")
preB = sys.argv[2].split("_")
# arg3: pairwise similarity matrix in which rows are separated by underscore
# e.g. 0.6_0.5, or 0.6,0.7_0.3,0.4
data = []
for a in preA:
    row = []
    for b in preB:
        wdsim = wns.word_similarity(a, b, 'wup')
        row.append(wdsim)
    data.append(row)
data = numpy.matrix(data)
#max values in rows
        t = re.sub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", ' ', t)
        t = re.sub("@\w+ ?", ' ', t)
        t = re.sub("[^\w\s]|[\d]", ' ', t)
        t = re.sub(stop, ' ', t)
        t = re.sub("\s+", ' ', t)
        t = t.split()
        t = [w for w in t if w.isalpha()]
        t = [wordnet_lemmatizer.lemmatize(w) for w in t]
        clean.append(t)
    return clean


cleanCleanCat1 = cleanTexts(categoryList1)
cleanCleanCat2 = cleanTexts(categoryList2)

wns = WordNetSimilarity()
similarCategories = []
for cat in cleanCleanCat1:
    sims = []
    for t in cleanCleanCat2:
        TextSim = []
        for w in cat:
            # wdsSim=[1 if w == wr else wns.word_similarity(w, wr, 'li') for wr in t]
            wdsSim = [wns.word_similarity(w, wr, 'li') for wr in t]
            TextSim.extend(wdsSim)
        sims.append((cleanCleanCat2.index(t), sum(TextSim)))
    if max(sims, key=lambda x: x[1])[1] > 0:
        similarCategories.append(
            (max(sims, key=lambda x: x[1])[0], max(sims,
                                                   key=lambda x: x[1])[1]))
    else:
# pip install sematch
# nltk.download('wordnet_ic')
# You also need to edit one of the sematch library files, sparql in case you are using python 3. You need to change the print statement.
from sematch.semantic.similarity import WordNetSimilarity
import pandas as pd

wns = WordNetSimilarity()

words = ['artist', 'musician', 'scientist', 'physicist', 'actor', 'movie']
sim_matrix = [[wns.word_similarity(w1, w2, 'wpath') for w1 in words]
              for w2 in words]
df = pd.DataFrame(sim_matrix, index=words, columns=words)
print(df)

print(wns.word_similarity("Dog", "Cat"))
Exemplo n.º 29
0
from sematch.semantic.similarity import WordNetSimilarity
L1=[]
L2=[]
L3=[]
wns = WordNetSimilarity()

# Computing English word similarity using Li method
x=wns.word_similarity('programmer', 'coder', 'software engineer')
if(x>0.7):
    L1.append('programmer')
    L1.append('coder')
    L1.append('software engineer')
else:continue
    
    
# Computing english word similarity using Li method
wns.word_similarity('softwrae program', 'computer software', 'software system')
if(x>0.7):
    L1.append('software program')
    L1.append('computer software')
    L1.append('software system')
else:continue
Exemplo n.º 30
0
from sematch.semantic.similarity import WordNetSimilarity

import codecs

wns = WordNetSimilarity()
poems = codecs.open('generatedpoems.txt', 'r', encoding='utf-8')
data = open('data.txt', 'a')
for x in poems:
    temp_words = x.split(" ")
    total = 0
    count = 0
    for y in range(len(temp_words) - 1):
        total += wns.word_similarity(temp_words[y], temp_words[y + 1], 'li')
        count += 1
    total /= count
    data.write(str(total) + '\n')
data.close()
poems.close()
#print wns.word_similarity(w1, w2, 'li')
Exemplo n.º 31
0
class fmodel(object):
    def __init__(self):
        self.out = {}
        self.keras = keras_similar()
        self.classifier = Qclassifier()
        self.spell=Spelling()
        self.wn = WordNetSimilarity()
        self.en_nlp = spacy.load("en_core_web_md")
        self.stopwords_en=[]
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
            'utils', 'stopwords_en.txt')) as f:

            self.stopwords_en = f.read().splitlines()

    def ent_nltk(self, sentence):
        ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
        iob_tagged = tree2conlltags(ne_tree)
        ents = [[0, 0, 10]]
        for i in range(len(iob_tagged)):
            each = iob_tagged[i]

            if each[2] != 'O':
                if ents[-1][2] == (i - 1):
                    ents[-1][0] += " " + each[0]
                    ents[-1][2] = i
                else:
                    ents.append([each[0], each[2][2:], i])
        if len(ents) > 1:
            ents = ents[1:]
            ents = [ent[0] for ent in ents]
        else:
            ents = []

        return ents

    def mini_similar(self, q1, q2):
        self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0, 'class': ["", ""],
                    'f_class': 0, "sentiment": [0, 0, 0],
                    "keywords": [[""], [""]], "numbers": [[], []],
                    "entities": [[], []], "max_keywords": 0, "keywords_sim": 0}
        regex = re.compile('[^a-zA-Z0-9]')
        q1 = regex.sub('', q1)
        q2 = regex.sub('', q2)
        if q1 == q2:
            self.out['sim'] = 1
            self.out['sim_per'] = 100
            return self.out
        else:
            s1 = self.wn.word_similarity(q1, q2, 'lin')
            print(s1)

            if s1 > 0.9:
                self.out['sim'] = 1
                self.out['sim_per'] = 100
                return self.out

            elif s1 > 0.8:
                self.out['sim'] = 1
                self.out['sim_per'] = s1  # max([s1,s2,s3])
                return self.out
        return self.out

    def is_one_word(self, q1, q2):
        l1 = q1
        l2 = q2
        flag1 = False
        flag2 = False
        stop = True
        word1 = ""
        word2 = ""

        if len(l1)!=len(l2):

            return False
        else:
            for i in range(len(l1)):
                    if l1[i].text != l2[i].text or l1[i].lemma_ != l2[i].lemma_: 
                        if(flag2):
                            return False
                            
                        elif l1[i].text in self.stopwords_en and l2[i].text in self.stopwords_en:
                            word1 = l1[i].text
                            word2 = l2[i].text
                            flag1 = True                                                   

                        else:
                            word1 = l1[i].lemma_
                            word2 = l2[i].lemma_
                            flag1 = True
                            flag2 = True
        if flag1:
            self.out = self.mini_similar(word1,word2)
            return True
            

    def similar(self, text, challenge):
        if not isinstance(text, str) or not isinstance(challenge, str):
            q1 = text
            q2 = challenge
        else:
            q1 = normalizr.normalize(text, normalizations)
            q2 = normalizr.normalize(challenge, normalizations)

        q1 = self.spell.correct_str(q1,True)
        q2 = self.spell.correct_str(q2,True)

        if (len(q1.split()) == 1 and len(q2.split()) == 1) or (q1 == q2):
            return self.mini_similar(q1, q2)
        regex = re.compile(u'/')  # [^a-zA-Z]')
        q1 = regex.sub('', q1)
        q2 = regex.sub('', q2)

        self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0.0, 'class': ["", ""],
                    'f_class': 0, "sentiment": [0, 0, 0],
                    "keywords": [[""], [""]], "numbers": [[], []],
                    "entities": [[], []], "max_keywords": 0,
                    "keywords_sim": 0.0}
        q1_neg_list = list(set(mark_negation(q1.split())[0]))
        q2_neg_list = list(set(mark_negation(q2.split())[0]))

        if q1 == "" or q2 == "":
            return self.out


        sq1 = self.en_nlp(q1)
        sq2 = self.en_nlp(q2)

        if self.is_one_word(sq1, sq2):
            return self.out
        count = 0

        start_time = time.time()

        entsq1 = self.ent_nltk(q1)
        entsq2 = self.ent_nltk(q2)

        self.out['entities'][1] = entsq2
        self.out['entities'][0] = entsq1

        for ent in sq1.ents:
            if ent.text not in entsq1:
                # self.out['entities'][0].append([ent.label_, ent.text])
                self.out['entities'][0].append(ent.text)

        for ent in sq2.ents:
            if ent.text not in entsq2:
                # self.out['entities'][1].append((ent.label_, ent.text))
                self.out['entities'][1].append(ent.text)

        if self.out['entities'][0]:

            if self.out['entities'][1]:
                if(len(self.out['entities'][0])!= len(self.out['entities'][1])):
                    return self.out

                self.out['max_keywords'] += len(
                    set(self.out['entities'][0] + self.out['entities'][1]))


                for each in self.out['entities'][0]:
                    if(each in self.out['entities'][1]):
                        count += 1
                    else:
                        return self.out
            else:
                return self.out

        elif self.out['entities'][1]:
            return self.out
        

            

        elapsed_time = time.time() - start_time

        self.out['keras'] = self.keras.similar(q1, q2)

        self.out['sentiment'][0] = get_sentiment_values(q1)[1]['compound']
        self.out['sentiment'][1] = get_sentiment_values(q2)[1]['compound']
        self.out['sentiment'][2] = abs(
            self.out['sentiment'][0] - self.out['sentiment'][1])

        if (abs(self.out['sentiment'][0]) > 0.3 and abs(
                self.out['sentiment'][1]) > 0.3):
            if self.out['sentiment'][2] >= 0.6:
                return self.out

        start_time = time.time()
        self.out['class'][0] = self.classifier.classify_question(sq1)
        self.out['class'][1] = self.classifier.classify_question(sq2)

        self.out['f_class'] = (self.out['class'][0] == self.out['class'][1])

        self.out['keywords'][0], self.out['numbers'][0] = extract_features(sq1)
        self.out['keywords'][1], self.out['numbers'][1] = extract_features(sq2)

        self.out['max_keywords'] += len(
            set(self.out['keywords'][0] + self.out['keywords'][1]))

        if self.out['class'][0] > 0 and self.out['class'][1] > 0:
            self.out['max_keywords'] += 1

        for each in self.out['keywords'][0]:
            if each in self.out['keywords'][1]:
                if (each in q1_neg_list and each not in q2_neg_list) or (
                                each in q2_neg_list and each not in q1_neg_list):
                    self.out['max_keywords'] += 1
                else:
                    if(each in self.stopwords_en):
                        count += 0.30
                        #self.out['max_keywords'] -= 1
                    else:      
                        count+=1

        if self.out['numbers'][0]:
            self.out['max_keywords'] += 1

            if self.out['numbers'][1]:
                self.out['max_keywords'] += 1
                if self.out['numbers'][1] != self.out['numbers'][0]:
                    return self.out

        elif self.out['numbers'][1]:
            self.out['max_keywords'] += 1

        if self.out['class'][0] > 0 and self.out['class'][1] > 0:
            self.out['max_keywords'] += 1

            if self.out['f_class']:
                if self.out['max_keywords'] > 1:
                    count += 1
                else:
                    count += 0.35

        # keywords_s1= [x for x in keywords_s1 if x not in keywords_s2]
        # keywords_s3= [x for x in keywords_s2 if x not in keywords_s1]
        if self.out['max_keywords'] < 1:
            self.out['keywords_sim'] = 0
        else:
            self.out['keywords_sim'] = (count / self.out['max_keywords']) * 100
            self.out['sim_per'] = (self.out['keywords_sim']+self.out['keras'])/2.0
            #print(self.out['keywords_sim'],count,self.out['max_keywords'])

        '''
        k_value = []
        s_value = []

        k = 100.0
        s = 30.0
        k_step = 10.0
        s_step = 4.0

        self.out["sim_per"] = (self.out['keywords_sim'] + self.out['keras']) / 2

        for i in range(7):
            k -= k_step
            s += s_step
            k_value.append(k)
            s_value.append(s)
        '''
        s_value = [34.0, 40.0, 50.0, 55.0, 60.0, 60.0, 60.0]
        k_value = [90.0, 85.0, 80.0, 75.0, 70.0, 60.0, 30.0]

        if self.out['keras'] >= k_value[0]:
            if self.out['keywords_sim'] >= s_value[0]:
                self.out['sim'] = 1
                return self.out

        elif self.out['keras'] > k_value[1]:
            if self.out['keywords_sim'] >= s_value[1]:
                self.out['sim'] = 1
                return self.out

        elif self.out['keras'] > k_value[2]:
            if self.out['keywords_sim'] >= s_value[2]:
                self.out['sim'] = 1
                return self.out

        elif self.out['keras'] > k_value[3]:
            if self.out['keywords_sim'] >= s_value[3]:
                self.out['sim'] = 1
                return self.out
        elif self.out['keras'] > k_value[4]:
            if self.out['keywords_sim'] >= s_value[4]:
                self.out['sim'] = 1
                return self.out
        elif self.out['keras'] > k_value[5]:
            if self.out['keywords_sim'] >= s_value[5]:
                self.out['sim'] = 1
                return self.out
        elif self.out['keras'] > k_value[6]:
            if self.out['keywords_sim'] >= s_value[6]:
                self.out['sim'] = 1
                return self.out
        



        return self.out

    def similarr(self, text, questions=list()):

        answer, max_similarity = None, 0
        if not text or len(questions) == 0:
            return answer, max_similarity
        for question in questions:
            try:
                result = self.similar(text.lower(),
                                      question.get('question').lower())
            except:
                result = self.similar(text, question.get('question'))

            if result.get('sim') == 1:
                confidence = result.get('sim_per')
                if max_similarity <= confidence <= 100:
                    max_similarity = confidence
                    answer = question.get('id')
                    # print("round stop\n")
                if max_similarity >= 95:
                    break

        # print('[Stop]')
        return answer, max_similarity

    def get_suggestions(self, text=None, texts=list()):
        res = []
        s = []
        min_confidence = 45

        for each in texts:
            result = self.similar(text, each.get('question').lower())
            if result.get('sim') == 1:
                confidence = result.get('sim_per')
                if 100 >= confidence > min_confidence:
                    if each.get('rich_text'):
                        response = each.get('rich_text')
                    else:
                        flow = int(each.get('response').replace('flow-', ''))
                        flow = Flow.objects.filter(id=flow).values('id', 'name',
                                                                   'category__name')
                        if flow.exists():
                            response = [{'flow': flow}]
                        else:
                            response = None
                    if response:
                        res.append((confidence, each.get('id'), response,
                                    each.get('question')))
            s = sorted(res, key=operator.itemgetter(0), reverse=True)[:3]
        suggestions = []
        for e in s:
            if e[2]:
                messages = []
                for m in e[2]:
                    messages.append({'message': format_message(m)})
                suggestions.append({'confidence': e[0], 'id': e[1],
                                    'message': messages})
        return suggestions
Exemplo n.º 32
0
def test_synset_expand():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    cat = wns.word2synset('cat')[0]
    assert wns.synset_expand(cat) is not None
from sematch.semantic.similarity import WordNetSimilarity
# import jieba
# import synonyms
# import jieba.posseg as pseg

wns = WordNetSimilarity()
wns.monol_word_similarity('狗', '猫', 'cmn', 'wup')
# print(wns.word_similarity('dog', 'cat', 'li'))
# print(wns.monol_word_similarity('忧患', '安乐', 'cmn', 'wup'))
print(wns.monol_word_similarity('狗', '猫', 'cmn', 'wup'))
print(wns.monol_word_similarity('猫', '狗', 'cmn', 'wup'))
# print(wns.monol_word_similarity('电脑', '键盘', 'cmn', 'wup'))
# print(wns.monol_word_similarity('电脑', '电脑', 'cmn', 'wup'))
# print(wns.monol_word_similarity('国家', '国家', 'cmn', 'wup'))
#
# def parse_token(data):
#     # words = []
#     # for d in data:
#     #     # jieba.enable_paddle()
#     seg_data = pseg.cut(data, use_paddle=True) #default
#     # per_word = [str(word) for word in seg_data if not str(word) in jieba_sp_words]
#     # for word, flag in seg_data:
#     #     print(f'{word}, {flag}')
#     # words.append(seg_data)
#     return seg_data
#
#
# def word_flag(sentence:list):
#     for word,flag in sentence:
#         return word,flag
#
Exemplo n.º 34
0
from sematch.semantic.similarity import WordNetSimilarity

wn_sim = WordNetSimilarity()

w1 = 'gil'
lang1 = 'pol'
w2 = "sowa"
lang2 = 'pol'
result = []
# for sim_type in ['path','lch','wup','li','res','lin','jcn','wpath','zhou']:

for sim_type in ['path', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath', 'zhou']:
    sim = wn_sim.crossl_word_similarity(w1, w2, lang1, lang2, sim_type)
    tmp = {'name': sim_type, 'sim': sim}
    result.append(tmp)
    print(tmp)

avg = (result[0]['sim'] + result[1]['sim'] + result[2]['sim'] +
       result[3]['sim'] / 10 + result[4]['sim'] + result[5]['sim'] +
       result[6]['sim']) / 7
print("average from other methods: " + str(avg))
from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt
from random import random, shuffle
import pickle
import sys
import string
import numpy as np
from sematch.semantic.similarity import WordNetSimilarity
from config import path
wns = WordNetSimilarity()

import string

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
string.punctuation.__add__('?')
string.punctuation.__add__('.')
string.punctuation.__add__(',')

# from gensim.models import Word2Vec
# model = Word2Vec.load_word2vec_format(path+'GoogleNews-vectors-negative300.bin', binary=True)  # C binary format
# print model.vocab
model = None


def remove_punctuation(x):
    new_line = [w for w in list(x) if w not in string.punctuation]
    new_line = ''.join(new_line)
    return new_line
Exemplo n.º 36
0
from flask import Flask, json, request, render_template as template
from sematch.application import Matcher
from sematch.semantic.similarity import ConceptSimilarity, WordNetSimilarity
from sematch.semantic.similarity import YagoTypeSimilarity, EntitySimilarity
from sematch.semantic.graph import DBpediaDataTransform, Taxonomy

import os

DEBUG = True
SECRET_KEY = 'Secret_development_key'
DATA_FILE = 'data/data.txt'

app = Flask(__name__)
app.config.from_object(__name__)

wn_sim = WordNetSimilarity()
yago_sim = YagoTypeSimilarity()
matcher = Matcher()
dbpedia_sim = ConceptSimilarity(Taxonomy(DBpediaDataTransform()),
                                'models/dbpedia_type_ic.txt')
entity = EntitySimilarity()

from search import text_lsa, text_tfidf, data


@app.route('/api/text_search')
def text_search():
    query = request.args.get('query')
    result = text_tfidf.search(query)
    result_data = []