Exemplo n.º 1
0
 def expand_query_term_cluster(self, q, G, cluster_dict, k_relevant_words):
     upd_query = utils.get_tokenized_query(q)
     porter = PorterStemmer()
     res = [w for w in upd_query]
     for qw in upd_query:
         counter = 0
         for cluster in cluster_dict.values():
             if qw in cluster or porter.stem(qw) in cluster:
                 list_neighbors = [
                     i for i in cluster
                     if (i != qw and i != porter.stem(qw))
                 ]
                 counter += 1
                 break
         if counter == 0:
             continue
         weight_list = []
         for i in list_neighbors:
             weight_list.append(
                 (i, G.edges[(qw, i)]['weight'] if (qw, i) in G.edges else
                  (porter.stem(qw), i)))
         final_res = sorted(weight_list, key=lambda x: x[1],
                            reverse=True)[:k_relevant_words]
         for u, v in final_res:
             res.append(u)
     return ' '.join(res)
Exemplo n.º 2
0
    def get_expanded_query(self, q, args=None):
        if not Glove.glove:
            print('INFO: Glove: Loading word vectors in {} ...'.format(
                Glove.vectorfile))
            Glove.glove = load_glove_model(Glove.vectorfile)

        upd_query = utils.get_tokenized_query(q)
        synonyms = []
        res = []
        if not self.replace:
            res = [w for w in upd_query]
        ps = PorterStemmer()
        for qw in upd_query:
            found_flag = False
            qw_stem = ps.stem(qw)
            if qw.lower() in Glove.glove.keys():
                w = sorted(Glove.glove.keys(),
                           key=lambda word: scipy.spatial.distance.euclidean(
                               Glove.glove[word], Glove.glove[qw]))
                w = w[:self.topn]
                for u in w:
                    u_stem = ps.stem(u)
                    if u_stem != qw_stem:
                        found_flag = True
                        res.append(u)
            if not found_flag and self.replace:
                res.append(qw)
        return ' '.join(res)
Exemplo n.º 3
0
    def get_expanded_query(self, q, args=None):
        if not Word2Vec.word2vec:
            print('INFO: Word2Vec: Loading word vectors in {} ...'.format(
                Word2Vec.vectorfile))
            Word2Vec.word2vec = gensim.models.KeyedVectors.load_word2vec_format(
                Word2Vec.vectorfile)

        upd_query = utils.get_tokenized_query(q)
        synonyms = []
        res = []
        if not self.replace:
            res = [w for w in upd_query]
        ps = PorterStemmer()
        for qw in upd_query:
            found_flag = False
            qw_stem = ps.stem(qw)
            if qw in Word2Vec.word2vec.vocab:
                w = Word2Vec.word2vec.most_similar(positive=[qw],
                                                   topn=self.topn)
                for u, v in w:
                    u_stem = ps.stem(u)
                    if u_stem != qw_stem:
                        found_flag = True
                        res.append(u)
            if not found_flag and self.replace:
                res.append(qw)
        return ' '.join(res)
Exemplo n.º 4
0
    def get_expanded_query(self, q, args=None):

        if not Word2Vec.word2vec:
            print('INFO: Word2Vec: Loading word vectors in {} ...'.format(
                Word2Vec.vectorfile))
            Word2Vec.word2vec = gensim.models.KeyedVectors.load(
                Word2Vec.vectorfile)

        query_concepts = self.get_concepts(q, 0.1)
        upd_query = utils.get_tokenized_query(q)
        res = []
        if not self.replace:
            res = [w for w in upd_query]
        for c in query_concepts:
            c_lower_e = "e_" + c.replace(" ", "_").lower()
            if c_lower_e in Word2Vec.word2vec.vocab:
                w = Word2Vec.word2vec.most_similar(positive=[c_lower_e],
                                                   topn=self.topn)
                for u, v in w:
                    if u.startswith("e_"):
                        u = u.replace("e_", "")
                    elif u.startswith("c_"):
                        u = u.replace("c_", "")
                    res.append(u.replace("_", " "))

            res.append(c)
        return ' '.join(res)
Exemplo n.º 5
0
    def get_expanded_query(self, q, args=None):
        upd_query = utils.get_tokenized_query(q)
        ps = PorterStemmer()
        synonyms =[]
        res = []
        if not self.replace:
            res=[w for w in upd_query]
        for w in upd_query:
            found_flag = False
            w_stem=ps.stem(w)
            for syn in wordnet.synsets(w):
                for l in syn.lemmas():
                    synonyms.append(l.name())
            synonyms=list(set(synonyms))
            synonyms=synonyms[:self.topn]
            for s in synonyms:
                s_stem=ps.stem(s)
                if  s_stem!=w_stem:
                    found_flag = True
                    res.append(s)
                synonyms=[]

            if not found_flag and self.replace:
                res.append(w)
        return ' '.join(res)
Exemplo n.º 6
0
    def get_expanded_query(self, q, args=None):

        query_concepts = self.get_concepts(q, 0.1)
        upd_query = utils.get_tokenized_query(q)
        res = []
        if not self.replace:
            res = [w for w in upd_query]
        for c in query_concepts:
            res.append(c)
        return ' '.join(res)
Exemplo n.º 7
0
    def get_expanded_query(self, q, args=None):
        upd_query = utils.get_tokenized_query(q)
        res = []
        if not self.replace:
            res = [w for w in upd_query]
        ps = PorterStemmer()
        for q in upd_query:
            q_stem = ps.stem(q)
            found_flag = False
            try:
                obj = requests.get('http://api.conceptnet.io/c/en/' + q).json()
            except:
                if self.replace:
                    res.append(q)
                continue
            if len(obj['edges']) < self.topn:
                x = len(obj['edges'])
            else:
                x = self.topn
            for i in range(x):

                try:
                    start_lan = obj['edges'][i]['start']['language']
                    end_lan = obj['edges'][i]['end']['language']
                except:
                    continue
                if obj['edges'][i]['start']['language'] != 'en' or obj['edges'][i]['end']['language'] != 'en':
                    continue
                if obj['edges'][i]['start']['label'].lower() == q:
                    if obj['edges'][i]['end']['label'] not in res and q_stem != ps.stem(obj['edges'][i]['end']['label']):
                        found_flag = True
                        res.append(obj['edges'][i]['end']['label'])
                elif obj['edges'][i]['end']['label'].lower() == q:
                    if obj['edges'][i]['start']['label'] not in res and q_stem != ps.stem(obj['edges'][i]['start']['label']):
                        found_flag = True
                        res.append(obj['edges'][i]['start']['label'])
            if not found_flag and self.replace:
                res.append(q)
        return ' '.join(res)
Exemplo n.º 8
0
    def get_expanded_query(self, q, args=None):
        pos_dict = {
            'n': 'noun',
            'v': 'verb',
            'a': 'adjective',
            's': 'satellite adj',
            'r': 'adverb'
        }
        upd_query = utils.get_tokenized_query(q)
        q_ = []
        if not self.replace:
            q_ = [w for w in upd_query]
        for w in upd_query:
            found_flag = False
            if utils.valid(w):
                pos = wordnet.synsets(w)[0].pos() if wordnet.synsets(
                    w) else 'n'
                syn = self.get_synonym(w, pos_dict[pos])
                if not syn and self.replace:
                    q_.append(w)
                else:
                    q_.append(' '.join(syn))

        return ' '.join(q_)