Пример #1
0
    def process_text(self, urls, documents):
        tp = TextPreprocess()

        word2vec_list_docs = []
        final_urls = []
        i = 0
        for text in documents:
            doc = tp.preprocess(text)
            if self.word_vec is None:
                terms = [term for term in doc.keys() if doc[term] > 5]
                results = get_documents_by_id(terms, ["term", "vector"],
                                              "word_phrase_to_vec", "terms",
                                              self.es)
                word_vec_doc = [res["vector"] for res in results]
            else:
                word_vec_doc = [
                    self.word_vec[term] for term in doc.keys()
                    if not self.word_vec.get(term) is None
                ]

            if word_vec_doc:
                m_word_vec = np.array(word_vec_doc).mean(axis=0)
                word2vec_list_docs.append(m_word_vec.tolist())
                final_urls.append(urls[i])
            i = i + 1

        self.documents = final_urls

        self.word2vec = np.array(word2vec_list_docs)

        return [self.documents, self.word2vec]
Пример #2
0
    def process_text(self, urls, documents):
        tp = TextPreprocess()

        word2vec_list_docs = []
        final_urls = []
        i = 0
        for text in documents:
            doc = tp.preprocess(text)
            if self.word_vec is None:
                terms = [term for term in doc.keys() if doc[term] > 5]
                results = get_documents_by_id(terms, ["term", "vector"], "word_phrase_to_vec", "terms", self.es)
                word_vec_doc = [res["vector"] for res in results]
            else:    
                word_vec_doc = [self.word_vec[term] for term in doc.keys() if not self.word_vec.get(term) is None]
                
            if word_vec_doc:
                m_word_vec = np.array(word_vec_doc).mean(axis=0) 
                word2vec_list_docs.append(m_word_vec.tolist())
                final_urls.append(urls[i])
            i = i + 1

        self.documents = final_urls

        self.word2vec = np.array(word2vec_list_docs)

        return [self.documents,self.word2vec]
Пример #3
0
    def process(self, documents, mapping=None, es_index = 'memex', es_doc_type = 'page', es = None):
        [data_tf, corpus, urls] = getTermFrequency(documents, mapping, es_index, es_doc_type, es)
        
        documents = urls

        word2vec_list_docs = []
        urls = []
        i = 0
        for doc in data_tf:
            if self.word_vec is None:
                results = get_documents_by_id(doc.keys(), ["term", "vector"], "word_phrase_to_vec", "terms", self.es)
                word_vec_doc = [res["vector"][0] for res in results]
            else:    
                word_vec_doc = [self.word_vec[term] for term in doc.keys() if doc[term] > 5 and not self.word_vec.get(term) is None]

            if word_vec_doc:
                m_word_vec = np.array(word_vec_doc).mean(axis=0) 
                word2vec_list_docs.append(m_word_vec.tolist())
                urls.append(documents[i])
            i = i + 1
        
        self.documents = urls
        
        self.word2vec = np.array(word2vec_list_docs)

        return [self.documents,self.word2vec]
Пример #4
0
 def get(self, word):
     if self.word_vec is None:
         results = get_documents_by_id([word], ["term"], "word_phrase_to_vec", "terms", self.es)
         if results is None:
             return None;
         else:
             return results[0]["term"][0]
     else:
         return self.word_vec.get(word)
Пример #5
0
 def get(self, word):
     if self.word_vec is None:
         results = get_documents_by_id([word], ["term"],
                                       "word_phrase_to_vec", "terms",
                                       self.es)
         if results is None:
             return None
         else:
             return results[0]["term"][0]
     else:
         return self.word_vec.get(word)
Пример #6
0
    def process(self,
                documents,
                mapping=None,
                es_index='memex',
                es_doc_type='page',
                es=None):
        [data_tf, corpus, urls] = getTermFrequency(documents, mapping,
                                                   es_index, es_doc_type, es)

        documents = urls

        word2vec_list_docs = []
        urls = []
        i = 0
        for doc in data_tf:
            if self.word_vec is None:
                results = get_documents_by_id(doc.keys(), ["term", "vector"],
                                              "word_phrase_to_vec", "terms",
                                              self.es)
                word_vec_doc = [res["vector"][0] for res in results]
            else:
                word_vec_doc = [
                    self.word_vec[term] for term in doc.keys()
                    if doc[term] > 5 and not self.word_vec.get(term) is None
                ]

            if word_vec_doc:
                m_word_vec = np.array(word_vec_doc).mean(axis=0)
                word2vec_list_docs.append(m_word_vec.tolist())
                urls.append(documents[i])
            i = i + 1

        self.documents = urls

        self.word2vec = np.array(word2vec_list_docs)

        return [self.documents, self.word2vec]