Пример #1
0
    def load_trie(self, trie_cache_file):
        '''
        Load a prebuilt tree from file or create a new one
        :return:
        '''
        trie = None

        if os.path.isfile(trie_cache_file):
            print('Start loading trie from %s' % trie_cache_file)
            with open(trie_cache_file, 'rb') as f:
                trie = pickle.load(f)

        else:
            print('Trie not found, creating %s' % trie_cache_file)
            count = 0
            listwords = []
            dict_files = [self.wordlist]
            for dict_file in dict_files:
                print(dict_file)
                file = open(dict_file, 'r', encoding='utf8')
                for line in file:
                    tokens = nlp.preprocessText(line)
                    if (len(tokens) > 0):
                        listwords.append(tokens)

            trie = MyTrie(listwords)
            with open(trie_cache_file, 'wb') as f:
                pickle.dump(trie, f)
        return trie
    def get_Topics_npFilter(self, topicdocs):
        ptopicdocs = nlp.preprocessed_docs(topicdocs)
        docs = [' '.join(doc.tokens) for doc in ptopicdocs]

        if not self.trained == True:
            self.train()
        matrix = self.model.transform(docs).todense()
        topic_dic = {}
        i = 0
        for doci in ptopicdocs:
            chunks = self.npchunk(doci.sentences)
            temptokens = zip(matrix[i].tolist()[0], itertools.count())
            temptokens1 = []
            tfidf_dic = {}
            for (x, y) in temptokens:
                if x > 0.0:
                    tfidf_dic[self.i2w[y]] = x

            for chunk in chunks:
                if chunk in tfidf_dic:
                    temptokens1.append((tfidf_dic[chunk], ' '.join(
                        nlp.preprocessText(chunk,
                                           stemming=False,
                                           stopwords_removal=False))))

            topic_dic[doci.id] = temptokens1
            i += 1

        return topic_dic
Пример #3
0
def getConcepts(filename):
    gold_list = set()

    with open(filename) as f:
        for line in f:
            gold_list.add(' '.join(
                nlp.preprocessText(line.replace("\n", "").strip().lower())))

    return list(gold_list)
Пример #4
0
    def extract(self, testset, OUTPUT_FOL=None):
        if not os.path.exists(OUTPUT_FOL):
            os.makedirs(OUTPUT_FOL)

        for doc in testset:
            tokens = nlp.preprocessText(doc.text)
            vec = self.vectorize(tokens).tolist()
            with open(os.path.join(OUTPUT_FOL, doc.id + ".txt.phrases"),
                      'wb') as f_:
                pickle.dump(vec, f_)
Пример #5
0
    def train_D2V(self, ldocuments):
        '''
        Load or train Doc2Vec
        '''
        document_dict = {}
        id2num_dict = {}
        documents = []
        for doc in ldocuments:
            doc_num = len(document_dict)
            id2num_dict[doc.id] = doc_num

            words = nlp.preprocessText(doc.text)
            tagged_doc = TaggedDocument(words=words, tags=[doc_num])
            document_dict[doc.id] = (doc_num, tagged_doc)

            documents.append(tagged_doc)

        # d2v_model = Doc2Vec(size=self.config['d2v_vector_length'], window=self.config['d2v_window_size'], min_count=self.config['d2v_min_count'], workers=4, alpha=0.025, min_alpha=0.025) # use fixed documents rate
        d2v_model = Doc2Vec(size=300,
                            window=5,
                            min_count=3,
                            workers=10,
                            iter=30)
        d2v_model.build_vocab(documents)
        if self.pretrained_w2v_path:
            if self.pretrained_w2v_path.endswith('bin'):
                d2v_model.intersect_word2vec_format(self.pretrained_w2v_path,
                                                    binary=True)
            else:
                d2v_model.intersect_word2vec_format(self.pretrained_w2v_path,
                                                    binary=False)

        # for epoch in range(20):
        # print('D2V training epoch = %d' % epoch)
        d2v_model.train(documents, total_examples=len(documents))
        # d2v_model.alpha -= 0.002  # decrease the learning rate
        # d2v_model.min_alpha = d2v_model.alpha  # fix the learning rate, no decay

        # store the model to mmap-able files
        d2v_model.save(self.model_path)

        if self.output_path != None:
            if not os.path.exists(self.output_path):
                os.makedirs(self.output_path)

            for doc.id, (doc_num, _) in document_dict.items():
                with open(
                        os.path.join(self.output_path,
                                     doc.id + ".txt.phrases"), 'wb') as f_:
                    pickle.dump(d2v_model.docvecs[doc_num].tolist(), f_)

        return d2v_model
Пример #6
0
    def scan(self, sentence, min_length=1, max_length=5):
        keyword_list = []
        tokens = nlp.preprocessText(sentence)

        ngrams = []
        for i in range(min_length, max_length + 1):
            ngrams += nltk.ngrams(tokens, i)

        for ngram in ngrams:
            if (self.search(' '.join(ngram))):
                keyword_list.append(' '.join(ngram))

        return keyword_list
    def npchunk(self, doc):
        npchunklist = []
        for sen in doc:
            ichunklist = list(nlp_spacy(sen).noun_chunks)
            ichunklist = [
                nlp.preprocessText(str(ichunk.text)) for ichunk in ichunklist
            ]
            ichunklist = [ichunk for ichunk in ichunklist if len(ichunk) > 0]
            # ichunklistt = [' '.join(ichunk)  for ichunk in ichunklist if len(ichunk) <= 3 and len(ichunk) > 0]
            for ichunk in ichunklist:
                if len(ichunk) <= 3 and len(ichunk) > 0:
                    npchunklist.append(' '.join(ichunk))
                elif len(ichunk) > 3:
                    newchunks = nltk.ngrams(ichunk, 3)
                    for nc in newchunks:
                        npchunklist.append(' '.join(nc))

        return list(set(npchunklist))
def getGlobalngrams(grams, documents, threshold):

    singlecorpus = ""
    for doc in documents:
        singlecorpus += ' ' + doc.text + '\n'

    ncorpus = ' '.join(nlp.preprocessText(singlecorpus))
    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=grams,
                         stop_words=nlp.stopwords)
    tfidf_matrix = tf.fit_transform([ncorpus])
    feature_names = tf.get_feature_names()
    doc = tfidf_matrix.todense()
    temptokens = zip(doc.tolist()[0], itertools.count())
    temptokens = [(x, y) for (x, y) in temptokens if x > threshold]
    tokindex = heapq.nlargest(len(temptokens), temptokens)
    global1grams = dict([(feature_names[y], x) for (x, y) in tokindex])
    topindex = [(feature_names[y], x) for (x, y) in tokindex]
    f = open('data/file' + str(grams[0]) + ".txt", 'w')
    for key in global1grams:
        f.write(key + "," + global1grams[key] + "\n")

    return global1grams, topindex