示例#1
0
class PreprocessCorpusFile:
    def __init__(self, file_in='', file_out='', dir='', n_docs=-1):
        self.file_in = file_in
        self.file_out = file_out
        self.n_docs = n_docs

        self.tp = TextProcessing(dir=dir)

        self.process_corpus()

    def process_corpus(self):
        fin = open(self.file_in, 'r')
        fout = open(self.file_out, 'w')

        i = 0
        for line in fin.readlines():
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # write to file
            fout.write(' '.join(stemmed_tokens))
            fout.write('\n')

            i += 1
            if self.n_docs != -1 and i >= self.n_docs:
                break
            if i % 1000 == 0:
                logging.debug('Sentence %s processed' % i)

        # convert tokenized documents into a document-term matrix
        fin.close()
        fout.close()
示例#2
0
class Sentences:
    def __init__(self, corpus_file, n_docs=-1):
        self.corpus_file = corpus_file
        self.n_docs = n_docs

        self.tp = TextProcessing(dir='')

        self.dictionary = Dictionary('')
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()

    def __iter__(self, dict_dir):
        logging.info("Loading corpus in file %s" % self.corpus_file)

        i = 0
        for line in open(self.corpus_file, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            #ret.append(stemmed_tokens)

            # add line to dictionary
            d2 = Dictionary(stemmed_tokens)
            self.dictionary = self.dictionary.merge_with(d2)

            # count number of documents and break if > num_docs
            i += 1
            if self.n_docs != -1 and i >= self.n_docs:
                break
            if i % 1000 == 0:
                logging.debug('Document %s loaded' % i)
示例#3
0
class MyWord2Vec:
    def __init__(self, dir=''):
        self.dir = dir
        self.dictionary = Dictionary.load(dir + 'myDictionary')

        self.tp = TextProcessing(dir=dir)

        self.size = 100

    def load_corpus(self, file_name, num_docs):
        texts = []
        i = 0
        for line in open(file_name, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            texts.append(stemmed_tokens)

            # count number of documents and break if > num_docs
            i += 1
            if num_docs != -1 and i >= num_docs:
                break

        # convert tokenized documents into a document-term matrix
        return texts

    def train_model(self, file_name='corpus.txt', num_docs=-1, size=100):
        self.size = size

        # generate corpus
        #corpus = self.load_corpus(file_name, num_docs)
        corpus = LineSentence(file_name, limit=num_docs)

        # generate Word2Vec model
        model = Word2Vec(corpus, size=size, window=5, min_count=10, workers=3)
        return model

    def update_model(self, model, file_name, num_docs=-1):
        # generate new corpus
        corpus = self.load_corpus(file_name, num_docs)

        # generate Word2Vec model
        model.update(corpus)

    def get_word_embedding(self, model, word):
        if word in model.wv.vocab:
            vec = model.wv[word]
        else:
            w_clean = self.tp.clean_word(word)
            if w_clean in model.wv.vocab:
                vec = model.wv[w_clean]
            else:
                vec = np.zeros(self.size)

        return vec

    def get_sentence_embedding(self, model, line):
        words = self.tp.clean_line(line)
        vec = np.zeros(self.size)

        n_words = 0
        for w in words:
            if w in model.wv:
                vec += model.wv[w]
                n_words += 1

        if n_words > 0:
            return vec / n_words
        else:
            return vec

    def save_model(self, model):
        model.save(self.dir + 'myW2Vmodel')
        #self.dictionary.save('myDictionary')

    def load_model(self):
        model = Word2Vec.load(self.dir + 'myW2Vmodel')
        return model
示例#4
0
class LDA:
    def __init__(self, dir='', load_dict=False):
        self.dir = dir
        self.tp = TextProcessing(dir=self.dir)

        # create empty dictionary:
        #self.dictionary = Dictionary()
        self.dictionary = Dictionary.load(dir + 'myDictionary')
        self.save_dict = True

        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()

    def clean_line(self, line):
        raw = line.lower()
        tokens = self.tokenizer.tokenize(raw)

        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in self.en_stop]

        # stem tokens
        r = []
        for i in stopped_tokens:
            try:
                r.append(self.clean_word(i))
            except:
                logging.info("Can't process word %s" % i)
        return r

    def clean_word(self, word):
        stemmed_word = self.p_stemmer.stem(word)
        return stemmed_word

    def load_corpus(self, file_name, num_docs):
        logging.info("Loading corpus in file %s" % file_name)
        texts = []
        i = 0
        for line in open(file_name, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            texts.append(stemmed_tokens)

            # count number of documents and break if > num_docs
            i += 1
            if num_docs != -1 and i >= num_docs:
                break
            if i % 1000 == 0:
                logging.debug('Document %s loaded' % i)

        # turn our tokenized documents into a id <-> term dictionary
        #if len(self.dictionary) == 0:
        #self.dictionary = Dictionary(texts)
        #self.dictionary.save(self.dir + 'myDictionary')
        '''else:
            # self.dictionary.merge_with(Dictionary(texts))
            pass'''

        # convert tokenized documents into a document-term matrix
        return [self.dictionary.doc2bow(text) for text in texts]

    def train_model(self,
                    file_name='corpus.txt',
                    num_docs=-1,
                    num_topics=50,
                    passes=20,
                    multicore=False):
        # generate LDA model
        if not multicore:
            corpus = self.load_corpus(file_name, num_docs)
            ldamodel = LdaModel(corpus,
                                num_topics=num_topics,
                                id2word=self.dictionary,
                                passes=passes)
        else:
            corpus = Sentences(file_name, num_docs)
            ldamodel = LdaMulticore(corpus.__iter__(),
                                    num_topics=num_topics,
                                    id2word=self.dictionary,
                                    passes=passes,
                                    workers=3)

        return ldamodel

    def update_model(self, ldamodel, file_name, num_docs=-1):
        # generate new corpus
        corpus = self.load_corpus(file_name, num_docs)

        # generate LDA model
        ldamodel.update(corpus)

    def get_document_topics(self, ldamodel, text, n=1):
        text = self.tp.clean_line(text)
        bow = self.dictionary.doc2bow(text)

        if n == 1:
            return ldamodel.get_document_topics(bow, minimum_probability=0)

        list_d = []
        keys = set()
        for _ in range(n):
            d = dict(ldamodel.get_document_topics(bow))
            list_d.append(d)
            for k in d.keys():
                keys.add(k)

        probs = []
        for k in keys:
            mean = 0
            for i in range(n):
                if k in list_d[i].keys():
                    mean += list_d[i][k]
            probs.append((k, mean / n))
        return probs

    def show_topic_words(self, ldamodel, topic_id, topn=10):
        list = ldamodel.get_topic_terms(topic_id, topn=topn)
        r = []
        for w_id, p in list:
            print(self.dictionary[w_id], ' \t ', p)
            r.append((self.dictionary[w_id], p))
        return r

    def save_model(self, ldamodel):
        ldamodel.save(self.dir + 'myLDAmodel')

    def load_model(self):
        return LdaModel.load(self.dir + 'myLDAmodel')