Exemplo n.º 1
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print 'No data provided...'
Exemplo n.º 2
0
class WordCorpus(BaseCorpus):
    """\
    Wrapper around a `gensim.corpora.dictionary.Dictionary`.

    This is a light-weight alternative to `CableCorpus` to create an initial
    word dictionary::

        wd = WordCorpus()
        wd.add_text('ref-1', 'bla bla')
        # add more texts
        wd.dct.filter_extremes()

        corpus = CableCorpus('/my/directory/', wd.dct)
        corpus.add_text('ref-1', 'bla bla')
        # add more texts
        corpus.close()
    """
    def __init__(self, dct=None, tokenizer=None):
        """\
        Initializes the wrapper.

        `dct`
            An existing Dictionary or ``None`` if a new Dictionary should be
            created (default)
        `tokenizer`
            A tokenizer function or ``None``, see `BaseCorpus`
        """
        super(WordCorpus, self).__init__(tokenizer)
        self.dct = Dictionary() if dct is None else dct

    def add_words(self, reference_id, words):
        self.dct.doc2bow(words, True)
Exemplo n.º 3
0
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
    ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
        be a 3-tuple of the picklefile names in the following order:
        
        (title, body, tags)
        
        If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
    '''
    utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
    for eid in xrange(n):
        for row in row_stream(splits_template % eid):
            ID, title, body, tags = row
            utitledict.doc2bow(title.split(), allow_update=True)
            ubodydict.doc2bow(body.split(), allow_update=True)
            utagdict.doc2bow(tags.split(), allow_update=True)
    
    assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
    print "Before filtering..."
    print "utitledict:", utitledict
    print "ubodydict:", ubodydict
    print "utagdict:", utagdict
    
    if save_pickle_tup:
        assert len(save_pickle_tup) == 3
        if save_pickle_tup[0]:
            print "saving utitledict..."
            utitledict.save(save_pickle_tup[0])
        if save_pickle_tup[1]:
            print "saving ubodydict..."
            ubodydict.save(save_pickle_tup[1])
        if save_pickle_tup[2]:
            print "saving utagdict..."
            utagdict.save(save_pickle_tup[2])
            
    return (utitledict, ubodydict, utagdict)
Exemplo n.º 4
0
def doc_to_gensim(doc, lemmatize=True,
                  filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.

    Args:
        doc (``spacy.Doc``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list((int, int)): bag-of-words document, a list of (integer word ID, word count)
            2-tuples
    """
    gdict = Dictionary()
    words = extract.words(doc,
                          filter_stops=filter_stops,
                          filter_punct=filter_punct,
                          filter_nums=filter_nums)
    if lemmatize is True:
        gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
    else:
        gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)

    return (gdict, gdoc)
def create_dictionaries(train=None,
                        test=None,
                        model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
Exemplo n.º 6
0
 def _load_vocab(self,fname):
     logging.info("loading plain-text file:{}".format(fname))            
     src_file = codecs.open(fname, 'rb', 'utf-8')
     dictionary = Dictionary()
     
     num_instances = 0            
     for term in src_file:            
         dictionary.doc2bow(term.strip().lower().encode('utf-8').split(), allow_update=True)
         num_instances += 1
         
     logging.info("processed {} instances".format(num_instances))
     self.dictionary = dictionary
Exemplo n.º 7
0
def get_corpus_dictionary():
    """Crafts a toy corpus and the dictionary associated."""
    # Toy corpus.
    corpus = [
        ['carrot', 'salad', 'tomato'],
        ['carrot', 'salad', 'dish'],
        ['tomato', 'dish'],
        ['tomato', 'salad'],

        ['car', 'break', 'highway'],
        ['highway', 'accident', 'car'],
        ['moto', 'break'],
        ['accident', 'moto', 'car']
    ]

    dictionary = Dictionary(corpus)

    # Transforming corpus with dictionary.
    corpus = [dictionary.doc2bow(doc) for doc in corpus]

    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token

    return corpus, dictionary
Exemplo n.º 8
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
Exemplo n.º 9
0
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
                         headlines=None): 
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        filter_corpus (optional): boolean  
            Filter the corpus to only those words seen in the bodies/headlines. 
        bodies (optional): list of lists 
            Must be passed in if `filter_corpus` is True. 
        headlines (optional): list of lists  
            Must be passed in if `filter_corpus` is True. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if filter_corpus:
        if (not bodies or not headlines): 
            excep_str = "Must pass in bodies and headlines with filter_corpus True!"
            raise Exception(excep_str)
        else: 
            wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    # Leave index 0 for the newline character
    word_idx_dct = {wrd: (idx + 1) for idx, wrd in gensim_dct.items()}
    idx_word_dct = {(idx + 1): wrd for idx, wrd in gensim_dct.items()}
    word_idx_dct['\n'] = 0
    idx_word_dct[0] = '\n'

    word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
    vec_dim = next(len(value) for value in word_vector_dct.values())
    word_vector_dct['\n'] = np.zeros((vec_dim))

    return word_idx_dct, idx_word_dct, word_vector_dct 
Exemplo n.º 10
0
def create_mapping_dicts(wrd_embedding, reviews=None, vocab_size=None):
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        reviews (optional): np.array (or array-like) of lists of strings
            Used to filter the vocabulary, either to only those words in `reviews`
            or the most common `vocab_size` words in `reviews` that are also in 
            the `wrd_embedding`.
        vocab_size (optional): int
            Keep only `vocab_size` most common words from the reviews. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if reviews is not None: 
        wrd_embedding = _filter_corpus(wrd_embedding, reviews, vocab_size)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    # Leave index 0 for masking the padding, 1 for the end of sequence
    # character (EOS), and 2 for unkown words (denoted 'UNK')
    wrd_idx_dct = {wrd: (idx + 3) for idx, wrd in gensim_dct.items()}
    idx_wrd_dct = {(idx + 3): wrd for idx, wrd in gensim_dct.items()}
    wrd_idx_dct['EOS'] = 1
    idx_wrd_dct[1] = 'EOS'
    wrd_idx_dct['UNK'] = 2
    idx_wrd_dct[2] = 'UNK'

    wrd_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
    embedding_dim = wrd_embedding.vector_size
    wrd_vector_dct['EOS'] = np.zeros((embedding_dim))
    wrd_vector_dct['UNK'] = np.zeros((embedding_dim))

    return wrd_idx_dct, idx_wrd_dct, wrd_vector_dct 
Exemplo n.º 11
0
class tip_rec:

	def __init__(self, num_topics = 15):
		self.numtopics = num_topics
		self.topic_dict = dict(enumerate(np.zeros(num_topics)))
		self.user_dict = {}
		self.model = None
		self.worddict = {}
		self.mydict = None


	def train(self, df):
		self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()}
		cv = CV(stop_words='english')
		X = cv.fit_transform(df['context'])
		vocab = cv.vocabulary_.keys()
		self.worddict=dict([(i, s) for i, s in enumerate(vocab)])
		self.mydict = Dictionary()
		self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict)
		self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict)
		for i in df.iterrows():
			if i[1]['context'] == '':
				continue
			else:
				values = new_model[mydict.doc2bow(i[1]['context'].split())]
				for val in values:
					if val[0] in user_dict[i[1].sender].keys():
						if i[1].amt == '':
							continue
						user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt)
						continue
					user_dict[i[1].sender][val[0]] = val[1]
		for i in user_dict.keys():
			norm_const = sum(user_dict[i].values())
			for j in user_dict[i].keys():
				user_dict[i][j] = user_dict[i][j]/norm_const

	def predict(self, text, username = ''):
		topics = self.model[self.mydict.doc2bow(text.split())]
		doc_aff = np.zeros(self.numtopics)
		for i in topics:
			doc_aff[i[0]] = i[1]
		if username == '':
			returndict = {}
			for user in self.user_dict.keys():
				user_aff = np.array(self.user_dict[user].values())    
				score = np.linalg.norm(user_aff - doc_aff)
				returndict[user] = score
			return returndict
		else:
			user_aff = np.array(self.user_dict[username].values())    
			score = np.linalg.norm(user_aff - doc_aff)
			return (username, score)
Exemplo n.º 12
0
def cluster_questions(topic_num,
                      res_path,
                      q_path='datasets\DialogQA\Qall.txt',
                      a_path='datasets\DialogQA\Aall.txt'):
    with open(a_path, 'r', encoding='utf-8') as f:
        common_texts = [text.split() for text in f.readlines()]

    with open(q_path, 'r', encoding='utf-8') as f:
        questions = [text for text in f.readlines()]

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    lda = LdaModel(common_corpus, num_topics=topic_num)

    questions_clusterd = [[] for i in range(topic_num)]
    print('Questions : ', len(questions))
    perp = lda.log_perplexity(common_corpus)
    for i, q in enumerate(questions):
        other_corpus = [common_dictionary.doc2bow(common_texts[i])]
        vector = lda[other_corpus]
        # print(vector[0])
        max_prob = 0
        for (idx, prob) in vector[0]:
            # print(idx)
            if prob > max_prob:
                topic = idx
                max_prob = prob
        questions_clusterd[topic].append(q)
        # print(topic)
    if (not os._exists(res_path)):
        os.makedirs(res_path)
    for top in range(topic_num):
        with open(res_path + str(top) + '.txt', 'w', encoding='utf-8') as f:
            for quest in questions_clusterd[top]:
                f.write(quest)
                # f.write('\n')

    return perp
Exemplo n.º 13
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        #词典Dictionary(),词向量表model.vocab.keys(),
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 14
0
class LDATextEncoder(TextEncoder):
    def __init__(self, language="english", encoding_length=20):
        self.name = "LDA"
        self.model = None
        self.num_topics = encoding_length
        self.dictionary = None
        super().__init__(language=language, encoding_length=encoding_length)

    def fit(self, docs):
        docs = self.preprocess_docs(docs)
        self.dictionary = Dictionary(docs)
        corpus = [self.dictionary.doc2bow(doc) for doc in docs]
        self.model = LdaModel(corpus,
                              id2word=self.dictionary,
                              num_topics=self.num_topics,
                              minimum_probability=0.0)
        return self

    def transform(self, docs):
        docs = self.preprocess_docs(docs)
        docs = [self.dictionary.doc2bow(doc) for doc in docs]
        return np.array([self.model[doc] for doc in docs])[:, :, 1]
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
        4- 返回所有词语的向量的拼接结果
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        # 获取keys集合,字典的单词集合
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # 获取word_index=>index集合
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # 获取word=>词向量集合
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                sentences = sentence.split(' ')
                for word in sentences:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # pad 补上0
        combined = sequence.pad_sequences(combined)
        global input_length
        input_length = len(combined[0])
        return w2indx, w2vec, combined
    else:
        print('error: 模型或者和并集合combined 为空')
Exemplo n.º 16
0
def gensim_lda(pd_df_yelp, text_rev):  #gensim lda
    common_dict = Dictionary(text_rev)
    common_corpus = [common_dict.doc2bow(text) for text in text_rev]
    lda = LdaModel(common_corpus)
    topics = [lda.get_document_topics(doc) for doc in common_corpus]
    topicIDs = [topic[0][0] for topic in topics]
    topic_prob_list = [lda.show_topic(topicID) for topicID in topicIDs]
    topic_prob_list_split = [zip(*item) for item in topic_prob_list]
    topic_prob_list_words = [list(map(lambda topID: dict(common_dict)[int(topID)],item[0]))\
     for item in topic_prob_list_split]
    topic_prob_list_prob = list(
        map(lambda item: list(item[1]), topic_prob_list_split))
    return (topic_prob_list_words, topic_prob_list_prob)
Exemplo n.º 17
0
def create_dictionaries(data, model, feature):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
    w2idx = {v: k + 1 for k, v in gensim_dict.items()}
    w2idxl = {v.lower(): k + 1 for k, v in gensim_dict.items()}
    #w2vec = {word: model[word.lower()] for word in w2idx.keys()}
    w2vec = {}
    for word in w2idx.keys():
        if feature == 'bow':
            try:
                w2vec[word.lower()] = model[word]
            except KeyError:
                w2vec[word.lower()] = [0] * model.vector_size
        else:
            try:
                w2vec[word] = model[word]
            except KeyError:
                w2vec[word] = [0] * model.vector_size

    def parse_dataset(data, feature):
        for key in data.keys():
            if feature == 'bow':
                txt = data[key].lower().replace('\n', '').split()
            else:
                txt = data[key].replace('\n', '').split()
            new_txt = []
            for word in txt:
                try:
                    if feature == 'bow':
                        new_txt.append(w2idxl[word])
                    else:
                        new_txt.append(w2idx[word])
                except:
                    new_txt.append(0)
            data[key] = new_txt
        return data

    out = parse_dataset(data, feature)
    return w2idx, w2vec, out
Exemplo n.º 18
0
def get_topic_words(sent, stop_words, cnt=15):
    sent = re.sub(r'[\r\n]', '', sent)
    wlst = jieba.lcut(sent)
    ls = []
    for w in wlst:
        if w not in stop_words:
            ls.append(w)

    di = Dictionary([ls])
    corpus = [di.doc2bow(text) for text in [ls]]
    lda = LdaModel(corpus, id2word=di, num_topics=1)
    tp = lda.print_topics(num_words=cnt)[0][1]
    return re.findall('"(.+?)"', tp)
Exemplo n.º 19
0
def train_model_lda_gensim():
    # 把文章转成list
    common_dictionary = Dictionary(common_texts)
    print(type(common_texts))
    print(common_texts[0])

    # 把文本转成词袋形式
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    # 调用lda模型,并指定10个主题
    lda = LdaModel(common_corpus, num_topics=10)
    # 检查结果
    lda.print_topic(1, topn=2)
Exemplo n.º 20
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1-创建索引映射的单词
        2-创建一个单词到矢量映射
        3-转换训练和测试词典

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
Exemplo n.º 21
0
def create_dictionaries(model=None, combined=None):
    """ 
    Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    """
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()

        gensim_dict.doc2bow(reduce(lambda x, y: x + y, combined),
                            allow_update=True)

        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 22
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
        4- 返回所有词语的向量的拼接结果
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        # keys
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                sentences = sentence.split(' ')
                for word in sentences:
                    try:
                        #word = np.unicode(word, errors='ignore')
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # combined = sequence.pad_sequences(combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        combined = sequence.pad_sequences(
            combined)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 23
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        # the index of a word which have word vector is not 0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # integrate all the corresponding word vectors into the word vector matrix
        w2vec = {word: model[word] for word in w2indx.keys()}

        # a word without a word vector is indexed 0,return the index of word
        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in list(sentence):
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # unify the length of the sentence with the pad_sequences function of keras
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # return index, word vector matrix and the sentence with an unifying length and indexed
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 24
0
def keywords(corpus):
    docs=[preprocess(doc) for doc in corpus]
    dictionary = Dictionary(docs)
    c = [dictionary.doc2bow(doc) for doc in docs]
    tfidf = TfidfModel(c)
    result=[]    
    for s in c:
        tfidf_weights = tfidf[s]
        r=[]
        sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
        for term_id, weight in sorted_tfidf_weights:
            r.append([dictionary.get(term_id), weight])
        result.append(r)
    return result
Exemplo n.º 25
0
def lda(domain):

    common_texts = normalize(domain=domain)

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    lda = LdaModel(common_corpus,
                   num_topics=2,
                   per_word_topics=True,
                   id2word=common_dictionary)

    # print(common_dictionary.token2id)
    return lda
Exemplo n.º 26
0
def lda_vector(dataset: list, refer_dictionary=None, refer_lda_model=None):

    if refer_dictionary is None:
        refer_docs = [
          [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False
           and token not in [',', '.', '?']] for sample in dataset
        ]
        refer_dictionary = Dictionary(refer_docs)
        refer_doc2bow = [refer_dictionary.doc2bow(text) for text in refer_docs]
        refer_lda_model = LdaModel(corpus=refer_doc2bow, id2word=refer_dictionary, num_topics=10, dtype=np.float64, passes=10, minimum_probability=0.0)

    doc = [
        [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False
         and token not in [',', '.', '?']] for sample in dataset
    ]
    doc_bow_s = [refer_dictionary.doc2bow(text) for text in doc]
    doc_vecs = [refer_lda_model[doc_bow] for doc_bow in doc_bow_s]

    for (sample, doc_vec) in zip(dataset, doc_vecs):
        for topic_prob in doc_vec:
            sample['topic'+str(topic_prob[0] + 1)] = topic_prob[1]

    return refer_dictionary, refer_lda_model
Exemplo n.º 27
0
    def buildDic(self, model=None, words=None):
        '''
        构建词典,
        :param model:   word2vec模型
        :param words:   结巴分词后所有的文本内容
        :return:        返回每个词语的索引(词语-索引),词向量(词语-向量),以及每个句子所对应的词语索引(下标索引)
        '''
        if (model is not None) and (words is not None):
            # 初始化一个词典
            dict = Dictionary()
            # model.vocab.keys() 为 word2vec 中所有的词,设置 allow_update=True 则每个词出现一个,频率就会增加一次
            # 转换为词袋模型
            dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
            # 重新生成字典:key 是单词,value 是单词对应的下标。其中 k 为下标索引,v 为 字典中包含的词,
            w2indx = {v: k + 1 for k, v in dict.items()}
            # key 是单词,value 是对应的词向量
            w2vec = {word: model[word] for word in w2indx.keys()}

            # 获取一句话所对应的词语索引
            def parseDataset(words):
                data = []
                for sentence in words:
                    new_txt = []
                    for word in sentence:
                        try:
                            new_txt.append(w2indx[word])
                        except:
                            new_txt.append(0)
                    data.append(new_txt)
                return data

            combined = parseDataset(words)
            # 对长短不同的时序统一维度。
            combined = sequence.pad_sequences(combined, maxlen=self.maxlen)
            return w2indx, w2vec, combined
        else:
            print("模型或数据导入失败")
def create_dictionaries(model=None, combined=None):
    ''' 这个函数做3件事
        1- 创建一个单词到索引的映射
        2- 创建一个单词到词向量的映射
        3- 对训练集和测试集的词典进行转换
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  词频小于10->0 所以v->k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用, 把combined中的词语转换成对应的索引
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # 词频小于10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 29
0
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' 单词变集合
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0

        f12.write(str(combined))
        f12.write('\n')

        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
Exemplo n.º 30
0
def word2vec_train(tokenizedtalkfile, vocabularyfile):
    wordlist = []
    for line in open(tokenizedtalkfile, 'r'):
        talkwords = []
        for word in line.split(' '):
            if word.find('\n') != -1:
                word = word.replace('\n', '')
            talkwords.append(word)
        wordlist.append(talkwords)
    print('Start Training ...')
    start = time.time()
    model = Word2Vec(size=50, min_count=1, window=7, workers=4, sg=1, iter=5)
    model.build_vocab(wordlist)
    model.train(wordlist)
    model.save('corpus_word2vec_model.pkl')
    end = time.time()
    print('Training Time: %.5f' % (end - start))
    model = Word2Vec.load('corpus_word2vec_model.pkl')
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
    word2index = {v: k for k, v in gensim_dict.items()}
    with open(vocabularyfile, 'w') as vocabFile:
        for item in word2index.keys():
            vocabFile.write(item + '\t' + str(word2index[item]) + '\n')
Exemplo n.º 31
0
def topic_extraction(corpus, ntopics):
    # gensim lda
    common_dictionary = Dictionary(corpus)
    common_corpus = [common_dictionary.doc2bow(text) for text in corpus]
    lda = LdaModel(common_corpus,
                   num_topics=ntopics,
                   iterations=800,
                   random_state=1)
    features = lda.get_document_topics(common_corpus, minimum_probability=0)
    lda_list = []
    for f in features:
        lda_list.append([b[1] for b in f])
    lda_df = pd.DataFrame(lda_list)
    lda_df = lda_df.reset_index(drop=True)
    return lda_df
Exemplo n.º 32
0
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
                         headlines=None): 
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        filter_corpus (optional): boolean  
            Filter the corpus to only those words seen in the articles. Use
            to speed up iteration during intial building/training phases. 
        bodies (optional): list of lists 
            Must be passed in if `filter_corpus` is True. 
        headlines (optional): list of lists  
            Must be passed in if `filter_corpus` is True. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if filter_corpus:
        if (not bodies or not headlines): 
            raise Exception('Must pass in bodies and headlines with filter_corpus as True!')
        else: 
            wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    word_idx_dct = {wrd: idx for idx, wrd in gensim_dct.items()}
    idx_word_dct = {idx: wrd for idx, wrd in gensim_dct.items()}
    word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}

    return word_idx_dct, idx_word_dct, word_vector_dct 
Exemplo n.º 33
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 ,so k+1
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # all index of word with freq>10,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in
                 w2indx.keys()}  # all index of word vectors with freq>10, (word->model(word))

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined,
                                          maxlen=maxlen)  # index to every word in every sentence, when freq < 10, index = 0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 35
0
    def create_dictionaries(cls, model=None, combined=None):
        """ Function does are number of Jobs:
            1- Creates a word to index mapping
            2- Creates a word to vector mapping
            3- Transforms the Training and Testing Dictionaries
        """
        def _parse_dataset(sentences):
            """Words become integers
                将每一个句子中的每个词用词向量存在的词的索引表示出来,
                如果词没有在索引中出现,则标为0
            """
            data = []
            for sentence in sentences:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except KeyError:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        if combined is not None and model is not None:
            gensim_dict = Dictionary()
            gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
            w2indx = {v: k + 1
                      for k, v in gensim_dict.items()}  # 所有频数超过5的词语的索引
            w2vec = {word: model[word]
                     for word in w2indx.keys()}  # 所有频数超过5的词语的词向量
            combined = _parse_dataset(combined)
            combined = sequence.pad_sequences(
                combined,
                maxlen=cls.maxlen)  # 每个句子所含词语对应的索引,所有句子中含有频数小于5的词语,索引为0
            return w2indx, w2vec, combined
        else:
            print('No data provided...')
Exemplo n.º 36
0
def create_dictionaries(model=None, combined=None):
    #创造辞典 1-创建单词到索引的映射 2-创建单词到矢量的映射 3-转换培训和测试词典

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()}  # word => index 词的索引
        f = open("../model/word2index.txt", 'w',
                 encoding='utf8')  #word2index,txt文件是如何生成的?
        for key in w2indx:
            f.write(str(key))
            f.write(' ')
            f.write(str(w2indx[key]))
            f.write('\n')
        f.close()
        w2vec = {word: model[word] for word in w2indx.keys()}  # word => vector

        def parse_dataset(combined):  # 解析数据集  闭包(函数内部的函数)临时使用
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word => index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # 句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 37
0
def get_lda_model_byDomains(domains):
    """ Создать LDA модель из заданных ссылок
    :param domains: имена сообществ VK
    """

    common_texts = normilize_texts(domains[0])

    for i in range(1, len(domains)):
        common_texts += normilize_texts(domains[i])

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    lda = LdaModel(common_corpus, num_topics=len(domains))

    return lda
Exemplo n.º 38
0
def transform_data(model, x_train, y_train, x_test, y_test):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

    w2indx = {v: k + 1 for k, v in gensim_dict.items()}
    w2vec = {word: model[word] for word in w2indx.keys()}

    def parse_data(x, y):

        for key in range(len(y)):
            txt = x[key].lower().replace('\n', '').split()
            new_txt = []
            for word in txt:
                try:
                    new_txt.append(w2indx[word])
                except:
                    new_txt.append(0)
            x[key] = new_txt
        return x, y

    x_train, y_train = parse_data(x_train, y_train)
    x_test, y_test = parse_data(x_test, y_test)

    return w2indx, w2vec, x_train, y_train, x_test, y_test
Exemplo n.º 39
0
def pre_process_lda(data_train):
    stoplist = load_stopwords(stopword_path)
    text_data = []
    for document in data_train:
        doc = document.lower().strip()
        words = tokenizer.tokenize(doc)
        docs = [
            word for word in words if (word not in stoplist and len(word) > 1)
        ]
        text_data.append(docs)

    dictionary = Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    return corpus, dictionary
def create_dictionaries(train=None, test=None, model=None):
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data

        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
Exemplo n.º 41
0
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  # 所有频数超过10的词语的索引,(k->v)=>(v->k)
        f = open("word2index.txt", 'w', encoding='utf8')
        for key in w2indx:
            f.write(str(key))
            f.write(' ')
            f.write(str(w2indx[key]))
            f.write('\n')
        f.close()
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  # 所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)  # [[1,2,3...],[]]
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Exemplo n.º 42
0
    def __init__(self, strategy="GREEDY", seed=2020, max_iter=20):
        """
        This class produces a baseline BM25 ranking and uses LDA topic modelling
        in combination with the general re-ranking procedure of Huang and Hu (2009)
        """
        self.seed = seed
        self.max_iter = max_iter
        self.utils = Utils()

        # Amount of documents to rank and rerank
        self.N= 100

        # Select a strategy for weighing final topics
        self.strategy = strategy
    
        # K to use in TOP-K-AVG strategy
        self.top_k = 10 

        # TODO ideally we don't want to first rank every time for the reranking 
        self.baseline = BaselineBM25(k=self.N)
        self.baseline.rank()

        # For each topic, the system outputs N retrieved articles.
        self.batch_hits = self.baseline.get_batch_hits()

        # Read index to retrieve document contents
        # N.B. the `contents` field is currently empty; we stored "raw" instead.
        self.index_loc = self.baseline.get_index_loc()
        reader = IndexReader(self.index_loc)

        # Vocabulary in index
        #vocabulary = [ term.term for term in reader.terms()]
        #print(f"{len(vocabulary)} terms in vocabulary")

        # Topics and the retrieved articles are represented as the keyword sequences
        self.topics = self.baseline.get_topics()
        self.topic_keywords = { id: topic['title'].lower().split() for (id, topic) in self.topics.items() } 
        self.query_ids = self.baseline.get_query_ids()

        # Next line returns preprocessed documents per query 
        docs_per_query = { query_id: [ reader.analyze( reader.doc(hit.docid).raw()) for hit in hits] for query_id, hits in self.batch_hits.items() }

        # Prepare bag-of-words dataset for gensim
        self.X = defaultdict(list)
        for id in self.query_ids:
            dictionary = Dictionary(docs_per_query[id])
            # Dictionary expects a list of lists, elements being lists of tokens
            self.X[id] = [dictionary.doc2bow(doc) for doc in docs_per_query[id]]
Exemplo n.º 43
0
class DigestedDocumentCollection(CorpusABC):
    """A bag-of-words representation of a corpus (collection of documents).

    This serves as direct input to modeling functions.  It is output from
    preprocessing functions.

    Parameters
    ----------
    corpus: A collection of tokenized documents
        Each document is a list of tokens, tokenized and normalized strings
        (either utf8 or unicode) (e.g. output of topik.SimpleTokenizer)

    Readers iterate over tuples (id, content), but discard id in return (for compatibility with Gensim.)

    """
    def __init__(self, tokenized_corpus):
        self.corpus = tokenized_corpus
        self.dict = Dictionary(tokenized_corpus.get_generator_without_id())
        super(DigestedDocumentCollection, self).__init__()

    def __iter__(self):
        """Discards id field - for compatibility with Gensim."""
        for _id, doc_tokens in self.corpus:
            yield self.dict.doc2bow(doc_tokens)

    def __len__(self):
        return len(self.corpus)

    def get_id2word_dict(self):
        return self.dict

    def save(self, filename):
        self.corpus.save(filename)

    @classmethod
    def load(cls, filename):
        return cls(load_persisted_corpus(filename))

    @property
    def persistor(self):
        return self.corpus.persistor

    @property
    def filter_string(self):
        return self.corpus.filter_string
Exemplo n.º 44
0
module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)

# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
         ['survey', 'user', 'computer', 'system', 'response', 'time'],
         ['eps', 'user', 'interface', 'system'],
         ['system', 'human', 'system', 'eps'],
         ['user', 'response', 'time'],
         ['trees'],
         ['graph', 'trees'],
         ['graph', 'minors', 'trees'],
         ['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
boolean_document_based = ['u_mass']
sliding_window_based = ['c_v', 'c_uci', 'c_npmi']


def testfile():
    # temporary data will be stored to this file
    return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')

def checkCoherenceMeasure(topics1, topics2, coherence):
    """Check provided topic coherence algorithm on given topics"""
    if coherence in boolean_document_based:
        cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence)
        cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence)
    else:
        cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
class WikiCorpus(interfaces.CorpusABC):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
    
    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.
    
    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id, takes almost 7h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 7.5h, creates a file in MatrixMarket format plus file with id->word
    
    """
    def __init__(self, fname, noBelow = 20, keep_words = 200000, dictionary = None):
        """
        Initialize the corpus. This scans the corpus once, to determine its 
        vocabulary (only the first `keep_words` most frequent words that 
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if dictionary is None:
            self.dictionary = Dictionary(self.getArticles())
            self.dictionary.filterExtremes(noBelow = noBelow, noAbove = 0.1, keepN = keep_words)
        else:
            self.dictionary = dictionary

    
    def __len__(self):
        return self.numDocs


    def __iter__(self):
        """
        The function that defines a corpus -- iterating over the corpus yields 
        vectors, one for each document.
        """
        for docNo, text in enumerate(self.getArticles()):
            yield self.dictionary.doc2bow(text, allowUpdate = False)

        
    def saveDictionary(self, fname):
        """
        Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        fout = open(fname, 'w')
        for token, tokenId in sorted(self.dictionary.token2id.iteritems()):
            fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId]))
        fout.close()
    
    
    @staticmethod
    def loadDictionary(fname):
        """
        Load previously stored mapping between words and their ids.
        
        The result can be used as the `id2word` parameter for input to transformations.
        """
        result = {}
        for lineNo, line in enumerate(open(fname)):
            cols = line[:-1].split('\t')
            if len(cols) == 2:
                wordId, word = cols
            elif len(cols) == 3:
                wordId, word, docFreq = cols
            else:
                continue
            result[int(wordId)] = word # docFreq not used
        return result
    
    
    def saveAsText(self, fname):
        """
        Store the corpus to disk, in a human-readable text format.
        
        This actually saves two files:
        
        1. Document-term co-occurence frequency counts (bag-of-words), as 
           a Matrix Market file `fname_bow.mm`.
        2. Token to integer mapping, as a text file `fname_wordids.txt`.
        
        """
        self.saveDictionary(fname + '_wordids.txt')
        matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt = 10000)
        
    
    def getArticles(self):
        """
        Iterate over the dump, returning text version of each article.
        
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).
        """
        articles, intext = 0, False
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith('      <text'):
                intext = True
                line = line[line.find('>') + 1 : ]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find('</text>') # can be on the same line as <text>
            if pos >= 0:
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filterWiki(''.join(lines))
                if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
                    articles += 1
                    yield tokenize(text) # split text into tokens
        
        self.numDocs = articles # cache corpus length
Exemplo n.º 46
0
logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)

logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)

for key in articles.iterkeys():

    logging.info('current term: %s' % key)

    term_list = articles[key].keys()
    text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) 
            for article in articles[key].values()]
    sim_matrix = np.zeros((len(text_list), len(text_list)))

    logging.info('transform the textlist')
    text_list = lsi[log_ent[text_list]]

    logging.info('compute similarity matrix')
    for i, par1 in enumerate(text_list):
        for j, par2 in enumerate(text_list):
            sim_matrix[i, j] = matutils.cossim(par1, par2)
    matrices[key] = {}
    matrices[key]['term_list'] = term_list
    matrices[key]['sim_matrix'] = sim_matrix
    assert np.shape(sim_matrix)[0] == len(term_list)
    
term_lists = []
for i in range(len(df)):
    df['msg'][i] = df['msg'][i].lower()
    j = df['msg'][i].find('req')
    if j > -1:
        df['msg'][i] = df['msg'][i][j:] 
        idx.append(i)
        terms = df['msg'][i].split()
        terms = terms[5:]
        filtered_terms = [t for t in terms if len(t) > 0]
        term_lists.append(filtered_terms)

# Merge term lists into the main dataframe    
d = {'terms':term_lists}
term_df = DataFrame(data=d,columns=['terms'],index=df.index[idx])
df = df.join(term_df)

# Create corpus for topic modeling
corpora_dict = Dictionary(term_lists)
corpus = [corpora_dict.doc2bow(msg) for msg in term_lists]

# Perform topic modeling
lda = LdaModel(corpus=corpus,id2word=corpora_dict,num_topics=5)

# Print out top terms for each topic
topics = lda.show_topics()
i = 0
for topic in topics:
    i += 1
    print "Topic %d: %s" % (i,str(topic))
Exemplo n.º 48
0
class CableCorpus(BaseCorpus):
    """\
    The cable corpus consists of several files which are written into a directory.

    * a dictionary with a ``<word id> <word> <frequency>`` mapping saved under "wordids.pickle"
    * a JSON file with a ``<cable reference id> <document number>`` mapping under "id2docid.json"
    * a `Market Matrix format <http://math.nist.gov/MatrixMarket/formats.html>` vector space model file "bow.mm"

    CAUTION: The corpus overrides any existing files with the same file name in the specified directory.

    By default, the corpus creates the word dictionary and the vector space model which
    may lead into an unuseful vector space model. To filter certain words, the corpus may be
    initialized with a pre-generated word dictionary. To make the dictionary immutable, the property
    ``allow_dict_updates`` should be set to ``False`` (updates are allowed by default).
    The resulting vector space model contains only words which are in the word dictionary then.

    Example to reduce the clutter::

        corpus = CableCorpus('/my/directory/')
        # Add some texts here
        corpus.add_text('ref-1', u'bla bla bla')
        corpus.add_text('ref-2', u'bla bla blub')
        ...
        corpus.dct.filter_extremes()
        corpus.close()

        from gensim.corpora.dictionary import Dictionary

        # Load previously created dict
        dct = Dictionary.load_from_text('/my/directory/cables_wordids.txt')
        # Create another corpus with the previously word dict
        corpus = CableCorpus('/my/directory/', dct, allow_dict_updates=False)
        # Add some texts
        ....
        corpus.close()
    """
    def __init__(self, path, dct=None, tokenizer=None, allow_dict_updates=True, prefix=None):
        """\
        Initializes the cable corpus.
        
        `path`
            Directory where the generated files are stored.
        `dct`
            An existing `gensim.corpora.dictionary.Dictionary`
            If it's ``None`` (default) a dictionary will be created.
        `tokenizer`
            A function to tokenize/normalize/clean-up/remove stop words from strings.
            If it's ``None`` (default), a default function will be used to tokenize texts.
        `allow_dict_updates`
            Indicats if unknown words should be added to the dictionary (default ``True``).
        `prefix`
            A prefix for the generated file names.
        """
        super(CableCorpus, self).__init__(tokenizer)
        if not os.path.isdir(path):
            raise IOError('Expected a directory path')
        self.dct = Dictionary() if dct is None else dct
        self._path = path
        self._prefix = prefix or 'cables_'
        self._mw = IncrementalMmWriter(os.path.join(path, self._prefix + 'bow.mm'))
        self.allow_dict_updates = allow_dict_updates
        self._cables = []

    def add_words(self, reference_id, words):
        self._cables.append(reference_id)
        self._mw.add_vector(self.dct.doc2bow(words, self.allow_dict_updates))

    def close(self):
        self._mw.close()
        self.dct.save(os.path.join(self._path, self._prefix + 'wordids.pickle'))
        json_filename = os.path.join(self._path, self._prefix + 'id2docid.json')
        json.dump(dict(zip(self._cables, count())), open(json_filename, 'wb'))
Exemplo n.º 49
0
Arquivo: lda.py Projeto: freygit/36
class LDA(object):

    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus = [[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model.update(new_corpus_data)

    def inference(self, document = []):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc =  [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Exemplo n.º 50
0
class TextCorpus(interfaces.CorpusABC):
    """
    Helper class to simplify the pipeline of getting bag-of-words vectors (= a
    gensim corpus) from plain text.

    This is an abstract base class: override the `get_texts()` method to match
    your particular input.

    Given a filename (or a file-like object) in constructor, the corpus object
    will be automatically initialized with a dictionary in `self.dictionary` and
    will support the `iter` corpus method. You must only provide a correct `get_texts`
    implementation.

    """
    def __init__(self, input=None):
        super(TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if input is not None:
            self.dictionary.add_documents(self.get_texts())
        else:
            logger.warning("No input document stream provided; assuming "
                           "dictionary will be initialized some other way.")


    def __iter__(self):
        """
        The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each document.
        """
        for text in self.get_texts():
            if self.metadata:
                yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1])
            else:
                yield self.dictionary.doc2bow(text, allow_update=False)


    def getstream(self):
        return getstream(self.input)


    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        length = 0
        for lineno, line in enumerate(getstream(self.input)):
            length += 1
            yield utils.tokenize(line, lowercase=True)
        self.length = length


    def __len__(self):
        return self.length # will throw if corpus not initialized
class DefaultJsonCorpus(object):
    """
    A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input.
    The methods provided by gensim TextCorpus are needed for the GenSim training.
    Any corpus provided to DocumentSimilarity should provide the methods given in this class.
    """
    def __init__(self, input=None,create_dictionary=True):
        super(DefaultJsonCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())


    def __iter__(self):
        for text in self.get_texts():
            yield self.dictionary.doc2bow(text, allow_update=False)

    def getstream(self):
        return utils.file_or_filename(self.input)

    def __len__(self):
        if not hasattr(self, 'length'):
            # cache the corpus length
            self.length = sum(1 for _ in self.get_texts())
        return self.length

    def get_json(self):
        if isinstance(self.input,list):
            for j in self.input:
                yield j
        else:
            with self.getstream() as lines:
                for line in lines:
                    line = line.rstrip()
                    j = json.loads(line)
                    yield j

    def get_texts(self,raw=False):
        """
        yield raw text or tokenized text
        """
        for j in self.get_json():
            text = j["text"]
            if raw:
                yield text
            else:
                yield utils.tokenize(text, deacc=True, lowercase=True)

    def get_meta(self):
        """
        return a json object with meta data for the documents. It must return:
        id - id for this document
        optional title and tags. Tags will be used as base truth used to score document similarity results.
        """
        doc_id = 0
        for j in self.get_json():
            m = copy.deepcopy(j)
            m['id'] = long(m['id'])
            m['corpus_seq_id'] = doc_id
            doc_id += 1
            yield m

    def get_dictionary(self):
        return self.dictionary