Python Dictionary.keys примеры использования

Язык программирования: Python

Пространство имен/Пакет: gensim.corpora.dictionary

Класс/Тип: Dictionary

Метод/Функция: keys

Примеров на hotexamples.com: 7

Python Dictionary.keys - 7 примеров найдено. Это лучшие примеры Python кода для gensim.corpora.dictionary.Dictionary.keys, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Dictionary(30)

items(30)

save(30)

doc2bow(30)

filter_extremes(30)

load(30)

add_documents(30)

get(23)

load_from_text(19)

from_corpus(16)

doc2idx(12)

compactify(9)

save_as_text(8)

keys(6)

token2id(4)

itervalues(4)

id2token(4)

filter_tokens(3)

values(3)

merge_with(2)

num_docs(2)

num_nnz(2)

num_pos(2)

dfs(2)

iteritems(1)

from_documents(1)

filter_n_most_frequent(1)

filterExtremes(1)

patch_with_special_tokens(1)

corpus_id2orig_id(1)

corpus(1)

Пример #1

Показать файл

def make_item_descriptions(max_sentence_length=None):
    descriptions = pd.read_csv(os.path.join(
        'data', 'descriptions.csv')).rename(columns={'movie': 'item'})
    texts = descriptions.description
    texts = texts.apply(lambda x: x.strip().split())
    dictionary = Dictionary(texts.values)
    dictionary.filter_extremes()
    eos_id = len(dictionary.keys())

    # to index list
    texts = texts.apply(
        lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id))
    texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id]))
    max_sentence_length = max(
        texts.apply(len)) if max_sentence_length is None else min(
            max(texts.apply(len)), max_sentence_length)

    # padding
    texts = texts.apply(lambda x: x[:max_sentence_length])
    texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)),
                                         'constant',
                                         constant_values=(0, eos_id)))

    # change types
    texts = texts.apply(lambda x: x.astype(np.int32))
    descriptions.id = descriptions.id.astype(np.int32)

    return descriptions.id.values, texts.values, len(dictionary.keys()) + 1

Пример #2

Показать файл

Файл: data_generator.py Проект: nishiba/cnn_sc

class DataGenerator(object):
    def __init__(self,
                 positive_dataset: pd.DataFrame,
                 negative_dataset: pd.DataFrame,
                 test_size: float,
                 random_state: int = 123,
                 max_sentence_length: int = None):
        self.dataset = pd.concat([positive_dataset, negative_dataset], axis=0)
        self.dataset['review'] = self.dataset['review'].apply(
            lambda x: x.strip().split())
        self.dictionary = Dictionary(self.dataset['review'].values)
        self.dataset['review'] = self.dataset['review'].apply(
            self.dictionary.doc2idx)
        self.max_sentence_length = max_sentence_length
        if self.max_sentence_length is not None:
            self.dataset['review'] = self.dataset['review'].apply(
                lambda x: x[:self.max_sentence_length])
        else:
            self.max_sentence_length = max(self.dataset['review'].apply(len))

        # padding
        eos_id = len(self.dictionary.keys())
        self.dataset['review'] = self.dataset['review'].apply(
            lambda x: np.pad(x, (0, self.max_sentence_length - len(x)),
                             'constant',
                             constant_values=(0, eos_id)))

        # change type
        self.dataset['review'] = self.dataset['review'].apply(
            lambda x: x.astype(np.int32))
        self.dataset['label'] = self.dataset['label'].astype(np.int32)

        # split
        self.train, self.test = train_test_split(self.dataset,
                                                 test_size=test_size,
                                                 random_state=random_state)

    def get_train_dataset(self):
        return list(
            zip(self.train['review'].values, self.train['label'].values))

    def get_test_dataset(self):
        return list(zip(self.test['review'].values, self.test['label'].values))

    def get_max_sentence_length(self):
        return self.max_sentence_length

    def get_n_word(self):
        return len(self.dictionary.keys()) + 1

Пример #3

Показать файл

Файл: umass_coherence.py Проект: maifeng/Examples_UMass-Coherence

# fit an LDA model, n_topic = 5
news_dictionary = Dictionary(documents)
news_dictionary.filter_extremes(no_below=5,
                                no_above=0.5,
                                keep_n=5000,
                                keep_tokens=None)
corpus = [news_dictionary.doc2bow(text) for text in documents]
lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary)

lda.show_topics()

# convert gensim corpus to a sparse document-term matrix for coherence measure
corpus_dense = gensim.matutils.corpus2csc(corpus,
                                          num_terms=len(
                                              news_dictionary.keys()))
corpus_dense = corpus_dense.astype(int)
corpus_dense = corpus_dense.transpose()
print(corpus_dense.shape)


# implements the UMass coherence in Mimno et al. 2011 - Optimizing Semantic Coherence in Topic Models
def cooccur_df_ws(w1, w2, corpus_dense, w2ids):
    """
    Returns the co-document frequency of two words
    """
    w1_id, w2_id = w2ids.token2id.get(w1), w2ids.token2id.get(w2)
    co_freq_array = (corpus_dense[:, [w1_id, w2_id]] > 0).sum(axis=1).A1
    return np.count_nonzero(co_freq_array == 2)

Пример #4

Показать файл

Файл: tag_selector.py Проект: freygit/36

class InfoGain(object):
    """
    计算标签的信息增益，输入的是标注了类别的语料库，
    计算信息增益首先计算全局的信息熵，然后计算每个标
    签的条件熵，相减就是信息增益。
    Example：
        >>> ig = InfoGain(corpus_file)
        >>> ig.compute()
        >>> ig.save(ig_file)
        >>> print(ig['word'])  # 查询一个词的信息增益
    """
    def __init__(self, corpus_file):
        """
        Args:
            corpus_file -- 语料文件，第一列是类别，后面都是标签
        """
        corpus = []
        categories = []
        self._category_distribution = {}  # 统计各个类别的样本数
        self._words_cate = {}  # 统计每个词（标签、特征）下的类别样本数
        self._words_sample_count = {}
        self._info_gain = {}
        with open(corpus_file, 'r') as documents:
            for line in documents:
                words = line.strip().split()
                if len(words) <= 1:
                    continue
                categories.append(words[0])
                corpus.append(words[1:])
                if words[0] not in self._category_distribution:
                    self._category_distribution[words[0]] = 0
                self._category_distribution[words[0]] += 1

                # 统计词（标签、特征）和类别的共现次数，用于计算条件熵
                for word in set(words[1:]):
                    if word not in self._words_cate:
                        self._words_cate[word] = {}
                        self._words_sample_count[word] = 0
                    if words[0] not in self._words_cate[word]:
                        self._words_cate[word][words[0]] = 0
                    self._words_cate[word][words[0]] += 1
                    self._words_sample_count[word] += 1

        self._common_dictionary = Dictionary(corpus)
        self._corpus = corpus
        self._categories = categories

    def compute(self):
        """
        计算所有词（标签、特征）的信息增益。首先计算全局的信息熵。
        """
        system_entropy = compute_entropy(len(self._corpus),
                                         self._category_distribution)
        # 计算每个词的条件熵
        for word in self._common_dictionary.keys():
            category_distribution = {}
            if word not in self._words_cate:
                continue
            # 出现该词（标签、特征）的类别分布信息熵
            entropy1 = compute_entropy(self._words_sample_count[word],
                                       self._words_cate[word])
            for cate in self._category_distribution:
                category_distribution[cate] = self._category_distribution[cate]
                if cate in self._words_cate[word]:
                    category_distribution[cate] -= self._words_cate[word]
            # 未出现该词（标签、特征）的类别分布信息熵
            entropy2 = (compute_entropy(len(self._corpus)
                                        - self._words_sample_count[word],
                                        category_distribution))
            # 该词（标签、特征）的条件熵
            condition_entropy = (self._words_sample_count[word] * entropy1/len(self._corpus)
                                 + (len(self._corpus) - self._words_sample_count[word])
                                 * entropy2/len(self._corpus))
            # 信息增益
            info_gain = system_entropy - condition_entropy
            self._info_gain[word] = info_gain


    def save(self, ig_file_name, sort=False):
        """
        保存到文件，格式为：词 信息增益
        Args:
            ig_file_name -- 文件路径
            sort -- 是否按照信息增益从高到低排序后输出，默认不排序
        """
        with open(ig_file_name, 'w') as ig_file:
            if not sort:
                for word in self._info_gain:
                    ig_file.write("%s %.2f\n" % (word, self._info_gain[word]))
            else:
                for item in sorted(self._info_gain.items(), key=lambda x: x[1], reverse=True):
                    ig_file.write("%s %.2f\n" % (item[0], item[1]))

    def __get_item__(self, word):
        if word not in self._info_gain:
            return 0.0
        return self._info_gain[word]

Пример #5

Показать файл

Файл: Project_topic_modeling.py Проект: wengangm/research_project

                continue
            if row[0] not in eids:
                eids.add(row[0])
                if len(row) == 8:
                    row.append(",".join(topic_word_list_result[row[0]]))
                else:
                    row[8] = ",".join(topic_word_list_result[row[0]])
                csvData.append(row)
    with open(filename, 'w', encoding='utf-8') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(csvData)
        print("Write to database successfully")


print("corpus length:", len(pubs_corpus))
print("dict length:", len(pubs_dictionary.keys()))
topic_word_list_result = dict()
topic_dict = topic_to_lemmatized_word_list(lda)

for id in pubs_eids:
    cur_corpus = id_to_corpus.get(id, None)
    if cur_corpus != None:
        candidate_topics = lda.get_document_topics(
            cur_corpus)  # list all topic index
        best_topic_index = select_highest_prob_topic(
            candidate_topics)  # select the index with highest prob
        if best_topic_index == -1:
            print("no topic document:", id)
            topic_word_list_result[id] = ["unknown"]
        else:
            topic_word_list_result[id] = topic_dict[

Пример #6

Показать файл

    docs_per_topic  = defaultdict(list)

    # Create a dictionary with docs per topic
    for topic_id in topics:
        # Uncommented following line if you want to return actual docs
        #docs_per_topic[ topic_id ].append(X[np.where(argmax == topic_id)])
        docs_per_topic[ topic_id ].append(np.where(argmax == topic_id))
    return docs_per_topic


# Initialization of term dictionary; in this case for 9 2 a 3-word documents
print(common_texts)
common_dictionary = Dictionary(common_texts)
print(common_dictionary)  # prints 12 unique tokens
print(common_dictionary.keys())  # prints 12 unique tokens

# Create a corpus from a list of texts
# BoW: Bag-of-Words representation for each document: (token_id[int], token_count[float]) tuples
X = [common_dictionary.doc2bow(text) for text in common_texts]
print("X: ", X)
# X can also be a sparse matrix with (n_docs, n_terms); that may be handier

lda = LdaModel(corpus=X, num_topics = 10, alpha='symmetric')

# Full document-term matrix
print(lda.get_topics())

# TODO figure out how to use LdaState; need to initialize it first I guess
# Get posterior probabilities over topics
#print(lda.LdaState().get_lambda())

Пример #7

Показать файл

Файл: happy_hotel.py Проект: leslem/insight-data-challenges

    ])  # Bigrams and trigrams are joined by underscores
# -

# ### Remove rare and common tokens, and limit vocabulary

# +
dictionary = Dictionary(all_tokens)
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=20000)

# Look at the top 100 and bottom 100 tokens

temp = dictionary[0]  # Initialize the dict

token_counts = pd.DataFrame(np.array(
    [[token_id, dictionary.id2token[token_id], dictionary.cfs[token_id]]
     for token_id in dictionary.keys() if token_id in dictionary.cfs.keys()
     and token_id in dictionary.id2token.keys()]),
                            columns=['id', 'token', 'count'])

token_counts['count'] = token_counts['count'].astype('int')
token_counts['count'].describe()
token_counts = token_counts.sort_values('count')

plt.rcParams.update({'figure.figsize': (5, 3.5), 'figure.dpi': 200})
token_counts['count'].head(5000).hist(bins=100)
plt.suptitle("Counts for 5,000 least frequent included words")
plt.show()
display(token_counts.head(50))

plt.rcParams.update({'figure.figsize': (5, 3.5), 'figure.dpi': 200})
token_counts['count'].tail(1000).hist(bins=100)