def line2doc(self, line): words = self.line2words(line) if self.use_wordids: # get all distinct terms in this document, ignore unknown words uniq_words = set(words).intersection(iterkeys(self.word2id)) # the following creates a unique list of words *in the same order* # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. # if this was not needed, we might as well have used useWords = set(words) use_words, marker = [], set() for word in words: if (word in uniq_words) and (word not in marker): use_words.append(word) marker.add(word) # construct a list of (wordIndex, wordFrequency) 2-tuples doc = list(zip(map(self.word2id.get, use_words), map(words.count, use_words))) else: uniq_words = set(words) # construct a list of (word, wordFrequency) 2-tuples doc = list(zip(uniq_words, map(words.count, uniq_words))) # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus return doc
def line2doc(self, line): words = self.line2words(line) if self.use_wordids: # get all distinct terms in this document, ignore unknown words uniq_words = set(words).intersection(iterkeys(self.word2id)) # the following creates a unique list of words *in the same order* # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. # if this was not needed, we might as well have used useWords = set(words) use_words, marker = [], set() for word in words: if (word in uniq_words) and (word not in marker): use_words.append(word) marker.add(word) # construct a list of (wordIndex, wordFrequency) 2-tuples doc = zip( map(self.word2id.get, use_words), map(words.count, use_words) ) # using list.count is suboptimal but speed of this whole function is irrelevant else: uniq_words = set(words) # construct a list of (word, wordFrequency) 2-tuples doc = zip( uniq_words, map(words.count, uniq_words) ) # using list.count is suboptimal but that's irrelevant at this point # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus return doc
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ result = {} missing = {} if isinstance(document, string_types): raise TypeError( "doc2bow expects an array of utf8 tokens on input, not a string" ) document = sorted(utils.to_utf8(token) for token in document) # construct (word, frequency) mapping. in python3 this is done simply # using Counter(), but here i use itertools.groupby() for the job for word_norm, group in itertools.groupby(document): frequency = len( list(group) ) # how many times does this word appear in the input document tokenid = self.token2id.get(word_norm, None) if tokenid is None: # first time we see this token (~normalized form) if return_missing: missing[word_norm] = frequency if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token continue tokenid = len(self.token2id) self.token2id[ word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence! # update how many times a token appeared in the document result[tokenid] = frequency if allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ result = {} missing = {} if isinstance(document, string_types): raise TypeError("doc2bow expects an array of utf8 tokens on input, not a string") document = sorted(utils.to_utf8(token) for token in document) # construct (word, frequency) mapping. in python3 this is done simply # using Counter(), but here i use itertools.groupby() for the job for word_norm, group in itertools.groupby(document): frequency = len(list(group)) # how many times does this word appear in the input document tokenid = self.token2id.get(word_norm, None) if tokenid is None: # first time we see this token (~normalized form) if return_missing: missing[word_norm] = frequency if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token continue tokenid = len(self.token2id) self.token2id[word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence! # update how many times a token appeared in the document result[tokenid] = frequency if allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall corpus statistics and document frequencies. For each id appearing in this document, increase its document frequency (`self.dfs`) by one. """ result = {} missing = {} document = sorted( document) # convert the input to plain list (needed below) for word_norm, group in itertools.groupby(document): frequency = len( list(group) ) # how many times does this word appear in the input document tokenid = self.restricted_hash(word_norm) result[tokenid] = result.get(tokenid, 0) + frequency if self.debug: # increment document count for each unique token that appeared in the document self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1 if allow_update or self.allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) if self.debug: # increment document count for each unique tokenid that appeared in the document # done here, because several words may map to the same tokenid for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall corpus statistics and document frequencies. For each id appearing in this document, increase its document frequency (`self.dfs`) by one. """ result = {} missing = {} document = sorted(document) # convert the input to plain list (needed below) for word_norm, group in itertools.groupby(document): frequency = len(list(group)) # how many times does this word appear in the input document tokenid = self.restricted_hash(word_norm) result[tokenid] = result.get(tokenid, 0) + frequency if self.debug: # increment document count for each unique token that appeared in the document self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1 if allow_update or self.allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) if self.debug: # increment document count for each unique tokenid that appeared in the document # done here, because several words may map to the same tokenid for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): if num_neg is None: # by default, print half as many salient negative words as positive num_neg = num_words / 2 logger.info('computing word-topic salience for %i topics' % len(topics)) topics, result = set(topics), {} # TODO speed up by block computation for uvecno, uvec in enumerate(u): uvec = numpy.abs(numpy.asarray(uvec).flatten()) udiff = uvec / numpy.sqrt(numpy.sum(numpy.dot(uvec, uvec))) for topic in topics: result.setdefault(topic, []).append((udiff[topic], uvecno)) logger.debug("printing %i+%i salient words" % (num_words, num_neg)) for topic in sorted(iterkeys(result)): weights = sorted(result[topic], key=lambda x: -abs(x[0])) _, most = weights[0] if u[most, topic] < 0.0: # the most significant word has a negative sign => flip sign of u[most] normalize = -1.0 else: normalize = 1.0 # order features according to salience; ignore near-zero entries in u pos, neg = [], [] for weight, uvecno in weights: if normalize * u[uvecno, topic] > 0.0001: pos.append('%s(%.3f)' % (id2token[uvecno], u[uvecno, topic])) if len(pos) >= num_words: break for weight, uvecno in weights: if normalize * u[uvecno, topic] < -0.0001: neg.append('%s(%.3f)' % (id2token[uvecno], u[uvecno, topic])) if len(neg) >= num_neg: break logger.info('topic #%s(%.3f): %s, ..., %s' % (topic, s[topic], ', '.join(pos), ', '.join(neg)))
def __str__(self): some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) return "Dictionary(%i unique tokens: %s%s)" % ( len(self), some_keys, '...' if len(self) > 5 else '')
def __str__(self): some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')