def build(self): self.__initDataStructures() text2tokens = resolve(self.text2tokens) text2tokens.originalWords = True for txto in resolve(self.corpus): for token, word in text2tokens(txto.text): self.__register(token, word) # todo solve this problem, if only a single copy of the tokenizer exists # in the context, than originalWords should be reset back to false # because most of the use cases assume it is false, possible solution is copying text2tokens.originalWords = False
def worldvecCro(type='word2vec', corpus='iter0_cronews_final'): from pytopia.resource.word_vec_aggregator.WordVecAggregator import WordVecAggregator if type.startswith('word2vec'): w2vec = resolve('word2vec_builder')('word2vec.hrwac.cbow.vectors.bin') else: w2vec = resolve('glove_vectors_builder')('glove.hrwac.300d.txt') avg = True if type.endswith('avg') else None text2vec = WordVecAggregator('croelect_alphanum_stopword_tokenizer', w2vec, None, avg) mapper = resolve('corpus_text_vectors_builder')(vectorizer=text2vec, corpus=corpus) return mapper
def worldvec(type='word2vec', corpus='us_politics'): from pytopia.resource.word_vec_aggregator.WordVecAggregator import WordVecAggregator if type.startswith('word2vec'): w2vec = resolve('word2vec_builder')( 'GoogleNews-vectors-negative300.bin') else: w2vec = resolve('glove_vectors_builder')('glove.6B.300d.txt') avg = True if type.endswith('avg') else None text2vec = WordVecAggregator('alphanum_gtar_stopword_tokenizer', w2vec, None, avg) mapper = resolve('corpus_text_vectors_builder')(vectorizer=text2vec, corpus=corpus) return mapper
def printDocumentTitles(topic, topDocs=10, corpus='us_politics'): ''' :param topic: (modelId, topicId) :param topDocs: :return: ''' mid, tid = topic ctiBuilder = resolve('corpus_topic_index_builder') cti = ctiBuilder(corpus=corpus, model=mid) wtexts = cti.topicTexts(tid, top=topDocs) txtIds = [id_ for id_, _ in wtexts] corpus = resolve(corpus) idTexts = corpus.getTexts(txtIds) for txto in idTexts: print txto.title
def __createTopicIdIndex(self): ''' Since topic ids can in general be any objects, mapping between ids and indices [0, ... , NumTopics] must be created. ''' model = resolve(self.model) self.__id2ind = {tid: i for i, tid in enumerate(model.topicIds())}
def build(self): ci = self.corpus_index corpus = resolve(self.corpus) self._m = None # matrix for storing coprus vector rows = len(ci) for txto in corpus: if self.verbose: print txto.id t0 = time() vec = self.vectorizer(txto) if self.verbose: print ' time 2 vectorize %.4f' % (time() - t0) if self._m is None: # init matrix if isinstance(vec, np.ndarray): cols = vec.shape[0] self._m = np.empty(shape=(rows, cols), dtype=vec.dtype) self.sparse = False elif isinstance(vec, spmatrix): cols = vec.shape[1] self._m = sparse_type((rows, cols), dtype=vec.dtype) self.sparse = True else: raise Exception('Unsuported vector type: %s' % str(type(vec))) t0 = time() # if self.sparse and not(isinstance(vec, sparse_type)): # vec = sparse_type(vec) #print vec.shape, type(vec) #print self._m.shape, type(self._m) r = ci.id2index(txto.id) self._m[r] = vec if self.verbose: print ' time 2 write 2 matrix %.4f' % (time() - t0)
def croelectTopicFeatures(topic, table=None): ''' Extract features of a topic from topic - semantic topic table and topic description. :param topic: (modelId, topicId) ''' mid, tid = topic model = resolve(mid) assert mid.startswith('croelect_') mid = mid[9:] parse = croelectTableParse() topicLabel = '%s.%d' % (mid, tid) #print topicLabel f = {} ptopic = parse.getTopic(topicLabel) #print ptopic.themes # num_themes f['num_themes'] = len(ptopic.themes) # table_mixed f['table_mixed'] = ptopic.mixed l = unicode(model.description.topic[tid].label).lower().strip() # label_noiseonly if l == u'šum': f['label_noiseonly'] = True else: f['label_noiseonly'] = False # label_noise if u'šum' in l: f['label_noise'] = True else: f['label_noise'] = False return f
def getAllCroelectTopics(): ''' Return all topics of croelect models as (modelId, topicId). ''' all = [(mid, tid) for mid in croelectModelIds for tid in sorted(resolve(mid).topicIds())] return all
def uspolTopicFeatures(topic): ''' Extract features of a topic from topic - semantic topic table and topic description. :param topic: (modelId, topicId) ''' parse = tableParse() mid, tid = topic topicLabel = '%s.%d' % (mid, tid) f = {} ptopic = parse.getTopic(topicLabel) # num_themes f['num_themes'] = len(ptopic.themes) # table_mixed f['table_mixed'] = ptopic.mixed model = resolve(mid) l = str(model.description.topic[tid].label).lower().strip() # label_mixed if l.startswith('mix:') or l.startswith('mixture:'): f['label_mixed'] = True else: f['label_mixed'] = False # label_mixonly if l == 'mix' or l == 'mixture': f['label_mixonly'] = True else: f['label_mixonly'] = False # label_noise if l.endswith('et al'): f['label_noise'] = True else: f['label_noise'] = False # stopwords f['stopwords'] = (l == 'stopwords') return f
def __call__(self, topic): ''' :param topic: (modelId, topicId) ''' mid, tid = topic model = resolve(mid) return self.calculateCoherence(model.topic2string(tid, topw=self.topWords))
def __buildOld(self): model, corpus = resolve(self.model, self.corpus) txt2tok = resolve( self.text2tokens if self.text2tokens else model.text2tokens) dict = resolve( self.dictionary if self.dictionary else model.dictionary) self.__createTopicIdIndex() # turn corpus 2 bow corpus, make corpus index bowBuilder = resolve('bow_corpus_builder') bowCorpus = bowBuilder(corpus, txt2tok, dict) self.__topics = np.zeros((len(bowCorpus), model.numTopics()), dtype=np.float32) for i, bowTxt in enumerate(bowCorpus): tvec = model.inferTopics(bowTxt, format='bow') for tid, ti in self.__id2ind.iteritems(): self.__topics[i, ti] = tvec[tid]
def inferTopics(self, txt, batch=False, format='tokens'): ''' Calculate document-topic proportions for text(s). :param txt: text or iterable of texts :param batch: it True, txt is iterable of text, otherwise a single text :param format of a single text: 'tokens' - list of tokens, 'bow' - list of (wordId, wordCount), 'string' :return: if a single text, single doc-topic vector, else a list of doc-topic vectors ''' if batch: texts = [t for t in txt] else: texts = [txt] if format == 'tokens': dict_ = resolve(self.dictionary) texts = [dict_.tokens2bow(t) for t in texts] elif format == 'bow': pass # already in correct format else: raise Exception('format %s not supported' % format) result = self.model.inference(texts, collect_sstats=False) vectors = result[ 0] # first part of the result 2-tuple is a list of vectors for vec in vectors: vec /= vec.sum() # normalize to prob. distribution if batch: return vectors else: return vectors[0]
def __call__(self, topic): ''' :param topic: (modelId, topicId) :return: ''' mid, tid = topic model = resolve(mid) ctiBuilder = resolve('corpus_topic_index_builder') cti = ctiBuilder(corpus=model.corpus, model=model) topicTexts = cti.topicTexts(tid, sorted=None) # take document weights and normalize tdist = topicTexts[:, 1].astype(np.float64) tdist /= tdist.sum() numDocs = topicTexts.shape[0] tvac = np.repeat(1.0/numDocs, numDocs) return self.measure(tdist, tvac)
def __init__(self, selector, mapper, score, mapperIsFactory=True, timer=False, useTopic=False): ''' :param selector: returns list of documents for the topic :param mapper: a mapper function on (selected) topic documents, returning vectors or scalar, or alternatively a 'factory' - callable that accepts 'dictionary', 'text2tokens' and 'corpus' parameters (attributes of a topic model to which processed topic belongs) and builds such a mapper. This is for creating a customized mapper for each model. :param mapperIsFactory: if True treat mapper as a factory as described above :param score: score function on the matrix/vector of transformed documents :param useTopic: if True, topic is also sent as parameter to mapper ''' self.selector, self.score = selector, deduceId(score) self.mapper = resolveId(mapper) IdComposer.__init__(self) self.__factory = mapperIsFactory self.__mapper = resolve(mapper) self.__score = score self.__timer = timer self.__useTopic = useTopic
def docuDistStats(vectorizers, distances, corpus='us_politics', sampleSize=100000, rndSeed=54778, savePath='.', models=None): ''' :param corpus: :param vectorizer: :param distance: :param sampleSize: :return: ''' from numpy import triu_indices from numpy.random import choice, seed corpus = resolve(corpus) ids, id2txt = [], {} for txto in corpus: ids.append(txto.id) id2txt[txto.id] = txto N = len(ids) print 'corpus indexed, size %d' % N # sample pairs of ids pairs = triu_indices(N, 1) numPairs = len(pairs[0]) print 'pairs array created' seed(rndSeed) indSample = choice(numPairs, sampleSize, replace=False) print 'sampling' idPairs = [(ids[pairs[0][i]], ids[pairs[1][i]]) for i in indSample] pairs = None import gc gc.collect() # create pair distances print 'calculating distances' if not isinstance(distances, list): distances = [distances] if not isinstance(vectorizers, list): vectorizers = [vectorizers] mlabel = '' if models is None else '_'.join(m for m in models) for vectorizer in vectorizers: vectors = {} for distance in distances: fname = 'vectorizer[%s]_distance[%s]_models[%s]_stats' % \ (vectorizer.id, distance.__name__, mlabel) dists = np.empty(len(idPairs), dtype=np.float64) print fname for i, p in enumerate(idPairs): id1, id2 = p if not models: if id1 not in vectors: vectors[id1] = vectorizer(id1) if id2 not in vectors: vectors[id2] = vectorizer(id2) dists[i] = distance(vectors[id1], vectors[id2]) else: for m in models: vectors[id1] = vectorizer(id1, m) vectors[id2] = vectorizer(id2, m) dists[i] = distance(vectors[id1], vectors[id2]) #if i % 10000 == 0: print ' %d distances calculated' % i statistics(dists, join(savePath, fname))
def wordprob(corpus='us_politics', text2tokens='RsssuckerTxt2Tokens', dict='us_politics_dict'): from pytopia.resource.text_prob_vector.TextProbVectorizer import TextProbVectorizer vectorizer = TextProbVectorizer(text2tokens=text2tokens, dictionary=dict) textVectors = resolve('corpus_text_vectors_builder')(vectorizer=vectorizer, corpus=corpus) return textVectors
def __call__(self, topic): ''' :param topic: (modelId, topicId) :return: ''' mid, tid = topic model = resolve(mid) ctiBuilder = resolve('corpus_topic_index_builder') cti = ctiBuilder(corpus=model.corpus, model=model) topicTexts = cti.topicTexts(tid) if self.__threshold == 'above-random': rnd = 1.0 / model.numTopics() texts = [textId for textId, w in topicTexts if w > rnd] elif 0.0 < self.__threshold < 1.0: texts = [textId for textId, w in topicTexts if w > self.__threshold] elif isinstance(self.__threshold, (int, long)) : texts = [textId for textId, _ in topicTexts[:self.__threshold]] return texts
def __call__(self, txto): d, t2t = resolve(self.dictionary, self.text2tokens) vec = np.zeros(d.maxIndex() + 1, np.float32) numTokens = 0 for tok in t2t(txto.text): if tok in d: vec[d.token2index(tok)] += 1 numTokens += 1 vec /= numTokens return vec
def build(self, sparse=False): ''' :param sparse: if True, use sparse matrix for doc-wordCount matrix, which saves memory but is much slower. :return: ''' bowBuilder = resolve('bow_corpus_builder') bowCorpus = bowBuilder(corpus=self.corpus, text2tokens=self.text2tokens, dictionary=self.dictionary) dict = resolve(self.dictionary) # TODO use bowCorpus.corpusMatrix() instead of manually building rows, cols = len(bowCorpus), dict.maxIndex() + 1 if sparse: counts = dok_matrix((rows, cols), dtype=np.uint32) else: counts = np.zeros((rows, cols), dtype=np.uint32) for i, bow in enumerate(bowCorpus): counts[i] = bow2Vector(bow, cols, sparse=sparse) tfidf = TfidfTransformer(sublinear_tf=True) self.__tfidf = tfidf.fit_transform(counts)
def build(self): self.__id2index, self.__index2id = {}, {} corpus = resolve(self.corpus) i = 0 for txto in corpus: if txto.id not in self.__id2index: self.__id2index[txto.id] = i self.__index2id[i] = txto.id i += 1 self.__length = i
def wordDocs(self, word): ''' :param word: string or word index :return: list of (textId, wordCount), for all the texts where the word appears ''' d = resolve(self.dictionary) if isinstance(word, basestring): wi = d.token2index(word) else: wi = word ci = self.corpus_index return [(ci[di], wc) for di, wc in self._word2doc[wi]]
def tfIdfMatrix(t, threshold=0.1): ''' Create matrix where rows are tf-idf vectors of top documents for the topic :param t: (modelId, topicId), where modelId is in context :param threshold: use only documents with topic weight above treshold :return matrix of tfidf vectors, as ndarray ''' mid, tid = t model = resolve(mid) ctiBuilder = resolve('corpus_topic_index_builder') cti = ctiBuilder(corpus=model.corpus, model=model) topicTexts = cti.topicTexts(tid) if 0.0 < threshold < 1.0: texts = [textId for textId, w in topicTexts if w > threshold] else: texts = [textId for textId, w in topicTexts[:threshold]] tfidfBuilder = resolve('corpus_tfidf_builder') tfidf = tfidfBuilder(corpus=model.corpus, dictionary=model.dictionary, text2tokens=model.text2tokens) vecs = [ np.array(tfidf[txtid]) for txtid in texts ] return np.array(vecs)
def __getCorpusTopicIndex(self, modelId): ''' Build corpus topic index or retrieve from cache. ''' if not modelId in self.__ctiCache: # todo there is already memcache at pytopia level, # the problem is logging the accesses that slows things down ctiBuilder = resolve('corpus_topic_index_builder') cti = ctiBuilder(corpus=self.corpus, model=modelId) self.__ctiCache[modelId] = cti return self.__ctiCache[modelId]
def destemWords(words, top=True, corpusId='us_politics', text2tokens=RsssuckerTxt2Tokens()): itb = resolve('inverse_tokenizer_builder') itok = itb(corpusId, text2tokens, True) if top: print ' '.join(itok.allWords(w)[0] for w in words.split()) else: for w in words.split(): print w, itok.allWords(w)
def build(self): model = resolve(self.model) storedVectorsUsed = False if self._useNativeVectorization(): textTopics = model.corpusTopicVectors() if textTopics is not None: # todo: test for this case ci = resolve('corpus_index_builder')(self.corpus) shape = (len(ci), model.numTopics()) # todo maybe use == to compare indices (speed issue) if shape == textTopics.shape: self.__topics = np.copy(textTopics) storedVectorsUsed = True # todo remove __id2ind variable from the class self.__id2ind = { id_: i for i, id_ in enumerate(model.topicIds()) } else: self._logger().info('corpusTopicVectors shape mismatch, ' 'stored %s, required %s \n' 'proceeding to infer text topics' % (textTopics.shape, str(shape))) if not storedVectorsUsed: corpus = resolve(self.corpus) txt2tok = resolveIds( self.text2tokens if self.text2tokens else model.text2tokens) dict = resolveIds( self.dictionary if self.dictionary else model.dictionary) # turn corpus 2 bow corpus, make corpus index bowBuilder = resolve('bow_corpus_builder') bowCorpus = bowBuilder(corpus, txt2tok, dict) self.__topics = np.zeros((len(bowCorpus), model.numTopics()), dtype=np.float32) self.__createTopicIdIndex() tvec = model.inferTopics(bowCorpus, batch=True, format='bow') for i, txtVec in enumerate(tvec): for tid, ti in self.__id2ind.iteritems(): self.__topics[i, ti] = txtVec[tid]
def contrastCoherences(coh1, coh2, topics, tableParse, ltopics, coh1Top=True, coh2Top=False, sort=None, per1=0.9, per2=0.1, topWords=10): ''' Display topics with good rank by one coherence measure and bad ranked by another measure. :param per1, per2: percentiles that define what is good and bad rank - take per1 percentile by coh1 (or above) and bottom per2 percentile by coh2 (or below) :param topics: list of labeled topics :param topWords: top words for topic label :return: ''' from numpy import percentile res1 = [coh1(t) for t, tl in topics] res2 = [coh2(t) for t, tl in topics] perc1 = percentile(res1, per1 * 100.0) perc2 = percentile(res2, per2 * 100.0) print 'coh_top', coh1.id print 'coh_bot', coh2.id selected = [] selector = lambda score, perc, above: score >= perc if above else score <= perc selector1 = lambda score: selector(score, perc1, coh1Top) selector2 = lambda score: selector(score, perc2, coh2Top) for i, t in enumerate(topics): if selector1(res1[i]) and selector2(res2[i]): topic = t[0] selected.append(topic) if sort: if sort == coh1: selected = sorted(selected, key=lambda t: coh1(t), reverse=True) else: selected = sorted(selected, key=lambda t: coh2(t), reverse=True) topic2label = {t: l for t, l in ltopics} for topic in selected: mi, ti = topic model = resolve(mi) label = topic2label[topic] semtopics = u';'.join( th for th in tableParse.getTopic(topicLabel(topic)).themes) print '%15s: %s , %s , [%s]' % (topicLabel(topic), model.topic2string( ti, topWords), label, semtopics)
def createTextPerLine(corpus, folder, maxTexts=None, seed=None): ''' Create text per line corpus from some pytopia corpus and save to file, composing the file name from the params. :param folder: folder to store text new corpus to :param maxTexts, seed: see corpus2textPerLine :return: ''' corpus = resolve(corpus) fname = '%s%s%s.txt' % (corpus.id, '' if not maxTexts else '_[:%s]' % str(maxTexts), '' if not seed else '_seed[%s]' % str(seed)) txtFile = path.join(folder, fname) corpus2textPerLine(corpus, txtFile, maxTexts=maxTexts, seed=seed)
def __palmettoScorer(self): from pytopia.topic_functions.coherence.palmetto_coherence import PalmettoCoherence if 'standard' in self.__p and self.__p['standard'] == True: # for 'standard' palmetto, original index stores regular words, not stems # so inverse tokenization has to be performed itb = resolve('inverse_tokenizer_builder') itok = itb(self.corpus, self.text2tokens, True) else: itok = None coh = PalmettoCoherence(self.type, topWords=self.topWords, wordTransform=itok, **self.__p) self.__scorer = coh
def __call__(self, textId, model=None): ''' :param textId: :param model: topic model id :return: ''' if self.models: models = self.__models else: models = [model] vecs = [] for modelId in models: cti = self.__getCorpusTopicIndex(modelId) topicVals = cti.textTopics(textId) model = resolve(modelId) vecs.append(self.__topics2vector(topicVals, model)) return np.concatenate(vecs)
def corpusMatrix(self, sparse=True, dtype=np.uint32): ''' Create matrix from bow corpus. Rows are indices of documents, columns are dictionary indices of words words, values are number of words in the document. :param sparse: weather to return scipy.sparse matrix of np.ndarray ''' from pytopia.corpus.tools import bow2Vector self.dict = resolve(self.dict) rows, cols = len(self), self.dict.maxIndex() + 1 if sparse: matrix = dok_matrix((rows, cols), dtype=dtype) else: matrix = np.zeros((rows, cols), dtype=dtype) for i, bow in enumerate(self): matrix[i] = bow2Vector(bow, cols, sparse=sparse) return matrix