def build(self):
     self.__initDataStructures()
     text2tokens = resolve(self.text2tokens)
     text2tokens.originalWords = True
     for txto in resolve(self.corpus):
         for token, word in text2tokens(txto.text):
             self.__register(token, word)
     # todo solve this problem, if only a single copy of the tokenizer exists
     # in the context, than originalWords should be reset back to false
     # because most of the use cases assume it is false, possible solution is copying
     text2tokens.originalWords = False
def worldvecCro(type='word2vec', corpus='iter0_cronews_final'):
    from pytopia.resource.word_vec_aggregator.WordVecAggregator import WordVecAggregator
    if type.startswith('word2vec'):
        w2vec = resolve('word2vec_builder')('word2vec.hrwac.cbow.vectors.bin')
    else:
        w2vec = resolve('glove_vectors_builder')('glove.hrwac.300d.txt')
    avg = True if type.endswith('avg') else None
    text2vec = WordVecAggregator('croelect_alphanum_stopword_tokenizer', w2vec,
                                 None, avg)
    mapper = resolve('corpus_text_vectors_builder')(vectorizer=text2vec,
                                                    corpus=corpus)
    return mapper
def worldvec(type='word2vec', corpus='us_politics'):
    from pytopia.resource.word_vec_aggregator.WordVecAggregator import WordVecAggregator
    if type.startswith('word2vec'):
        w2vec = resolve('word2vec_builder')(
            'GoogleNews-vectors-negative300.bin')
    else:
        w2vec = resolve('glove_vectors_builder')('glove.6B.300d.txt')
    avg = True if type.endswith('avg') else None
    text2vec = WordVecAggregator('alphanum_gtar_stopword_tokenizer', w2vec,
                                 None, avg)
    mapper = resolve('corpus_text_vectors_builder')(vectorizer=text2vec,
                                                    corpus=corpus)
    return mapper
def printDocumentTitles(topic, topDocs=10, corpus='us_politics'):
    '''
    :param topic: (modelId, topicId)
    :param topDocs:
    :return:
    '''
    mid, tid = topic
    ctiBuilder = resolve('corpus_topic_index_builder')
    cti = ctiBuilder(corpus=corpus, model=mid)
    wtexts = cti.topicTexts(tid, top=topDocs)
    txtIds = [id_ for id_, _ in wtexts]
    corpus = resolve(corpus)
    idTexts = corpus.getTexts(txtIds)
    for txto in idTexts:
        print txto.title
示例#5
0
 def __createTopicIdIndex(self):
     '''
     Since topic ids can in general be any objects, mapping between
     ids and indices [0, ... , NumTopics] must be created.
     '''
     model = resolve(self.model)
     self.__id2ind = {tid: i for i, tid in enumerate(model.topicIds())}
 def build(self):
     ci = self.corpus_index
     corpus = resolve(self.corpus)
     self._m = None  # matrix for storing coprus vector
     rows = len(ci)
     for txto in corpus:
         if self.verbose: print txto.id
         t0 = time()
         vec = self.vectorizer(txto)
         if self.verbose: print ' time 2 vectorize %.4f' % (time() - t0)
         if self._m is None:  # init matrix
             if isinstance(vec, np.ndarray):
                 cols = vec.shape[0]
                 self._m = np.empty(shape=(rows, cols), dtype=vec.dtype)
                 self.sparse = False
             elif isinstance(vec, spmatrix):
                 cols = vec.shape[1]
                 self._m = sparse_type((rows, cols), dtype=vec.dtype)
                 self.sparse = True
             else:
                 raise Exception('Unsuported vector type: %s' %
                                 str(type(vec)))
         t0 = time()
         # if self.sparse and not(isinstance(vec, sparse_type)):
         #     vec = sparse_type(vec)
         #print vec.shape, type(vec)
         #print self._m.shape, type(self._m)
         r = ci.id2index(txto.id)
         self._m[r] = vec
         if self.verbose:
             print ' time 2 write 2 matrix %.4f' % (time() - t0)
示例#7
0
def croelectTopicFeatures(topic, table=None):
    '''
    Extract features of a topic from topic - semantic topic table and topic description.
    :param topic: (modelId, topicId)
    '''
    mid, tid = topic
    model = resolve(mid)
    assert mid.startswith('croelect_')
    mid = mid[9:]
    parse = croelectTableParse()
    topicLabel = '%s.%d' % (mid, tid)

    #print topicLabel
    f = {}
    ptopic = parse.getTopic(topicLabel)
    #print ptopic.themes
    # num_themes
    f['num_themes'] = len(ptopic.themes)
    # table_mixed
    f['table_mixed'] = ptopic.mixed
    l = unicode(model.description.topic[tid].label).lower().strip()
    # label_noiseonly
    if l == u'šum': f['label_noiseonly'] = True
    else: f['label_noiseonly'] = False
    # label_noise
    if u'šum' in l: f['label_noise'] = True
    else: f['label_noise'] = False
    return f
示例#8
0
def getAllCroelectTopics():
    '''
    Return all topics of croelect models as (modelId, topicId).
    '''
    all = [(mid, tid) for mid in croelectModelIds
           for tid in sorted(resolve(mid).topicIds())]
    return all
def uspolTopicFeatures(topic):
    '''
    Extract features of a topic from topic - semantic topic table and topic description.
    :param topic: (modelId, topicId)
    '''
    parse = tableParse()
    mid, tid = topic
    topicLabel = '%s.%d' % (mid, tid)
    f = {}
    ptopic = parse.getTopic(topicLabel)
    # num_themes
    f['num_themes'] = len(ptopic.themes)
    # table_mixed
    f['table_mixed'] = ptopic.mixed
    model = resolve(mid)
    l = str(model.description.topic[tid].label).lower().strip()
    # label_mixed
    if l.startswith('mix:') or l.startswith('mixture:'):
        f['label_mixed'] = True
    else:
        f['label_mixed'] = False
    # label_mixonly
    if l == 'mix' or l == 'mixture': f['label_mixonly'] = True
    else: f['label_mixonly'] = False
    # label_noise
    if l.endswith('et al'): f['label_noise'] = True
    else: f['label_noise'] = False
    # stopwords
    f['stopwords'] = (l == 'stopwords')
    return f
示例#10
0
 def __call__(self, topic):
     '''
     :param topic: (modelId, topicId)
     '''
     mid, tid = topic
     model = resolve(mid)
     return self.calculateCoherence(model.topic2string(tid, topw=self.topWords))
示例#11
0
 def __buildOld(self):
     model, corpus = resolve(self.model, self.corpus)
     txt2tok = resolve(
         self.text2tokens if self.text2tokens else model.text2tokens)
     dict = resolve(
         self.dictionary if self.dictionary else model.dictionary)
     self.__createTopicIdIndex()
     # turn corpus 2 bow corpus, make corpus index
     bowBuilder = resolve('bow_corpus_builder')
     bowCorpus = bowBuilder(corpus, txt2tok, dict)
     self.__topics = np.zeros((len(bowCorpus), model.numTopics()),
                              dtype=np.float32)
     for i, bowTxt in enumerate(bowCorpus):
         tvec = model.inferTopics(bowTxt, format='bow')
         for tid, ti in self.__id2ind.iteritems():
             self.__topics[i, ti] = tvec[tid]
示例#12
0
 def inferTopics(self, txt, batch=False, format='tokens'):
     '''
     Calculate document-topic proportions for text(s).
     :param txt: text or iterable of texts
     :param batch: it True, txt is iterable of text, otherwise a single text
     :param format of a single text:
         'tokens' - list of tokens, 'bow' - list of (wordId, wordCount), 'string'
     :return: if a single text, single doc-topic vector, else a list of doc-topic vectors
     '''
     if batch: texts = [t for t in txt]
     else: texts = [txt]
     if format == 'tokens':
         dict_ = resolve(self.dictionary)
         texts = [dict_.tokens2bow(t) for t in texts]
     elif format == 'bow':
         pass  # already in correct format
     else:
         raise Exception('format %s not supported' % format)
     result = self.model.inference(texts, collect_sstats=False)
     vectors = result[
         0]  # first part of the result 2-tuple is a list of vectors
     for vec in vectors:
         vec /= vec.sum()  # normalize to prob. distribution
     if batch: return vectors
     else: return vectors[0]
示例#13
0
 def __call__(self, topic):
     '''
     :param topic: (modelId, topicId)
     :return:
     '''
     mid, tid = topic
     model = resolve(mid)
     ctiBuilder = resolve('corpus_topic_index_builder')
     cti = ctiBuilder(corpus=model.corpus, model=model)
     topicTexts = cti.topicTexts(tid, sorted=None)
     # take document weights and normalize
     tdist = topicTexts[:, 1].astype(np.float64)
     tdist /= tdist.sum()
     numDocs = topicTexts.shape[0]
     tvac = np.repeat(1.0/numDocs, numDocs)
     return self.measure(tdist, tvac)
示例#14
0
 def __init__(self,
              selector,
              mapper,
              score,
              mapperIsFactory=True,
              timer=False,
              useTopic=False):
     '''
     :param selector: returns list of documents for the topic
     :param mapper: a mapper function on (selected) topic documents,
             returning vectors or scalar,
             or alternatively a 'factory' - callable that accepts 'dictionary',
             'text2tokens' and 'corpus' parameters (attributes of
              a topic model to which processed topic belongs) and
             builds such a mapper.
             This is for creating a customized mapper for each model.
     :param mapperIsFactory: if True treat mapper as a factory as described above
     :param score: score function on the matrix/vector of transformed documents
     :param useTopic: if True, topic is also sent as parameter to mapper
     '''
     self.selector, self.score = selector, deduceId(score)
     self.mapper = resolveId(mapper)
     IdComposer.__init__(self)
     self.__factory = mapperIsFactory
     self.__mapper = resolve(mapper)
     self.__score = score
     self.__timer = timer
     self.__useTopic = useTopic
def docuDistStats(vectorizers,
                  distances,
                  corpus='us_politics',
                  sampleSize=100000,
                  rndSeed=54778,
                  savePath='.',
                  models=None):
    '''
    :param corpus:
    :param vectorizer:
    :param distance:
    :param sampleSize:
    :return:
    '''
    from numpy import triu_indices
    from numpy.random import choice, seed
    corpus = resolve(corpus)
    ids, id2txt = [], {}
    for txto in corpus:
        ids.append(txto.id)
        id2txt[txto.id] = txto
    N = len(ids)
    print 'corpus indexed, size %d' % N
    # sample pairs of ids
    pairs = triu_indices(N, 1)
    numPairs = len(pairs[0])
    print 'pairs array created'
    seed(rndSeed)
    indSample = choice(numPairs, sampleSize, replace=False)
    print 'sampling'
    idPairs = [(ids[pairs[0][i]], ids[pairs[1][i]]) for i in indSample]
    pairs = None
    import gc
    gc.collect()
    # create pair distances
    print 'calculating distances'
    if not isinstance(distances, list): distances = [distances]
    if not isinstance(vectorizers, list): vectorizers = [vectorizers]
    mlabel = '' if models is None else '_'.join(m for m in models)
    for vectorizer in vectorizers:
        vectors = {}
        for distance in distances:
            fname = 'vectorizer[%s]_distance[%s]_models[%s]_stats' % \
                    (vectorizer.id, distance.__name__, mlabel)
            dists = np.empty(len(idPairs), dtype=np.float64)
            print fname
            for i, p in enumerate(idPairs):
                id1, id2 = p
                if not models:
                    if id1 not in vectors: vectors[id1] = vectorizer(id1)
                    if id2 not in vectors: vectors[id2] = vectorizer(id2)
                    dists[i] = distance(vectors[id1], vectors[id2])
                else:
                    for m in models:
                        vectors[id1] = vectorizer(id1, m)
                        vectors[id2] = vectorizer(id2, m)
                        dists[i] = distance(vectors[id1], vectors[id2])
                #if i % 10000 == 0: print '  %d distances calculated' % i
            statistics(dists, join(savePath, fname))
def wordprob(corpus='us_politics',
             text2tokens='RsssuckerTxt2Tokens',
             dict='us_politics_dict'):
    from pytopia.resource.text_prob_vector.TextProbVectorizer import TextProbVectorizer
    vectorizer = TextProbVectorizer(text2tokens=text2tokens, dictionary=dict)
    textVectors = resolve('corpus_text_vectors_builder')(vectorizer=vectorizer,
                                                         corpus=corpus)
    return textVectors
示例#17
0
 def __call__(self, topic):
     '''
     :param topic: (modelId, topicId)
     :return:
     '''
     mid, tid = topic
     model = resolve(mid)
     ctiBuilder = resolve('corpus_topic_index_builder')
     cti = ctiBuilder(corpus=model.corpus, model=model)
     topicTexts = cti.topicTexts(tid)
     if self.__threshold == 'above-random':
         rnd = 1.0 / model.numTopics()
         texts = [textId for textId, w in topicTexts if w > rnd]
     elif 0.0 < self.__threshold < 1.0:
         texts = [textId for textId, w in topicTexts if w > self.__threshold]
     elif isinstance(self.__threshold, (int, long)) :
         texts = [textId for textId, _ in topicTexts[:self.__threshold]]
     return texts
示例#18
0
 def __call__(self, txto):
     d, t2t = resolve(self.dictionary, self.text2tokens)
     vec = np.zeros(d.maxIndex() + 1, np.float32)
     numTokens = 0
     for tok in t2t(txto.text):
         if tok in d:
             vec[d.token2index(tok)] += 1
             numTokens += 1
     vec /= numTokens
     return vec
 def build(self, sparse=False):
     '''
     :param sparse: if True, use sparse matrix for doc-wordCount matrix,
         which saves memory but is much slower.
     :return:
     '''
     bowBuilder = resolve('bow_corpus_builder')
     bowCorpus = bowBuilder(corpus=self.corpus,
                            text2tokens=self.text2tokens,
                            dictionary=self.dictionary)
     dict = resolve(self.dictionary)
     # TODO use bowCorpus.corpusMatrix() instead of manually building
     rows, cols = len(bowCorpus), dict.maxIndex() + 1
     if sparse: counts = dok_matrix((rows, cols), dtype=np.uint32)
     else: counts = np.zeros((rows, cols), dtype=np.uint32)
     for i, bow in enumerate(bowCorpus):
         counts[i] = bow2Vector(bow, cols, sparse=sparse)
     tfidf = TfidfTransformer(sublinear_tf=True)
     self.__tfidf = tfidf.fit_transform(counts)
示例#20
0
 def build(self):
     self.__id2index, self.__index2id = {}, {}
     corpus = resolve(self.corpus)
     i = 0
     for txto in corpus:
         if txto.id not in self.__id2index:
             self.__id2index[txto.id] = i
             self.__index2id[i] = txto.id
             i += 1
     self.__length = i
 def wordDocs(self, word):
     '''
     :param word: string or word index
     :return: list of (textId, wordCount), for all the texts where the word appears
     '''
     d = resolve(self.dictionary)
     if isinstance(word, basestring): wi = d.token2index(word)
     else: wi = word
     ci = self.corpus_index
     return [(ci[di], wc) for di, wc in self._word2doc[wi]]
示例#22
0
def tfIdfMatrix(t, threshold=0.1):
    '''
    Create matrix where rows are tf-idf vectors of top documents for the topic
    :param t: (modelId, topicId), where modelId is in context
    :param threshold: use only documents with topic weight above treshold
    :return matrix of tfidf vectors, as ndarray
    '''
    mid, tid = t
    model = resolve(mid)
    ctiBuilder = resolve('corpus_topic_index_builder')
    cti = ctiBuilder(corpus=model.corpus, model=model)
    topicTexts = cti.topicTexts(tid)
    if 0.0 < threshold < 1.0:
        texts = [textId for textId, w in topicTexts if w > threshold]
    else: texts = [textId for textId, w in topicTexts[:threshold]]
    tfidfBuilder = resolve('corpus_tfidf_builder')
    tfidf = tfidfBuilder(corpus=model.corpus, dictionary=model.dictionary,
                         text2tokens=model.text2tokens)
    vecs = [ np.array(tfidf[txtid]) for txtid in texts ]
    return np.array(vecs)
 def __getCorpusTopicIndex(self, modelId):
     '''
     Build corpus topic index or retrieve from cache.
     '''
     if not modelId in self.__ctiCache:
         # todo there is already memcache at pytopia level,
         # the problem is logging the accesses that slows things down
         ctiBuilder = resolve('corpus_topic_index_builder')
         cti = ctiBuilder(corpus=self.corpus, model=modelId)
         self.__ctiCache[modelId] = cti
     return self.__ctiCache[modelId]
def destemWords(words,
                top=True,
                corpusId='us_politics',
                text2tokens=RsssuckerTxt2Tokens()):
    itb = resolve('inverse_tokenizer_builder')
    itok = itb(corpusId, text2tokens, True)
    if top:
        print ' '.join(itok.allWords(w)[0] for w in words.split())
    else:
        for w in words.split():
            print w, itok.allWords(w)
示例#25
0
 def build(self):
     model = resolve(self.model)
     storedVectorsUsed = False
     if self._useNativeVectorization():
         textTopics = model.corpusTopicVectors()
         if textTopics is not None:
             # todo: test for this case
             ci = resolve('corpus_index_builder')(self.corpus)
             shape = (len(ci), model.numTopics())
             # todo maybe use == to compare indices (speed issue)
             if shape == textTopics.shape:
                 self.__topics = np.copy(textTopics)
                 storedVectorsUsed = True
                 # todo remove __id2ind variable from the class
                 self.__id2ind = {
                     id_: i
                     for i, id_ in enumerate(model.topicIds())
                 }
             else:
                 self._logger().info('corpusTopicVectors shape mismatch, '
                                     'stored %s, required %s \n'
                                     'proceeding to infer text topics' %
                                     (textTopics.shape, str(shape)))
     if not storedVectorsUsed:
         corpus = resolve(self.corpus)
         txt2tok = resolveIds(
             self.text2tokens if self.text2tokens else model.text2tokens)
         dict = resolveIds(
             self.dictionary if self.dictionary else model.dictionary)
         # turn corpus 2 bow corpus, make corpus index
         bowBuilder = resolve('bow_corpus_builder')
         bowCorpus = bowBuilder(corpus, txt2tok, dict)
         self.__topics = np.zeros((len(bowCorpus), model.numTopics()),
                                  dtype=np.float32)
         self.__createTopicIdIndex()
         tvec = model.inferTopics(bowCorpus, batch=True, format='bow')
         for i, txtVec in enumerate(tvec):
             for tid, ti in self.__id2ind.iteritems():
                 self.__topics[i, ti] = txtVec[tid]
def contrastCoherences(coh1,
                       coh2,
                       topics,
                       tableParse,
                       ltopics,
                       coh1Top=True,
                       coh2Top=False,
                       sort=None,
                       per1=0.9,
                       per2=0.1,
                       topWords=10):
    '''
    Display topics with good rank by one coherence measure and
     bad ranked by another measure.
    :param per1, per2: percentiles that define what is good and bad rank -
        take per1 percentile by coh1 (or above) and bottom per2 percentile by coh2 (or below)
    :param topics: list of labeled topics
    :param topWords: top words for topic label
    :return:
    '''
    from numpy import percentile
    res1 = [coh1(t) for t, tl in topics]
    res2 = [coh2(t) for t, tl in topics]
    perc1 = percentile(res1, per1 * 100.0)
    perc2 = percentile(res2, per2 * 100.0)
    print 'coh_top', coh1.id
    print 'coh_bot', coh2.id
    selected = []
    selector = lambda score, perc, above: score >= perc if above else score <= perc
    selector1 = lambda score: selector(score, perc1, coh1Top)
    selector2 = lambda score: selector(score, perc2, coh2Top)
    for i, t in enumerate(topics):
        if selector1(res1[i]) and selector2(res2[i]):
            topic = t[0]
            selected.append(topic)
    if sort:
        if sort == coh1:
            selected = sorted(selected, key=lambda t: coh1(t), reverse=True)
        else:
            selected = sorted(selected, key=lambda t: coh2(t), reverse=True)
    topic2label = {t: l for t, l in ltopics}
    for topic in selected:
        mi, ti = topic
        model = resolve(mi)
        label = topic2label[topic]
        semtopics = u';'.join(
            th for th in tableParse.getTopic(topicLabel(topic)).themes)
        print '%15s: %s , %s , [%s]' % (topicLabel(topic),
                                        model.topic2string(
                                            ti, topWords), label, semtopics)
示例#27
0
def createTextPerLine(corpus, folder, maxTexts=None, seed=None):
    '''
    Create text per line corpus from some pytopia corpus and save to file,
     composing the file name from the params.
    :param folder: folder to store text new corpus to
    :param maxTexts, seed: see corpus2textPerLine
    :return:
    '''
    corpus = resolve(corpus)
    fname = '%s%s%s.txt' % (corpus.id,
                            '' if not maxTexts else '_[:%s]' % str(maxTexts),
                            '' if not seed else '_seed[%s]' % str(seed))
    txtFile = path.join(folder, fname)
    corpus2textPerLine(corpus, txtFile, maxTexts=maxTexts, seed=seed)
示例#28
0
 def __palmettoScorer(self):
     from pytopia.topic_functions.coherence.palmetto_coherence import PalmettoCoherence
     if 'standard' in self.__p and self.__p['standard'] == True:
         # for 'standard' palmetto, original index stores regular words, not stems
         # so inverse tokenization has to be performed
         itb = resolve('inverse_tokenizer_builder')
         itok = itb(self.corpus, self.text2tokens, True)
     else:
         itok = None
     coh = PalmettoCoherence(self.type,
                             topWords=self.topWords,
                             wordTransform=itok,
                             **self.__p)
     self.__scorer = coh
 def __call__(self, textId, model=None):
     '''
     :param textId:
     :param model: topic model id
     :return:
     '''
     if self.models: models = self.__models
     else: models = [model]
     vecs = []
     for modelId in models:
         cti = self.__getCorpusTopicIndex(modelId)
         topicVals = cti.textTopics(textId)
         model = resolve(modelId)
         vecs.append(self.__topics2vector(topicVals, model))
     return np.concatenate(vecs)
示例#30
0
 def corpusMatrix(self, sparse=True, dtype=np.uint32):
     '''
     Create matrix from bow corpus.
     Rows are indices of documents, columns are dictionary indices of words words,
     values are number of words in the document.
     :param sparse: weather to return scipy.sparse matrix of np.ndarray
     '''
     from pytopia.corpus.tools import bow2Vector
     self.dict = resolve(self.dict)
     rows, cols = len(self), self.dict.maxIndex() + 1
     if sparse: matrix = dok_matrix((rows, cols), dtype=dtype)
     else: matrix = np.zeros((rows, cols), dtype=dtype)
     for i, bow in enumerate(self):
         matrix[i] = bow2Vector(bow, cols, sparse=sparse)
     return matrix