def hal(self, wordset="verbs", zsaxes=(0, 1), rectify=False, basepath="/auto/k8/huth/storydata/story+books+wiki+15w-densehal-mat", debug=False): """HAL semantic model (without dimensionality reduction). """ from text.story.util.HalModel import make_hal_wordset_model, verb_set, make_hal_sm, english1000 haltf = tables.openFile(basepath + ".hf5") halmat = np.array(haltf.root.halmat.read()) halvocab = cPickle.load(open(basepath + "-vocab")) ## Choose a wordset if wordset == "verbs": wordset = verb_set elif wordset == "cmuverbs": wordset = verb_set[:23] elif wordset == "english1000": wordset = english1000 halsm = make_hal_sm(halmat, halvocab, wordset) for axis in zsaxes: halsm.zscore(axis) if rectify: halsm.rectify() halstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, halsm)) #return mapdict(halstimseqs, lambda s: s.chunksums()) if debug: return halstimseqs return self.downsample(halstimseqs)
def word2vec(self, modelfile="/auto/k8/huth/GoogleNews-vectors-negative300.bin", norm=False): """GenSim / word2vec model. """ model = self.get_word2vec_model(modelfile, norm) #modeldims = model["test"].shape[0] #model.data = np.zeros((modeldims,)) w2vstims = mapdict(self.wordseqs, lambda ds: makelsa(ds, model)) return self.downsample(w2vstims)
def nmflsa(self): """NMF LSA model based on newLSA. """ tf = tables.openFile("/auto/k6/huth/nmf-lsa.hf5") vocab = tf.root.vocab.read() data = tf.root.data.read() nmodel = SemanticModel(data, vocab) wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, nmodel)) #return mapdict(wordstimseqs, lambda s: s.chunksums()) return self.downsample(wordstimseqs)
def commonwords( self, num=100, basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2-vocab"): """Common word indicator model. Based on old LSA model fitting, used less data. """ vocab = cPickle.load(open(basepath)) counts = cPickle.load(open(basepath + "-Rcounts")) selwords = np.argsort(counts)[-num:] wmodel = SemanticModel(np.eye(num), list(np.array(vocab)[selwords])) wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel)) #return mapdict(wordstimseqs, lambda s: s.chunksums()) return self.downsample(wordstimseqs)
def co(self, wordset="english1000", zsaxes=(0, 1), rectify=False, basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat", debug=False): """Co-occurence-based semantic model (without dimensionality reduction). """ cosm = self.get_co_model(wordset, zsaxes, rectify, basepath) costimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, cosm)) #return mapdict(halstimseqs, lambda s: s.chunksums()) if debug: return costimseqs return self.downsample(costimseqs)
def commonwords2( self, num=100, basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat" ): """Common word indicator model. Base on newer co model fitting, using more data. """ cotf = tables.openFile(basepath + ".hf5") counts = cotf.root.wordcounts.read() covocab = cPickle.load(open(basepath + "-vocab")) selwords = np.argsort(counts)[-num:] wmodel = SemanticModel(np.eye(num), list(np.array(covocab)[selwords])) wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel)) return self.downsample(wordstimseqs)
def allwords(self): """All word indicator model. """ from text.textcore import Corpus corpus_file = "/auto/k5/huth/corpora/story/raw-transcripts/stories1.tar.gz" corpus = Corpus(corpus_file, split_documents=200) corpus_file1 = "/auto/k5/huth/corpora/story/raw-transcripts/stories2.tar.gz" corpus.append_corpus(corpus_file1) storyvocab = sorted(list(set(corpus.get_vocabulary()))) num = len(storyvocab) wmodel = SemanticModel(np.eye(num), list(np.array(storyvocab))) wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel)) #return mapdict(wordstimseqs, lambda s: s.chunksums()) return self.downsample(wordstimseqs)
def newlsa(self, ndim, rectify, entweight, entcutoff=5, basepath="/auto/k6/huth/lsamats6/", debug=False): """New LSA semantic model. """ lsasm = self.get_newlsa_model(ndim, rectify, entweight, entcutoff, basepath) lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm)) if debug: return lsastimseqs return self.downsample(lsastimseqs)
def lsa(self, ndim, rectify, zsaxes=(1, ), basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2", debug=False): """LSA semantic model. """ vocab = cPickle.load(open(basepath + "-vocab")) lsasm = SemanticModel(None, None) lsasm.load_ascii_root(basepath + "-Vt", vocab) lsasm.data = lsasm.data[:ndim] for axis in zsaxes: lsasm.zscore(axis) if rectify: lsasm.rectify() lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm)) #return mapdict(lsastimseqs, lambda s: s.chunksums()) if debug: return lsastimseqs return self.downsample(lsastimseqs)