def addintoitem(self, item, vector, weight=1): if not self.contains(item): vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.indexspace[item] = vector self.globalfrequency[item] = 0 self.contextspace[item] = sparsevectors.newemptyvector( self.dimensionality) self.associationspace[item] = sparsevectors.newemptyvector( self.dimensionality) self.contextspace[item] = sparsevectors.sparseadd( self.contextspace[item], sparsevectors.normalise(vector), weight)
def additem(self, item, vector="dummy"): if vector is "dummy": vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) if not self.contains(item): self.indexspace[item] = vector self.globalfrequency[item] = 1 self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality) self.attributespace[item] = sparsevectors.newemptyvector(self.dimensionality) self.morphologyspace[item] = sparsevectors.newemptyvector(self.dimensionality) # self.textspace[item] = sparsevectors.newemptyvector(self.dimensionality) # self.utterancespace[item] = sparsevectors.newemptyvector(self.dimensionality) # self.authorspace[item] = sparsevectors.newemptyvector(self.dimensionality) self.bign += 1
def importgavagaiwordspace(self, vectorfile: str, threshold=5): vectorpattern = re.compile( r"\(\"(.*)\" #S(\d+);([\d\+\-\;]+): #S\d+;(.+): (\d+)\)", re.IGNORECASE) itempattern = re.compile(r"(\d+)\+?(\-?[\d\.e\-]+)$") antal = 0 antalkvar = 0 try: with open(vectorfile, 'rt', errors="replace") as gavagaispace: for line in gavagaispace: antal += 1 vectors = vectorpattern.match(line) if vectors: string = str(vectors.group(1)) dim = int(vectors.group(2)) idx = vectors.group(3) ctx = vectors.group(4) freq = int(vectors.group(5)) if freq > threshold: antalkvar += 1 # logger("{} {} {} {} {}".format(antal, antalkvar, string, freq, idx), debug) idxvector = sparsevectors.newemptyvector(dim) idxlist = idx.split(";") for ii in idxlist: try: item = itempattern.match(ii) idxvector[int(item.group(1))] = float( item.group(2)) except: logger( "{} {} {} {}".format( antal, string, ii, idx), error) self.additem(string, idxvector) ctxvector = sparsevectors.newemptyvector(dim) ctxlist = ctx.split(";") for ii in ctxlist: try: item = itempattern.match(ii) ctxvector[int(item.group(1))] = float( item.group(2)) except: logger( "{} {} {} {}".format( antal, string, ii, idx), error) self.contextspace[string] = ctxvector self.observedfrequency[string] = freq self.languagemodel.additem(string, freq) except IOError: logger("Could not read from >>" + vectorfile + "<<", error)
def textvector(self, string, frequencyweighting=True, loglevel=False): uvector = sparsevectors.newemptyvector(self.dimensionality) if self.window > 0: windows = [string[ii:ii + self.window] for ii in range(len(string) - self.window + 1)] for sequence in windows: thisvector = self.makevector(sequence) if frequencyweighting: factor = self.frequencyweight(sequence) else: factor = 1 logger(sequence + " " + str(factor), loglevel) if loglevel: logger(str(sparsevectors.sparsecosine(uvector, sparsevectors.normalise(thisvector))), loglevel) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(thisvector), factor) else: words = nltk.word_tokenize(string) if self.binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
def additem(self, item, vector="dummy"): if vector is "dummy": vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) if not self.contains(item): self.indexspace[item] = vector self.contextspace[item] = sparsevectors.newemptyvector( self.dimensionality)
def sequencevector(self, sequence, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(self.dimensionality) windowlist = self.windows(sequence) logger(str(windowlist), loglevel) for w in windowlist: initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( self.onesequencevector(w, None, loglevel))) return initialvector
def postriplevector(self, text, poswindow=3): poses = nltk.pos_tag(text) windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)] onevector = self.pospermutations["vector"] vector = sparsevectors.newemptyvector(self.dimensionality) for sequence in windows: for item in sequence: if item[1] not in self.pospermutations: self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality) onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]]) vector = sparsevectors.sparseadd(vector, onevector) return vector
def additem(self, item, vector=None): """ Add new item to the space. Add randomly generated index vector (unless one is given as an argument or one already is recorded in index space); add empty context space, prep LanguageModel to accommodate item. Should normally be called from observe() but also at times from addintoitem. """ if item not in self.indexspace: if vector is None: vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.indexspace[item] = vector self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality) self.changed = True self.observedfrequency[item] = 0
def rolevector(roledict, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for role in roledict: for item in roledict[role]: ctxspace.observe(item, False, debug) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( ctxspace.useoperator(ctxspace.indexspace[item], role))) if loglevel: logger( role + " " + item + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def tweetvector(string): uvector = sparsevectors.newemptyvector(ngramspace.dimensionality) if window > 0: windows = [ string[ii:ii + window] for ii in range(len(string) - window + 1) ] for sequence in windows: if ngramspace.contains(sequence): thisvector = ngramspace.indexspace[sequence] # ngramspace.observe(sequence) # should we be learning stuff now? naaw. else: thisvector = stringspace.makevector(sequence) # ngramspace.additem(sequence, thisvector) # should it be added to cache? naaw. factor = ngramspace.frequencyweight(sequence) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector), factor) return uvector
def tokenvector(tokenlist, initialvector=None, weights=True, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for item in tokenlist: if not weights or str(item).startswith( "JiK" ): # cxg features should not be weighted the same way lex feats are weight = 1 else: weight = ctxspace.languagemodel.frequencyweight(item, True) ctxspace.observe(item, True) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise(ctxspace.contextspace[item]), weight) if loglevel: logger( item + " " + str(weight) + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def textvector(self, words, frequencyweighting=True, binaryfrequencies=False, loglevel=False): self.docs += 1 uvector = sparsevectors.newemptyvector(self.dimensionality) if binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) self.df[w] += 1 uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
]: # "["hearts", "turtle", "cat", "rabbit", "queen", "and", "off"]: n = {} for v in vectorrepository: n[v] = sparsevectors.sparsecosine(space.indexspace[probe], vectorrepository[v]) m = sorted(sentencerepository, key=lambda k: n[k], reverse=True) for mc in m: if n[mc] > 0.0001: print(probe, mc, n[mc], sentencerepository[mc]) print(space.contexttoindexneighbourswithweights(probe)) for v in vectorrepository: print(v, sentencerepository[v], sep="\t", end="\t") # print(v, vectorrepository[v]) ww = nltk.word_tokenize(sentencerepository[v]) vec = sparsevectors.newemptyvector(dimensionality) # for www in ww: # print(www, space.indexspace[www], space.globalfrequency[www], space.frequencyweight(www), sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v])) nvn = {} for www in ww: nvn[www] = sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v]) vec = sparsevectors.sparseadd( vec, sparsevectors.normalise(space.indexspace[www]), space.frequencyweight(www)) m = sorted(ww, key=lambda k: nvn[k], reverse=True)[:5] for mc in m: if nvn[mc] > 0.0001: print(mc, nvn[mc], sep=":", end="\t") print()
random.shuffle( filenamelist ) # if we shuffle here the weights won't be as good i mean overtrained filenamelist = filenamelist[:testbatchsize] logger("Going on with a file list of " + str(testbatchsize) + " items.", monitor) if textcategorisation: logger("Text target space", monitor) if authorcategorisation: logger("Author target space", monitor) if gendercategorisation: logger("Gender target space", monitor) for cat in categories: categorytable[cat] = cat # redundant redundancy redundanciness targetspace[cat] = sparsevectors.newemptyvector( ngramspace.dimensionality) targets.add(cat) logger("Started training files.", monitor) authorindex = 0 textindex = 0 testvectorantal = 0 trainvectorantal = 0 for file in filenamelist: authorindex += 1 authornametable[authorindex] = file.split(".")[0].split("/")[-1] logger("Starting training " + str(authorindex) + " " + file, debug) e = xml.etree.ElementTree.parse(file).getroot() trainvectors[authorindex] = [] testvectors[authorindex] = [] thesevectors = []
random.shuffle(filenamelist) split = int(len(filenamelist) * testtrainfraction) testfiles = filenamelist[:split] else: testfiles = filenamelist logger("Start building vectors for " + str(len(testfiles)) + " test files.", monitor) authorindex = 0 testitemspace = SemanticSpace() nn = 0 for file in testfiles: authorname = file.split(".")[0].split("/")[-1] authorindex += 1 logger("Reading " + str(authorindex) + " " + file, monitor) workingvector = sparsevectors.newemptyvector(dimensionality) e = xml.etree.ElementTree.parse(file).getroot() for b in e.iter("document"): origtext = b.text avector = sparsevectors.newemptyvector(dimensionality) if fulltext: avector = sparsevectors.normalise( stringspace.textvector(origtext, frequencyweighting)) if generalise: newtext = squintinglinguist.generalise(origtext) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting))) if featurise:
def newemptyvector(self): return sparsevectors.newemptyvector(self.dimensionality)