def textvector(self, string, frequencyweighting=True, loglevel=False): uvector = sparsevectors.newemptyvector(self.dimensionality) if self.window > 0: windows = [string[ii:ii + self.window] for ii in range(len(string) - self.window + 1)] for sequence in windows: thisvector = self.makevector(sequence) if frequencyweighting: factor = self.frequencyweight(sequence) else: factor = 1 logger(sequence + " " + str(factor), loglevel) if loglevel: logger(str(sparsevectors.sparsecosine(uvector, sparsevectors.normalise(thisvector))), loglevel) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(thisvector), factor) else: words = nltk.word_tokenize(string) if self.binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
def contextneighbours(self, item: str, number: int=10, weights: bool=False, filtertag: bool=False, threshold: int=-1) -> list: ''' Return the items from the contextspace most similar to the given item. I.e. items which have similar neighbours to the item. Specify number of items desired (0 will give all), if weights are desired, if only items with the same tag are desired, and if thresholding to a certain horizon is desired. ''' neighbourhood = {} if filtertag: targetset = self.tagged[self.tag[item]] else: targetset = self.contextspace for i in targetset: # was: for i in self.contextspace: if i == item: continue k = sparsevectors.sparsecosine(self.contextspace[item], self.contextspace[i]) if k > threshold: neighbourhood[i] = k if not number: number = len(neighbourhood) if weights: r = sorted(neighbourhood.items(), key=lambda k: neighbourhood[k[0]], reverse=True)[:number] else: r = sorted(neighbourhood, key=lambda k: neighbourhood[k], reverse=True)[:number] return r
def contexttoindexneighbours(self, item, number=10, weights=False, permutationname="nil"): """ Return the items whose index vectors are most similar to the given item's context vector. I.e. items which have occurred in contexts with the item. """ permutation = self.permutationcollection[permutationname] neighbourhood = {} for i in self.indexspace: neighbourhood[i] = sparsevectors.sparsecosine( self.contextspace[item], sparsevectors.permute(self.indexspace[i], permutation)) if not number: number = len(neighbourhood) if weights: r = sorted(neighbourhood.items(), key=lambda k: neighbourhood[k[0]], reverse=True)[:number] else: r = sorted(neighbourhood, key=lambda k: neighbourhood[k], reverse=True)[:number] return r
def processsentences(sents, testing=True): global sentencerepository, vectorrepositoryidx, featurerepository, index, ticker, sequencelabels, vectorrepositoryseq for s in sents: index += 1 key = "s" + str(index) if s in sentencerepository.values(): continue fs = featurise(s) logger(s, debug) fcxg = fs["features"] fpos = fs["pos"] fsem = fs["roles"] fwds = fs["words"] logger(fwds, debug) logger(fpos, debug) logger(fcxg, debug) logger(fsem, debug) vecidx = tokenvector(fwds, None, True, debug) vecseq = seq.sequencevector(fpos, None, debug) vecis = sparsevectors.sparseadd(vecidx, vecseq, 1, True) logger("idx - comb\t" + str(sparsevectors.sparsecosine(vecidx, vecis)), debug) logger("seq - comb\t" + str(sparsevectors.sparsecosine(vecseq, vecis)), debug) veccxg = tokenvector(fcxg, vecis, False, debug) logger("comb - cxg\t" + str(sparsevectors.sparsecosine(vecis, veccxg)), debug) logger("idx - cxg\t" + str(sparsevectors.sparsecosine(vecidx, veccxg)), debug) logger("seq - cxg\t" + str(sparsevectors.sparsecosine(veccxg, vecseq)), debug) vecsem = rolevector(fsem, veccxg, debug) logger("idx - sem\t" + str(sparsevectors.sparsecosine(vecidx, vecsem)), debug) logger("seq - sem\t" + str(sparsevectors.sparsecosine(vecseq, vecsem)), debug) logger("comb - sem\t" + str(sparsevectors.sparsecosine(vecis, vecsem)), debug) logger("cxg - sem\t" + str(sparsevectors.sparsecosine(veccxg, vecsem)), debug) sentencerepository[key] = s vectorrepositoryidx[key] = vecidx vectorrepositoryseq[key] = vecseq vectorrepositorycxg[key] = veccxg vectorrepositorysem[key] = vecsem featurerepository[key] = fs logger(str(key) + ":" + str(s) + "->" + str(fs), debug) if ticker > 1000: logger(str(index) + " sentences processed", monitor) squintinglinguist.restartCoreNlpClient() ticker = 0 ticker += 1
def indextocontextneighbours(self, item, number=10, weights=False, permutationname="nil"): permutation = self.permutationcollection[permutationname] neighbourhood = {} for i in self.contextspace: neighbourhood[i] = sparsevectors.sparsecosine(sparsevectors.permute(self.indexspace[item], permutation), self.contextspace[i]) if weights: r = sorted(neighbourhood.items(), key=lambda k: neighbourhood[k[0]], reverse=True)[:number] else: r = sorted(neighbourhood, key=lambda k: neighbourhood[k], reverse=True)[:number] return r
def rolevector(roledict, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for role in roledict: for item in roledict[role]: ctxspace.observe(item, False, debug) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( ctxspace.useoperator(ctxspace.indexspace[item], role))) if loglevel: logger( role + " " + item + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def tokenvector(tokenlist, initialvector=None, weights=True, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for item in tokenlist: if not weights or str(item).startswith( "JiK" ): # cxg features should not be weighted the same way lex feats are weight = 1 else: weight = ctxspace.languagemodel.frequencyweight(item, True) ctxspace.observe(item, True) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise(ctxspace.contextspace[item]), weight) if loglevel: logger( item + " " + str(weight) + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def similarity(self, item, anotheritem): return sparsevectors.sparsecosine(self.contextspace[item], self.contextspace[anotheritem])
if False: for i in space.items(): print(i, space.globalfrequency[i], space.bign, space.frequencyweight(i), sep="\t") # show that lexical stats work use weighting if False: for probe in [ "jussi", "boat", "fun" ]: # "["hearts", "turtle", "cat", "rabbit", "queen", "and", "off"]: n = {} for v in vectorrepository: n[v] = sparsevectors.sparsecosine(space.indexspace[probe], vectorrepository[v]) m = sorted(sentencerepository, key=lambda k: n[k], reverse=True) for mc in m: if n[mc] > 0.0001: print(probe, mc, n[mc], sentencerepository[mc]) print(space.contexttoindexneighbourswithweights(probe)) for v in vectorrepository: print(v, sentencerepository[v], sep="\t", end="\t") # print(v, vectorrepository[v]) ww = nltk.word_tokenize(sentencerepository[v]) vec = sparsevectors.newemptyvector(dimensionality) # for www in ww: # print(www, space.indexspace[www], space.globalfrequency[www], space.frequencyweight(www), sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v])) nvn = {} for www in ww:
def similarityM(self, item, anotheritem): return sparsevectors.sparsecosine(self.morphologyspace[item], self.morphologyspace[anotheritem])
def indextocontextsimilarity(self, item, anotheritem): if self.contains(item): return sparsevectors.sparsecosine(self.indexspace[item], self.contextspace[anotheritem]) else: return 0.0
def similarity(self, vector, anothervector): return sparsevectors.sparsecosine(vector, anothervector)
logger("Cycle " + str(cycle) + " of " + str(cycles) + "tests.") items = list(targetspace.items()) logger( "Calculating neighbours for " + str(len(testitemspace.items())) + " test items and " + str(len(targetspace.items())) + " target items.", monitor) neighbours = {} for item in testers: neighbours[item] = {} for otheritem in targetspace.items(): if testitemspace.name[item] == targetspace.name[otheritem]: continue neighbours[item][otheritem] = sparsevectors.sparsecosine( testitemspace.indexspace[item], targetspace.indexspace[otheritem]) logger("Done calculating neighbours", monitor) logger("Pool depth " + str(itempooldepth), monitor) if averagelinkage: logger("Averagelinkage", monitor) if votelinkage: logger("Votelinkage", monitor) confusion = ConfusionMatrix() primeconfusion = ConfusionMatrix() targetscore = {} for item in testers: sortedneighbours = sorted(neighbours[item], key=lambda hh: neighbours[item][hh], reverse=True)[:itempooldepth]
for i in vecs.indexspace: p = 0 n = 0 negscore = "--" ampSscore = "--" ampTscore = "--" ampGscore = "--" dtscore = "--" wereon = False if vecs.globalfrequency[i] > 1: wereon = True ns = dict(vecs.contextneighbourswithweights(i, number)) negscore = sparsevectors.sparsecosine(vecs.indexspace["JiKnegation"], vecs.contextspace[i]) ampSscore = sparsevectors.sparsecosine(vecs.indexspace["JiKampsurprise"], vecs.contextspace[i]) ampTscore = sparsevectors.sparsecosine(vecs.indexspace["JiKamptruly"], vecs.contextspace[i]) ampGscore = sparsevectors.sparsecosine(vecs.indexspace["JiKampgrade"], vecs.contextspace[i]) dtscore = sparsevectors.sparsecosine(vecs.indexspace["JiKhedge"], vecs.contextspace[i]) if str(i).startswith("JiK"): ns = {} wereon = True for j in vecs.contextspace: ns[j] = sparsevectors.sparsecosine(vecs.indexspace[i], vecs.contextspace[j]) if wereon: k = sorted(ns.items(), key=lambda k: ns[k[0]], reverse=True)[:number] for witem in k: if witem[0] in posattitudewordset: p += 1 if witem[0] in negattitudewordset:
def similarityA(self, item, anotheritem): return sparsevectors.sparsecosine(self.attributespace[item], self.attributespace[anotheritem])
fvector = sparsevectors.normalise(fvector) pvector = sparsevectors.normalise(stringspace.postriplevector(origtext)) avector = sparsevectors.sparseadd( pvector, sparsevectors.sparseadd(mvector, sparsevectors.sparseadd(fvector, tvector))) vector = fvector tn = {} mn = {} fn = {} an = {} nofn = 3 for otheritem in fullspace.items(): if otheritem == newtest: continue tn[otheritem] = sparsevectors.sparsecosine( tvector, fullspace.indexspace[otheritem]) mn[otheritem] = sparsevectors.sparsecosine( mvector, fullspace.indexspace[otheritem]) fn[otheritem] = sparsevectors.sparsecosine( fvector, fullspace.indexspace[otheritem]) an[otheritem] = sparsevectors.sparsecosine( avector, fullspace.indexspace[otheritem]) logger(str(newtest) + "\t" + textdepot[newtest], debug) tnn = sorted(tn, key=lambda i: tn[i], reverse=True)[:nofn] logger(str(tnn), debug) for o in tnn: logger("\t" + str(o) + "\t" + str(tn[o]) + "\t" + textdepot[o], debug) mnn = sorted(mn, key=lambda i: mn[i], reverse=True)[:nofn] logger(str(mnn), debug) for o in mnn: logger("\t" + str(o) + "\t" + str(mn[o]) + "\t" + textdepot[o], debug)
def runbatchtest(fraction, n: int = 100): logger("{} {} {}".format(n, fraction, ticker), monitor) keylist = list(vectorrepositoryall.keys())[:n] random.shuffle(keylist) split = int(len(keylist) * fraction) train = keylist[:split] test = keylist[split:] logger("{} train vs {} test".format(len(train), len(test)), monitor) ones = [] nils = [] dummymaxconfusionmatrix = ConfusionMatrix() dummyrandomconfusionmatrix = ConfusionMatrix() centroidconfusionmatrix = ConfusionMatrix() poolconfusionmatrix = ConfusionMatrix() for trainitem in test: if illness[trainitem] == "1": ones.append(vectorrepositoryall[trainitem]) else: nils.append(vectorrepositoryall[trainitem]) onecentroid = sparsevectors.centroid(ones) nilcentroid = sparsevectors.centroid(nils) if len(nils) > len(ones): dummymaxguess = "0" else: dummymaxguess = "1" # factor = len(ones) / len(nils) # no, bad idea, go for fifty-fifty factor = 1 / 2 for testitem in test: dummymaxconfusionmatrix.addconfusion(illness[testitem], dummymaxguess) if random.random() > factor: dummyrandomguess = "0" else: dummyrandomguess = "1" dummyrandomconfusionmatrix.addconfusion(illness[testitem], dummyrandomguess) probe = vectorrepositoryall[testitem] resultc = "0" i1 = sparsevectors.sparsecosine(probe, onecentroid) n1 = sparsevectors.sparsecosine(probe, nilcentroid) if i1 > n1: resultc = "1" centroidconfusionmatrix.addconfusion(illness[testitem], resultc) probeneighbours = {} for targetitem in train: probeneighbours[targetitem] = sparsevectors.sparsecosine( probe, vectorrepositoryall[targetitem]) sortedfriends = sorted(probeneighbours, key=lambda hh: probeneighbours[hh], reverse=True)[:pooldepth] illity = 0 result = "0" for friend in sortedfriends: if illness[friend] == "1": illity += 1 if illity > pooldepth * factor: result = "1" nullity = pooldepth - illity poolconfusionmatrix.addconfusion(illness[testitem], result) print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( testitem, illness[testitem], resultc, i1, n1, result, illity, nullity, pooldepth)) print("RANDOM ----------------") dummyrandomconfusionmatrix.evaluate() print("MAX ----------------") dummymaxconfusionmatrix.evaluate() print("CENTROID ----------------") centroidconfusionmatrix.evaluate() print("NEIGHBOURS --------------") poolconfusionmatrix.evaluate()
def contextneighbours(self, item, number=10): n = {} for i in self.contextspace: n[i] = sparsevectors.sparsecosine(self.contextspace[item], self.contextspace[i]) return sorted(n, key=lambda k: n[k], reverse=True)[:number]
averagerankofauthorhit = 0 logger( "Average linkage: " + str(averagelinkage) + " pool depth " + str(itempooldepth), monitor) for authorindex in testvectors: logger( str(authorindex) + "\t" + str(facittable[authornametable[authorindex]]) + "===============", debug) targetscore = {} for target in targets: targetscore[target] = 0 for testfile in testvectors[authorindex]: if averagelinkage: # take all test sentences and sum their scores for target in targets: targetscore[target] += sparsevectors.sparsecosine( targetspace[target], testfile[1]) elif maxlinkage: # use only the closest sentence to match scores for target in targets: a = sparsevectors.sparsecosine(targetspace[target], testfile[1]) if a > targetscore[target]: targetscore[target] = a sortedtargets = sorted(targets, key=lambda ia: targetscore[ia], reverse=True) for rank in range(len(sortedtargets)): if sortedtargets[rank] == authorindex: averagerankofauthorhit += rank + 1 targetvote = {} for target in targets: for cat in categories:
def contexttoindexneighbourswithweights(self, item, number=10): n = {} for i in self.contextspace: n[i] = sparsevectors.sparsecosine(self.indexspace[item], self.contextspace[i]) return sorted(n.items(), key=lambda k: n[k[0]], reverse=True)[:number]
def similarity(self, item, anotheritem): # should be based on contextspace return sparsevectors.sparsecosine(self.indexspace[item], self.indexspace[anotheritem])