def textvector(self, string, frequencyweighting=True, loglevel=False): uvector = sparsevectors.newemptyvector(self.dimensionality) if self.window > 0: windows = [string[ii:ii + self.window] for ii in range(len(string) - self.window + 1)] for sequence in windows: thisvector = self.makevector(sequence) if frequencyweighting: factor = self.frequencyweight(sequence) else: factor = 1 logger(sequence + " " + str(factor), loglevel) if loglevel: logger(str(sparsevectors.sparsecosine(uvector, sparsevectors.normalise(thisvector))), loglevel) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(thisvector), factor) else: words = nltk.word_tokenize(string) if self.binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
def processfile(file): global sentencestorage, utterancespace sentenceindex = 0 textvector = wordspace.newemptyvector() with open(file, "r", encoding="utf-8") as textfile: rawtext = textfile.read().lower() rawtext = re.sub('\n', ' ', rawtext) rawtext = re.sub('\"', ' ', rawtext) rawtext = re.sub('\s+', ' ', rawtext) sents = sent_tokenize(rawtext) for sentence in sents: sentenceindex += 1 sentencestorage[sentenceindex] = sentence allsurfacewords = nltk.word_tokenize(sentence) wordspace.chkwordspace(allsurfacewords, debug) analyses = [] try: analyses = semanticdependencyparse.semanticdepparse( sentence.lower(), debug) except: logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence, error) kk = 0 for analysis in analyses: words = analysis.values() wordspace.checkwordspacelist(words, debug) for role in analysis: if role not in wordspace.permutationcollection: wordspace.permutationcollection[ role] = sparsevectors.createpermutation( wordspace.dimensionality) u = getvector(analysis, sentence) win = 1 sentencesequence = 0 startindexforthistext = 0 while win < sentencesequence: if sentenceindex - win > startindexforthistext: u = sparsevectors.sparseadd( u, sparsevectors.permute( sparsevectors.normalise( utterancespace[sentenceindex - win]), wordspace.permutationcollection["discourse"])) win += 1 if kk > 0: sentenceindex += 1 utterancespace[sentenceindex] = u textvector = sparsevectors.sparseadd(textvector, u, 1) kk += 1 textspace[file] = textvector return textvector
def observecollocation(self, item, otheritem, operator="nil"): if not self.contains(item): self.additem(item) if not self.contains(otheritem): self.additem(otheritem) self.contextspace[item] = sparsevectors.sparseadd( self.contextspace[item], sparsevectors.normalise(self.indexspace[otheritem]))
def applyoperator(self, item, operator, constant, weight): self.contextspace[item] = sparsevectors.sparseadd( self.contextspace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight) if operator == "morphology": self.morphologyspace[item] = sparsevectors.sparseadd( self.morphologyspace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight) else: self.attributespace[item] = sparsevectors.sparseadd( self.attributespace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight)
def addintoitem(self, item, vector, weight=1, operator=None): if not self.contains(item): self.additem(item) if operator is not None: vector = sparsevectors.permute(vector, operator) self.contextspace[item] = sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight) self.changed = True
def addintoitem(self, item, vector, weight=1): if not self.contains(item): vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.indexspace[item] = vector self.globalfrequency[item] = 0 self.contextspace[item] = sparsevectors.newemptyvector( self.dimensionality) self.contextspace[item] = \ sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight)
def sequencevector(self, sequence, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(self.dimensionality) windowlist = self.windows(sequence) logger(str(windowlist), loglevel) for w in windowlist: initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( self.onesequencevector(w, None, loglevel))) return initialvector
def postriplevector(self, text, poswindow=3): poses = nltk.pos_tag(text) windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)] onevector = self.pospermutations["vector"] vector = sparsevectors.newemptyvector(self.dimensionality) for sequence in windows: for item in sequence: if item[1] not in self.pospermutations: self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality) onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]]) vector = sparsevectors.sparseadd(vector, onevector) return vector
def processsentences(sents, testing=True): global sentencerepository, vectorrepositoryidx, featurerepository, index, ticker, sequencelabels, vectorrepositoryseq for s in sents: index += 1 key = "s" + str(index) if s in sentencerepository.values(): continue fs = featurise(s) logger(s, debug) fcxg = fs["features"] fpos = fs["pos"] fsem = fs["roles"] fwds = fs["words"] logger(fwds, debug) logger(fpos, debug) logger(fcxg, debug) logger(fsem, debug) vecidx = tokenvector(fwds, None, True, debug) vecseq = seq.sequencevector(fpos, None, debug) vecis = sparsevectors.sparseadd(vecidx, vecseq, 1, True) logger("idx - comb\t" + str(sparsevectors.sparsecosine(vecidx, vecis)), debug) logger("seq - comb\t" + str(sparsevectors.sparsecosine(vecseq, vecis)), debug) veccxg = tokenvector(fcxg, vecis, False, debug) logger("comb - cxg\t" + str(sparsevectors.sparsecosine(vecis, veccxg)), debug) logger("idx - cxg\t" + str(sparsevectors.sparsecosine(vecidx, veccxg)), debug) logger("seq - cxg\t" + str(sparsevectors.sparsecosine(veccxg, vecseq)), debug) vecsem = rolevector(fsem, veccxg, debug) logger("idx - sem\t" + str(sparsevectors.sparsecosine(vecidx, vecsem)), debug) logger("seq - sem\t" + str(sparsevectors.sparsecosine(vecseq, vecsem)), debug) logger("comb - sem\t" + str(sparsevectors.sparsecosine(vecis, vecsem)), debug) logger("cxg - sem\t" + str(sparsevectors.sparsecosine(veccxg, vecsem)), debug) sentencerepository[key] = s vectorrepositoryidx[key] = vecidx vectorrepositoryseq[key] = vecseq vectorrepositorycxg[key] = veccxg vectorrepositorysem[key] = vecsem featurerepository[key] = fs logger(str(key) + ":" + str(s) + "->" + str(fs), debug) if ticker > 1000: logger(str(index) + " sentences processed", monitor) squintinglinguist.restartCoreNlpClient() ticker = 0 ticker += 1
def rolevector(roledict, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for role in roledict: for item in roledict[role]: ctxspace.observe(item, False, debug) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( ctxspace.useoperator(ctxspace.indexspace[item], role))) if loglevel: logger( role + " " + item + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def tweetvector(string): uvector = sparsevectors.newemptyvector(ngramspace.dimensionality) if window > 0: windows = [ string[ii:ii + window] for ii in range(len(string) - window + 1) ] for sequence in windows: if ngramspace.contains(sequence): thisvector = ngramspace.indexspace[sequence] # ngramspace.observe(sequence) # should we be learning stuff now? naaw. else: thisvector = stringspace.makevector(sequence) # ngramspace.additem(sequence, thisvector) # should it be added to cache? naaw. factor = ngramspace.frequencyweight(sequence) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector), factor) return uvector
def makevector(self, string): stringvector = {} # np.array([0] * self.dimensionality) for character in string[::-1]: # reverse the string! (to keep strings that share prefixes similar) if character not in self.indexspace: vec = {} nonzeros = random.sample(list(range(self.dimensionality)), self.denseness) random.shuffle(nonzeros) split = self.denseness // 2 for i in nonzeros[:split]: vec[i] = 1 for i in nonzeros[split:]: vec[i] = -1 self.indexspace[character] = vec self.globalfrequency[character] = 1 self.bign += 1 stringvector = sparsevectors.sparseadd(sparsevectors.sparseshift(stringvector, self.dimensionality), self.indexspace[character]) # np.append(stringvector[1:], stringvector[0]) + self.indexspace[character] return stringvector # lil_matrix(stringvector.reshape(self.dimensionality, -1))
def tokenvector(tokenlist, initialvector=None, weights=True, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for item in tokenlist: if not weights or str(item).startswith( "JiK" ): # cxg features should not be weighted the same way lex feats are weight = 1 else: weight = ctxspace.languagemodel.frequencyweight(item, True) ctxspace.observe(item, True) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise(ctxspace.contextspace[item]), weight) if loglevel: logger( item + " " + str(weight) + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def textvector(self, words, frequencyweighting=True, binaryfrequencies=False, loglevel=False): self.docs += 1 uvector = sparsevectors.newemptyvector(self.dimensionality) if binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) self.df[w] += 1 uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
def getvector(roleworddict, sentencestring): uvector = {} # vector for test item for role in roleworddict: item = roleworddict[role] uvector = sparsevectors.sparseadd( uvector, sparsevectors.permute( sparsevectors.normalise(wordspace.indexspace[item]), wordspace.permutationcollection[role]), wordspace.frequencyweight(item)) lexicalwindow = 1 if lexicalwindow > 0: wds = word_tokenize(sentencestring.lower()) windows = [ wds[i:i + lexicalwindow] for i in range(len(wds) - lexicalwindow + 1) ] for sequence in windows: thisvector = {} for item in sequence: thisvector = sparsevectors.sparseadd( sparsevectors.permute( thisvector, wordspace.permutationcollection["sequence"]), wordspace.indexspace[item], wordspace.frequencyweight(item)) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector)) pos = 1 if pos > 0: wds = word_tokenize(sentencestring) posanalyses = nltk.pos_tag(wds) poslist = [i[1] for i in posanalyses] windows = [ poslist[i:i + lexicalwindow] for i in range(len(poslist) - lexicalwindow + 1) ] for sequence in windows: thisvector = {} for item in sequence: thisvector = sparsevectors.sparseadd( sparsevectors.permute( thisvector, wordspace.permutationcollection["sequence"]), wordspace.indexspace[item], wordspace.frequencyweight(item)) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector)) style = True if style: wds = word_tokenize(sentencestring) cpw = len(sentencestring) / len(wds) wps = len(wds) sl = True if sl: if wps > 8: uvector = sparsevectors.sparseadd(uvector, longsentencevector) if wps < 5: uvector = sparsevectors.sparseadd(uvector, shortsentencevector) posanalyses = nltk.pos_tag(wds) poslist = [i[1] for i in posanalyses] for poses in poslist: if poses == "RB" or poses == "RBR" or poses == "RBS": uvector = sparsevectors.sparseadd(uvector, adverbvector) for w in wds: if w in negationlist: uvector = sparsevectors.sparseadd(uvector, negationvector) if w in hedgelist: uvector = sparsevectors.sparseadd(uvector, hedgevector) if w in amplifierlist: uvector = sparsevectors.sparseadd(uvector, amplifiervector) # attitude terms # verb stats # seq newordgrams # verb classes use wordspace! # sent sequences return uvector
print(probe, mc, n[mc], sentencerepository[mc]) print(space.contexttoindexneighbourswithweights(probe)) for v in vectorrepository: print(v, sentencerepository[v], sep="\t", end="\t") # print(v, vectorrepository[v]) ww = nltk.word_tokenize(sentencerepository[v]) vec = sparsevectors.newemptyvector(dimensionality) # for www in ww: # print(www, space.indexspace[www], space.globalfrequency[www], space.frequencyweight(www), sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v])) nvn = {} for www in ww: nvn[www] = sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v]) vec = sparsevectors.sparseadd( vec, sparsevectors.normalise(space.indexspace[www]), space.frequencyweight(www)) m = sorted(ww, key=lambda k: nvn[k], reverse=True)[:5] for mc in m: if nvn[mc] > 0.0001: print(mc, nvn[mc], sep=":", end="\t") print() if False: for w in space.items(): print(w, space.globalfrequency[w], space.indexspace[w], sep="\t") print("\t\t", space.contextspace[w]) # show that constructional items work the same way # show that permuted semantic roles work "semantic grep"
targetspace[textindex] = sparsevectors.newemptyvector( ngramspace.dimensionality) categorytable[textindex] = facittable[ authornametable[authorindex]] # name space collision for keys avector = tweetvector(b.text) thesevectors.append((targetlabel, avector)) if len(thesevectors) > 0: random.shuffle(thesevectors) split = int(len(thesevectors) * testtrainfraction) testvectors[authorindex] = thesevectors[:split] testvectorantal += len(testvectors[authorindex]) trainvectors[authorindex] = thesevectors[split:] trainvectorantal += len(trainvectors[authorindex]) for tv in trainvectors[authorindex]: targetspace[tv[0]] = sparsevectors.sparseadd( targetspace[tv[0]], tv[1]) logger("Done training files.", monitor) if outputmodel: # output character patterns to be able to generate new tweetvectors for separate testing on trained data stringspace.saveelementspace(charactervectorspacefilename) # output model here with info about the category of each model item with open(categorymodelfilename, "wb") as outfile: pickle.dump(targetspace, outfile) logger( "Testing targetspace with " + str(len(targetspace)) + " categories, " + str(testvectorantal) + " test items and " + str(trainvectorantal) + " training cases. ", monitor) confusion = ConfusionMatrix()
e = xml.etree.ElementTree.parse(textfile).getroot() for b in e.iter("document"): textindex += 1 tvector = sparsevectors.normalise( stringspace.textvector(b.text, frequencyweighting)) textspace.additem(textindex, tvector) newtext = squintinglinguist.generalise(b.text) mvector = sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting)) modifiedtextspace.additem(textindex, mvector) features = squintinglinguist.featurise(b.text) fvector = sparsevectors.newemptyvector(dimensionality) for feature in features: fv = stringspace.getvector(feature) fvector = sparsevectors.sparseadd(fvector, sparsevectors.normalise(fv), stringspace.frequencyweight(feature)) fvector = sparsevectors.normalise(fvector) squintfeaturespace.additem(textindex, fvector) pvector = sparsevectors.normalise(stringspace.postriplevector(b.text)) avector = sparsevectors.sparseadd( pvector, sparsevectors.sparseadd(mvector, sparsevectors.sparseadd(fvector, tvector))) fullspace.additem(textindex, avector) textdepot[textindex] = b.text modifiedtextdepot[textindex] = newtext featuredepot[textindex] = features logger("Done making " + str(textindex) + " vectors.", monitor) matrix = False if matrix:
authorname = file.split(".")[0].split("/")[-1] authorindex += 1 logger("Reading " + str(authorindex) + " " + file, monitor) workingvector = sparsevectors.newemptyvector(dimensionality) e = xml.etree.ElementTree.parse(file).getroot() for b in e.iter("document"): origtext = b.text avector = sparsevectors.newemptyvector(dimensionality) if fulltext: avector = sparsevectors.normalise( stringspace.textvector(origtext, frequencyweighting)) if generalise: newtext = squintinglinguist.generalise(origtext) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting))) if featurise: features = squintinglinguist.featurise(origtext) for feature in features: fv = stringspace.getvector(feature) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise(fv), stringspace.frequencyweight(feature)) if postriples: posttriplevector = stringspace.postriplevector(origtext) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise(posttriplevector)) workingvector = sparsevectors.sparseadd( workingvector, sparsevectors.normalise(avector)) nn += 1
def doallthefiles(rangelimit=4000): filelist = {} seenfile = {} antal_frag = 0 for ix in range(rangelimit): filelist[ix] = {} seenfile[ix] = True for cat in categories: fn = "{}{}.of_{:0>4d}.json.txt".format(path, cat, ix) try: os.stat(fn) filelist[ix][cat] = fn except: seenfile[ix] = None filelist[ix][cat] = None del filelist[ix] logger( "index {} did not match up {} file: {}".format( ix, cat, fn), error) logger("antal filer: {}".format(len(filelist)), monitor) conditions = ["wp", "wd", "wn", "wdp", "wnp", "wnd", "wndp"] vocabulary = {} vocabulary_words = Counter() vocabulary_labels = Counter() vocabulary["wp"] = Counter() vocabulary["wd"] = Counter() vocabulary["wn"] = Counter() vocabulary["wnp"] = Counter() vocabulary["wnd"] = Counter() vocabulary["wdp"] = Counter() vocabulary["wndp"] = Counter() outfrag = {} for fileindex in filelist: if seenfile[fileindex]: zippy = mergefiles(filelist[fileindex][categories[0]], filelist[fileindex][categories[1]], filelist[fileindex][categories[2]], filelist[fileindex][categories[3]]) wp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wp", fileindex), "w+") wd_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wd", fileindex), "w+") wn_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wn", fileindex), "w+") wnp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wnp", fileindex), "w+") wnd_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wnd", fileindex), "w+") wdp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wdp", fileindex), "w+") wndp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wndp", fileindex), "w+") for fragment in zippy: antal_frag += 1 for cc in conditions: outfrag[cc] = [] for oneitem in fragment: vocabulary_words.update([oneitem[0]]) vocabulary_labels.update([oneitem[1]]) vocabulary_labels.update([oneitem[2]]) vocabulary_labels.update([oneitem[3]]) vocabulary["wp"].update( [joinstring.join([oneitem[0], oneitem[1]])]) outfrag["wp"].append("".join([oneitem[0], oneitem[1]])) vocabulary["wd"].update( [joinstring.join([oneitem[0], oneitem[2]])]) outfrag["wd"].append("".join([oneitem[0], oneitem[2]])) vocabulary["wn"].update( [joinstring.join([oneitem[0], oneitem[3]])]) outfrag["wn"].append("".join([oneitem[0], oneitem[3]])) vocabulary["wnp"].update([ joinstring.join([oneitem[0], oneitem[1], oneitem[2]]) ]) outfrag["wnp"].append("".join( [oneitem[0], oneitem[1], oneitem[2]])) vocabulary["wnd"].update([ joinstring.join([oneitem[0], oneitem[1], oneitem[3]]) ]) outfrag["wnd"].append("".join( [oneitem[0], oneitem[1], oneitem[3]])) vocabulary["wdp"].update([ joinstring.join([oneitem[0], oneitem[2], oneitem[3]]) ]) outfrag["wdp"].append("".join( [oneitem[0], oneitem[2], oneitem[3]])) vocabulary["wndp"].update([ joinstring.join( [oneitem[0], oneitem[1], oneitem[2], oneitem[3]]) ]) outfrag["wndp"].append("".join( [oneitem[0], oneitem[1], oneitem[2], oneitem[3]])) wp_f.write(" ".join(outfrag["wp"]) + "\n") wd_f.write(" ".join(outfrag["wd"]) + "\n") wn_f.write(" ".join(outfrag["wn"]) + "\n") wnp_f.write(" ".join(outfrag["wnp"]) + "\n") wnd_f.write(" ".join(outfrag["wnd"]) + "\n") wdp_f.write(" ".join(outfrag["wdp"]) + "\n") wndp_f.write(" ".join(outfrag["wndp"]) + "\n") wn_f.close() wd_f.close() wp_f.close() wnd_f.close() wnp_f.close() wdp_f.close() wndp_f.close() logger("antal fragment: {}".format(antal_frag), monitor) vocab_words = {w for w, c in vocabulary_words.items() if c >= MINCOUNT} size_vocab = len(vocab_words) logger("antal ord std: {}".format(size_vocab), monitor) embeddings = {} for w in vocab_words: embeddings[w] = sparsevectors.newrandomvector(dimensionality, density) vocab_labels = {w for w, c in vocabulary_labels.items() if c >= MINCOUNT} size_vocab = len(vocab_labels) logger("antal tag tot: {}".format(size_vocab), monitor) labelembeddings = {} for w in vocab_labels: try: labelembeddings[w] = sparsevectors.newrandomvector( dimensionality, labeldensity) except IndexError: logger("Indexerror: {}".format(w), error) for cc in conditions: vocab_words = {w for w, c in vocabulary[cc].items() if c >= MINCOUNT} size_vocab = len(vocab_words) compositeembeddings = {} logger("antal ord i {}: {}".format(cc, size_vocab), monitor) with open('{}{}/vocab.words.txt'.format(outpath, cc), "w+") as f: for wdl in sorted(list(vocab_words)): wd = "".join(wdl.split(joinstring)) f.write('{}\n'.format(wd)) vv = embeddings[wdl.split(joinstring)[0]] for ll in wdl.split(joinstring)[1:]: vv = sparsevectors.sparseadd(vv, labelembeddings[ll]) compositeembeddings[wd] = sparsevectors.listify( sparsevectors.normalise(vv), dimensionality) with open('{}{}/compositevectors.txt'.format(outpath, cc), "w+") as f: for www in compositeembeddings: f.write("{} {}\n".format( www, " ".join(map(str, compositeembeddings[www]))))