def processsentences(ftag: str = "erisk", batch: int = 200): global sentencerepository, illness, vectorrepositoryall, featurerepository, ticker, fraction for key in sentencerepository: sentence = sentencerepository[key] fs = squintinglinguist.featurise(sentence) logger(sentence, debug) try: fcxg = fs["features"] fpos = fs["pos"] fsem = fs["roles"] fwds = fs["words"] vecidx = tokenvector(fwds, None, True, debug) vecseq = seq.sequencevector(fpos, vecidx, debug) veccxg = tokenvector(fcxg, vecseq, False, debug) vecsem = rolevector(fsem, veccxg, debug) vectorrepositoryall[key] = vecsem except KeyError: pass ticker += 1 logger("{} {}: {} -> {}".format(ticker, key, sentence, fs), debug) if ticker % 100 == 0: logger(str(ticker) + " sentences processed", monitor) with open("{}{}.{}.vectors".format(outputdirectory, ftag, ticker), "w+") as outputfile: for item in vectorrepositoryall: outputfile.write("{}\t{}\t{}\n".format( item, illness[item], vectorrepositoryall[item])) squintinglinguist.restartCoreNlpClient() if ticker % batch == 0: runbatchtest(fraction, batch) logger(str(ticker) + " sentences processed", monitor) runbatchtest(fraction, len(sentencerepository))
def processsentences(sents, testing=True): global sentencerepository, vectorrepositoryidx, featurerepository, index, ticker, sequencelabels, vectorrepositoryseq for s in sents: index += 1 key = "s" + str(index) if s in sentencerepository.values(): continue fs = featurise(s) logger(s, debug) fcxg = fs["features"] fpos = fs["pos"] fsem = fs["roles"] fwds = fs["words"] logger(fwds, debug) logger(fpos, debug) logger(fcxg, debug) logger(fsem, debug) vecidx = tokenvector(fwds, None, True, debug) vecseq = seq.sequencevector(fpos, None, debug) vecis = sparsevectors.sparseadd(vecidx, vecseq, 1, True) logger("idx - comb\t" + str(sparsevectors.sparsecosine(vecidx, vecis)), debug) logger("seq - comb\t" + str(sparsevectors.sparsecosine(vecseq, vecis)), debug) veccxg = tokenvector(fcxg, vecis, False, debug) logger("comb - cxg\t" + str(sparsevectors.sparsecosine(vecis, veccxg)), debug) logger("idx - cxg\t" + str(sparsevectors.sparsecosine(vecidx, veccxg)), debug) logger("seq - cxg\t" + str(sparsevectors.sparsecosine(veccxg, vecseq)), debug) vecsem = rolevector(fsem, veccxg, debug) logger("idx - sem\t" + str(sparsevectors.sparsecosine(vecidx, vecsem)), debug) logger("seq - sem\t" + str(sparsevectors.sparsecosine(vecseq, vecsem)), debug) logger("comb - sem\t" + str(sparsevectors.sparsecosine(vecis, vecsem)), debug) logger("cxg - sem\t" + str(sparsevectors.sparsecosine(veccxg, vecsem)), debug) sentencerepository[key] = s vectorrepositoryidx[key] = vecidx vectorrepositoryseq[key] = vecseq vectorrepositorycxg[key] = veccxg vectorrepositorysem[key] = vecsem featurerepository[key] = fs logger(str(key) + ":" + str(s) + "->" + str(fs), debug) if ticker > 1000: logger(str(index) + " sentences processed", monitor) squintinglinguist.restartCoreNlpClient() ticker = 0 ticker += 1
e = xml.etree.ElementTree.parse(file).getroot() for b in e.iter("document"): origtext = b.text avector = sparsevectors.newemptyvector(dimensionality) if fulltext: avector = sparsevectors.normalise( stringspace.textvector(origtext, frequencyweighting)) if generalise: newtext = squintinglinguist.generalise(origtext) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting))) if featurise: features = squintinglinguist.featurise(origtext) for feature in features: fv = stringspace.getvector(feature) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise(fv), stringspace.frequencyweight(feature)) if postriples: posttriplevector = stringspace.postriplevector(origtext) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise(posttriplevector)) workingvector = sparsevectors.sparseadd( workingvector, sparsevectors.normalise(avector)) nn += 1 testitemspace.additem(authorindex, workingvector) testitemspace.name[authorindex] = authorname logger("Done building " + str(nn) + " vectors.", monitor)
for f in files: logger(f, monitor) sentences = simpletextfilereader.doonetweetfile(f, targetterms) processsentences(sentences, index) space.outputwordspace(outputdirectory + "/" + str(index) + ".wordspace") pindex = 0 if runtest == True: for probe in [ "i am afraid of the hurricane", "i said i was afraid the hurricane", "getting as far away from this hurricane as possible", "the storm is a bitch" ]: pindex += 1 feats = featurise(probe) pkey = "p" + str(pindex) vecidx = tokenvector(feats["words"], None, True) vecseq = seq.sequencevector(feats["pos"], None) veccxg = tokenvector(feats["features"], None, False) vecsem = rolevector(feats["roles"], None) vec1 = seq.sequencevector(feats["pos"], vecidx) vec2 = tokenvector(feats["features"], vec1, False) vectot = rolevector(feats["roles"], vec2) neighboursByIdx = {} neighboursBySeq = {} neighboursByCxg = {} neighboursBySem = {} neighboursByTot = {} for v in sentencerepository: d1 = space.similarity(vecidx, vectorrepositorysem[v])
textdepot = {} modifiedtextdepot = {} featuredepot = {} e = xml.etree.ElementTree.parse(textfile).getroot() for b in e.iter("document"): textindex += 1 tvector = sparsevectors.normalise( stringspace.textvector(b.text, frequencyweighting)) textspace.additem(textindex, tvector) newtext = squintinglinguist.generalise(b.text) mvector = sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting)) modifiedtextspace.additem(textindex, mvector) features = squintinglinguist.featurise(b.text) fvector = sparsevectors.newemptyvector(dimensionality) for feature in features: fv = stringspace.getvector(feature) fvector = sparsevectors.sparseadd(fvector, sparsevectors.normalise(fv), stringspace.frequencyweight(feature)) fvector = sparsevectors.normalise(fvector) squintfeaturespace.additem(textindex, fvector) pvector = sparsevectors.normalise(stringspace.postriplevector(b.text)) avector = sparsevectors.sparseadd( pvector, sparsevectors.sparseadd(mvector, sparsevectors.sparseadd(fvector, tvector))) fullspace.additem(textindex, avector) textdepot[textindex] = b.text modifiedtextdepot[textindex] = newtext