示例#1
0
def processsentences(ftag: str = "erisk", batch: int = 200):
    global sentencerepository, illness, vectorrepositoryall, featurerepository, ticker, fraction
    for key in sentencerepository:
        sentence = sentencerepository[key]
        fs = squintinglinguist.featurise(sentence)
        logger(sentence, debug)
        try:
            fcxg = fs["features"]
            fpos = fs["pos"]
            fsem = fs["roles"]
            fwds = fs["words"]
            vecidx = tokenvector(fwds, None, True, debug)
            vecseq = seq.sequencevector(fpos, vecidx, debug)
            veccxg = tokenvector(fcxg, vecseq, False, debug)
            vecsem = rolevector(fsem, veccxg, debug)
            vectorrepositoryall[key] = vecsem
        except KeyError:
            pass
        ticker += 1
        logger("{} {}: {} -> {}".format(ticker, key, sentence, fs), debug)
        if ticker % 100 == 0:
            logger(str(ticker) + " sentences processed", monitor)
            with open("{}{}.{}.vectors".format(outputdirectory, ftag, ticker),
                      "w+") as outputfile:
                for item in vectorrepositoryall:
                    outputfile.write("{}\t{}\t{}\n".format(
                        item, illness[item], vectorrepositoryall[item]))
            squintinglinguist.restartCoreNlpClient()
        if ticker % batch == 0:
            runbatchtest(fraction, batch)
    logger(str(ticker) + " sentences processed", monitor)
    runbatchtest(fraction, len(sentencerepository))
def processsentences(sents, testing=True):
    global sentencerepository, vectorrepositoryidx, featurerepository, index, ticker, sequencelabels, vectorrepositoryseq
    for s in sents:
        index += 1
        key = "s" + str(index)
        if s in sentencerepository.values():
            continue
        fs = featurise(s)
        logger(s, debug)
        fcxg = fs["features"]
        fpos = fs["pos"]
        fsem = fs["roles"]
        fwds = fs["words"]
        logger(fwds, debug)
        logger(fpos, debug)
        logger(fcxg, debug)
        logger(fsem, debug)
        vecidx = tokenvector(fwds, None, True, debug)
        vecseq = seq.sequencevector(fpos, None, debug)
        vecis = sparsevectors.sparseadd(vecidx, vecseq, 1, True)
        logger("idx - comb\t" + str(sparsevectors.sparsecosine(vecidx, vecis)),
               debug)
        logger("seq - comb\t" + str(sparsevectors.sparsecosine(vecseq, vecis)),
               debug)
        veccxg = tokenvector(fcxg, vecis, False, debug)
        logger("comb - cxg\t" + str(sparsevectors.sparsecosine(vecis, veccxg)),
               debug)
        logger("idx - cxg\t" + str(sparsevectors.sparsecosine(vecidx, veccxg)),
               debug)
        logger("seq - cxg\t" + str(sparsevectors.sparsecosine(veccxg, vecseq)),
               debug)
        vecsem = rolevector(fsem, veccxg, debug)
        logger("idx - sem\t" + str(sparsevectors.sparsecosine(vecidx, vecsem)),
               debug)
        logger("seq - sem\t" + str(sparsevectors.sparsecosine(vecseq, vecsem)),
               debug)
        logger("comb - sem\t" + str(sparsevectors.sparsecosine(vecis, vecsem)),
               debug)
        logger("cxg - sem\t" + str(sparsevectors.sparsecosine(veccxg, vecsem)),
               debug)
        sentencerepository[key] = s
        vectorrepositoryidx[key] = vecidx
        vectorrepositoryseq[key] = vecseq
        vectorrepositorycxg[key] = veccxg
        vectorrepositorysem[key] = vecsem
        featurerepository[key] = fs
        logger(str(key) + ":" + str(s) + "->" + str(fs), debug)
        if ticker > 1000:
            logger(str(index) + " sentences processed", monitor)
            squintinglinguist.restartCoreNlpClient()
            ticker = 0
        ticker += 1
示例#3
0
    e = xml.etree.ElementTree.parse(file).getroot()

    for b in e.iter("document"):
        origtext = b.text
        avector = sparsevectors.newemptyvector(dimensionality)
        if fulltext:
            avector = sparsevectors.normalise(
                stringspace.textvector(origtext, frequencyweighting))
        if generalise:
            newtext = squintinglinguist.generalise(origtext)
            avector = sparsevectors.sparseadd(
                avector,
                sparsevectors.normalise(
                    stringspace.textvector(newtext, frequencyweighting)))
        if featurise:
            features = squintinglinguist.featurise(origtext)
            for feature in features:
                fv = stringspace.getvector(feature)
                avector = sparsevectors.sparseadd(
                    avector, sparsevectors.normalise(fv),
                    stringspace.frequencyweight(feature))
        if postriples:
            posttriplevector = stringspace.postriplevector(origtext)
            avector = sparsevectors.sparseadd(
                avector, sparsevectors.normalise(posttriplevector))
        workingvector = sparsevectors.sparseadd(
            workingvector, sparsevectors.normalise(avector))
    nn += 1
    testitemspace.additem(authorindex, workingvector)
    testitemspace.name[authorindex] = authorname
logger("Done building " + str(nn) + " vectors.", monitor)
示例#4
0
for f in files:
    logger(f, monitor)
    sentences = simpletextfilereader.doonetweetfile(f, targetterms)
    processsentences(sentences, index)
    space.outputwordspace(outputdirectory + "/" + str(index) + ".wordspace")
    pindex = 0
    if runtest == True:
        for probe in [
                "i am afraid of the hurricane",
                "i said i was afraid the hurricane",
                "getting as far away from this hurricane as possible",
                "the storm is a bitch"
        ]:
            pindex += 1
            feats = featurise(probe)
            pkey = "p" + str(pindex)
            vecidx = tokenvector(feats["words"], None, True)
            vecseq = seq.sequencevector(feats["pos"], None)
            veccxg = tokenvector(feats["features"], None, False)
            vecsem = rolevector(feats["roles"], None)
            vec1 = seq.sequencevector(feats["pos"], vecidx)
            vec2 = tokenvector(feats["features"], vec1, False)
            vectot = rolevector(feats["roles"], vec2)
            neighboursByIdx = {}
            neighboursBySeq = {}
            neighboursByCxg = {}
            neighboursBySem = {}
            neighboursByTot = {}
            for v in sentencerepository:
                d1 = space.similarity(vecidx, vectorrepositorysem[v])
textdepot = {}
modifiedtextdepot = {}
featuredepot = {}

e = xml.etree.ElementTree.parse(textfile).getroot()
for b in e.iter("document"):
    textindex += 1
    tvector = sparsevectors.normalise(
        stringspace.textvector(b.text, frequencyweighting))
    textspace.additem(textindex, tvector)
    newtext = squintinglinguist.generalise(b.text)
    mvector = sparsevectors.normalise(
        stringspace.textvector(newtext, frequencyweighting))
    modifiedtextspace.additem(textindex, mvector)
    features = squintinglinguist.featurise(b.text)
    fvector = sparsevectors.newemptyvector(dimensionality)
    for feature in features:
        fv = stringspace.getvector(feature)
        fvector = sparsevectors.sparseadd(fvector, sparsevectors.normalise(fv),
                                          stringspace.frequencyweight(feature))
    fvector = sparsevectors.normalise(fvector)
    squintfeaturespace.additem(textindex, fvector)
    pvector = sparsevectors.normalise(stringspace.postriplevector(b.text))
    avector = sparsevectors.sparseadd(
        pvector,
        sparsevectors.sparseadd(mvector,
                                sparsevectors.sparseadd(fvector, tvector)))
    fullspace.additem(textindex, avector)
    textdepot[textindex] = b.text
    modifiedtextdepot[textindex] = newtext