예제 #1
0
 def addintoitem(self, item, vector, weight=1):
     if not self.contains(item):
         vector = sparsevectors.newrandomvector(self.dimensionality,
                                                self.denseness)
         self.indexspace[item] = vector
         self.globalfrequency[item] = 0
         self.contextspace[item] = sparsevectors.newemptyvector(
             self.dimensionality)
         self.associationspace[item] = sparsevectors.newemptyvector(
             self.dimensionality)
     self.contextspace[item] = sparsevectors.sparseadd(
         self.contextspace[item], sparsevectors.normalise(vector), weight)
예제 #2
0
    def additem(self, item, vector="dummy"):
        if vector is "dummy":
            vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness)
        if not self.contains(item):
            self.indexspace[item] = vector
            self.globalfrequency[item] = 1
            self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality)
            self.attributespace[item] = sparsevectors.newemptyvector(self.dimensionality)
            self.morphologyspace[item] = sparsevectors.newemptyvector(self.dimensionality)
#            self.textspace[item] = sparsevectors.newemptyvector(self.dimensionality)
#            self.utterancespace[item] = sparsevectors.newemptyvector(self.dimensionality)
#            self.authorspace[item] = sparsevectors.newemptyvector(self.dimensionality)
            self.bign += 1
 def importgavagaiwordspace(self, vectorfile: str, threshold=5):
     vectorpattern = re.compile(
         r"\(\"(.*)\" #S(\d+);([\d\+\-\;]+): #S\d+;(.+): (\d+)\)",
         re.IGNORECASE)
     itempattern = re.compile(r"(\d+)\+?(\-?[\d\.e\-]+)$")
     antal = 0
     antalkvar = 0
     try:
         with open(vectorfile, 'rt', errors="replace") as gavagaispace:
             for line in gavagaispace:
                 antal += 1
                 vectors = vectorpattern.match(line)
                 if vectors:
                     string = str(vectors.group(1))
                     dim = int(vectors.group(2))
                     idx = vectors.group(3)
                     ctx = vectors.group(4)
                     freq = int(vectors.group(5))
                     if freq > threshold:
                         antalkvar += 1
                         #                            logger("{} {} {} {} {}".format(antal, antalkvar, string, freq, idx), debug)
                         idxvector = sparsevectors.newemptyvector(dim)
                         idxlist = idx.split(";")
                         for ii in idxlist:
                             try:
                                 item = itempattern.match(ii)
                                 idxvector[int(item.group(1))] = float(
                                     item.group(2))
                             except:
                                 logger(
                                     "{} {} {} {}".format(
                                         antal, string, ii, idx), error)
                         self.additem(string, idxvector)
                         ctxvector = sparsevectors.newemptyvector(dim)
                         ctxlist = ctx.split(";")
                         for ii in ctxlist:
                             try:
                                 item = itempattern.match(ii)
                                 ctxvector[int(item.group(1))] = float(
                                     item.group(2))
                             except:
                                 logger(
                                     "{} {} {} {}".format(
                                         antal, string, ii, idx), error)
                         self.contextspace[string] = ctxvector
                         self.observedfrequency[string] = freq
                         self.languagemodel.additem(string, freq)
     except IOError:
         logger("Could not read from >>" + vectorfile + "<<", error)
예제 #4
0
 def textvector(self, string, frequencyweighting=True, loglevel=False):
     uvector = sparsevectors.newemptyvector(self.dimensionality)
     if self.window > 0:
         windows = [string[ii:ii + self.window] for ii in range(len(string) - self.window + 1)]
         for sequence in windows:
             thisvector = self.makevector(sequence)
             if frequencyweighting:
                 factor = self.frequencyweight(sequence)
             else:
                 factor = 1
             logger(sequence + " " + str(factor), loglevel)
             if loglevel:
                 logger(str(sparsevectors.sparsecosine(uvector, sparsevectors.normalise(thisvector))), loglevel)
             uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(thisvector), factor)
     else:
         words = nltk.word_tokenize(string)
         if self.binaryfrequencies:
             wordlist = set(words)  # not a list, a set but hey
         else:
             wordlist = words
         for w in wordlist:
             if frequencyweighting:
                 factor = self.frequencyweight(w)
             else:
                 factor = 1
             if w not in self.indexspace:
                 self.additem(w)
             else:
                 self.observe(w)
             uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(self.indexspace[w]), factor)
     return uvector
예제 #5
0
 def additem(self, item, vector="dummy"):
     if vector is "dummy":
         vector = sparsevectors.newrandomvector(self.dimensionality,
                                                self.denseness)
     if not self.contains(item):
         self.indexspace[item] = vector
         self.contextspace[item] = sparsevectors.newemptyvector(
             self.dimensionality)
예제 #6
0
 def sequencevector(self, sequence, initialvector=None, loglevel=False):
     if initialvector is None:
         initialvector = sparsevectors.newemptyvector(self.dimensionality)
     windowlist = self.windows(sequence)
     logger(str(windowlist), loglevel)
     for w in windowlist:
         initialvector = sparsevectors.sparseadd(
             initialvector,
             sparsevectors.normalise(
                 self.onesequencevector(w, None, loglevel)))
     return initialvector
예제 #7
0
 def postriplevector(self, text, poswindow=3):
     poses = nltk.pos_tag(text)
     windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)]
     onevector = self.pospermutations["vector"]
     vector = sparsevectors.newemptyvector(self.dimensionality)
     for sequence in windows:
         for item in sequence:
             if item[1] not in self.pospermutations:
                 self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality)
             onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]])
         vector = sparsevectors.sparseadd(vector, onevector)
     return vector
예제 #8
0
 def additem(self, item, vector=None):
     """
     Add new item to the space. Add randomly generated index vector (unless one is given as an argument or one
     already is recorded in index space); add empty context space, prep LanguageModel to accommodate item. Should
     normally be called from observe() but also at times from addintoitem.
     """
     if item not in self.indexspace:
         if vector is None:
             vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness)
         self.indexspace[item] = vector
     self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality)
     self.changed = True
     self.observedfrequency[item] = 0
예제 #9
0
def rolevector(roledict, initialvector=None, loglevel=False):
    if initialvector is None:
        initialvector = sparsevectors.newemptyvector(dimensionality)
    for role in roledict:
        for item in roledict[role]:
            ctxspace.observe(item, False, debug)
            tmp = initialvector
            initialvector = sparsevectors.sparseadd(
                initialvector,
                sparsevectors.normalise(
                    ctxspace.useoperator(ctxspace.indexspace[item], role)))
            if loglevel:
                logger(
                    role + " " + item + " " +
                    str(sparsevectors.sparsecosine(tmp, initialvector)),
                    loglevel)
    return initialvector
예제 #10
0
def tweetvector(string):
    uvector = sparsevectors.newemptyvector(ngramspace.dimensionality)
    if window > 0:
        windows = [
            string[ii:ii + window] for ii in range(len(string) - window + 1)
        ]
        for sequence in windows:
            if ngramspace.contains(sequence):
                thisvector = ngramspace.indexspace[sequence]
#                 ngramspace.observe(sequence)  # should we be learning stuff now? naaw.
            else:
                thisvector = stringspace.makevector(sequence)


#                ngramspace.additem(sequence, thisvector)  # should it be added to cache? naaw.
            factor = ngramspace.frequencyweight(sequence)
            uvector = sparsevectors.sparseadd(
                uvector, sparsevectors.normalise(thisvector), factor)
    return uvector
예제 #11
0
def tokenvector(tokenlist, initialvector=None, weights=True, loglevel=False):
    if initialvector is None:
        initialvector = sparsevectors.newemptyvector(dimensionality)
    for item in tokenlist:
        if not weights or str(item).startswith(
                "JiK"
        ):  # cxg features should not be weighted the same way lex feats are
            weight = 1
        else:
            weight = ctxspace.languagemodel.frequencyweight(item, True)
        ctxspace.observe(item, True)
        tmp = initialvector
        initialvector = sparsevectors.sparseadd(
            initialvector,
            sparsevectors.normalise(ctxspace.contextspace[item]), weight)
        if loglevel:
            logger(
                item + " " + str(weight) + " " +
                str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel)
    return initialvector
예제 #12
0
 def textvector(self,
                words,
                frequencyweighting=True,
                binaryfrequencies=False,
                loglevel=False):
     self.docs += 1
     uvector = sparsevectors.newemptyvector(self.dimensionality)
     if binaryfrequencies:
         wordlist = set(words)  # not a list, a set but hey
     else:
         wordlist = words
     for w in wordlist:
         if frequencyweighting:
             factor = self.frequencyweight(w)
         else:
             factor = 1
         if w not in self.indexspace:
             self.additem(w)
         else:
             self.observe(w)
         self.df[w] += 1
         uvector = sparsevectors.sparseadd(
             uvector, sparsevectors.normalise(self.indexspace[w]), factor)
     return uvector
예제 #13
0
    ]:  # "["hearts", "turtle", "cat", "rabbit", "queen", "and", "off"]:
        n = {}
        for v in vectorrepository:
            n[v] = sparsevectors.sparsecosine(space.indexspace[probe],
                                              vectorrepository[v])
        m = sorted(sentencerepository, key=lambda k: n[k], reverse=True)
        for mc in m:
            if n[mc] > 0.0001:
                print(probe, mc, n[mc], sentencerepository[mc])
        print(space.contexttoindexneighbourswithweights(probe))

for v in vectorrepository:
    print(v, sentencerepository[v], sep="\t", end="\t")
    #    print(v, vectorrepository[v])
    ww = nltk.word_tokenize(sentencerepository[v])
    vec = sparsevectors.newemptyvector(dimensionality)
    #    for www in ww:
    #        print(www, space.indexspace[www], space.globalfrequency[www], space.frequencyweight(www), sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v]))
    nvn = {}
    for www in ww:
        nvn[www] = sparsevectors.sparsecosine(space.indexspace[www],
                                              vectorrepository[v])
        vec = sparsevectors.sparseadd(
            vec, sparsevectors.normalise(space.indexspace[www]),
            space.frequencyweight(www))
    m = sorted(ww, key=lambda k: nvn[k], reverse=True)[:5]
    for mc in m:
        if nvn[mc] > 0.0001:
            print(mc, nvn[mc], sep=":", end="\t")
    print()
예제 #14
0
    random.shuffle(
        filenamelist
    )  # if we shuffle here the weights won't be as good i mean overtrained
    filenamelist = filenamelist[:testbatchsize]
logger("Going on with a file list of " + str(testbatchsize) + " items.",
       monitor)

if textcategorisation:
    logger("Text target space", monitor)
if authorcategorisation:
    logger("Author target space", monitor)
if gendercategorisation:
    logger("Gender target space", monitor)
    for cat in categories:
        categorytable[cat] = cat  # redundant redundancy redundanciness
        targetspace[cat] = sparsevectors.newemptyvector(
            ngramspace.dimensionality)
        targets.add(cat)

logger("Started training files.", monitor)
authorindex = 0
textindex = 0
testvectorantal = 0
trainvectorantal = 0
for file in filenamelist:
    authorindex += 1
    authornametable[authorindex] = file.split(".")[0].split("/")[-1]
    logger("Starting training " + str(authorindex) + " " + file, debug)
    e = xml.etree.ElementTree.parse(file).getroot()
    trainvectors[authorindex] = []
    testvectors[authorindex] = []
    thesevectors = []
예제 #15
0
    random.shuffle(filenamelist)
    split = int(len(filenamelist) * testtrainfraction)
    testfiles = filenamelist[:split]
else:
    testfiles = filenamelist

logger("Start building vectors for " + str(len(testfiles)) + " test files.",
       monitor)
authorindex = 0
testitemspace = SemanticSpace()
nn = 0
for file in testfiles:
    authorname = file.split(".")[0].split("/")[-1]
    authorindex += 1
    logger("Reading " + str(authorindex) + " " + file, monitor)
    workingvector = sparsevectors.newemptyvector(dimensionality)
    e = xml.etree.ElementTree.parse(file).getroot()

    for b in e.iter("document"):
        origtext = b.text
        avector = sparsevectors.newemptyvector(dimensionality)
        if fulltext:
            avector = sparsevectors.normalise(
                stringspace.textvector(origtext, frequencyweighting))
        if generalise:
            newtext = squintinglinguist.generalise(origtext)
            avector = sparsevectors.sparseadd(
                avector,
                sparsevectors.normalise(
                    stringspace.textvector(newtext, frequencyweighting)))
        if featurise:
예제 #16
0
 def newemptyvector(self):
     return sparsevectors.newemptyvector(self.dimensionality)