def contexttoindexneighbours(self,
                              item,
                              number=10,
                              weights=False,
                              permutationname="nil"):
     """
     Return the items whose index vectors are most similar to the given item's context vector. I.e. items which
     have occurred in contexts with the item.
     """
     permutation = self.permutationcollection[permutationname]
     neighbourhood = {}
     for i in self.indexspace:
         neighbourhood[i] = sparsevectors.sparsecosine(
             self.contextspace[item],
             sparsevectors.permute(self.indexspace[i], permutation))
     if not number:
         number = len(neighbourhood)
     if weights:
         r = sorted(neighbourhood.items(),
                    key=lambda k: neighbourhood[k[0]],
                    reverse=True)[:number]
     else:
         r = sorted(neighbourhood,
                    key=lambda k: neighbourhood[k],
                    reverse=True)[:number]
     return r
 def useoperator(self, vector, operator):
     newvec = vector
     if operator:
         if not self.isoperator(operator):
             self.addoperator(operator)
         newvec = sparsevectors.permute(
             vector, self.permutationcollection[operator])
     return newvec
예제 #3
0
 def addintoitem(self, item, vector, weight=1, operator=None):
     if not self.contains(item):
         self.additem(item)
     if operator is not None:
         vector = sparsevectors.permute(vector, operator)
     self.contextspace[item] = sparsevectors.sparseadd(self.contextspace[item],
                                                       sparsevectors.normalise(vector),
                                                       weight)
     self.changed = True
예제 #4
0
 def applyoperator(self, item, operator, constant, weight):
     self.contextspace[item] = sparsevectors.sparseadd(
         self.contextspace[item],
         sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant],
                                                       self.permutationcollection[operator])),
         weight)
     if operator == "morphology":
                 self.morphologyspace[item] = sparsevectors.sparseadd(
                     self.morphologyspace[item],
                     sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant],
                     self.permutationcollection[operator])),
                     weight)
     else:
         self.attributespace[item] = sparsevectors.sparseadd(
             self.attributespace[item],
             sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant],
                                                       self.permutationcollection[operator])),
             weight)
예제 #5
0
 def indextocontextneighbours(self, item, number=10, weights=False, permutationname="nil"):
     permutation = self.permutationcollection[permutationname]
     neighbourhood = {}
     for i in self.contextspace:
         neighbourhood[i] = sparsevectors.sparsecosine(sparsevectors.permute(self.indexspace[item], permutation),
                                           self.contextspace[i])
     if weights:
         r = sorted(neighbourhood.items(), key=lambda k: neighbourhood[k[0]], reverse=True)[:number]
     else:
         r = sorted(neighbourhood, key=lambda k: neighbourhood[k], reverse=True)[:number]
     return r
예제 #6
0
 def postriplevector(self, text, poswindow=3):
     poses = nltk.pos_tag(text)
     windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)]
     onevector = self.pospermutations["vector"]
     vector = sparsevectors.newemptyvector(self.dimensionality)
     for sequence in windows:
         for item in sequence:
             if item[1] not in self.pospermutations:
                 self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality)
             onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]])
         vector = sparsevectors.sparseadd(vector, onevector)
     return vector
def processfile(file):
    global sentencestorage, utterancespace
    sentenceindex = 0
    textvector = wordspace.newemptyvector()
    with open(file, "r", encoding="utf-8") as textfile:
        rawtext = textfile.read().lower()
        rawtext = re.sub('\n', ' ', rawtext)
        rawtext = re.sub('\"', ' ', rawtext)
        rawtext = re.sub('\s+', ' ', rawtext)
        sents = sent_tokenize(rawtext)
        for sentence in sents:
            sentenceindex += 1
            sentencestorage[sentenceindex] = sentence
            allsurfacewords = nltk.word_tokenize(sentence)
            wordspace.chkwordspace(allsurfacewords, debug)
            analyses = []
            try:
                analyses = semanticdependencyparse.semanticdepparse(
                    sentence.lower(), debug)
            except:
                logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence,
                       error)
            kk = 0
            for analysis in analyses:
                words = analysis.values()
                wordspace.checkwordspacelist(words, debug)
                for role in analysis:
                    if role not in wordspace.permutationcollection:
                        wordspace.permutationcollection[
                            role] = sparsevectors.createpermutation(
                                wordspace.dimensionality)
                u = getvector(analysis, sentence)
                win = 1
                sentencesequence = 0
                startindexforthistext = 0
                while win < sentencesequence:
                    if sentenceindex - win > startindexforthistext:
                        u = sparsevectors.sparseadd(
                            u,
                            sparsevectors.permute(
                                sparsevectors.normalise(
                                    utterancespace[sentenceindex - win]),
                                wordspace.permutationcollection["discourse"]))
                    win += 1
                if kk > 0:
                    sentenceindex += 1
                utterancespace[sentenceindex] = u
                textvector = sparsevectors.sparseadd(textvector, u, 1)
                kk += 1
        textspace[file] = textvector
    return textvector
예제 #8
0
 def onesequencevector(self, subsequence, accumulator=None, loglevel=False):
     if accumulator is None:
         accumulator = self.sequencelabel
     if subsequence == []:
         return accumulator
     else:
         head = subsequence[0]  # type: str
         tail = subsequence[1:]
         if head not in self.permutations:
             self.permutations[head] = sparsevectors.createpermutation(
                 self.dimensionality)
             self.changed = True
         return self.onesequencevector(
             tail,
             sparsevectors.permute(accumulator, self.permutations[head]))
def getvector(roleworddict, sentencestring):
    uvector = {}  # vector for test item
    for role in roleworddict:
        item = roleworddict[role]
        uvector = sparsevectors.sparseadd(
            uvector,
            sparsevectors.permute(
                sparsevectors.normalise(wordspace.indexspace[item]),
                wordspace.permutationcollection[role]),
            wordspace.frequencyweight(item))
    lexicalwindow = 1
    if lexicalwindow > 0:
        wds = word_tokenize(sentencestring.lower())
        windows = [
            wds[i:i + lexicalwindow]
            for i in range(len(wds) - lexicalwindow + 1)
        ]
        for sequence in windows:
            thisvector = {}
            for item in sequence:
                thisvector = sparsevectors.sparseadd(
                    sparsevectors.permute(
                        thisvector,
                        wordspace.permutationcollection["sequence"]),
                    wordspace.indexspace[item],
                    wordspace.frequencyweight(item))
            uvector = sparsevectors.sparseadd(
                uvector, sparsevectors.normalise(thisvector))
    pos = 1
    if pos > 0:
        wds = word_tokenize(sentencestring)
        posanalyses = nltk.pos_tag(wds)
        poslist = [i[1] for i in posanalyses]
        windows = [
            poslist[i:i + lexicalwindow]
            for i in range(len(poslist) - lexicalwindow + 1)
        ]
        for sequence in windows:
            thisvector = {}
            for item in sequence:
                thisvector = sparsevectors.sparseadd(
                    sparsevectors.permute(
                        thisvector,
                        wordspace.permutationcollection["sequence"]),
                    wordspace.indexspace[item],
                    wordspace.frequencyweight(item))
            uvector = sparsevectors.sparseadd(
                uvector, sparsevectors.normalise(thisvector))
    style = True
    if style:
        wds = word_tokenize(sentencestring)
        cpw = len(sentencestring) / len(wds)
        wps = len(wds)
        sl = True
        if sl:
            if wps > 8:
                uvector = sparsevectors.sparseadd(uvector, longsentencevector)
            if wps < 5:
                uvector = sparsevectors.sparseadd(uvector, shortsentencevector)
        posanalyses = nltk.pos_tag(wds)
        poslist = [i[1] for i in posanalyses]
        for poses in poslist:
            if poses == "RB" or poses == "RBR" or poses == "RBS":
                uvector = sparsevectors.sparseadd(uvector, adverbvector)
        for w in wds:
            if w in negationlist:
                uvector = sparsevectors.sparseadd(uvector, negationvector)
            if w in hedgelist:
                uvector = sparsevectors.sparseadd(uvector, hedgevector)
            if w in amplifierlist:
                uvector = sparsevectors.sparseadd(uvector, amplifiervector)

    # attitude terms
    # verb stats
    # seq newordgrams
    # verb classes use wordspace!
    # sent sequences
    return uvector