def __init__(self,
              dimensionality: int = 2000,
              denseness: int = 10,
              name: str = "no name"):
     self.name = name
     self.indexspace = {}  # dict: string - sparse vector
     self.contextspace = {}  # dict: string - denser vector
     self.tag = {}  # dict: string - string
     self.tagged = {}  # dict: string - list: str
     self.dimensionality = dimensionality
     self.denseness = denseness
     self.permutationcollection = {
         "nil": list(range(self.dimensionality)),
         "before": sparsevectors.createpermutation(self.dimensionality),
         "after": sparsevectors.createpermutation(self.dimensionality)
     }
     self.observedfrequency = {}  # dict: string - int
     self.constantdenseness = 10
     self.languagemodel = LanguageModel()
     self.poswindow = 3
     self.changed = False
Exemplo n.º 2
0
 def postriplevector(self, text, poswindow=3):
     poses = nltk.pos_tag(text)
     windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)]
     onevector = self.pospermutations["vector"]
     vector = sparsevectors.newemptyvector(self.dimensionality)
     for sequence in windows:
         for item in sequence:
             if item[1] not in self.pospermutations:
                 self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality)
             onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]])
         vector = sparsevectors.sparseadd(vector, onevector)
     return vector
def processfile(file):
    global sentencestorage, utterancespace
    sentenceindex = 0
    textvector = wordspace.newemptyvector()
    with open(file, "r", encoding="utf-8") as textfile:
        rawtext = textfile.read().lower()
        rawtext = re.sub('\n', ' ', rawtext)
        rawtext = re.sub('\"', ' ', rawtext)
        rawtext = re.sub('\s+', ' ', rawtext)
        sents = sent_tokenize(rawtext)
        for sentence in sents:
            sentenceindex += 1
            sentencestorage[sentenceindex] = sentence
            allsurfacewords = nltk.word_tokenize(sentence)
            wordspace.chkwordspace(allsurfacewords, debug)
            analyses = []
            try:
                analyses = semanticdependencyparse.semanticdepparse(
                    sentence.lower(), debug)
            except:
                logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence,
                       error)
            kk = 0
            for analysis in analyses:
                words = analysis.values()
                wordspace.checkwordspacelist(words, debug)
                for role in analysis:
                    if role not in wordspace.permutationcollection:
                        wordspace.permutationcollection[
                            role] = sparsevectors.createpermutation(
                                wordspace.dimensionality)
                u = getvector(analysis, sentence)
                win = 1
                sentencesequence = 0
                startindexforthistext = 0
                while win < sentencesequence:
                    if sentenceindex - win > startindexforthistext:
                        u = sparsevectors.sparseadd(
                            u,
                            sparsevectors.permute(
                                sparsevectors.normalise(
                                    utterancespace[sentenceindex - win]),
                                wordspace.permutationcollection["discourse"]))
                    win += 1
                if kk > 0:
                    sentenceindex += 1
                utterancespace[sentenceindex] = u
                textvector = sparsevectors.sparseadd(textvector, u, 1)
                kk += 1
        textspace[file] = textvector
    return textvector
Exemplo n.º 4
0
 def onesequencevector(self, subsequence, accumulator=None, loglevel=False):
     if accumulator is None:
         accumulator = self.sequencelabel
     if subsequence == []:
         return accumulator
     else:
         head = subsequence[0]  # type: str
         tail = subsequence[1:]
         if head not in self.permutations:
             self.permutations[head] = sparsevectors.createpermutation(
                 self.dimensionality)
             self.changed = True
         return self.onesequencevector(
             tail,
             sparsevectors.permute(accumulator, self.permutations[head]))
Exemplo n.º 5
0
 def addoperator(self, item):
     self.permutationcollection[item] = sparsevectors.createpermutation(
         self.dimensionality)