def addconstant(self, item):
     self.changed = True
     self.additem(
         item,
         sparsevectors.newrandomvector(
             self.dimensionality,
             self.dimensionality // self.constantdenseness))
示例#2
0
 def additem(self, item, vector="dummy"):
     if vector is "dummy":
         vector = sparsevectors.newrandomvector(self.dimensionality,
                                                self.denseness)
     if not self.contains(item):
         self.indexspace[item] = vector
         self.contextspace[item] = sparsevectors.newemptyvector(
             self.dimensionality)
示例#3
0
 def addintoitem(self, item, vector, weight=1):
     if not self.contains(item):
         vector = sparsevectors.newrandomvector(self.dimensionality,
                                                self.denseness)
         self.indexspace[item] = vector
         self.globalfrequency[item] = 0
         self.contextspace[item] = sparsevectors.newemptyvector(
             self.dimensionality)
     self.contextspace[item] = \
         sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight)
示例#4
0
 def additem(self, item, vector=None):
     """
     Add new item to the space. Add randomly generated index vector (unless one is given as an argument or one
     already is recorded in index space); add empty context space, prep LanguageModel to accommodate item. Should
     normally be called from observe() but also at times from addintoitem.
     """
     if item not in self.indexspace:
         if vector is None:
             vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness)
         self.indexspace[item] = vector
     self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality)
     self.changed = True
     self.observedfrequency[item] = 0
示例#5
0
    def additem(self, item, vector="dummy"):
        if vector is "dummy":
            vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness)
        if not self.contains(item):
            self.indexspace[item] = vector
            self.globalfrequency[item] = 1
            self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality)
            self.attributespace[item] = sparsevectors.newemptyvector(self.dimensionality)
            self.morphologyspace[item] = sparsevectors.newemptyvector(self.dimensionality)
#            self.textspace[item] = sparsevectors.newemptyvector(self.dimensionality)
#            self.utterancespace[item] = sparsevectors.newemptyvector(self.dimensionality)
#            self.authorspace[item] = sparsevectors.newemptyvector(self.dimensionality)
            self.bign += 1
示例#6
0
 def additemintoitem(self, item, otheritem, weight=1, operator=None):
     """
     Update the context vector of item by adding in the index vector of otheritem multiplied by the scalar weight.
     If item is unknown, add it to the space. If otheritem is unknown add only an index vector to the space.
     :param item: str
     :param otheritem: str
     :param weight: float
     :param permutation: list
     :return: None
     """
     if not self.contains(item):
         self.additem(item)
     if otheritem not in self.indexspace:
         self.indexspace[otheritem] = sparsevectors.newrandomvector(self.dimensionality, self.denseness)
     self.addintoitem(item, self.indexspace[otheritem], weight, operator)
示例#7
0
 def __init__(self,
              dimensionality=2000,
              window=3,
              sequencelabel=None,
              permutations={}):
     self.window = window
     self.changed = False
     self.dimensionality = dimensionality
     if sequencelabel is None:
         self.sequencelabel = sparsevectors.newrandomvector(
             dimensionality, dimensionality // 10)
         self.changed = True
     else:
         self.sequencelabel = sequencelabel
     self.permutations = permutations
     self.error = True
     self.debug = False
     self.monitor = False
示例#8
0
                                       str(seenw[something]) + "\n")
        wordstatsoutfile.flush()
        wordstatsoutfile.close()
    logger(
        "Computing ngram frequency-based weights with " + str(i) + " files " +
        str(file), monitor)
    e = xml.etree.ElementTree.parse(file).getroot()
    for b in e.iter("document"):
        string = b.text
        words = word_tokenize(string)
        str(string).replace("\n", "")
        windows = [
            string[ii:ii + window] for ii in range(len(string) - window + 1)
        ]
        for sequence in windows:
            seen[sequence] += 1
            if seen[sequence] == 1:
                thisvector = stringspace.makevector(sequence)
                itemj = {}
                itemj["string"] = sequence
                itemj["indexvector"] = thisvector
                pickle.dump(itemj, ngramvectoroutfile)
        for word in words:
            seenw[word] += 1
            if seenw[word] == 1:
                itemj = {}
                itemj["string"] = sequence
                itemj["indexvector"] = sparsevectors.newrandomvector(
                    dimensionality, denseness)
                pickle.dump(itemj, wordvectoroutfile)
示例#9
0
 def additem(self, item):
     self.indexspace[item] = sparsevectors.newrandomvector(self.dimensionality, self.denseness)
     self.globalfrequency[item] = 1
     self.bign += 1
def doallthefiles(rangelimit=4000):
    filelist = {}
    seenfile = {}
    antal_frag = 0
    for ix in range(rangelimit):
        filelist[ix] = {}
        seenfile[ix] = True
        for cat in categories:
            fn = "{}{}.of_{:0>4d}.json.txt".format(path, cat, ix)
            try:
                os.stat(fn)
                filelist[ix][cat] = fn
            except:
                seenfile[ix] = None
                filelist[ix][cat] = None
                del filelist[ix]
                logger(
                    "index {} did not match up {} file: {}".format(
                        ix, cat, fn), error)
    logger("antal filer: {}".format(len(filelist)), monitor)
    conditions = ["wp", "wd", "wn", "wdp", "wnp", "wnd", "wndp"]
    vocabulary = {}
    vocabulary_words = Counter()
    vocabulary_labels = Counter()
    vocabulary["wp"] = Counter()
    vocabulary["wd"] = Counter()
    vocabulary["wn"] = Counter()
    vocabulary["wnp"] = Counter()
    vocabulary["wnd"] = Counter()
    vocabulary["wdp"] = Counter()
    vocabulary["wndp"] = Counter()
    outfrag = {}
    for fileindex in filelist:
        if seenfile[fileindex]:
            zippy = mergefiles(filelist[fileindex][categories[0]],
                               filelist[fileindex][categories[1]],
                               filelist[fileindex][categories[2]],
                               filelist[fileindex][categories[3]])
            wp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wp", fileindex), "w+")
            wd_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wd", fileindex), "w+")
            wn_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wn", fileindex), "w+")
            wnp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wnp", fileindex), "w+")
            wnd_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wnd", fileindex), "w+")
            wdp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wdp", fileindex), "w+")
            wndp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wndp", fileindex),
                "w+")
            for fragment in zippy:
                antal_frag += 1
                for cc in conditions:
                    outfrag[cc] = []
                for oneitem in fragment:
                    vocabulary_words.update([oneitem[0]])
                    vocabulary_labels.update([oneitem[1]])
                    vocabulary_labels.update([oneitem[2]])
                    vocabulary_labels.update([oneitem[3]])
                    vocabulary["wp"].update(
                        [joinstring.join([oneitem[0], oneitem[1]])])
                    outfrag["wp"].append("".join([oneitem[0], oneitem[1]]))
                    vocabulary["wd"].update(
                        [joinstring.join([oneitem[0], oneitem[2]])])
                    outfrag["wd"].append("".join([oneitem[0], oneitem[2]]))
                    vocabulary["wn"].update(
                        [joinstring.join([oneitem[0], oneitem[3]])])
                    outfrag["wn"].append("".join([oneitem[0], oneitem[3]]))
                    vocabulary["wnp"].update([
                        joinstring.join([oneitem[0], oneitem[1], oneitem[2]])
                    ])
                    outfrag["wnp"].append("".join(
                        [oneitem[0], oneitem[1], oneitem[2]]))
                    vocabulary["wnd"].update([
                        joinstring.join([oneitem[0], oneitem[1], oneitem[3]])
                    ])
                    outfrag["wnd"].append("".join(
                        [oneitem[0], oneitem[1], oneitem[3]]))
                    vocabulary["wdp"].update([
                        joinstring.join([oneitem[0], oneitem[2], oneitem[3]])
                    ])
                    outfrag["wdp"].append("".join(
                        [oneitem[0], oneitem[2], oneitem[3]]))
                    vocabulary["wndp"].update([
                        joinstring.join(
                            [oneitem[0], oneitem[1], oneitem[2], oneitem[3]])
                    ])
                    outfrag["wndp"].append("".join(
                        [oneitem[0], oneitem[1], oneitem[2], oneitem[3]]))
                wp_f.write(" ".join(outfrag["wp"]) + "\n")
                wd_f.write(" ".join(outfrag["wd"]) + "\n")
                wn_f.write(" ".join(outfrag["wn"]) + "\n")
                wnp_f.write(" ".join(outfrag["wnp"]) + "\n")
                wnd_f.write(" ".join(outfrag["wnd"]) + "\n")
                wdp_f.write(" ".join(outfrag["wdp"]) + "\n")
                wndp_f.write(" ".join(outfrag["wndp"]) + "\n")
            wn_f.close()
            wd_f.close()
            wp_f.close()
            wnd_f.close()
            wnp_f.close()
            wdp_f.close()
            wndp_f.close()

    logger("antal fragment: {}".format(antal_frag), monitor)
    vocab_words = {w for w, c in vocabulary_words.items() if c >= MINCOUNT}
    size_vocab = len(vocab_words)
    logger("antal ord std: {}".format(size_vocab), monitor)
    embeddings = {}
    for w in vocab_words:
        embeddings[w] = sparsevectors.newrandomvector(dimensionality, density)

    vocab_labels = {w for w, c in vocabulary_labels.items() if c >= MINCOUNT}
    size_vocab = len(vocab_labels)
    logger("antal tag tot: {}".format(size_vocab), monitor)
    labelembeddings = {}
    for w in vocab_labels:
        try:
            labelembeddings[w] = sparsevectors.newrandomvector(
                dimensionality, labeldensity)
        except IndexError:
            logger("Indexerror: {}".format(w), error)
    for cc in conditions:
        vocab_words = {w for w, c in vocabulary[cc].items() if c >= MINCOUNT}
        size_vocab = len(vocab_words)
        compositeembeddings = {}
        logger("antal ord i {}: {}".format(cc, size_vocab), monitor)
        with open('{}{}/vocab.words.txt'.format(outpath, cc), "w+") as f:
            for wdl in sorted(list(vocab_words)):
                wd = "".join(wdl.split(joinstring))
                f.write('{}\n'.format(wd))
                vv = embeddings[wdl.split(joinstring)[0]]
                for ll in wdl.split(joinstring)[1:]:
                    vv = sparsevectors.sparseadd(vv, labelembeddings[ll])
                compositeembeddings[wd] = sparsevectors.listify(
                    sparsevectors.normalise(vv), dimensionality)
        with open('{}{}/compositevectors.txt'.format(outpath, cc), "w+") as f:
            for www in compositeembeddings:
                f.write("{} {}\n".format(
                    www, " ".join(map(str, compositeembeddings[www]))))
示例#11
0
 def addconstant(self, item):
     self.additem(
         item,
         sparsevectors.newrandomvector(self.dimensionality,
                                       self.dimensionality // 10))
示例#12
0
path = "/home/jussi/aktuellt/2018.recfut/tf_ner/data/recfut/"
# read words file
if __name__ == '__main__':
    # Load vocab
    with open(path + "vocab.words.txt", "r+") as f:
        word_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
    size_vocab = len(word_to_idx)

    print("antal ord {}".format(size_vocab))

    # Array of zeros
    embeddings = np.zeros((size_vocab, dimensionality))

    for word in word_to_idx:
        vector = sparsevectors.newrandomvector(dimensionality, density)
        word_idx = word_to_idx[word]
        embeddings[word_idx] = sparsevectors.listify(vector, dimensionality)

    np.savez_compressed(path + 'randomindex.npz', embeddings=embeddings)

    with open(path + "vocab.words.txt", "r+") as f:
        word_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
    size_vocab = len(word_to_idx)

    print("antal ord {}".format(size_vocab))

    # Array of zeros
    embeddings = np.zeros((size_vocab, dimensionality))

    for word in word_to_idx:
示例#13
0
 def addconstant(self, item):
     self.constantcollection[item] = sparsevectors.newrandomvector(self.dimensionality, self.denseness)