Пример #1
0
def buildDictionary(occurs, wordDictSizeMax=30000, refDictSizeMax=30000):
    refList = []
    fullList = []
    for key, value in occurs.items():
        if isinstance(key, it.Reference):
            refList.append((key, value))
        else:
            fullList.append((key, value))
    refList.sort(key=lambda x: x[1], reverse=True)
    fullList.sort(key=lambda x: x[1], reverse=True)

    fullSource = refList[:refDictSizeMax] + fullList[:wordDictSizeMax]

    refIds = set()
    tokenMapper = BiDict()
    nextTokenId = 3
    for (tok, val) in fullSource:
        tokenId = tokenMapper.getFirst(tok)
        if tokenId == None:
            tokenMapper.insert(tok, nextTokenId)
            tokenId = nextTokenId
            nextTokenId += 1

        if isinstance(tok, it.Reference):
            refIds.add(tokenId)

    return (tokenMapper, refIds)
Пример #2
0
 def __init__(self, vec, getter):
     vals = [getter(o) for o in vec if o != None and getter(o) != None]
     self.samples = len(vals)
     self.map = BiDict()
     self.counterMap = {}
     ctr = 2 # 1 is unk, 0 is padding
     for v in vals:
         if type(v) is list:
             for w in v:
                 ctr = self.handleWord(w, ctr)
         else:
             ctr = self.handleWord(v, ctr)
Пример #3
0
    def restrictTo(self, limit):
        pairs = []
        for key, value in self.counterMap.items():
            pairs.append((key, value))
        pairs.sort(key=lambda x: x[1], reverse=True)

        restricted = pairs[:limit]
        new_dict = BiDict()
        new_key = 2
        for key, _ in restricted:
            tok = self.map.getSecond(key)
            new_dict.insert(tok, new_key)
            new_key += 1
        self.map = new_dict