import sys from fifo import Fifo from lru import LRU from opt import Opt def read_file(file_name): with open(file_name, "r") as f: result = map(lambda line: int(line.replace("\n", "")), f.readlines()) return result if __name__ == "__main__": args = sys.argv slots = args[1] file_name = args[2] entries = list(read_file(file_name)) fifo = Fifo(int(slots)) lru = LRU(int(slots)) opt = Opt(int(slots), entries) for index, entrie in enumerate(entries): fifo.add(entrie) lru.add(entrie) opt.add(entrie, index) print( "{} quadros, {} refs: FIFO: {} PFs, LRU: {} PFs, OPT: {} PFs" .format(slots, len(entries), fifo.falts, lru.falts, opt.falts))
class Synonyms(object): def __init__(self, jieba_path, lru_cap): self.psg = Jieba(jieba_path) self.cache = LRU(lru_cap) def load_vecs(self, load_fn, w2vFile): """ params: load_fn | function - must return dict && 2 dims numpy.array """ self.word2idx, self.weights = load_fn(w2vFile) self.idx2word = {v: k for k, v in self.word2idx.items()} print(f"load vector completed, vec size - {self.weights.shape}") print("build kd tree") self.kdtree = KDTree(self.weights, leaf_size=10, metric="euclidean") self.vec_len = self.weights.shape[1] def _word2vec(self, word, ignore_type="random"): """ ignore_type | string - "zeros" or "random" """ if word in self.word2idx: return self.weights[self.word2idx[word]] else: return np.random.uniform(low=-2, high=2, size=self.vec_len) def _text2vec(self, text, ret_num=5): vectors = [] for w in text: c = [] if w in self.word2idx: c.append(self._word2vec(w)) for nw, _ in self.nearby(w, ret_num=ret_num): c.append(self._word2vec(nw)) r = np.average(c, axis=0) vectors.append(r) else: vectors.append( np.random.uniform(low=-2, high=2, size=self.vec_len)) return np.sum(vectors, axis=0) def nearby(self, word, ret_num=10, ignore_type="random"): word = word.strip() # cache rets = self.cache.search(word) if rets is not None: for ret in rets: yield ret vec = self._word2vec(word, ignore_type) [distances], [points] = self.kdtree.query(np.array([vec]), k=ret_num, return_distance=True) rets = [] for (x, y) in zip(points, distances): w = self.idx2word[x] if w == word: s = 1.0 else: s = cosine(vec, self.weights[x]) rets.append((w, min(s, 1.0))) rets = list(sorted(rets, key=lambda x: x[1], reverse=True)) self.cache.add(word, rets) for ret in rets: yield ret def similarity(self, s1, s2, top=5): if s1 == s2: return 1. s1 = self.psg.segment(s1) s2 = self.psg.segment(s2) assert len(s1) > 0 and len( s2) > 0, "The length of s1 and s2 should > 0." return cosine(self._text2vec(s1, top), self._text2vec(s2, top))