def fill_redis(): redisdb = redis.Redis(REDIS_SERVER, REDIS_PORT) senf = open("sentences.csv") for line in senf: id, lang, sen = line.strip().split('\t') token = pyplus1.lang_parse(lang, sen) if token: tokenbuf = ','.join(token) redisdb.hset(lang, int(id), tokenbuf) linkf = open("data/links.csv") for line in linkf: nlang, nid, tlang, tid = line.split('\t') if nlang < tlang: # nlang sorts before tlang merged = (nid) | (tid << 32) setname = "%s-%s" % (nlang, tlang) else: merged = (tid) | (nid << 32) setname = "%s-%s" % (tlang, nlang) redisdb.sadd(setname, merged)
def add_sentence(self, lang, id, sentence): words = pyplus1.lang_parse(lang, sentence.encode('utf-8')) out = '.'.join(words) self.redisdb.hset(lang, id, out)
def parse_text(self, text): for line in text: words = pyplus1.lang_parse(self.tlang.encode('utf-8'), line.encode('utf-8')) for w in words: self.add_word(w)