def train(filename, parser): fname = basename(filename) cache_path = join(CACHE_PATH, fname) if exists(cache_path): return word2tag_count = {} for tag_id_list, txt in parser(filename): if not txt.strip(): continue tag_id_set = set(tag_id_list) if not tag_id_set: continue for tid in tuple(tag_id_set): tag_id_set.update(PTAG.get(tid, ())) word2count = defaultdict(int) word_list = list(seg_txt(utf8_ftoj(str(txt)))) for i in word_list: word2count[i] += 1 for k, v in word2count.iteritems(): if k not in word2tag_count: word2tag_count[k] = {} t = word2tag_count[k] for id in tag_id_set: if id not in t: t[id] = 0 t[id] += (1 + log(float(v))) tofromfile.tofile(cache_path, word2tag_count)
def train(filename, parser): fname = basename(filename) cache_path = join(CACHE_PATH, fname) if exists(cache_path): return word2tag_count = {} for tag_id_list, txt in parser(filename): if not txt.strip(): continue tag_id_set = set(tag_id_list) if not tag_id_set: continue for tid in tuple(tag_id_set): tag_id_set.update(PTAG.get(tid, ())) word2count = defaultdict(int) word_list = list(seg_txt(utf8_ftoj(str(txt)))) for i in word_list: word2count[i] += 1 for k, v in word2count.iteritems(): if k not in word2tag_count: word2tag_count[k] = {} t = word2tag_count[k] for id in tag_id_set: if id not in t: t[id] = 0 t[id] += (1+log(float(v))) tofromfile.tofile(cache_path, word2tag_count)
def main(): df = Df() def merge(filename): path = join(ZDATA_PATH_TRAIN_IDF, "%s.idf" % filename) if not exists(path): return print path df.extend_by_file(path) merge("zhihu.js") merge("review.txt") for i in glob(join(ZDATA_PATH_TRAIN_IDF, "wanfang", "Periodical_*")): merge(i) PATH = join(ZDATA_PATH, "data") if not exists(PATH): makedirs(PATH) tofile(join(PATH, "idf"), idf_dumps(df._count, df._df))
def main(): df = Df() def merge(filename): path = join(ZDATA_PATH_TRAIN_IDF, "%s.idf" % filename) if not exists(path): return print path df.extend_by_file(path) merge("zhihu.js") merge("review.txt") for i in glob(join(ZDATA_PATH_TRAIN_IDF, "wanfang", "Periodical_*")): merge(i) PATH = join(ZDATA_PATH, "data") if not exists(PATH): makedirs(PATH) tofile( join(PATH, "idf"), idf_dumps(df._count, df._df), )
def tofile(self, path): tofile(path, self._dict)
def tofile(self): word_id2tag_id = list(self.txt_tag_generator()) path = DATA_DIR self.tag2id.tofile(join(path, "tag2id")) self.word2id.tofile(join(path, "word2id")) tofile(join(path, "word_id2tag_id"), word_id2tag_id)
def tofile(self, f): tofile( f, (self._count, dict(self._df.iteritems())) )
def tofile(self): word_id2tag_id = list(self.txt_tag_generator()) path = DATA_DIR self.tag2id.tofile(join(path, 'tag2id')) self.word2id.tofile(join(path, 'word2id')) tofile(join(path, 'word_id2tag_id'), word_id2tag_id)
def tofile(self, f): tofile(f, (self._count, dict(self._df.iteritems())))