def gen_samples(ulog_d, ulog_diff, prefix, outpath): logger.info("generate posi & neg samples for myrec...") if '.json' in ulog_d: dlog = util.loadjson(ulog_d) difflog = util.loadjson(ulog_diff) else: dlog = util.load2dic(ulog_d) difflog = util.load2dic(ulog_diff) posisam = [] negsam = [] logger.info("gen posi samples...") for k in dlog.keys(): fns = dlog[k] if fns: for fn in fns: posisam.append("%s+%s\t%d" % (k, fn.lower(), 1)) print(len(posisam)) util.list2txt(posisam, outpath + '/' + prefix + '_posi.txt') del dlog del posisam logger.info("gen neg samples...") for k in difflog.keys(): fns = difflog[k] if fns: for fn in fns: negsam.append("%s+%s\t%d" % (k, fn.lower(), 0)) print(len(negsam)) util.list2txt(negsam, outpath + '/' + prefix + '_neg.txt')
def seg4file_book(infile, respath): ''' 对文章或小说做分词,一行多句,一个file是一本小说或全文 本方法原旨在处理一部系列小说,其小说格式较为奇葩 :param infile: :param respath: :return: ''' alllines=[] fsize = util.get_FileSize(infile) logger.info("doing %s and its %.2fMB" %(infile,float(fsize))) with codecs.open(infile, 'rU', encoding=bianma, errors='replace') as f: for line in f: l = line.strip() if l: alllines.append(l) fulltext=''.join(alllines) del alllines if fulltext: t2 = time.time() wordres=segword4onetext(fulltext) timecha = time.time() - t2 logger.info("%s done. totaly %.2f MB cost %.1f secs" % (infile, fsize, timecha)) logger.info(" -------- %.2f MB/sec -------- %.2fMB/min" % ( float(fsize) / float(timecha), float(fsize) * 60 / float(timecha))) if wordres: filename = os.path.split(infile)[1] util.list2txt(wordres, os.path.join(respath, filename)) else: raise Exception("Word Seg Res is None, Please Check Your Input File!!!")
def seg4file_1line1sent(infile, respath): ''' 对单个文件分词,本方法不涉及到分句,默认输入的一行就是一句 例如一行对应一段关键词,或者就简单的一行对应一个标题 ''' res = [] t0 = time.time() cnt = 0 fsize = util.get_FileSize(infile) logger.info("now doing file : %s and its size is %.2f MB" % (infile, fsize)) with codecs.open(infile, 'rU', encoding=bianma, errors='replace') as f: for line in f: cnt += 1 if cnt % 50000 == 0: #差不多1分钟 logger.info("doing %s line: %d" % (infile, cnt)) wseg=segword4oneline(line) if wseg: res.append(' '.join(wseg)) timecha = time.time() - t0 logger.info("file %s done. there are totaly %d lines %.2f MB cost %.1f secs" % (infile, cnt, fsize, timecha)) logger.info(" -------- %.2f MB/sec -------- %.2fMB/min" % ( float(fsize) / float(timecha), float(fsize) * 60 / float(timecha))) if res: filename = os.path.split(infile)[1] util.list2txt(res, os.path.join(respath, filename))
def get_samplevec_gensimmodel(vecpath1, vecpath2, samplefile, prefix, respath='./', stopcnt=100, progress_per=10000): #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec] data, labels, realexamp = [], [], [] logger.info('loading vecfile : %s' % vecpath1) # muser=Doc2Vec.load(usermodel) v_user = load_vec(vecpath1) logger.info('loading vecfile : %s' % vecpath2) v_file = load_vec(vecpath2) samples = util.load2list(samplefile) for cnt, exam in enumerate(samples): if cnt % progress_per == 0: print("getting example vecs : %d" % cnt) if stopcnt and stopcnt == cnt: break exam = exam.strip().split() label0 = exam[1] uid = '*dt_' + exam[0].split("+")[0] fn = '*dt_' + exam[0].split("+")[1] if uid in v_user and fn in v_file: uvec = list(v_user[uid]) fvec = list(v_file[fn]) sampvec = uvec + fvec #拼接 realexamp.append(exam[0]) data.append(sampvec) labels.append(label0) del v_file del v_user np.savetxt('%s/exampvecs_%s.txt' % (respath, prefix), np.array(data)) util.list2txt(realexamp, '%s/realexamples_%s.txt' % (respath, prefix))
def get_userlist(path, logpath=None): #获取用户id列表,返回list if os.path.exists(path): return util.load2list(path) else: ul = util.load2list(logpath, get1column=0) util.list2txt(ul, path) return ul
def filter_fns(inpath, outpath): inli = get_fnlist(inpath, '') res = [] for fn in inli: res.append(fn.lower()) a = list(set(res)) util.list2txt(a, outpath) print(len(a)) return a
def get_fnlist(path, logpath): #获取文件名列表,返回list if os.path.exists(path): return util.load2list(path) else: ul = util.load2list(logpath, to1column=True, start=1) res = list(set(ul)) util.list2txt(res, path) return res
def get_highquality_ulog(inpath, outpath, actmin=2, actmax=300): #优质用户历史,操作数>2 <300(操作太多可能是爬虫) oldulog = util.load2list(inpath) newulog = [] for l in oldulog: ws = l.strip().split()[1:] #每一行第一个是id if actmax > len(ws) > actmin: newulog.append(l) util.list2txt(newulog, outpath)
def seg4file_1line1text(infile, resprefix='',to1line=False, hastitle=False, spliter=None): ''' 对单个文件分词,涉及到分句, 默认输入的一行就是一段完整的文本,例如全文or摘要(与小说不同,小说是整个文件为全文) 这里分词有很多种方式 1:一行输入一行输出,即一段文本分词变成一行(to1line=True, hastitle=False) 2(default):一行输入多行输出,即一段文本先分句变成多行,每行句子再分词(to1line=False, hastitle=False) 3:输入文本第一列标题,后面是text,text分成一行(类似1),输出类似 title [text分词](to1line=True, hastitle=True) 4:输入文本第一列标题,后面是text,text分成多行(类似2),而输出每行都要加上标题 (to1line=False, hastitle=True) 结果文件存放在输入文件目录下segres文件夹内 分词速度:单核8.5M/min(XEON) :param infile: :return: ''' alllines=[] fsize = util.get_FileSize(infile) mode01='l1' if to1line else 'l0' mode02='t1' if hastitle else 't0' mode=mode01+mode02 logger.info("doing %s and its %.2fMB and split mode is %s" %(infile,float(fsize),mode)) t2 = time.time() cnt=-1 with codecs.open(infile, 'rU', encoding=bianma, errors='replace') as f: for line in f: cnt+=1 if cnt%50000==0: print("processed line %d" %cnt) l = line.strip() if not l: continue title = '' if hastitle: title = l.split(spliter)[0] l = ''.join(l.split(spliter)[1:]) if to1line: wseg=segword4oneline(l,convert=True) if wseg: alllines.append(' '.join([title]+wseg)) else: textseg=segword4onetext(l) for ts in textseg: alllines.append("%s %s" %(title,ts)) timecha = time.time() - t2 logger.info("%s done. totaly %.2f MB cost %.1f secs" % (infile, fsize, timecha)) logger.info(" -------- %.2f MB/sec -------- %.2fMB/min" % ( float(fsize) / float(timecha), float(fsize) * 60 / float(timecha))) if alllines: segresfolder=os.path.join(os.path.split(infile)[0],'segres') filename = os.path.splitext(os.path.split(infile)[1])[0] if not os.path.exists(segresfolder): os.mkdir(segresfolder) util.list2txt(alllines, os.path.join(segresfolder, filename+resprefix+'.txt')) else: raise Exception("Word Seg Res is None, Please Check Your Input File!!!")
def getfiledtop(cnter, filedfile, top=50): ''' 按filedfile里词的词频排序 :param cnter: counter of all words :type cnter: Counter :param filedfile: :param top: :return: ''' worddic = {} inwords = util.load2list(filedfile) for i in inwords: if cnter.has_key(i): worddic[i] = cnter[i] newcnter = Counter(worddic) top = min(len(newcnter), top) topnwords = ["%s %d" % (i, c) for (i, c) in newcnter.most_common(top)] respath = "%s_top%d.txt" % (os.path.splitext(filedfile)[0], top) util.list2txt(topnwords, respath) return topnwords
def get_intersec_log(user_interseclist, alllog_b, alllog_d, prefix, rootpath=datapath): ''' 获取用户d,b日志的交集用户,并获取这群用户的d,b以及b-d日志分别储存 :param user_interseclist: :type user_interseclist: :param alllog_b: :type alllog_b: :param alllog_d: :type alllog_d: :param prefix: :type prefix: :return: :rtype: ''' blog = util.load2dic(alllog_b) # dlog=util.loadjson(alllog_d) dlog = util.load2dic(alllog_d) userb = blog.keys() userd = dlog.keys() if not os.path.exists(user_interseclist): logger.info("caculating two logs` intersection user...") uintersec = list(set(userb).intersection(set(userd))) util.list2txt(uintersec, user_interseclist) else: logger.info("loading two logs` intersection user file : %s" % user_interseclist) uintersec = util.load2list(user_interseclist) interseced_d = get_sub_dic(dlog, uintersec) interseced_b = get_sub_dic(blog, uintersec) del dlog del blog # interseced_dbdiff = get_dic_diff(interseced_b, interseced_d) logger.info("saving ress...") util.savejson("%s/%s_posi.json" % (rootpath, prefix), interseced_d) util.savejson("%s/%s_neg.json" % (rootpath, prefix), interseced_b) # util.savejson("%s/%s_dbdiff.json" %(rootpath,prefix), interseced_dbdiff) logger.info("done!")
def get_w_v_bycode(vecfilepath, dic_code_kws, respath): ''' :param vecfilepath: :type vecfilepath: str :param dic_code_kws: :type dic_code_kws: dict :return: :rtype: ''' vect = load_vec(vecfilepath) vect.init_sims() for k in dic_code_kws.keys(): print("for code %s" % k) if '_' in k: basepath = respath + '/' + k.split('_')[0] else: basepath = respath + '/others' if not os.path.exists(basepath): os.mkdir(basepath) resfileword = basepath + '/words_' + k + '.txt' resfilevec = basepath + '/vecs_' + k + '.txt' if os.path.exists(resfileword) and os.path.exists(resfilevec): print("file %s already exists,skip code %s..." % (resfileword, k)) continue curkws = dic_code_kws[k] if len(curkws) > 100: words = [] vecs = [] curkws_uniq = list(set(curkws)) #去重 for w in curkws_uniq: if w in vect: words.append(w) vec_norm = vect.vectors_norm[vect.vocab[w].index] vecs.append(vec_norm) if words: print("saving data for code %s get res %d" % (k, len(words))) util.list2txt(words, resfileword) np.savetxt(resfilevec, np.array(vecs)) print("get words & vecs by code done!")
def mergefns(path1, path2, respath): la = util.load2list(path1) lb = util.load2list(path2) res = list(set(la).union(set(lb))) util.list2txt(res, respath)