Пример #1
0
def gen_samples(ulog_d, ulog_diff, prefix, outpath):
    logger.info("generate posi & neg samples for myrec...")
    if '.json' in ulog_d:
        dlog = util.loadjson(ulog_d)
        difflog = util.loadjson(ulog_diff)
    else:
        dlog = util.load2dic(ulog_d)
        difflog = util.load2dic(ulog_diff)
    posisam = []
    negsam = []
    logger.info("gen posi samples...")
    for k in dlog.keys():
        fns = dlog[k]
        if fns:
            for fn in fns:
                posisam.append("%s+%s\t%d" % (k, fn.lower(), 1))
    print(len(posisam))
    util.list2txt(posisam, outpath + '/' + prefix + '_posi.txt')
    del dlog
    del posisam
    logger.info("gen neg samples...")
    for k in difflog.keys():
        fns = difflog[k]
        if fns:
            for fn in fns:
                negsam.append("%s+%s\t%d" % (k, fn.lower(), 0))
    print(len(negsam))
    util.list2txt(negsam, outpath + '/' + prefix + '_neg.txt')
Пример #2
0
def seg4file_book(infile, respath):
    '''
    对文章或小说做分词,一行多句,一个file是一本小说或全文
    本方法原旨在处理一部系列小说,其小说格式较为奇葩
    :param infile: 
    :param respath: 
    :return: 
    '''
    alllines=[]
    fsize = util.get_FileSize(infile)
    logger.info("doing %s  and its %.2fMB" %(infile,float(fsize)))
    with codecs.open(infile, 'rU', encoding=bianma, errors='replace') as f:
        for line in f:
            l = line.strip()
            if l:
                alllines.append(l)
    fulltext=''.join(alllines)
    del alllines
    if fulltext:
        t2 = time.time()
        wordres=segword4onetext(fulltext)
        timecha = time.time() - t2
        logger.info("%s done. totaly  %.2f MB cost %.1f secs" % (infile, fsize, timecha))
        logger.info(" --------   %.2f MB/sec --------    %.2fMB/min" % (
            float(fsize) / float(timecha), float(fsize) * 60 / float(timecha)))
        if wordres:
            filename = os.path.split(infile)[1]
            util.list2txt(wordres, os.path.join(respath, filename))
        else:
            raise Exception("Word Seg Res is None, Please Check Your Input File!!!")
Пример #3
0
def seg4file_1line1sent(infile, respath):
    '''
    对单个文件分词,本方法不涉及到分句,默认输入的一行就是一句
    例如一行对应一段关键词,或者就简单的一行对应一个标题
    '''
    res = []
    t0 = time.time()
    cnt = 0
    fsize = util.get_FileSize(infile)
    logger.info("now doing file : %s and its size is %.2f MB" % (infile, fsize))
    with codecs.open(infile, 'rU', encoding=bianma, errors='replace') as f:
        for line in f:
            cnt += 1
            if cnt % 50000 == 0: #差不多1分钟
                logger.info("doing %s line: %d" % (infile, cnt))
            wseg=segword4oneline(line)
            if wseg:
                res.append(' '.join(wseg))
    timecha = time.time() - t0
    logger.info("file %s done. there are totaly %d lines %.2f MB cost %.1f secs" % (infile, cnt, fsize, timecha))
    logger.info(" --------   %.2f MB/sec --------    %.2fMB/min" % (
    float(fsize) / float(timecha), float(fsize) * 60 / float(timecha)))
    if res:
        filename = os.path.split(infile)[1]
        util.list2txt(res, os.path.join(respath, filename))
Пример #4
0
def get_samplevec_gensimmodel(vecpath1,
                              vecpath2,
                              samplefile,
                              prefix,
                              respath='./',
                              stopcnt=100,
                              progress_per=10000):
    #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec]
    data, labels, realexamp = [], [], []
    logger.info('loading vecfile : %s' % vecpath1)
    # muser=Doc2Vec.load(usermodel)
    v_user = load_vec(vecpath1)
    logger.info('loading vecfile : %s' % vecpath2)
    v_file = load_vec(vecpath2)
    samples = util.load2list(samplefile)
    for cnt, exam in enumerate(samples):
        if cnt % progress_per == 0:
            print("getting example vecs : %d" % cnt)
        if stopcnt and stopcnt == cnt:
            break
        exam = exam.strip().split()
        label0 = exam[1]
        uid = '*dt_' + exam[0].split("+")[0]
        fn = '*dt_' + exam[0].split("+")[1]
        if uid in v_user and fn in v_file:
            uvec = list(v_user[uid])
            fvec = list(v_file[fn])
            sampvec = uvec + fvec  #拼接
            realexamp.append(exam[0])
            data.append(sampvec)
            labels.append(label0)
    del v_file
    del v_user
    np.savetxt('%s/exampvecs_%s.txt' % (respath, prefix), np.array(data))
    util.list2txt(realexamp, '%s/realexamples_%s.txt' % (respath, prefix))
Пример #5
0
def get_userlist(path, logpath=None):
    #获取用户id列表,返回list
    if os.path.exists(path):
        return util.load2list(path)
    else:
        ul = util.load2list(logpath, get1column=0)
        util.list2txt(ul, path)
        return ul
Пример #6
0
def filter_fns(inpath, outpath):
    inli = get_fnlist(inpath, '')
    res = []
    for fn in inli:
        res.append(fn.lower())
    a = list(set(res))
    util.list2txt(a, outpath)
    print(len(a))
    return a
Пример #7
0
def get_fnlist(path, logpath):
    #获取文件名列表,返回list
    if os.path.exists(path):
        return util.load2list(path)
    else:
        ul = util.load2list(logpath, to1column=True, start=1)
        res = list(set(ul))
        util.list2txt(res, path)
        return res
Пример #8
0
def get_highquality_ulog(inpath, outpath, actmin=2, actmax=300):
    #优质用户历史,操作数>2 <300(操作太多可能是爬虫)
    oldulog = util.load2list(inpath)
    newulog = []
    for l in oldulog:
        ws = l.strip().split()[1:]  #每一行第一个是id
        if actmax > len(ws) > actmin:
            newulog.append(l)
    util.list2txt(newulog, outpath)
Пример #9
0
def seg4file_1line1text(infile, resprefix='',to1line=False, hastitle=False, spliter=None):
    '''
    对单个文件分词,涉及到分句,
    默认输入的一行就是一段完整的文本,例如全文or摘要(与小说不同,小说是整个文件为全文)
    这里分词有很多种方式
    1:一行输入一行输出,即一段文本分词变成一行(to1line=True, hastitle=False)
    2(default):一行输入多行输出,即一段文本先分句变成多行,每行句子再分词(to1line=False, hastitle=False)
    3:输入文本第一列标题,后面是text,text分成一行(类似1),输出类似 title [text分词](to1line=True, hastitle=True)
    4:输入文本第一列标题,后面是text,text分成多行(类似2),而输出每行都要加上标题 (to1line=False, hastitle=True)
    结果文件存放在输入文件目录下segres文件夹内
    分词速度:单核8.5M/min(XEON) 
    :param infile: 
    :return: 
    '''
    alllines=[]
    fsize = util.get_FileSize(infile)
    mode01='l1' if to1line else 'l0'
    mode02='t1' if hastitle else 't0'
    mode=mode01+mode02
    logger.info("doing %s  and its %.2fMB and split mode is %s" %(infile,float(fsize),mode))
    t2 = time.time()
    cnt=-1
    with codecs.open(infile, 'rU', encoding=bianma, errors='replace') as f:
        for line in f:
            cnt+=1
            if cnt%50000==0:
                print("processed line %d" %cnt)
            l = line.strip()
            if not l:
                continue
            title = ''
            if hastitle:
                title = l.split(spliter)[0]
                l = ''.join(l.split(spliter)[1:])
            if to1line:
                wseg=segword4oneline(l,convert=True)
                if wseg:
                    alllines.append(' '.join([title]+wseg))
            else:
                textseg=segword4onetext(l)
                for ts in textseg:
                    alllines.append("%s %s" %(title,ts))
    timecha = time.time() - t2
    logger.info("%s done. totaly  %.2f MB cost %.1f secs" % (infile, fsize, timecha))
    logger.info(" --------   %.2f MB/sec --------    %.2fMB/min" % (
        float(fsize) / float(timecha), float(fsize) * 60 / float(timecha)))
    if alllines:
        segresfolder=os.path.join(os.path.split(infile)[0],'segres')
        filename = os.path.splitext(os.path.split(infile)[1])[0]
        if not os.path.exists(segresfolder):
            os.mkdir(segresfolder)
        util.list2txt(alllines, os.path.join(segresfolder, filename+resprefix+'.txt'))
    else:
        raise Exception("Word Seg Res is None, Please Check Your Input File!!!")
Пример #10
0
def getfiledtop(cnter, filedfile, top=50):
    '''
    按filedfile里词的词频排序
    :param cnter: counter of all words
    :type cnter: Counter
    :param filedfile: 
    :param top: 
    :return: 
    '''
    worddic = {}
    inwords = util.load2list(filedfile)
    for i in inwords:
        if cnter.has_key(i):
            worddic[i] = cnter[i]
    newcnter = Counter(worddic)
    top = min(len(newcnter), top)
    topnwords = ["%s %d" % (i, c) for (i, c) in newcnter.most_common(top)]
    respath = "%s_top%d.txt" % (os.path.splitext(filedfile)[0], top)
    util.list2txt(topnwords, respath)
    return topnwords
Пример #11
0
def get_intersec_log(user_interseclist,
                     alllog_b,
                     alllog_d,
                     prefix,
                     rootpath=datapath):
    '''
    获取用户d,b日志的交集用户,并获取这群用户的d,b以及b-d日志分别储存
    :param user_interseclist: 
    :type user_interseclist: 
    :param alllog_b: 
    :type alllog_b: 
    :param alllog_d: 
    :type alllog_d: 
    :param prefix: 
    :type prefix: 
    :return: 
    :rtype: 
    '''
    blog = util.load2dic(alllog_b)
    # dlog=util.loadjson(alllog_d)
    dlog = util.load2dic(alllog_d)
    userb = blog.keys()
    userd = dlog.keys()
    if not os.path.exists(user_interseclist):
        logger.info("caculating two logs` intersection user...")
        uintersec = list(set(userb).intersection(set(userd)))
        util.list2txt(uintersec, user_interseclist)
    else:
        logger.info("loading two logs` intersection user file : %s" %
                    user_interseclist)
        uintersec = util.load2list(user_interseclist)
    interseced_d = get_sub_dic(dlog, uintersec)
    interseced_b = get_sub_dic(blog, uintersec)
    del dlog
    del blog
    # interseced_dbdiff = get_dic_diff(interseced_b, interseced_d)
    logger.info("saving ress...")
    util.savejson("%s/%s_posi.json" % (rootpath, prefix), interseced_d)
    util.savejson("%s/%s_neg.json" % (rootpath, prefix), interseced_b)
    # util.savejson("%s/%s_dbdiff.json" %(rootpath,prefix), interseced_dbdiff)
    logger.info("done!")
Пример #12
0
def get_w_v_bycode(vecfilepath, dic_code_kws, respath):
    '''
    
    :param vecfilepath: 
    :type vecfilepath: str
    :param dic_code_kws: 
    :type dic_code_kws: dict
    :return: 
    :rtype: 
    '''
    vect = load_vec(vecfilepath)
    vect.init_sims()
    for k in dic_code_kws.keys():
        print("for code %s" % k)
        if '_' in k:
            basepath = respath + '/' + k.split('_')[0]
        else:
            basepath = respath + '/others'
        if not os.path.exists(basepath):
            os.mkdir(basepath)
        resfileword = basepath + '/words_' + k + '.txt'
        resfilevec = basepath + '/vecs_' + k + '.txt'
        if os.path.exists(resfileword) and os.path.exists(resfilevec):
            print("file %s already exists,skip code %s..." % (resfileword, k))
            continue
        curkws = dic_code_kws[k]
        if len(curkws) > 100:
            words = []
            vecs = []
            curkws_uniq = list(set(curkws))  #去重
            for w in curkws_uniq:
                if w in vect:
                    words.append(w)
                    vec_norm = vect.vectors_norm[vect.vocab[w].index]
                    vecs.append(vec_norm)
            if words:
                print("saving data for code %s get res %d" % (k, len(words)))
                util.list2txt(words, resfileword)
                np.savetxt(resfilevec, np.array(vecs))
    print("get words & vecs by code done!")
Пример #13
0
def mergefns(path1, path2, respath):
    la = util.load2list(path1)
    lb = util.load2list(path2)
    res = list(set(la).union(set(lb)))
    util.list2txt(res, respath)