Exemplo n.º 1
0
def get_userlist(path, logpath=None):
    #获取用户id列表,返回list
    if os.path.exists(path):
        return util.load2list(path)
    else:
        ul = util.load2list(logpath, get1column=0)
        util.list2txt(ul, path)
        return ul
Exemplo n.º 2
0
def get_fnlist(path, logpath):
    #获取文件名列表,返回list
    if os.path.exists(path):
        return util.load2list(path)
    else:
        ul = util.load2list(logpath, to1column=True, start=1)
        res = list(set(ul))
        util.list2txt(res, path)
        return res
Exemplo n.º 3
0
def get_samplevec_gensimmodel(vecpath1,
                              vecpath2,
                              samplefile,
                              prefix,
                              respath='./',
                              stopcnt=100,
                              progress_per=10000):
    #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec]
    data, labels, realexamp = [], [], []
    logger.info('loading vecfile : %s' % vecpath1)
    # muser=Doc2Vec.load(usermodel)
    v_user = load_vec(vecpath1)
    logger.info('loading vecfile : %s' % vecpath2)
    v_file = load_vec(vecpath2)
    samples = util.load2list(samplefile)
    for cnt, exam in enumerate(samples):
        if cnt % progress_per == 0:
            print("getting example vecs : %d" % cnt)
        if stopcnt and stopcnt == cnt:
            break
        exam = exam.strip().split()
        label0 = exam[1]
        uid = '*dt_' + exam[0].split("+")[0]
        fn = '*dt_' + exam[0].split("+")[1]
        if uid in v_user and fn in v_file:
            uvec = list(v_user[uid])
            fvec = list(v_file[fn])
            sampvec = uvec + fvec  #拼接
            realexamp.append(exam[0])
            data.append(sampvec)
            labels.append(label0)
    del v_file
    del v_user
    np.savetxt('%s/exampvecs_%s.txt' % (respath, prefix), np.array(data))
    util.list2txt(realexamp, '%s/realexamples_%s.txt' % (respath, prefix))
Exemplo n.º 4
0
def data2csv():
    fnfeatpath = './data/highq_5w/fn18_5w_features.txt'
    fnfeas = uc.load2list(fnfeatpath)
    fns, cites, cites_w, authcodes, fundcodes, jigoucodes, productcodes, dates, pages, downs, citeds, ifs = [], [], [], [], [], [], [], [], [], [], [], []
    for i in fnfeas:
        if type(i) is str:
            iss = i.split()
            if len(iss) == 14:
                fns.append(iss[0])
                cites.append(iss[1])
                cites_w.append(iss[2])
                authcodes.append(iss[3])
                fundcodes.append(iss[4])
                jigoucodes.append(iss[5])
                productcodes.append(iss[6])
                dates.append(iss[7])
                pages.append(iss[8])
                downs.append(iss[9])
                citeds.append(iss[10])
                ifs.append(iss[11])
    exs = pd.DataFrame({
        'fns': fns,
        'cites': cites,
        'cites_w': cites_w,
        'authcodes': authcodes,
        'fundcodes': fundcodes,
        'jigoucodes': jigoucodes,
        'productcodes': productcodes,
        'dates': dates,
        'pages': pages,
        'downs': downs,
        'citeds': citeds,
        'ifs': ifs
    })
    exs.to_csv('./data/highq_5w/fn18_5w_features.csv')
Exemplo n.º 5
0
def get_highquality_ulog(inpath, outpath, actmin=2, actmax=300):
    #优质用户历史,操作数>2 <300(操作太多可能是爬虫)
    oldulog = util.load2list(inpath)
    newulog = []
    for l in oldulog:
        ws = l.strip().split()[1:]  #每一行第一个是id
        if actmax > len(ws) > actmin:
            newulog.append(l)
    util.list2txt(newulog, outpath)
Exemplo n.º 6
0
def getfiledtop(cnter, filedfile, top=50):
    '''
    按filedfile里词的词频排序
    :param cnter: counter of all words
    :type cnter: Counter
    :param filedfile: 
    :param top: 
    :return: 
    '''
    worddic = {}
    inwords = util.load2list(filedfile)
    for i in inwords:
        if cnter.has_key(i):
            worddic[i] = cnter[i]
    newcnter = Counter(worddic)
    top = min(len(newcnter), top)
    topnwords = ["%s %d" % (i, c) for (i, c) in newcnter.most_common(top)]
    respath = "%s_top%d.txt" % (os.path.splitext(filedfile)[0], top)
    util.list2txt(topnwords, respath)
    return topnwords
Exemplo n.º 7
0
def get_intersec_log(user_interseclist,
                     alllog_b,
                     alllog_d,
                     prefix,
                     rootpath=datapath):
    '''
    获取用户d,b日志的交集用户,并获取这群用户的d,b以及b-d日志分别储存
    :param user_interseclist: 
    :type user_interseclist: 
    :param alllog_b: 
    :type alllog_b: 
    :param alllog_d: 
    :type alllog_d: 
    :param prefix: 
    :type prefix: 
    :return: 
    :rtype: 
    '''
    blog = util.load2dic(alllog_b)
    # dlog=util.loadjson(alllog_d)
    dlog = util.load2dic(alllog_d)
    userb = blog.keys()
    userd = dlog.keys()
    if not os.path.exists(user_interseclist):
        logger.info("caculating two logs` intersection user...")
        uintersec = list(set(userb).intersection(set(userd)))
        util.list2txt(uintersec, user_interseclist)
    else:
        logger.info("loading two logs` intersection user file : %s" %
                    user_interseclist)
        uintersec = util.load2list(user_interseclist)
    interseced_d = get_sub_dic(dlog, uintersec)
    interseced_b = get_sub_dic(blog, uintersec)
    del dlog
    del blog
    # interseced_dbdiff = get_dic_diff(interseced_b, interseced_d)
    logger.info("saving ress...")
    util.savejson("%s/%s_posi.json" % (rootpath, prefix), interseced_d)
    util.savejson("%s/%s_neg.json" % (rootpath, prefix), interseced_b)
    # util.savejson("%s/%s_dbdiff.json" %(rootpath,prefix), interseced_dbdiff)
    logger.info("done!")
Exemplo n.º 8
0
def get_vec_gensimmodel(vecpath1,
                        samplefile,
                        prefix,
                        respath='./',
                        stopcnt=100,
                        progress_per=10000):
    #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec]
    resdata = {}
    logger.info('loading vecfile : %s' % vecpath1)
    v_user = load_vec(vecpath1)
    samples = util.load2list(samplefile)
    for cnt, exam in enumerate(samples):
        if cnt % progress_per == 0:
            print("getting example vecs : %d" % cnt)
        # if stopcnt and stopcnt==cnt:
        #     break
        examid = exam.strip()
        uid = '*dt_' + examid
        if uid in v_user:
            uvec = list(v_user[uid])
            resdata[examid] = uvec
    with open(respath + '/' + prefix + '.pkl', 'wb') as f:
        pickle.dump(resdata, f)
Exemplo n.º 9
0
def mergefns(path1, path2, respath):
    la = util.load2list(path1)
    lb = util.load2list(path2)
    res = list(set(la).union(set(lb)))
    util.list2txt(res, respath)
Exemplo n.º 10
0
def exam2traindata_(examples,
                    vecdic_users,
                    vecdic_fns,
                    fn_features,
                    process_per=50000,
                    earlystop=100,
                    resname='traindata_nofnvec_01.pkl',
                    encoding=True,
                    sample=''):
    #loading data
    examps = uc.load2list(examples)
    # examps_posi=examps[:641683]
    # examps_neg=examps[641683:]
    # if examplimit:
    #     examps=examps[:examplimit]
    if sample:
        # sampling
        # split example to X-Y for sampleing
        examps_X, examps_Y = [], []
        for i in examps:
            (uid, fn, label) = i.strip().split('+')
            examps_X.append('%s+%s' % (uid, fn))
            examps_Y.append(int(label))
        logger.info("raw example y:")
        logger.info(Counter(examps_Y))
        if sample is 'up':
            ros = RandomOverSampler(random_state=0)
            X_resampled, y_resampled = ros.fit_sample(
                np.array(examps_X).reshape(-1, 1), examps_Y)
            print(X_resampled.shape)
            X_resampled = X_resampled.reshape(-1)
            print(X_resampled.shape)
        elif sample is 'down':
            rus = RandomUnderSampler(random_state=0, replacement=True)
            X_resampled, y_resampled = rus.fit_sample(
                np.array(examps_X).reshape(-1, 1), examps_Y)
            print(X_resampled.shape)
            X_resampled = X_resampled.reshape(-1)
            print(X_resampled.shape)
        elif sample is 'simpledown':
            #前641683个是正样本,后287464个是负样本。
            X_resampled, y_resampled = examps_X[:287464] + examps_X[
                641683:], examps_Y[:287464] + examps_Y[641683:]
        else:
            logger.info('sample methord not found : %s' % sample)
            X_resampled, y_resampled = examps_X, examps_Y
        examps_X, examps_Y = X_resampled, y_resampled
        logger.info("after example y:")
        logger.info(Counter(examps_Y))
        # concat X & Y for forther featear extraction
        examps_new = []
        for x, y in zip(examps_X, examps_Y):
            examps_new.append('%s+%s' % (x, str(y)))
        examps = examps_new

    logger.info('generating features for model...')
    train_exs, test_exs = train_test_split(examps,
                                           test_size=0.3,
                                           random_state=2)

    # encoding features
    #loading features
    vecdicu = uc.pickle_load(vecdic_users) if isinstance(vecdic_users,
                                                         str) else vecdic_users
    # vecdicf=uc.pickle_load(vecdic_fns) if isinstance(vecdic_fns,str) else vecdic_fns
    logger.info('loadding fn_features...')
    fnfeats = pd.read_csv(fn_features) if isinstance(fn_features,
                                                     str) else fn_features
    fn_indexdic = {}
    for index, fn in enumerate(list(fnfeats.fns)):
        fn_indexdic[fn] = index

    fe_nums4onehot = ['cites', 'cites_w', 'citeds', 'downs', 'pages', 'ifs']
    fe_strs4onehot = ['productcodes']
    fe_strs4mulhot = ['fundcodes']
    if encoding:
        logger.info('encoding onehot for number features')
        onehots_num, onehots_num_model = col2onehot_numbers(
            fnfeats[fe_nums4onehot])
        # logger.info('encoding onehot for str features')
        # onehots_str,onehots_str_model=col2onehot_str(fnfeats[fe_strs4onehot[0]])
        logger.info('encoding multihot for str features')
        mulhots_str, mulhots_model = col2multibinar(fnfeats[fe_strs4mulhot[0]])

    logger.info('training data split get traindata %d, testdata %d' %
                (len(train_exs), len(test_exs)))

    def examples2x_y(exampls, ifencoding=encoding):
        X, Y = [], []
        for index, ex in enumerate(exampls):
            if index % process_per == 0:
                logger.info('examples2x_y process %d' % index)
            (uid, fn, label) = ex.strip().split('+')
            if fn in fn_indexdic:
                fnindex = fn_indexdic[fn]
            else:
                # print('fn not in fnfeatures %s' %fn)
                continue
            uidvec = np.array(vecdicu[uid])
            fnvec = [0]  #np.array(vecdicf[fn])
            if ifencoding:
                x = np.concatenate((uidvec, fnvec, onehots_num[fnindex],
                                    mulhots_str[fnindex]))
            else:
                feature_notencoded = list(fnfeats.iloc[fnindex][[
                    'fns', 'cites', 'cites_w', 'citeds', 'downs', 'pages',
                    'ifs'
                ]])
                x = np.concatenate((uidvec, fnvec, feature_notencoded[1:]))
            y = int(label)
            X.append(x)
            Y.append(y)
        return X, Y

    x_train, y_train = examples2x_y(train_exs)  #获取训练集特征
    x_test, y_test = examples2x_y(test_exs)  #获取测试集特征
    logger.info('training data actrauly get traindata %d, testdata %d' %
                (len(y_train), len(y_test)))
    alldata = [x_train, x_test, y_train, y_test]
    if resname:
        traindatapath = os.path.join(path.path_datahighq5w, resname)
        if not os.path.exists(traindatapath):
            uc.pickle_dump(alldata, traindatapath)
        else:
            newtraindatapath = os.path.join(path.path_datahighq5w,
                                            'newres_%s' % resname)
            logger.info(
                'triandatapath %s allready exists,save this batch traindata to %s'
                % (traindatapath, newtraindatapath))
            uc.pickle_dump(alldata, newtraindatapath)
    return x_train, x_test, y_train, y_test
Exemplo n.º 11
0
def exam2features(examples,
                  vecdic_users,
                  vecdic_fns,
                  fn_features,
                  respath=None,
                  process_per=10000,
                  earlystop=100):
    examps = uc.load2list(examples)
    vecdicu = uc.pickle_load(vecdic_users) if isinstance(vecdic_users,
                                                         str) else vecdic_users
    vecdicf = uc.pickle_load(vecdic_fns) if isinstance(vecdic_fns,
                                                       str) else vecdic_fns
    logger.info('loadding fn_features...')
    fnfeats = pd.read_csv(fn_features) if isinstance(fn_features,
                                                     str) else fn_features
    fe_nums4onehot = ['cites', 'cites_w', 'citeds', 'downs', 'pages', 'ifs']
    fe_strs4onehot = ['productcodes']
    fe_strs4mulhot = ['fundcodes']
    logger.info('encoding onehot for number features')
    onehots_num, onehots_num_model = col2onehot_numbers(
        fnfeats[fe_nums4onehot])
    logger.info('encoding onehot for str features')
    onehots_str, onehots_str_model = col2onehot_str(fnfeats[fe_strs4onehot[0]])
    logger.info('encoding multihot for str features')
    mulhots_str, mulhots_model = col2multibinar(fnfeats[fe_strs4mulhot[0]])
    fn_indexdic = {}
    for index, fn in enumerate(list(fnfeats.fns)):
        fn_indexdic[fn] = index
    features = []
    logger.info('generating features for model...')
    for index, ex in enumerate(examps):
        if index % process_per == 0:
            logger.info('process %d' % index)
        (uid, fn, label) = ex.strip().split('+')
        fnindex = fn_indexdic[fn]
        uidvec = np.array(vecdicu[uid])
        fnvec = np.array(vecdicf[fn])
        features.append(
            InputFeature2(uniqid=index,
                          uid=uid,
                          fn=fn,
                          uidvec=uidvec,
                          fnvec=fnvec,
                          fnnumsf=onehots_num[fnindex],
                          fnfuncode=mulhots_str[fnindex],
                          fnprodcode=onehots_str[fnindex],
                          label=label))
        if index < 10:  #打印前10个样本
            logger.info("\n*** Example ***")
            logger.info('uniqid=%d' % index)
            logger.info('uid=%s' % uid)
            logger.info('fn=%s' % fn)
            logger.info('uidvec=%s' % (' '.join([str(i) for i in uidvec])))
            logger.info('fnvec=%s' % (' '.join([str(i) for i in fnvec])))
            logger.info('fnnumsf=%s' %
                        (' '.join([str(i) for i in onehots_num[fnindex]])))
            logger.info('fnfuncode=%s' %
                        (' '.join([str(i) for i in mulhots_str[fnindex]])))
            logger.info('fnprodcode=%s' %
                        (' '.join([str(i) for i in onehots_str[fnindex]])))
            logger.info('label=%s' % str(label))
        if earlystop and index == earlystop - 1:
            break
    if respath:
        uc.pickle_dump(features, respath=respath)
    return features