Пример #1
0
def cacualte(date,nowDate):
    dataRcm=pd.read_table('recommendNews'+date+'.txt',encoding='utf-8');
    datatest=pd.read_table('newsTest.txt',encoding='utf-8');
    time=handleTime.strTimetoMkTime(nowDate);
    timeafter=handleTime.dateAfterdays(time,1);
    datatest['time'].astype(int);
    datatest=datatest[datatest['time']<timeafter];
    datatest=datatest[datatest['time']>=time];
    testCount=pd.Series.count(datatest['newsId']);
    rcmCount=pd.Series.count(dataRcm['newsId']);
    intersection=0
    usersNews=dict();
    for index,row in dataRcm.iterrows():
        if row['userid'] not in usersNews:
             usersNews[row['userid']]=set();
        usersNews[row['userid']].add(row['newsId']);
    for u,v in usersNews.items():
        datatesttemp=datatest[datatest['userid']==u];

        #print(datatesttemp,'woshilinsgi ');
        for t in v:
            for index,row in datatesttemp.iterrows():
                if row['newsId']==t:
                    intersection+=1;

    print('recall',float(intersection)/testCount*100);
    print('precison', float(intersection) / rcmCount*100);
Пример #2
0
def getHotNews(date,recommendK):
    #data=None;
    if handleTime.getDayByStr(date)>21:
        data=pd.read_table('newsTest.txt',encoding='utf-8');
        data['time'].astype(int);
        nowtime=handleTime.strTimetoMkTime(date);
        befortime=handleTime.dateBeforDays(nowtime,1);
        data=data[data['time']<nowtime];
        data=data[data['time']>=befortime];

    else:
        data=pd.read_table('newsTrain.txt',encoding='utf-8');
        data['time'].astype(int);
        data=data[data['time']>handleTime.strTimetoMkTime('2014-3-20')];
    data = data['newsId'].value_counts();
    data = pd.Series(data.index);
    data = data[:recommendK];
    return data;
Пример #3
0
def findDays(): #检查过去T天新闻,在当前一天新闻的差异
    datatest = pd.read_table('newsTest.txt', encoding='utf-8');
    time = handleTime.strTimetoMkTime('2014-3-22');
    datatest['time'].astype(int);
    datatest = datatest[datatest['time'] < time];
    datatest=datatest['newsId'].value_counts();
    print(datatest,'test');
    datatrain = pd.read_table('newsTrain.txt', encoding='utf-8');
    time = handleTime.strTimetoMkTime('2014-3-16');
    datatrain['time'].astype(int);
    datatrain = datatrain[datatrain['time'] >= time];
    datatrain=datatrain['newsId'].value_counts();
    #print(pd.Series.count(datatrain.index),'train');
    rsult=datatrain.index.isin(datatest.index);
    print(pd.Series(rsult).value_counts());





#cacualte()
Пример #4
0
def recommendOneDay(nowDate, kk):
    users = pd.read_table('newsTest.txt', encoding='utf-8')
    users['time'].astype(int)
    divsionTime = handleTime.strTimetoMkTime(nowDate)
    divsionTime = handleTime.dateAfterdays(divsionTime, 1)
    #获取后一天凌晨时间
    users = users[users['time'] < divsionTime]
    users = users['userid'].value_counts()
    interstNews.recommendNewsForUsers(users.index,
                                      recommendK=kk,
                                      Date=nowDate,
                                      Kvalue=260)
Пример #5
0
def filterlooked(similarNews, date):
    if handleTime.getDayByStr(date) > 21:
        data = pd.read_table('newsTest.txt', encoding='utf-8')
        data['time'].astype(int)
        nowtime = handleTime.strTimetoMkTime(date)
        befortime = handleTime.dateBeforDays(nowtime, 1)
        data = data[data['time'] < nowtime]
        data = data[data['time'] >= befortime]

    else:
        data = pd.read_table('newsTrain.txt', encoding='utf-8')
        data['time'].astype(int)
        data = data[data['time'] > handleTime.strTimetoMkTime('2014-3-20')]
    for u, v in similarNews.items():
        temp = data[data['userid'] == u]
        for t in v:
            for index, row in temp.iterrows():
                if row['newsId'] == t:
                    similarNews[u].remove(t)
                    if similarNews[u].__len__() == 0:
                        del similarNews[u]
    return similarNews
Пример #6
0
def caculateHotNews(date,hotNews):
    data=pd.read_table('newsTest.txt',encoding='utf-8');
    time=handleTime.strTimetoMkTime(date);
    afterTime=handleTime.dateAfterdays(time,1);
    data['time'].astype(int);
    data=data[data['time']<afterTime];
    data=data[data['time']>=time];
    userlist=data['userid'].value_counts();
    hotnewsCount=pd.Series.count(hotNews)*pd.Series.count(userlist.index);
    userNewsCount=pd.Series.count(data['newsId']);
    intersection=data['newsId'].isin(hotNews);
    intersection=intersection.value_counts()[True];
    print('recall', float(intersection) / userNewsCount * 100);
    print('precison', float(intersection) / hotnewsCount * 100);
Пример #7
0
def getSimilarityToNews(similarityUsers, nowDate='2014-3-21', K=5, days=1):
    #similarityUsers= pd.Series(getSimilarityUsers(user));
    orderUsers = similarityUsers.sort_values(ascending=False)
    kUser = orderUsers[:K]

    dateDay = handleTime.getDayByStr(nowDate)
    nowDate = handleTime.strTimetoMkTime(nowDate)
    newsTest = pd.read_table('newsTest.txt', encoding='utf-8')
    newsTest['time'].astype(int)
    newsFromTest = newsTest[newsTest['time'] < nowDate]
    #选择过去days天内的新闻
    newsFromTest = newsFromTest[
        newsFromTest['time'] > handleTime.dateBeforDays(nowDate, days)]
    newsFromTest['userid'].astype(kUser.index.dtype)
    #print('userid in kuser  ',newsFromTest['userid'].isin(kUser.index));
    newsSimFromTest = newsFromTest[newsFromTest['userid'].isin(kUser.index)]
    #print('xinwengeshu form test ',newsSimFromTest['newsId'].value_counts());
    newsInterst = dict()
    for index, row in newsSimFromTest.iterrows():
        if row['newsId'] not in newsInterst:
            newsInterst[row['newsId']] = 0
        newsInterst[row['newsId']] += kUser[row['userid']]
    if dateDay <= (20 + days):
        newsTrain = pd.read_table('newsTrain.txt', encoding='utf-8')
        newsTrain['time'].astype(int)
        divisionTime = handleTime.dateBeforDays(nowDate, days)
        newsFromTrain = newsTrain[newsTrain['time'] >= divisionTime]
        newsSimFromTrain = newsFromTrain[newsFromTrain['userid'].isin(
            kUser.index)]

        #print('woshi isin train valuecount',newsFromTrain['userid'].isin(kUser.index).value_counts() );
        #print('xinwengeshu ', newsSimFromTrain['newsId'].value_counts());
        for index, row in newsSimFromTrain.iterrows():
            if row['newsId'] not in newsInterst:
                newsInterst[row['newsId']] = 0
            newsInterst[row['newsId']] += kUser[row['userid']]

    #print('woshi xinwen ',newsInterst);
    return newsInterst
Пример #8
0
import handleTime
import pandas as pd

#f=os.open('user_click_data.txt',flags=,encoding='utf-8');
#f=BytesIO(open('user_click_data.txt').read().encode('utf-8'));
#np.loadtxt(f,delimiter='\t');,parse_dates=['time'],date_parser=time.localtime(int())
#np.loadtxt(io.StringIO(''.join(line for line in open('user_click_data.txt', encoding='utf-8') if not line.lstrip().startswith('!'))))
#data=pd.read_table('user_click_data.txt',names=['userid','newsId','time','newsTitle','newsContent','newsTime'],parse_dates=['time'],date_parser=gettime,nrows=4);
data = pd.read_table(
    'user_click_data.txt',
    names=['userid', 'newsId', 'time', 'newsTitle', 'newsContent', 'newsTime'])
divisionTimeStr = '2014-03-21 00:00:00'
divisionTime = handleTime.strTimetoMkTime(divisionTimeStr)
print(divisionTime)
#print(data[int(data['time'])<divisionTime]);
data['time'].astype('int')
frist20dayData = data[data['time'] < divisionTime]
last10dayData = data[data['time'] >= divisionTime]
frist20dayData.to_csv('newsTrain.txt',
                      sep='\t',
                      encoding='utf-8',
                      index=False,
                      columns=['userid', 'newsId', 'time'])
last10dayData.to_csv('newsTest.txt',
                     sep='\t',
                     encoding='utf-8',
                     index=False,
                     columns=['userid', 'newsId', 'time'])
#print(time.localtime(int(testTime)))

#print(data['time'].dtype);