def cacualte(date,nowDate): dataRcm=pd.read_table('recommendNews'+date+'.txt',encoding='utf-8'); datatest=pd.read_table('newsTest.txt',encoding='utf-8'); time=handleTime.strTimetoMkTime(nowDate); timeafter=handleTime.dateAfterdays(time,1); datatest['time'].astype(int); datatest=datatest[datatest['time']<timeafter]; datatest=datatest[datatest['time']>=time]; testCount=pd.Series.count(datatest['newsId']); rcmCount=pd.Series.count(dataRcm['newsId']); intersection=0 usersNews=dict(); for index,row in dataRcm.iterrows(): if row['userid'] not in usersNews: usersNews[row['userid']]=set(); usersNews[row['userid']].add(row['newsId']); for u,v in usersNews.items(): datatesttemp=datatest[datatest['userid']==u]; #print(datatesttemp,'woshilinsgi '); for t in v: for index,row in datatesttemp.iterrows(): if row['newsId']==t: intersection+=1; print('recall',float(intersection)/testCount*100); print('precison', float(intersection) / rcmCount*100);
def getHotNews(date,recommendK): #data=None; if handleTime.getDayByStr(date)>21: data=pd.read_table('newsTest.txt',encoding='utf-8'); data['time'].astype(int); nowtime=handleTime.strTimetoMkTime(date); befortime=handleTime.dateBeforDays(nowtime,1); data=data[data['time']<nowtime]; data=data[data['time']>=befortime]; else: data=pd.read_table('newsTrain.txt',encoding='utf-8'); data['time'].astype(int); data=data[data['time']>handleTime.strTimetoMkTime('2014-3-20')]; data = data['newsId'].value_counts(); data = pd.Series(data.index); data = data[:recommendK]; return data;
def findDays(): #检查过去T天新闻,在当前一天新闻的差异 datatest = pd.read_table('newsTest.txt', encoding='utf-8'); time = handleTime.strTimetoMkTime('2014-3-22'); datatest['time'].astype(int); datatest = datatest[datatest['time'] < time]; datatest=datatest['newsId'].value_counts(); print(datatest,'test'); datatrain = pd.read_table('newsTrain.txt', encoding='utf-8'); time = handleTime.strTimetoMkTime('2014-3-16'); datatrain['time'].astype(int); datatrain = datatrain[datatrain['time'] >= time]; datatrain=datatrain['newsId'].value_counts(); #print(pd.Series.count(datatrain.index),'train'); rsult=datatrain.index.isin(datatest.index); print(pd.Series(rsult).value_counts()); #cacualte()
def recommendOneDay(nowDate, kk): users = pd.read_table('newsTest.txt', encoding='utf-8') users['time'].astype(int) divsionTime = handleTime.strTimetoMkTime(nowDate) divsionTime = handleTime.dateAfterdays(divsionTime, 1) #获取后一天凌晨时间 users = users[users['time'] < divsionTime] users = users['userid'].value_counts() interstNews.recommendNewsForUsers(users.index, recommendK=kk, Date=nowDate, Kvalue=260)
def filterlooked(similarNews, date): if handleTime.getDayByStr(date) > 21: data = pd.read_table('newsTest.txt', encoding='utf-8') data['time'].astype(int) nowtime = handleTime.strTimetoMkTime(date) befortime = handleTime.dateBeforDays(nowtime, 1) data = data[data['time'] < nowtime] data = data[data['time'] >= befortime] else: data = pd.read_table('newsTrain.txt', encoding='utf-8') data['time'].astype(int) data = data[data['time'] > handleTime.strTimetoMkTime('2014-3-20')] for u, v in similarNews.items(): temp = data[data['userid'] == u] for t in v: for index, row in temp.iterrows(): if row['newsId'] == t: similarNews[u].remove(t) if similarNews[u].__len__() == 0: del similarNews[u] return similarNews
def caculateHotNews(date,hotNews): data=pd.read_table('newsTest.txt',encoding='utf-8'); time=handleTime.strTimetoMkTime(date); afterTime=handleTime.dateAfterdays(time,1); data['time'].astype(int); data=data[data['time']<afterTime]; data=data[data['time']>=time]; userlist=data['userid'].value_counts(); hotnewsCount=pd.Series.count(hotNews)*pd.Series.count(userlist.index); userNewsCount=pd.Series.count(data['newsId']); intersection=data['newsId'].isin(hotNews); intersection=intersection.value_counts()[True]; print('recall', float(intersection) / userNewsCount * 100); print('precison', float(intersection) / hotnewsCount * 100);
def getSimilarityToNews(similarityUsers, nowDate='2014-3-21', K=5, days=1): #similarityUsers= pd.Series(getSimilarityUsers(user)); orderUsers = similarityUsers.sort_values(ascending=False) kUser = orderUsers[:K] dateDay = handleTime.getDayByStr(nowDate) nowDate = handleTime.strTimetoMkTime(nowDate) newsTest = pd.read_table('newsTest.txt', encoding='utf-8') newsTest['time'].astype(int) newsFromTest = newsTest[newsTest['time'] < nowDate] #选择过去days天内的新闻 newsFromTest = newsFromTest[ newsFromTest['time'] > handleTime.dateBeforDays(nowDate, days)] newsFromTest['userid'].astype(kUser.index.dtype) #print('userid in kuser ',newsFromTest['userid'].isin(kUser.index)); newsSimFromTest = newsFromTest[newsFromTest['userid'].isin(kUser.index)] #print('xinwengeshu form test ',newsSimFromTest['newsId'].value_counts()); newsInterst = dict() for index, row in newsSimFromTest.iterrows(): if row['newsId'] not in newsInterst: newsInterst[row['newsId']] = 0 newsInterst[row['newsId']] += kUser[row['userid']] if dateDay <= (20 + days): newsTrain = pd.read_table('newsTrain.txt', encoding='utf-8') newsTrain['time'].astype(int) divisionTime = handleTime.dateBeforDays(nowDate, days) newsFromTrain = newsTrain[newsTrain['time'] >= divisionTime] newsSimFromTrain = newsFromTrain[newsFromTrain['userid'].isin( kUser.index)] #print('woshi isin train valuecount',newsFromTrain['userid'].isin(kUser.index).value_counts() ); #print('xinwengeshu ', newsSimFromTrain['newsId'].value_counts()); for index, row in newsSimFromTrain.iterrows(): if row['newsId'] not in newsInterst: newsInterst[row['newsId']] = 0 newsInterst[row['newsId']] += kUser[row['userid']] #print('woshi xinwen ',newsInterst); return newsInterst
import handleTime import pandas as pd #f=os.open('user_click_data.txt',flags=,encoding='utf-8'); #f=BytesIO(open('user_click_data.txt').read().encode('utf-8')); #np.loadtxt(f,delimiter='\t');,parse_dates=['time'],date_parser=time.localtime(int()) #np.loadtxt(io.StringIO(''.join(line for line in open('user_click_data.txt', encoding='utf-8') if not line.lstrip().startswith('!')))) #data=pd.read_table('user_click_data.txt',names=['userid','newsId','time','newsTitle','newsContent','newsTime'],parse_dates=['time'],date_parser=gettime,nrows=4); data = pd.read_table( 'user_click_data.txt', names=['userid', 'newsId', 'time', 'newsTitle', 'newsContent', 'newsTime']) divisionTimeStr = '2014-03-21 00:00:00' divisionTime = handleTime.strTimetoMkTime(divisionTimeStr) print(divisionTime) #print(data[int(data['time'])<divisionTime]); data['time'].astype('int') frist20dayData = data[data['time'] < divisionTime] last10dayData = data[data['time'] >= divisionTime] frist20dayData.to_csv('newsTrain.txt', sep='\t', encoding='utf-8', index=False, columns=['userid', 'newsId', 'time']) last10dayData.to_csv('newsTest.txt', sep='\t', encoding='utf-8', index=False, columns=['userid', 'newsId', 'time']) #print(time.localtime(int(testTime))) #print(data['time'].dtype);