def recommendByCF(userId): '''recommend by trained result df predicted_user_df userId: output: a list of newsId ''' try: user_rated_newsId = ratings_df.filter(col('userId') == userId).select( col('newsId')).rdd.flatMap(lambda x: x).collect() print user_rated_newsId if len(user_rated_newsId) == 0: return False predicted_not_rated_df = predicted_user_df.filter( ~predicted_user_df.newsId.isin(user_rated_newsId)) predicted_not_rated_df.cache() predicted_highest_not_rated_df = predicted_not_rated_df.sort( predicted_not_rated_df.prediction, ascending=False) predicted_highest_not_rated_df.show(20, truncate=False) predicted_highest_not_rated_df.cache() recommendNewsList = predicted_highest_not_rated_df.select( col('newsId')).rdd.flatMap(lambda x: x).take(1000) Log().write("Recommend to user %d by CF success.Recommend %d news" % (userId, len(recommendNewsList))) return recommendNewsList except Exception as e: Log().write( "Something wrong when recommend to user(%d) by CF.Err info : %s" % (userId, e), "ERROR") return False
def sendByPost(self, dataList, url='127.0.0.1', port=8887): '''send str(data) input: @data:result of GetNews.getNewFromFile ''' #When it tries to proxy "127.0.0.01/", the proxy gives up and returns a 504 error. #so need to setup an empty proxy proxy_build = urllib2.ProxyHandler({}) opener = urllib2.build_opener(proxy_build) urllib2.install_opener(opener) i = 0 requrl = 'http://' + url + ':' + str(port) + '/sendactions' Log().write("Begin send news to server") for d in dataList: req = urllib2.Request(requrl) req.add_header('Content-Type', 'application/json') urllib2.urlopen(req, json.dumps(d)) i += 1 if i % 10 == 0: Log().write('Already send %d actions to server' % i) Log().write('send %d actions to server' % i)
def recommendByTag(userId, userTagList=[u'娱乐', u'新闻']): '''recommend by the news which is in the tag that user readed and which user not readed if a user did not read a new that already pushed . the action rating is 0 output: a dict with key is tag and values are list of news id. ex:{'fun':[1,2,3] , 'phone':[5,6,7]} ''' try: user_readed_newsId = ratings_df.filter(col('userId') == userId).filter( col('rating') > 0).select( col('newsId')).rdd.flatMap(lambda x: x).collect() if len(user_readed_newsId) > 0: user_not_readed_df = sorted_news_df.filter( ~sorted_news_df.id.isin(user_readed_newsId)) else: user_not_readed_df = sorted_news_df user_not_readed_df.cache() if len(userTagList) == 0: return False returnDict = {} i = 0 for tag in userTagList: recommendListByTag = user_not_readed_df.filter( col('tag').like( u'%%%s%%' % tag)).select('id').rdd.flatMap(lambda x: x).take(500) returnDict.setdefault(tag, recommendListByTag) i += len(recommendListByTag) Log().write( "Recommend to user %d by tag %s success. Recommend %d news" % (userId, userTagList, i)) return returnDict except Exception as e: Log().write( "Something wrong when recommend to user(%d) by tag(%s).Err info : %s" % (userId, str(userTagList), e), "ERROR") return False
def append(self, onelineData, fileName=fileName): '''append data to file input: @onelineData: json like txt .ex:'{"a": 1, "b": 2}' ''' f = open(fileName, 'a') data = json.dumps(onelineData) if isinstance(onelineData, dict): if 'title' in onelineData.keys(): Log().write(u"Server get and save new : %s" % onelineData['title']) f.write(data) f.write('\n') f.close()
def getNewFromFile(self, folder, source=u'凤凰新闻'): '''red news from folder .the file name in folder is new's titile. the content in file is news. input: @foder : the folder content news file. output: @result : a dict . ex :[{title:***,tag:****,publishTime:***,from:***,\ content:**** , 'pv':*** , id:*****},*****] tag: ex 'joy,military' ''' debug = False result = [] fileList = os.listdir(folder) i = 0 for new_file in fileList: one_new = {} new_name = new_file.split('.')[0] one_new.setdefault('title', new_name) full_file = folder + '/' + new_file f = open(full_file, 'r') content = '' for line in f.readlines(): content += line.strip() codetype = chardet.detect(content)['encoding'] if codetype == None: continue content = content.decode(codetype, 'ignore') one_new.setdefault('content', content) tag = random.sample(tagList, 2) tag_s = '' for t in tag: tag_s += t + ',' one_new.setdefault('tag', tag_s[:-1]) one_new.setdefault('from', source) one_new.setdefault('pv', random.randint(10, 10000)) one_new.setdefault('publishTime', random.sample(timeList, 1)[0]) one_new.setdefault('id', i) f.close() result.append(one_new) Debug().debug(one_new['content'], isdebug=debug) Debug().debug(one_new['title'], isdebug=debug) i += 1 if i % 10 == 0: Log().write('Finish read %d news' % i) return result
def append(self, onelineData, fileName=fileName, structureList=structureList): '''append data to file input: @onelineData: json like txt .ex:'{"userId": 1, "newsId": 2,"rating":10 }' ''' f = open(fileName, 'a') data = json.dumps(onelineData) if isinstance(onelineData, dict): if 'userId' in onelineData.keys(): Log().write( u'user %s read new %s' % (str(onelineData['userId']), str(onelineData['newsId']))) f.write(data) f.write('\n') f.close()
labelCol='rating', metricName='rmse') ranks = [4, 8, 12] errors = [] models = [] err = 0 min_err = float('inf') best_rank = -1 for rank in ranks: als.setRank(rank) model = als.fit(training_df) predict_df = model.transform(validation_df) predict_ratings_df = predict_df.filter( predict_df.prediction != float('nan')) error = reg_eval.evaluate(predict_ratings_df) Log().write("When training als model , for %d rank the rmse is %f" % (rank, error)) if error < min_err: min_err = error best_rank = err err += 1 errors.append(error) models.append(model) als.setRank(ranks[best_rank]) Log().write("The best model was traind with rank %d" % ranks[best_rank]) all_rating_model = als.fit(ratings_df) predict_df = all_rating_model.transform(ratings_df) predicted_user_df = predict_df.filter(col('prediction') != float('nan')) predicted_user_df.cache()
f.write('\n') f.close() class SendNewsHandler(tornado.web.RequestHandler): def post(self): data = tornado.escape.json_decode(self.request.body) SaveData2File().append(data) def get(self): '''get url http://127.0.0.1:8888?name=hp ''' self.set_cookie('username', 'peng', expires=time.time() + 900) nowamagic = self.get_argument('name') print nowamagic def make_app(): return tornado.web.Application([ (r"/sendnews", SendNewsHandler), ]) if __name__ == "__main__": app = make_app() port = 8888 app.listen(port) Log().write('Begin to listen port %d' % port) tornado.ioloop.IOLoop.current().start()
f.write(data) f.write('\n') f.close() class SendActionsHandler(tornado.web.RequestHandler): def post(self): data = tornado.escape.json_decode(self.request.body) SaveData2File().append(data) def get(self): '''get url http://127.0.0.1:8888?name=hp ''' self.set_cookie('username', 'peng', expires=time.time() + 900) nowamagic = self.get_argument('name') print nowamagic def make_app(): return tornado.web.Application([ (r"/sendactions", SendActionsHandler), ]) if __name__ == "__main__": app = make_app() port = 8887 app.listen(port) Log().write('Save action server Begin to listen port %d' % port) tornado.ioloop.IOLoop.current().start()