def backfill(includeClass=True): newsCollection = getCollection() allNews = newsCollection.find({}) count = 0 for news in allNews: count += 1 modified = False if count % 50 == 0: print 'backfill on %i news' % count # correct news which doesn't have description if 'description' not in news or not news['description']: # title exist replace description with title if 'title' in news and news['title']: news['description'] = news['title'] # add class to news if includeClass: news['class'] = classify(news['title']) modified = True # url exist replace description with url elif 'url' in news and news['url']: news['description'] = news['url'] if includeClass: news['class'] = classify(news['url']) modified = True # text exist replace description with text elif 'text' in news and news['text']: news['description'] = news['text'] if includeClass: news['class'] = classify(news['text']) modified = True # otherwise leave description empty else: news['description'] = '' else: if includeClass: if 'title' in news and news['title']: news['class'] = classify(news['title']) modified = True elif 'url' in news and news['url']: news['class'] = classify(news['url']) modified = True elif 'text' in news and news['text']: news['class'] = classify(news['text']) modified = True else: pass if modified: newsCollection.replace_one({'digest': news['digest']}, news, upsert=True) print 'backfill completed. %i news' % count
def getNews(userID, pageID): pageID = int(pageID) # inclusive pageStartIndex = 0 # exclusive pageEndIndex = PAGINATION if pageID > 0: pageStartIndex = pageID * PAGINATION pageEndIndex = (pageID + 1) * PAGINATION slicedNewsList = [] if redisClient.get(userID) is not None: # redis save value in string, pickle.loads convert str to dict cachedNewsDigests = pickle.loads(redisClient.get(userID)) # pageStartIndex > pageEndIndex: slicedDigests = [] # pageStartIndex > len(cachedNewsDigests): slicedDigests = [] slicedDigests = cachedNewsDigests[pageStartIndex:pageEndIndex] # slicedDigests = [], slicedNewsList = [] slicedNewsList = list(mongoDB.getCollection().find( {'digest': { '$in': slicedDigests }})) else: pagesOfNews = list(mongoDB.getNews()) # reorder news based on preference model pagesOfNews = reorderPagesOfNews(userID, pagesOfNews) cachedNewsDigests = map(lambda x: x['digest'], pagesOfNews) # print cachedNewsDigests # save str to redis, pickle dumps convert dict to str redisClient.set(userID, pickle.dumps(cachedNewsDigests)) redisClient.expire(userID, REDIS['DIGEST_EXPIRATION']) # pageStartIndex > pageEndIndex: slicedNewsList = [] # pageStartIndex > len(cachedNewsDigests): slicedNewsList = [] slicedNewsList = pagesOfNews[pageStartIndex:pageEndIndex] # convert publishedAt to string # print slicedNewsList[0]['publishedAt'].strftime('%Y-%m-%d %H:%M') for newsObj in slicedNewsList: newsTime = newsObj['publishedAt'] strTime = datetime.strftime(newsTime, '%Y-%m-%d %H:%M') newsObj['publishedAt'] = strTime.decode('utf-8') return json.loads(dumps(slicedNewsList))
def getContentObj(videoSnippit, name, mediaType): collection = getCollection('pre') result = collection.find_one({"name": name}) responseIMDB = None if result is None: responseIMDB = findMedia(name) response = {} if responseIMDB is not None: response = { "name": responseIMDB['name'], "img": responseIMDB['img'], "type": mediaType } return response else: response = { "name": name, "img": videoSnippit['thumbnails']['high']['url'], "type": mediaType } return response
def getNewsDistribution(): # compute distribution based on class classAmountDict = {} for newsClass in NEWSCLASSES: amount = mongoDB.getCollection().find({'class': newsClass}).count() classAmountDict[newsClass] = amount classAmountList = sorted(classAmountDict.items(), key=operator.itemgetter(1), reverse=True) classes = [pair[0] for pair in classAmountList] classAmounts = [pair[1] for pair in classAmountList] # compute distribution based on source sourceAmountDict = {} for source in NEWSSOURCES: amount = mongoDB.getCollection().find({'source': source}).count() sourceAmountDict[source] = amount sourceAmountList = sorted(sourceAmountDict.items(), key=operator.itemgetter(1), reverse=True) # sourceAmountList = [('six', 6), ('six2', 6), ('two', 2), ('one', 1)] datalist = [] total = sum([pair[1] for pair in sourceAmountList]) remin = 100.0 for source in sourceAmountList: percentile = float(source[1]) / total * 100 percentile = math.floor(percentile * 10) / 10 # print "%s: %f" % (source[0],percentile) if percentile < 7.6: remin = math.floor(remin * 10) / 10 datalist.append({'name': 'other sources', 'y': remin}) break else: datalist.append({'name': source[0], 'y': percentile}) remin = remin - percentile newsDistribution = { 'class': { 'chart': { 'type': 'column' }, 'title': { 'text': 'News Distribution By Class' }, 'xAxis': { 'categories': classes, 'labels': { 'style': { 'fontFamily': 'Verdana, sans-serif', 'fontSize': '13px' } } }, 'yAxis': { 'title': { 'text': 'News Amount' } }, 'series': [{ 'data': classAmounts }], 'legend': { 'enabled': False }, 'credits': { 'enabled': False }, }, 'source': { 'chart': { 'plotBackgroundColor': None, 'plotBorderWidth': None, 'plotShadow': False, 'type': 'pie' }, 'title': { 'text': 'news distribution by news sources' }, 'tooltip': { 'pointFormat': '{series.name}: <b>{point.percentage:.1f}%</b>' }, 'plotOptions': { 'pie': { 'allowPointSelect': True, 'cursor': 'pointer', 'dataLabels': { 'enabled': False }, 'showInLegend': True } }, 'series': [{ 'name': 'source', 'colorByPoint': 'true', 'data': datalist }] } } return json.loads(dumps(newsDistribution))
def handler(news): if news is None or not isinstance(news, dict): warn('news deduper dont handle broken news object') scraperToDeduperClient.ackMessage() return if 'text' not in news or not news['text']: warn('news %s dont have content' % news['digest']) scraperToDeduperClient.ackMessage() return # prepare date and time to a format Mongo accept news['publishedAt'] = parser.parse(news['publishedAt']) # parse news publish date publishedTime = news['publishedAt'] publishedDayBegin = datetime.datetime(publishedTime.year, publishedTime.month, publishedTime.day, 0, 0, 0, 0) publishedDayEnd = publishedDayBegin + datetime.timedelta(days=1) # get same day news from mongoDB newsListOnThatDay = list(getCollection().find( {'publishedAt': { '$gte': publishedDayBegin, '$lt': publishedDayEnd }})) if newsListOnThatDay is not None and len(newsListOnThatDay) > 0: # Python: List Comprehensions newsArray = [ str(newsOnThatDay['text'].encode('utf-8')) for newsOnThatDay in newsListOnThatDay ] newsArray.insert(0, news['text']) # calc tfidf of all same day news tfidf = TfidfVectorizer().fit_transform(newsArray) firstDoc_sim = tfidf[0] * tfidf.T _, colSize = firstDoc_sim.shape # drop this news if found similar news in history for col in range(1, colSize): if firstDoc_sim[0, col] > TFIDF['SAME_NEWS_SIMILARITY_THRESHOLD']: warn( 'news deduper found news %s similar to existing one. Ignore.' % news['digest']) scraperToDeduperClient.ackMessage() return # data cleaning on news description: # Reason: we need description for news classifier service, more specificly, # for machine learning # Problem: some news don't have description, will cause vocabulary_processor to crash. # Solution: try to fill description with title, if title missing, fill with text if 'description' not in news or not news['description']: if 'title' in news and news['title']: news['description'] = news['title'] else: news['description'] = news['text'] # classify this news if 'title' in news: news['class'] = classify(news['title']) # similar news not found, save this news to database getCollection().replace_one({'digest': news['digest']}, news, upsert=True) scraperToDeduperClient.ackMessage()
def parseContent(videoSnippet): collection = getCollection('pre') #video snippit to get title -> determin the duration to see if movie to episode title = videoSnippet['title'] name = getMediaName(title) if name is None: return None mediaType = getMediaType(videoSnippet['resourceId']['videoId'], title) if mediaType == 'MOVIE': print(title, 'is movie') contentData = getContentObj(videoSnippet, name, 'M') contentMeta = { "name": contentData['name'], "img": contentData['img'], "type": contentData['type'], "id": contentData['name'].replace(' ', '').lower(), } collection.insert_one(contentMeta) #query("INSERT INTO content VALUES('{0}','{1}','{2}','{3}')".format(contentData['name'],contentData['img'],contentData['type'],contentData['name'].replace(' ', '').lower()), "execute") elif mediaType == 'EPISODE': print(title, 'is episode') contentData = getContentObj(videoSnippet, name, 'E') instalment = getInstalments(title) #print(instalment) #print(collection.find_one({"id":contentData['name'].replace(' ', '').lower()})) if collection.find_one( {"id": contentData['name'].replace(' ', '').lower()}) is None: contentMeta = { "name": contentData['name'], "img": contentData['img'], "type": contentData['type'], "id": contentData['name'].replace(' ', '').lower(), "seasons": [{ "season": instalment[0], "episodes": [{ "title": title, "episode": instalment[1], "id": videoSnippet['resourceId']['videoId'] }] }] } collection.insert_one(contentMeta) else: collection.update_one( { "id": contentData['name'].replace(' ', '').lower(), "seasons.season": instalment[0] }, { "$push": { "seasons.$.episodes": { "title": title, "episode": instalment[1], "id": videoSnippet['resourceId']['videoId'] } } }) ''' if query("SELECT COUNT(*) FROM content WHERE name = '{0}'".format(name), 'fetchone')[0]: if not query("SELECT COUNT(*) FROM season WHERE id = '{0}'".format('_'.join([contentData['name'].replace(' ', ''),str(instalment[0])]).lower()), 'fetchone')[0]: query("INSERT INTO season VALUES('{0}','{1}','{2}')".format(contentData['name'],instalment[0],'_'.join([contentData['name'].replace(' ', ''),str(instalment[0])]).lower()), "execute") query("INSERT INTO episode VALUES('{0}','{1}', '{2}')".format(title, instalment[1], videoId), "execute") else: query("INSERT INTO content VALUES('{0}','{1}','{2}','{3}')".format(contentData['name'],contentData['img'],contentData['type'],contentData['name'].replace(' ', '').lower()), "execute") query("INSERT INTO season VALUES('{0}','{1}','{2}')".format(contentData['name'],instalment[0],'_'.join([contentData['name'].replace(' ', ''),str(instalment[0])]).lower()), "execute") query("INSERT INTO episode VALUES('{0}','{1}', '{2}')".format(title, instalment[1], videoId), "execute") ''' else: print('Ignoring', title)