예제 #1
0
def backfill(includeClass=True):
    newsCollection = getCollection()
    allNews = newsCollection.find({})
    count = 0
    for news in allNews:
        count += 1
        modified = False

        if count % 50 == 0:
            print 'backfill on %i news' % count

        # correct news which doesn't have description
        if 'description' not in news or not news['description']:
            # title exist replace description with title
            if 'title' in news and news['title']:
                news['description'] = news['title']
                # add class to news
                if includeClass:
                    news['class'] = classify(news['title'])
                modified = True

            # url exist replace description with url
            elif 'url' in news and news['url']:
                news['description'] = news['url']
                if includeClass:
                    news['class'] = classify(news['url'])
                modified = True

            # text exist replace description with text
            elif 'text' in news and news['text']:
                news['description'] = news['text']
                if includeClass:
                    news['class'] = classify(news['text'])
                modified = True

            # otherwise leave description empty
            else:
                news['description'] = ''
        else:
            if includeClass:
                if 'title' in news and news['title']:
                    news['class'] = classify(news['title'])
                    modified = True
                elif 'url' in news and news['url']:
                    news['class'] = classify(news['url'])
                    modified = True
                elif 'text' in news and news['text']:
                    news['class'] = classify(news['text'])
                    modified = True
                else:
                    pass

        if modified:
            newsCollection.replace_one({'digest': news['digest']},
                                       news,
                                       upsert=True)

    print 'backfill completed. %i news' % count
예제 #2
0
def getNews(userID, pageID):
    pageID = int(pageID)
    # inclusive
    pageStartIndex = 0
    # exclusive
    pageEndIndex = PAGINATION
    if pageID > 0:
        pageStartIndex = pageID * PAGINATION
        pageEndIndex = (pageID + 1) * PAGINATION

    slicedNewsList = []
    if redisClient.get(userID) is not None:
        # redis save value in string, pickle.loads convert str to dict
        cachedNewsDigests = pickle.loads(redisClient.get(userID))
        # pageStartIndex > pageEndIndex: slicedDigests = []
        # pageStartIndex > len(cachedNewsDigests): slicedDigests = []
        slicedDigests = cachedNewsDigests[pageStartIndex:pageEndIndex]
        # slicedDigests = [], slicedNewsList = []
        slicedNewsList = list(mongoDB.getCollection().find(
            {'digest': {
                '$in': slicedDigests
            }}))
    else:
        pagesOfNews = list(mongoDB.getNews())
        # reorder news based on preference model
        pagesOfNews = reorderPagesOfNews(userID, pagesOfNews)
        cachedNewsDigests = map(lambda x: x['digest'], pagesOfNews)
        # print cachedNewsDigests

        # save str to redis, pickle dumps convert dict to str
        redisClient.set(userID, pickle.dumps(cachedNewsDigests))
        redisClient.expire(userID, REDIS['DIGEST_EXPIRATION'])
        # pageStartIndex > pageEndIndex: slicedNewsList = []
        # pageStartIndex > len(cachedNewsDigests): slicedNewsList = []
        slicedNewsList = pagesOfNews[pageStartIndex:pageEndIndex]

    # convert publishedAt to string
    # print slicedNewsList[0]['publishedAt'].strftime('%Y-%m-%d %H:%M')
    for newsObj in slicedNewsList:
        newsTime = newsObj['publishedAt']
        strTime = datetime.strftime(newsTime, '%Y-%m-%d %H:%M')
        newsObj['publishedAt'] = strTime.decode('utf-8')

    return json.loads(dumps(slicedNewsList))
예제 #3
0
def getContentObj(videoSnippit, name, mediaType):

    collection = getCollection('pre')
    result = collection.find_one({"name": name})
    responseIMDB = None
    if result is None:
        responseIMDB = findMedia(name)

    response = {}
    if responseIMDB is not None:
        response = {
            "name": responseIMDB['name'],
            "img": responseIMDB['img'],
            "type": mediaType
        }
        return response
    else:
        response = {
            "name": name,
            "img": videoSnippit['thumbnails']['high']['url'],
            "type": mediaType
        }
    return response
예제 #4
0
def getNewsDistribution():
    # compute distribution based on class
    classAmountDict = {}
    for newsClass in NEWSCLASSES:
        amount = mongoDB.getCollection().find({'class': newsClass}).count()
        classAmountDict[newsClass] = amount
    classAmountList = sorted(classAmountDict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    classes = [pair[0] for pair in classAmountList]
    classAmounts = [pair[1] for pair in classAmountList]

    # compute distribution based on source
    sourceAmountDict = {}
    for source in NEWSSOURCES:
        amount = mongoDB.getCollection().find({'source': source}).count()
        sourceAmountDict[source] = amount
    sourceAmountList = sorted(sourceAmountDict.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
    # sourceAmountList = [('six', 6), ('six2', 6), ('two', 2), ('one', 1)]

    datalist = []
    total = sum([pair[1] for pair in sourceAmountList])
    remin = 100.0
    for source in sourceAmountList:
        percentile = float(source[1]) / total * 100
        percentile = math.floor(percentile * 10) / 10
        # print "%s: %f" % (source[0],percentile)
        if percentile < 7.6:
            remin = math.floor(remin * 10) / 10
            datalist.append({'name': 'other sources', 'y': remin})
            break
        else:
            datalist.append({'name': source[0], 'y': percentile})
            remin = remin - percentile

    newsDistribution = {
        'class': {
            'chart': {
                'type': 'column'
            },
            'title': {
                'text': 'News Distribution By Class'
            },
            'xAxis': {
                'categories': classes,
                'labels': {
                    'style': {
                        'fontFamily': 'Verdana, sans-serif',
                        'fontSize': '13px'
                    }
                }
            },
            'yAxis': {
                'title': {
                    'text': 'News Amount'
                }
            },
            'series': [{
                'data': classAmounts
            }],
            'legend': {
                'enabled': False
            },
            'credits': {
                'enabled': False
            },
        },
        'source': {
            'chart': {
                'plotBackgroundColor': None,
                'plotBorderWidth': None,
                'plotShadow': False,
                'type': 'pie'
            },
            'title': {
                'text': 'news distribution by news sources'
            },
            'tooltip': {
                'pointFormat': '{series.name}: <b>{point.percentage:.1f}%</b>'
            },
            'plotOptions': {
                'pie': {
                    'allowPointSelect': True,
                    'cursor': 'pointer',
                    'dataLabels': {
                        'enabled': False
                    },
                    'showInLegend': True
                }
            },
            'series': [{
                'name': 'source',
                'colorByPoint': 'true',
                'data': datalist
            }]
        }
    }

    return json.loads(dumps(newsDistribution))
예제 #5
0
def handler(news):
    if news is None or not isinstance(news, dict):
        warn('news deduper dont handle broken news object')
        scraperToDeduperClient.ackMessage()
        return
    if 'text' not in news or not news['text']:
        warn('news %s dont have content' % news['digest'])
        scraperToDeduperClient.ackMessage()
        return

    # prepare date and time to a format Mongo accept
    news['publishedAt'] = parser.parse(news['publishedAt'])
    # parse news publish date
    publishedTime = news['publishedAt']
    publishedDayBegin = datetime.datetime(publishedTime.year,
                                          publishedTime.month,
                                          publishedTime.day, 0, 0, 0, 0)
    publishedDayEnd = publishedDayBegin + datetime.timedelta(days=1)

    # get same day news from mongoDB
    newsListOnThatDay = list(getCollection().find(
        {'publishedAt': {
            '$gte': publishedDayBegin,
            '$lt': publishedDayEnd
        }}))
    if newsListOnThatDay is not None and len(newsListOnThatDay) > 0:
        # Python: List Comprehensions
        newsArray = [
            str(newsOnThatDay['text'].encode('utf-8'))
            for newsOnThatDay in newsListOnThatDay
        ]
        newsArray.insert(0, news['text'])

        # calc tfidf of all same day news
        tfidf = TfidfVectorizer().fit_transform(newsArray)
        firstDoc_sim = tfidf[0] * tfidf.T

        _, colSize = firstDoc_sim.shape

        # drop this news if found similar news in history
        for col in range(1, colSize):
            if firstDoc_sim[0, col] > TFIDF['SAME_NEWS_SIMILARITY_THRESHOLD']:
                warn(
                    'news deduper found news %s similar to existing one. Ignore.'
                    % news['digest'])
                scraperToDeduperClient.ackMessage()
                return

    # data cleaning on news description:
    # Reason:  we need description for news classifier service, more specificly,
    #          for machine learning
    # Problem: some news don't have description, will cause vocabulary_processor to crash.
    # Solution: try to fill description with title, if title missing, fill with text
    if 'description' not in news or not news['description']:
        if 'title' in news and news['title']:
            news['description'] = news['title']
        else:
            news['description'] = news['text']

    # classify this news
    if 'title' in news:
        news['class'] = classify(news['title'])

    # similar news not found, save this news to database
    getCollection().replace_one({'digest': news['digest']}, news, upsert=True)
    scraperToDeduperClient.ackMessage()
예제 #6
0
def parseContent(videoSnippet):

    collection = getCollection('pre')

    #video snippit to get title -> determin the duration to see if movie to episode

    title = videoSnippet['title']
    name = getMediaName(title)
    if name is None: return None

    mediaType = getMediaType(videoSnippet['resourceId']['videoId'], title)

    if mediaType == 'MOVIE':
        print(title, 'is movie')
        contentData = getContentObj(videoSnippet, name, 'M')
        contentMeta = {
            "name": contentData['name'],
            "img": contentData['img'],
            "type": contentData['type'],
            "id": contentData['name'].replace(' ', '').lower(),
        }
        collection.insert_one(contentMeta)

        #query("INSERT INTO content VALUES('{0}','{1}','{2}','{3}')".format(contentData['name'],contentData['img'],contentData['type'],contentData['name'].replace(' ', '').lower()), "execute")

    elif mediaType == 'EPISODE':
        print(title, 'is episode')
        contentData = getContentObj(videoSnippet, name, 'E')
        instalment = getInstalments(title)
        #print(instalment)
        #print(collection.find_one({"id":contentData['name'].replace(' ', '').lower()}))
        if collection.find_one(
            {"id": contentData['name'].replace(' ', '').lower()}) is None:
            contentMeta = {
                "name":
                contentData['name'],
                "img":
                contentData['img'],
                "type":
                contentData['type'],
                "id":
                contentData['name'].replace(' ', '').lower(),
                "seasons": [{
                    "season":
                    instalment[0],
                    "episodes": [{
                        "title": title,
                        "episode": instalment[1],
                        "id": videoSnippet['resourceId']['videoId']
                    }]
                }]
            }
            collection.insert_one(contentMeta)
        else:
            collection.update_one(
                {
                    "id": contentData['name'].replace(' ', '').lower(),
                    "seasons.season": instalment[0]
                }, {
                    "$push": {
                        "seasons.$.episodes": {
                            "title": title,
                            "episode": instalment[1],
                            "id": videoSnippet['resourceId']['videoId']
                        }
                    }
                })
        '''
        if query("SELECT COUNT(*) FROM content WHERE name = '{0}'".format(name), 'fetchone')[0]:
            if not query("SELECT COUNT(*) FROM season WHERE id = '{0}'".format('_'.join([contentData['name'].replace(' ', ''),str(instalment[0])]).lower()), 'fetchone')[0]:
                query("INSERT INTO season VALUES('{0}','{1}','{2}')".format(contentData['name'],instalment[0],'_'.join([contentData['name'].replace(' ', ''),str(instalment[0])]).lower()), "execute")
            query("INSERT INTO episode VALUES('{0}','{1}', '{2}')".format(title, instalment[1], videoId), "execute")
        else:
            query("INSERT INTO content VALUES('{0}','{1}','{2}','{3}')".format(contentData['name'],contentData['img'],contentData['type'],contentData['name'].replace(' ', '').lower()), "execute")
            query("INSERT INTO season VALUES('{0}','{1}','{2}')".format(contentData['name'],instalment[0],'_'.join([contentData['name'].replace(' ', ''),str(instalment[0])]).lower()), "execute")
            query("INSERT INTO episode VALUES('{0}','{1}', '{2}')".format(title, instalment[1], videoId), "execute")
        '''
    else:
        print('Ignoring', title)