/
mongoDbDailyNews.py
63 lines (53 loc) · 1.7 KB
/
mongoDbDailyNews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 31 17:41:04 2020
@author: nikhilmatta
"""
from pymongo import MongoClient
from apscheduler.schedulers.blocking import BlockingScheduler
from scraper import data, getLinks, getArticleData
from model import loadModel, getSentiment, getSummary
#loadModel()
HOST = 'localhost'
PORT = 27017
client = MongoClient(HOST, PORT)
database = client.dailyNews
articles = database.articlesData
def storeArticles():
for i in range(len(data['links'])):
link = data['links'][i]
category = data['category'][i]
date = data['date'][i]
image = data['image'][i]
headline = data['headline'][i]
article = data['article'][i]
try:
articleSummary = getSummary(data['article'][i])
articleSentiment = getSentiment(data['article'][i])
articleData = {'link':link,'category':category,'date':date,'image':image,'headline':headline,'article':article,'summary':articleSummary,'sentiment':articleSentiment}
articles.update(articleData,articleData,upsert= True)
print(i)
except:pass
def clearLists():
data['links'].clear()
data['category'].clear()
data['date'].clear()
data['image'].clear()
data['headline'].clear()
data['article'].clear()
data['summary'].clear()
data['sentiment'].clear()
def scraperScheduler():
getLinks()
getArticleData()
storeArticles()
clearLists()
def startScheduler():
scheduler = BlockingScheduler()
scheduler.add_job(scraperScheduler, 'interval', hours=0.5)
scheduler.start()
if __name__ == '__main__':
loadModel()
scraperScheduler()
startScheduler()