class Scheduler: KWE_DATE_FORMAT = "%Y-%m-%d %H" KWE_DATE_FILE = 'kwe_date.txt' def _read_kwe_date(self): if os.path.isfile(self.KWE_DATE_FILE): with open(self.KWE_DATE_FILE, 'r') as f: return datetime.strptime(f.readline().strip(), self.KWE_DATE_FORMAT) return datetime(2019, 1, 23, 0) def _set_kwe_date(self, date): self.kwe_latest = date with open(self.KWE_DATE_FILE, 'w') as f: f.write(date.strftime(self.KWE_DATE_FORMAT)) def __init__(self): self.continue_schedule = False self.local_db = DBHandler() self.all_synonyms = set() self.trustpilot = TrustPilotCrawler() self.reddit = RedditScraper() self.scrapers = { 'trustpilot': TrustPilotCrawler(), 'reddit': RedditScraper() } # TODO: Make Environment Variables for API info self.kwe_api = f'http://{os.environ["KWE_API_HOST"]}/' self.kwe_api_key = {'Authorization': os.environ['KWE_API_KEY']} self.sa_api = f'http://{os.environ["SA_API_HOST"]}/prediction/' self.sa_api_key = {'Authorization': os.environ['SA_API_KEY']} self.synonym_api = f'http://{os.environ["GATEWAY_API_HOST"]}/api/synonyms' self.synonym_api_key = {'Authorization': os.environ['GATEWAY_API_KEY']} self.sentiment_categories = [{ 'category': 'positive', 'upper_limit': 1, 'lower_limit': 0.55 }, { 'category': 'negative', 'upper_limit': 0.45, 'lower_limit': 0 }, { 'category': 'neutral', 'upper_limit': 0.55, 'lower_limit': 0.45 }] self.kwe_interval = timedelta(hours=1) self.kwe_latest = self._read_kwe_date() self.continue_schedule = True self.schedule_thread = Thread() self.crawler_schedule_thread = Thread() self.begin_schedule() logging.info( f'Initiated scheduler, will create snapshots from {self.kwe_latest}' ) def begin_schedule(self): # TODO: # This should be run as a separate thread. # Start a KWE scheduling thread. # Repeat the following: # Fetch the next synonym from the synonym queue. # With this synonym, request from the crawler and # scraper all text posts saved with a relation to # this synonym. # Conduct sentiment analysis for every post immediately. # Store text+sentiment+date in a local database. # When the KWE scheduler decides it's time (once every hour), # fetch all text+sentiment+date from the local database. # Conduct KWE for all texts with the same sentiment. # In the main database, save the top-5-keywords with a relation # to both the synonym and their sentiment. """ for scraper in self.scrapers.keys(): self.scrapers[scraper].begin_crawl() """ self.reddit.begin_crawl() # self.trustpilot.begin_crawl() self.continue_schedule = True self.schedule_thread = Thread(target=self._threaded_schedule, name='Scheduler') def run(self): self.schedule_thread.start() def _save_snapshot(self, synonym, spans_from, spans_to): snapshot = self.create_snapshot(synonym, spans_from, spans_to) if snapshot: snapshot.save_remotely() @retry(delay=0.5, backoff=2, max_delay=60) def _threaded_schedule(self): while True: if not self.continue_schedule: return # Retrieve active synonyms from gateway logger.info('Updating synonyms') self.update_synonyms(self.fetch_all_synonyms().keys()) # Get and commit new posts logger.info('Retrieving posts') self.commit_reviews(self.retrieve_posts()) # Get and update sentiments for new posts logger.info('Fetching unsentimented posts') posts = self.fetch_new_posts(limit=10000) logger.info(f'{len(posts)} new posts fetched') if posts: sentiments = self.calculate_sentiments(posts) try: self.local_db.update_sentiments(sentiments) except Exception as e: print( f'Scheduler._threaded_schedule: Exception encountered with local_db.update_sentiments: {e}' ) traceback.print_exc() # TODO: Handle local_db.update_sentiments exceptions # Perform keyword extraction and save snapshots from current interval if datetime.utcnow() > self.kwe_latest + (2 * self.kwe_interval): logger.info(f'Current snapshot date: {self.kwe_latest}') jobs = [] with ThreadPoolExecutor(max_workers=30) as executor: for synonym in self.all_synonyms: jobs.append( executor.submit( self._save_snapshot, synonym, self.kwe_latest, self.kwe_latest + self.kwe_interval)) futures.wait(jobs) logger.info( f'Finished {len(jobs)} futures for date {self.kwe_latest}') self._set_kwe_date(self.kwe_latest + self.kwe_interval) else: sleep(5) def calculate_sentiments(self, posts): """ :param posts: { id : integer, text : string, } """ # Extract the post contents id_list = [] content_list = [] for id, content in posts.items(): id_list.append(id) content_list.append(content) # Call the SentimentAnalysis API try: predictions = json.loads( requests.post(self.sa_api, json=dict(data=content_list)).text) except Exception as e: print( f'Scheduler.calculate_sentiments: Exception encountered with SA API: {e}' ) traceback.print_exc() # TODO: Handle SA API exceptions return [] # Combine predictions with posts results = [{ 'id': id_list[i], 'sentiment': predictions['predictions'][i] } for i in range(0, len(predictions['predictions']))] return results def retrieve_posts(self): # Get posts from each scraper reddit_buffer = self.reddit.get_buffer_contents() trustpilot_buffer = self.trustpilot.get_buffer_contents() return {'trustpilot': trustpilot_buffer, 'reddit': reddit_buffer} def commit_reviews(self, reviews): # Get reviews from each crawler tp_reviews = reviews['trustpilot'] reddit_reviews = reviews['reddit'] # Commit reviews to the database try: for review in tp_reviews: self.local_db.commit_trustpilot( identifier=review['id'], synonym=review['synonym'], contents=review['text'], user=review['author'], date=review['date'], num_user_ratings=review['num_ratings']) except Exception as e: print( f'Scheduler.commit_reviews: Exception encountered while commiting trustpilot posts to database: {e}' ) traceback.print_exc() # TODO: Handle [db_handler].commit_trustpilot exceptions try: for review in reddit_reviews: self.local_db.commit_reddit(unique_id=review['id'], synonyms=review['synonyms'], text=review['text'], author=review['author'], date=review['date'], subreddit=review['subreddit']) except Exception as e: print( f'Scheduler.commit_reviews: Exception encountered while commiting reddit posts to database: {e}' ) traceback.print_exc() # TODO: Handle [db_handler].commit_reddit exceptions def fetch_all_synonyms(self): try: synonyms = requests.get(self.synonym_api, headers=self.synonym_api_key).json() return synonyms except Exception as e: print( f'Scheduler.fetch_all_synonyms: Exception encountered with synonym api: {e}' ) return {synonym: -1 for synonym in self.all_synonyms} def fetch_new_posts(self, synonym=None, with_sentiment=False, limit=None): """ Returns all newly crawled posts from the crawler and scraper that relate to this synonym. :param synonym : string :param with_sentiment : boolean - if set to false, only returns rows where sentiment = NULL. :param limit : integer """ try: posts = self.local_db.get_new_posts(synonym, with_sentiment, limit) return posts except Exception as e: logger.error( f'Exception encountered while retrieving posts from database: {e}' ) traceback.print_exc() # TODO: Handle [db_handler].get_new_posts exceptions return {} def create_snapshot(self, synonym, from_time=datetime.min, to_time=datetime.now()): """ :param synonym: string :param from_time: datetime :param to_time: datetime """ statistics = dict() posts = list() try: posts = self.local_db.get_kwe_posts(synonym, from_time, to_time) except Exception as e: print( f'Scheduler.create_snapshot: Exception encountered while retrieving posts from database: {e}' ) traceback.print_exc() # TODO: Handle [db_handler].get_kwe_posts exception if posts: avg_sentiment = mean([p["sentiment"] for p in posts]) splits = [{ "sentiment_category": sc["category"], "posts": [ p["content"] for p in posts if sc["upper_limit"] >= p["sentiment"] >= sc["lower_limit"] ] } for sc in self.sentiment_categories] # For each split of posts, compute keywords and number of posts for split in splits: keywords = [] num_posts = len(split["posts"]) # Only requests keywords if there are posts if num_posts: try: logger.info(f'Performing KWE on posts for {synonym}') response = requests.post( self.kwe_api, json=dict(posts=split["posts"]), headers=self.kwe_api_key).json() keywords = response.get('keywords', []) except Exception as e: print( f'Scheduler.create_snapshot: Exception encountered with KWE API: {e}' ) traceback.print_exc() return None statistics[split['sentiment_category']] = { "keywords": keywords, "posts": num_posts } else: return None return Snapshot(spans_from=from_time, spans_to=to_time, sentiment=avg_sentiment, synonym=synonym, statistics=statistics) def update_synonyms(self, synonyms): if set(synonyms) == self.all_synonyms: return try: self.local_db.commit_synonyms(synonyms) except Exception as e: print( f'Scheduler.update_synonyms: Exception encountered while commiting synonyms to database: {e}' ) traceback.print_exc() # TODO: Handle [db_handler].commit_synonyms exceptions return self.all_synonyms = self.all_synonyms.union(synonyms) # Update scraper synonyms self.reddit.use_synonyms(self.all_synonyms) self.trustpilot.use_synonyms(self.all_synonyms) def add_synonym(self, synonym): self.add_synonyms([synonym]) def add_synonyms(self, synonyms): self.update_synonyms(list(self.all_synonyms.union(synonyms)))