class ArticleScrapingTask: def __init__(self, scraper, min_word_count_heuristic=100): """ @type scraper: Scraper @type min_word_count_heuristic: int """ self._scraper = scraper self._min_word_count_heuristic = min_word_count_heuristic self._logger = Logger(self.__class__.__name__) pass def run(self, article): """ @type article: Article """ if article.url: self._logger.info('Scraping %s.', article.url) for parser in self._scraper.scrape(article.url): assert isinstance(parser, ArticleParser) # The final URL of the article may be different, during scraping, the scraper # passing the final URL to each constructed Parser. article.url = parser.url title = parser.get_title() publish_date = parser.get_publish_date() preview_image_url = parser.get_preview_image_url() body = parser.get_body() if title: article.title = title if publish_date: article.publish_date = publish_date if preview_image_url: article.preview_image_url = preview_image_url if body and self._is_article_body(body): article.body = body elif article.description and self._is_article_body(article.description): article.body = article.description else: break return True return False def _is_article_body(self, body): return self._count_words(body) > self._min_word_count_heuristic @staticmethod def _count_words(s): return len(s.split())
def sync_stats(): """Fetch latest stats from IEDCR website""" try: logger = Logger.create_logger("tasks") logger.info("Starting sync of stats data") if Meta.is_stats_syncing(): logger.info("A stats sync is already in progress") return Meta.set_stats_syncing(True) provider = DataProvider() data = provider.get_stats() stat = Stat.get() # iteratively update the data for attr, value in data.items(): setattr(stat, attr, value) stat.save() Meta.set_stats_syncing(False) logger.info("Stats sync complete") except Exception as e: Meta.set_stats_syncing(False) logger.error(f"Stats sync failed with error: {e}")
class OMDBController: def __init__(self): self.http_requests = Requester() self.logger = Logger().get_logger() self.config = Config() def omdb_request(self, query_string): self.logger.debug(f'query_string received: {query_string}') res = self.http_requests.get_url( url=f'{self.config.omdb_base_url}?{query_string}', jsonify=True, params={ 'apikey': self.config.omdb_api_key, 'i': self.config.omdb_id, }, ) return res
def __init__(self, topic_scraper, article_scraping_task): """ @type topic_scraper: Scraper @type article_scraping_task: ArticleScrapingTask """ self._topic_scraper = topic_scraper self._article_scraping_task = article_scraping_task self._logger = Logger(self.__class__.__name__)
class MarvelController: def __init__(self): self.http_requests = Requester() self.logger = Logger().get_logger() self.config = Config() def marvel_request(self, query_string): self.logger.debug(f'\n\nquery_string received: {query_string}') print(f'{self.config.marvel_base_url}/v1/public/characters?{query_string}') res = self.http_requests.get_url( url = f'{self.config.marvel_base_url}/v1/public/characters?{query_string}', jsonify = True, params = { 'apikey' : self.config.marvel_public_key, }, ) return res
def __init__(self, scraper, min_word_count_heuristic=100): """ @type scraper: Scraper @type min_word_count_heuristic: int """ self._scraper = scraper self._min_word_count_heuristic = min_word_count_heuristic self._logger = Logger(self.__class__.__name__) pass
class TopicScrapingTask: def __init__(self, topic_scraper, article_scraping_task): """ @type topic_scraper: Scraper @type article_scraping_task: ArticleScrapingTask """ self._topic_scraper = topic_scraper self._article_scraping_task = article_scraping_task self._logger = Logger(self.__class__.__name__) def run(self, topic_url): for parser in self._topic_scraper.scrape(topic_url): assert isinstance(parser, TopicParser) self._logger.info('Scraping topic at %s.' % topic_url) articles = [] for article in parser.get_articles(): try: if self._article_scraping_task.run(article): articles.append(article) else: self._logger.warn('Could not parse article body at %s', article.url) except IOError, e: self._logger.error('Failed scraping article: %s' % e) continue return articles
class KeywordAlgorithm(Algorithm): name = 'keyword' def __init__(self): Algorithm.__init__(self) self._score_mapper = ScoreMapper() self._logger = Logger(self.__class__.__name__) def train(self, articles, states): self._params.extractor = KeywordFeatureExtractor(finder=KeywordFinder(), text_key=lambda a: a.title) features = np.array(self._params.extractor.train_extract(articles)) scores = np.array(self._score_mapper.map_batch_score(states)) regression = LinearRegression(fit_intercept=True) n_features = features.shape[1] self._logger.info('Feature space uses %d keywords', n_features) if n_features >= 100: param_grid = { 'pca__n_components': range(50, n_features, 50) } pca = PCA(n_components=100) pipeline = Pipeline([('pca', pca), ('regression', regression)]) clf = GridSearchCV(pipeline, param_grid, n_jobs=1, verbose=0, cv=3, score_func=top_item_scorer) else: clf = regression self._params.classifier = clf #self._params.classifier = regression self._params.classifier.fit(features, scores) def score(self, articles): self._logger.info('Feature space uses %d keywords', self._params.extractor.keyword_count()) features = self._params.extractor.extract(articles) return self._params.classifier.predict(np.array(features))
class WorkerThread(threading.Thread): def __init__(self, worker_id, task_queue, completed_queue): """ @type worker_id: int @type task_queue: Queue.Queue @type completed_queue: Queue.Queue """ self._name = '%s-%d' % (self.__class__.__name__, worker_id) threading.Thread.__init__(self, name=self._name) self._logger = Logger(self._name) self._id = worker_id self._task_queue = task_queue self._completed_queue = completed_queue self._continue = True def stop(self): self._continue = False def run(self): while self._continue: self.work() self.exit() def work(self): raise NotImplementedError def exit(self): self._logger.info('Exiting.') @classmethod def initializer(cls, *args, **kwargs): class _WorkerThread(cls): def __init__(self, worker_id, task_queue, completed_queue): cls.__init__(self, worker_id, task_queue, completed_queue, *args, **kwargs) return _WorkerThread
def __init__(self, worker_id, task_queue, completed_queue): """ @type worker_id: int @type task_queue: Queue.Queue @type completed_queue: Queue.Queue """ self._name = '%s-%d' % (self.__class__.__name__, worker_id) threading.Thread.__init__(self, name=self._name) self._logger = Logger(self._name) self._id = worker_id self._task_queue = task_queue self._completed_queue = completed_queue self._continue = True
class BaseModel(db.Model): __abstract__ = True logger = Logger.create_logger(__name__) def save(self): """save the item to database""" try: db.session.add(self) db.session.commit() except Exception as e: self.logger.error(f"Error while saving to database: {e}") def delete(self): """delete the item from database""" try: db.session.delete(self) db.session.commit() except Exception as e: self.logger.error(f"Error while deleting from database: {e}")
def __init__(self): Algorithm.__init__(self) self._score_mapper = ScoreMapper() self._logger = Logger(self.__class__.__name__)
from application import app from application.logger import Logger Logger().init() # api import application.api.status_api import application.api.example_api # if run with cli this is NOT executed if __name__ == '__main__': app.logger.info('start application: [{0}] @ {1}:{2} in DEBUG={3}'.format( app.config['APP_NAME'], app.config['HTTP_HOST'], app.config['HTTP_PORT'], app.config['DEBUG'])) app.run(host=app.config['HTTP_HOST'], port=app.config['HTTP_PORT'], debug=app.config['DEBUG'])
import requests from requests.auth import HTTPBasicAuth from application.logger import Logger from application.config import Config logger = Logger().get_logger() class Requester: def __init__(self): self.config = Config() def get_url(self, url, jsonify, headers={'content': 'application/json'}, verify=True, params=None): # make GET request try: response = requests.get(url, headers=headers, verify=verify, params=params) response.raise_for_status() except requests.exceptions.HTTPError as e: logger.exception(f'Requester.get_url Exception: {e}') raise e # jsonify response if jsonify:
def sync_district_data(): """Fetch latest data from IEDCR reports""" try: # For some unknown reason, Logger.createLogger(__name__), # where __name__ == "application.tasks" doesn't bind # the handler. After some debugging, I found that anything # prefixing "application.*" doesn't work. According to # Logger.create_logger(), it assumes that a handler is # already binded, although it's not. # For the other parts it doesn't cause any problem. For example, # when the logger is created inside DataProvider module, the name # "application.provider.*" doesn't cause any problem. # This is a weird issue. I will look into this later. For now, # I will name it "tasks" logger = Logger.create_logger("tasks") logger.info("Starting sync of district data") if Meta.is_district_syncing(): logger.info("A district sync is already in progress") return # set updating state to true Meta.set_district_syncing(True) # download and get updated data provider = DataProvider() new_data = (provider.sync_district_data() ) # returns list of tuple as [...(districtName, Count)] last_updated = Meta.get_last_district_sync() # flag to monitor if fetched data has changed has_updated = False # differnece with current time and last updated time update_delta = datetime.utcnow() - last_updated # check the data against database records and update as necessary for pair in new_data: # ignore blank data if pair[0] == "" or pair[1] == "": continue district = District.find_by_name(pair[0]) if district: if district.count != pair[1]: # count changed from last record # - save previous count # - update new count district.prev_count = district.count district.count = pair[1] has_updated = True else: # count did not change # - make count and prev_count same only if last change was 1 day ago if update_delta.days >= 1: district.prev_count = district.count district.save() else: new_district = District(pair[0], pair[1]) new_district.save() has_updated = True # set updating state to False as update is finished Meta.set_district_syncing(False) logger.debug(f"Has updated = {has_updated}") if has_updated: # set last updated time to now if more than 24hrs # the 24hrs constant window helps to better calculate new_count - prev_count if update_delta.days >= 1: Meta.set_last_district_sync() logger.info("Updated last sync time") logger.info("District sync complete (fetched new data)") return logger.info("District sync complete (already up-to-date)") except Exception as e: Meta.set_district_syncing(False) logger.error(f"District sync failed with error: {e}")
def __init__(self): self.http_requests = Requester() self.logger = Logger().get_logger() self.config = Config()
class DataProvider: logger = Logger.create_logger(__name__) def __init__(self): self.stats_data_source = "https://corona.gov.bd/lang/en" self.district_report_url = os.environ.get("REPORT_URL") self.trans_table = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789") def get_stats(self): """Fetch the latest statistics like total positive cases, deaths etc""" page = requests.get(self.stats_data_source) soup = bs(page.content, "html.parser") counts = soup.select(".live-update-box-wrap-h1>b") # process counts - replace bangla digits with english for i in range(len(counts)): # counts[i] = counts[i].text.translate(self.trans_table) counts[i] = int(counts[i].text) data_dict = { "positive_24": counts[0], "positive_total": counts[1], "death_24": counts[2], "death_total": counts[3], "recovered_24": counts[4], "recovered_total": counts[5], "test_24": counts[6], "test_total": counts[7], } self.logger.debug(data_dict) return data_dict def parse_district_data(self): """Parse the Google Sheets to get district data""" page = requests.get(self.district_report_url) soup = bs(page.content, "html.parser") table = soup.find("table") rows = table.find_all("tr") result = [] for rindex, row in enumerate(rows): # ignore first two rows and last row because they are headers/totals if rindex < 2 or rindex == len(rows) - 1: continue data = [] for col in row.find_all("td"): # ignore division names column if col.has_attr("rowspan"): continue data.append(self.sanitize(col.text)) result.append(data) return result def sanitize(self, s): """sanitize string: - by replacing invalid chars with correct ones - converting to int if applicable""" mapping = {"’": "'"} for key, val in mapping.items(): s = s.replace(key, val) if s.isdigit(): s = int(s) return s def sync_district_data(self): return self.parse_district_data()