Пример #1
0
class ArticleScrapingTask:
    def __init__(self, scraper, min_word_count_heuristic=100):
        """
        @type scraper: Scraper
        @type min_word_count_heuristic: int
        """
        self._scraper = scraper
        self._min_word_count_heuristic = min_word_count_heuristic
        self._logger = Logger(self.__class__.__name__)
        pass

    def run(self, article):
        """
        @type article: Article
        """
        if article.url:
            self._logger.info('Scraping %s.', article.url)

            for parser in self._scraper.scrape(article.url):
                assert isinstance(parser, ArticleParser)

                # The final URL of the article may be different, during scraping, the scraper
                # passing the final URL to each constructed Parser.
                article.url = parser.url

                title = parser.get_title()
                publish_date = parser.get_publish_date()
                preview_image_url = parser.get_preview_image_url()
                body = parser.get_body()

                if title:
                    article.title = title
                if publish_date:
                    article.publish_date = publish_date
                if preview_image_url:
                    article.preview_image_url = preview_image_url

                if body and self._is_article_body(body):
                    article.body = body
                elif article.description and self._is_article_body(article.description):
                    article.body = article.description
                else:
                    break

                return True
        return False

    def _is_article_body(self, body):
        return self._count_words(body) > self._min_word_count_heuristic

    @staticmethod
    def _count_words(s):
        return len(s.split())
Пример #2
0
def sync_stats():
    """Fetch latest stats from IEDCR website"""
    try:
        logger = Logger.create_logger("tasks")
        logger.info("Starting sync of stats data")
        if Meta.is_stats_syncing():
            logger.info("A stats sync is already in progress")
            return

        Meta.set_stats_syncing(True)

        provider = DataProvider()
        data = provider.get_stats()

        stat = Stat.get()

        # iteratively update the data
        for attr, value in data.items():
            setattr(stat, attr, value)

        stat.save()
        Meta.set_stats_syncing(False)
        logger.info("Stats sync complete")
    except Exception as e:
        Meta.set_stats_syncing(False)
        logger.error(f"Stats sync failed with error: {e}")
Пример #3
0
class OMDBController:
    def __init__(self):
        self.http_requests = Requester()
        self.logger = Logger().get_logger()
        self.config = Config()

    def omdb_request(self, query_string):
        self.logger.debug(f'query_string received: {query_string}')
        res = self.http_requests.get_url(
            url=f'{self.config.omdb_base_url}?{query_string}',
            jsonify=True,
            params={
                'apikey': self.config.omdb_api_key,
                'i': self.config.omdb_id,
            },
        )
        return res
Пример #4
0
 def __init__(self, topic_scraper, article_scraping_task):
     """
     @type topic_scraper: Scraper
     @type article_scraping_task: ArticleScrapingTask
     """
     self._topic_scraper = topic_scraper
     self._article_scraping_task = article_scraping_task
     self._logger = Logger(self.__class__.__name__)
Пример #5
0
class MarvelController:
  def __init__(self):
    self.http_requests =  Requester()
    self.logger        =  Logger().get_logger()
    self.config        =  Config()


  def marvel_request(self, query_string):
    self.logger.debug(f'\n\nquery_string received: {query_string}')
    print(f'{self.config.marvel_base_url}/v1/public/characters?{query_string}')
    res = self.http_requests.get_url(
      url     =  f'{self.config.marvel_base_url}/v1/public/characters?{query_string}',
      jsonify =  True,
      params  =  {
        'apikey' :  self.config.marvel_public_key,
      },
    )
    return res
Пример #6
0
 def __init__(self, scraper, min_word_count_heuristic=100):
     """
     @type scraper: Scraper
     @type min_word_count_heuristic: int
     """
     self._scraper = scraper
     self._min_word_count_heuristic = min_word_count_heuristic
     self._logger = Logger(self.__class__.__name__)
     pass
Пример #7
0
class TopicScrapingTask:
    def __init__(self, topic_scraper, article_scraping_task):
        """
        @type topic_scraper: Scraper
        @type article_scraping_task: ArticleScrapingTask
        """
        self._topic_scraper = topic_scraper
        self._article_scraping_task = article_scraping_task
        self._logger = Logger(self.__class__.__name__)

    def run(self, topic_url):
        for parser in self._topic_scraper.scrape(topic_url):
            assert isinstance(parser, TopicParser)
            self._logger.info('Scraping topic at %s.' % topic_url)

            articles = []
            for article in parser.get_articles():
                try:
                    if self._article_scraping_task.run(article):
                        articles.append(article)
                    else:
                        self._logger.warn('Could not parse article body at %s', article.url)

                except IOError, e:
                    self._logger.error('Failed scraping article: %s' % e)
                    continue

            return articles
Пример #8
0
class KeywordAlgorithm(Algorithm):
    name = 'keyword'

    def __init__(self):
        Algorithm.__init__(self)
        self._score_mapper = ScoreMapper()
        self._logger = Logger(self.__class__.__name__)

    def train(self, articles, states):
        self._params.extractor = KeywordFeatureExtractor(finder=KeywordFinder(), text_key=lambda a: a.title)

        features = np.array(self._params.extractor.train_extract(articles))
        scores = np.array(self._score_mapper.map_batch_score(states))

        regression = LinearRegression(fit_intercept=True)

        n_features = features.shape[1]
        self._logger.info('Feature space uses %d keywords', n_features)

        if n_features >= 100:
            param_grid = {
                'pca__n_components': range(50, n_features, 50)
            }
            pca = PCA(n_components=100)
            pipeline = Pipeline([('pca', pca), ('regression', regression)])
            clf = GridSearchCV(pipeline, param_grid, n_jobs=1, verbose=0, cv=3, score_func=top_item_scorer)
        else:
            clf = regression

        self._params.classifier = clf
        #self._params.classifier = regression

        self._params.classifier.fit(features, scores)

    def score(self, articles):
        self._logger.info('Feature space uses %d keywords', self._params.extractor.keyword_count())

        features = self._params.extractor.extract(articles)
        return self._params.classifier.predict(np.array(features))
Пример #9
0
class WorkerThread(threading.Thread):
    def __init__(self, worker_id, task_queue, completed_queue):
        """
        @type worker_id: int
        @type task_queue: Queue.Queue
        @type completed_queue: Queue.Queue
        """
        self._name = '%s-%d' % (self.__class__.__name__, worker_id)

        threading.Thread.__init__(self, name=self._name)

        self._logger = Logger(self._name)
        self._id = worker_id
        self._task_queue = task_queue
        self._completed_queue = completed_queue
        self._continue = True

    def stop(self):
        self._continue = False

    def run(self):
        while self._continue:
            self.work()
        self.exit()

    def work(self):
        raise NotImplementedError

    def exit(self):
        self._logger.info('Exiting.')

    @classmethod
    def initializer(cls, *args, **kwargs):
        class _WorkerThread(cls):
            def __init__(self, worker_id, task_queue, completed_queue):
                cls.__init__(self, worker_id, task_queue, completed_queue, *args, **kwargs)

        return _WorkerThread
Пример #10
0
    def __init__(self, worker_id, task_queue, completed_queue):
        """
        @type worker_id: int
        @type task_queue: Queue.Queue
        @type completed_queue: Queue.Queue
        """
        self._name = '%s-%d' % (self.__class__.__name__, worker_id)

        threading.Thread.__init__(self, name=self._name)

        self._logger = Logger(self._name)
        self._id = worker_id
        self._task_queue = task_queue
        self._completed_queue = completed_queue
        self._continue = True
Пример #11
0
class BaseModel(db.Model):
    __abstract__ = True
    logger = Logger.create_logger(__name__)

    def save(self):
        """save the item to database"""
        try:
            db.session.add(self)
            db.session.commit()
        except Exception as e:
            self.logger.error(f"Error while saving to database: {e}")

    def delete(self):
        """delete the item from database"""
        try:
            db.session.delete(self)
            db.session.commit()
        except Exception as e:
            self.logger.error(f"Error while deleting from database: {e}")
Пример #12
0
 def __init__(self):
     Algorithm.__init__(self)
     self._score_mapper = ScoreMapper()
     self._logger = Logger(self.__class__.__name__)
Пример #13
0
from application import app
from application.logger import Logger

Logger().init()

# api
import application.api.status_api
import application.api.example_api

# if run with cli this is NOT executed
if __name__ == '__main__':
    app.logger.info('start application: [{0}] @ {1}:{2} in DEBUG={3}'.format(
        app.config['APP_NAME'], app.config['HTTP_HOST'], app.config['HTTP_PORT'], app.config['DEBUG']))
    app.run(host=app.config['HTTP_HOST'], port=app.config['HTTP_PORT'], debug=app.config['DEBUG'])
Пример #14
0
import requests
from requests.auth import HTTPBasicAuth

from application.logger import Logger
from application.config import Config
logger = Logger().get_logger()


class Requester:
    def __init__(self):
        self.config = Config()

    def get_url(self,
                url,
                jsonify,
                headers={'content': 'application/json'},
                verify=True,
                params=None):
        # make GET request
        try:
            response = requests.get(url,
                                    headers=headers,
                                    verify=verify,
                                    params=params)
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            logger.exception(f'Requester.get_url Exception: {e}')
            raise e

        # jsonify response
        if jsonify:
Пример #15
0
def sync_district_data():
    """Fetch latest data from IEDCR reports"""
    try:
        # For some unknown reason, Logger.createLogger(__name__),
        # where __name__ == "application.tasks" doesn't bind
        # the handler. After some debugging, I found that anything
        # prefixing "application.*" doesn't work. According to
        # Logger.create_logger(), it assumes that a handler is
        # already binded, although it's not.

        # For the other parts it doesn't cause any problem. For example,
        # when the logger is created inside DataProvider module, the name
        # "application.provider.*" doesn't cause any problem.

        # This is a weird issue. I will look into this later. For now,
        # I will name it "tasks"
        logger = Logger.create_logger("tasks")
        logger.info("Starting sync of district data")
        if Meta.is_district_syncing():
            logger.info("A district sync is already in progress")
            return

        # set updating state to true
        Meta.set_district_syncing(True)

        # download and get updated data
        provider = DataProvider()
        new_data = (provider.sync_district_data()
                    )  # returns list of tuple as [...(districtName, Count)]
        last_updated = Meta.get_last_district_sync()

        # flag to monitor if fetched data has changed
        has_updated = False

        # differnece with current time and last updated time
        update_delta = datetime.utcnow() - last_updated

        # check the data against database records and update as necessary
        for pair in new_data:
            # ignore blank data
            if pair[0] == "" or pair[1] == "":
                continue

            district = District.find_by_name(pair[0])
            if district:

                if district.count != pair[1]:
                    # count changed from last record
                    # - save previous count
                    # - update new count
                    district.prev_count = district.count
                    district.count = pair[1]
                    has_updated = True
                else:
                    # count did not change
                    # - make count and prev_count same only if last change was 1 day ago
                    if update_delta.days >= 1:
                        district.prev_count = district.count

                district.save()
            else:
                new_district = District(pair[0], pair[1])
                new_district.save()
                has_updated = True

        # set updating state to False as update is finished
        Meta.set_district_syncing(False)

        logger.debug(f"Has updated = {has_updated}")
        if has_updated:
            # set last updated time to now if more than 24hrs
            # the 24hrs constant window helps to better calculate new_count - prev_count
            if update_delta.days >= 1:
                Meta.set_last_district_sync()
                logger.info("Updated last sync time")
            logger.info("District sync complete (fetched new data)")
            return
        logger.info("District sync complete (already up-to-date)")
    except Exception as e:
        Meta.set_district_syncing(False)
        logger.error(f"District sync failed with error: {e}")
Пример #16
0
 def __init__(self):
   self.http_requests =  Requester()
   self.logger        =  Logger().get_logger()
   self.config        =  Config()
Пример #17
0
class DataProvider:
    logger = Logger.create_logger(__name__)

    def __init__(self):
        self.stats_data_source = "https://corona.gov.bd/lang/en"
        self.district_report_url = os.environ.get("REPORT_URL")
        self.trans_table = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

    def get_stats(self):
        """Fetch the latest statistics like total positive cases, deaths etc"""
        page = requests.get(self.stats_data_source)
        soup = bs(page.content, "html.parser")
        counts = soup.select(".live-update-box-wrap-h1>b")

        # process counts - replace bangla digits with english
        for i in range(len(counts)):
            # counts[i] = counts[i].text.translate(self.trans_table)
            counts[i] = int(counts[i].text)

        data_dict = {
            "positive_24": counts[0],
            "positive_total": counts[1],
            "death_24": counts[2],
            "death_total": counts[3],
            "recovered_24": counts[4],
            "recovered_total": counts[5],
            "test_24": counts[6],
            "test_total": counts[7],
        }

        self.logger.debug(data_dict)
        return data_dict

    def parse_district_data(self):
        """Parse the Google Sheets to get district data"""
        page = requests.get(self.district_report_url)
        soup = bs(page.content, "html.parser")
        table = soup.find("table")
        rows = table.find_all("tr")
        result = []

        for rindex, row in enumerate(rows):
            # ignore first two rows and last row because they are headers/totals
            if rindex < 2 or rindex == len(rows) - 1:
                continue

            data = []
            for col in row.find_all("td"):
                # ignore division names column
                if col.has_attr("rowspan"):
                    continue
                data.append(self.sanitize(col.text))

            result.append(data)

        return result

    def sanitize(self, s):
        """sanitize string:
        - by replacing invalid chars with correct ones
        - converting to int if applicable"""
        mapping = {"’": "'"}
        for key, val in mapping.items():
            s = s.replace(key, val)

        if s.isdigit():
            s = int(s)
        return s

    def sync_district_data(self):
        return self.parse_district_data()