def scrap_links(self, link_config_map): """ Each rss feed has set of links to be scrapped. This functions is passed a generator object with those links and the respective url to get scrapper config. Call the base scrapper and wait for the magic. :param link_config_map: Generator object, containing the link to be scrapped and the url of source to get the scrapper config. :return: Give me some time to decide. Update if you see anything getting returned and not mentioned here. """ driver = ChromeDriverManager() config = ConfigurationManager.get_config() mysql_manager = MySQLManager(host=config.get_aws_rds_host(), port=config.get_aws_rds_port(), username=config.get_aws_rds_username(), pwd=config.get_aws_rds_password(), db_name=config.get_aws_rds_db_name()) S = mysql_manager.get_session() s = S() for source_url, link, link_id in link_config_map: try: scrapper_config = self._source_id_scrapper_config_map[ source_url] wait_driver = driver.get_waiter_driver(link['link']) scraper = Scapper(uri=link, config=scrapper_config, wait_driver=wait_driver, link_id=link_id) dump_to_scraperdb(data=scraper.get_data(), db_session=s) except Exception as e: print( "Something went wrong with scraping rss links or dumping them to database." )
def main(): import json from pprint import pprint from selenium import webdriver # from scraper.models.scraper import dump_to_scraperdb # dummy config object - will be replaced later with flask configs option = webdriver.ChromeOptions() option.add_argument('--headless') # part of dummy class current_config: driver = webdriver.Chrome( options=option, executable_path="chromedriver_linux64/chromedriver") # driver = webdriver.Chrome( # executable_path="chromedriver_linux64/chromedriver") # Better to pass single instance of driver inside than loading driver each time uri = "https://www.indiatoday.in/world/story/pakistan-plane-crash-watch-moment-when-pia-flight-crashed-in-karachi-1680875-2020-05-22?utm_source=rss" driver = current_config.driver driver.get(uri) wait_driver = WebDriverWait(driver, 10) os.environ[ ConfigurationConstants.CONFIGURATION_FILE_ENV_VAR] = "config.ini" config: 'Configuration' = ConfigurationManager.get_config() config = config.get_scraper_configs().get("SCRAPER_INDIA_TODAY", None) scrape = Scapper(uri=uri, config=config, wait_driver=wait_driver, link_id=1) data = scrape.get_data() pprint(data)
def load_config(self, scraper_name: str, filepath: str = "config.ini") -> None: os.environ[ ConfigurationConstants.CONFIGURATION_FILE_ENV_VAR] = filepath config: 'Configuration' = ConfigurationManager.get_config() self._config = config.get_scraper_configs().get(scraper_name, None)
def __init__(self, **kwargs): super().__init__(**kwargs) self._output_file = None self._last_time_file = None self.check_child_class_parameters() self._config = ConfigurationManager.get_config()
def get_params(): parser = argparse.ArgumentParser() parser.add_argument('--config', dest='config', required=True, type=str) parser.add_argument('--url', dest='url', required=True, type=str) arg_params = parser.parse_args() url, config = arg_params.url, arg_params.config sce.set_conf_env_file(config) config = ConfigurationManager.get_config() return url, config
def dump_to_scraperdb(data, db_session): config: "Configuration" = ConfigurationManager.get_config() assert data["headline"] is not None f = Scraper(rss_feed_id=data["rss_feed_id"], uri=data["uri"]["link"], headline=data["headline"], story_kicker=data["story_kicker"], article=data["article"]) try: db_session.add(f) db_session.flush() db_session.commit() except: db_session.rollback() finally: db_session.close()
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, String, BigInteger, DateTime, ForeignKey, VARCHAR, DECIMAL import datetime from digital_reporter.modules.utilities.configurations.configuration_manager import ConfigurationManager from digital_reporter.modules.db.managers.MySQL import MySQLManager import sqlalchemy.dialects.mysql as mysql_t from sqlalchemy.sql.functions import current_timestamp from digital_reporter import db config = ConfigurationManager.get_config() Base = declarative_base() class RSSSource(db.Model): __tablename__ = 'rss_source' id = db.Column(BigInteger, primary_key=True, autoincrement=True) scrapper_config_id = db.Column(BigInteger, ForeignKey('scrapper_config.id', ondelete='CASCADE', onupdate='CASCADE'), nullable=True) url = db.Column(VARCHAR(3000), unique=True) created_at = db.Column(mysql_t.TIMESTAMP(fsp=6), nullable=False, server_default=current_timestamp(6)) class RSSFeed(db.Model): __tablename__ = 'rss_feed' id = db.Column(BigInteger, primary_key=True, autoincrement=True)