示例#1
0
    def scrap_links(self, link_config_map):
        """
        Each rss feed has set of links to be scrapped. This functions is passed a generator object with
        those links and the respective url to get scrapper config. Call the base scrapper and wait for
        the magic.
        :param link_config_map: Generator object, containing the link to be scrapped and the url of source
        to get the scrapper config.
        :return: Give me some time to decide. Update if you see anything getting returned and not mentioned
        here.
        """
        driver = ChromeDriverManager()
        config = ConfigurationManager.get_config()
        mysql_manager = MySQLManager(host=config.get_aws_rds_host(),
                                     port=config.get_aws_rds_port(),
                                     username=config.get_aws_rds_username(),
                                     pwd=config.get_aws_rds_password(),
                                     db_name=config.get_aws_rds_db_name())

        S = mysql_manager.get_session()
        s = S()

        for source_url, link, link_id in link_config_map:
            try:
                scrapper_config = self._source_id_scrapper_config_map[
                    source_url]
                wait_driver = driver.get_waiter_driver(link['link'])
                scraper = Scapper(uri=link,
                                  config=scrapper_config,
                                  wait_driver=wait_driver,
                                  link_id=link_id)
                dump_to_scraperdb(data=scraper.get_data(), db_session=s)
            except Exception as e:
                print(
                    "Something went wrong with scraping rss links or dumping them to database."
                )
示例#2
0
def main():
    import json
    from pprint import pprint
    from selenium import webdriver
    # from scraper.models.scraper import dump_to_scraperdb

    # dummy config object - will be replaced later with flask configs
    option = webdriver.ChromeOptions()
    option.add_argument('--headless')

    # part of dummy

    class current_config:
        driver = webdriver.Chrome(
            options=option,
            executable_path="chromedriver_linux64/chromedriver")
        # driver = webdriver.Chrome(
        #     executable_path="chromedriver_linux64/chromedriver")

    # Better to pass single instance of driver inside than loading driver each time
    uri = "https://www.indiatoday.in/world/story/pakistan-plane-crash-watch-moment-when-pia-flight-crashed-in-karachi-1680875-2020-05-22?utm_source=rss"
    driver = current_config.driver
    driver.get(uri)
    wait_driver = WebDriverWait(driver, 10)

    os.environ[
        ConfigurationConstants.CONFIGURATION_FILE_ENV_VAR] = "config.ini"
    config: 'Configuration' = ConfigurationManager.get_config()
    config = config.get_scraper_configs().get("SCRAPER_INDIA_TODAY", None)
    scrape = Scapper(uri=uri,
                     config=config,
                     wait_driver=wait_driver,
                     link_id=1)
    data = scrape.get_data()
    pprint(data)
示例#3
0
 def load_config(self,
                 scraper_name: str,
                 filepath: str = "config.ini") -> None:
     os.environ[
         ConfigurationConstants.CONFIGURATION_FILE_ENV_VAR] = filepath
     config: 'Configuration' = ConfigurationManager.get_config()
     self._config = config.get_scraper_configs().get(scraper_name, None)
示例#4
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self._output_file = None
        self._last_time_file = None

        self.check_child_class_parameters()
        self._config = ConfigurationManager.get_config()
示例#5
0
def get_params():

    parser = argparse.ArgumentParser()

    parser.add_argument('--config', dest='config', required=True, type=str)

    parser.add_argument('--url', dest='url', required=True, type=str)

    arg_params = parser.parse_args()
    url, config = arg_params.url, arg_params.config
    sce.set_conf_env_file(config)
    config = ConfigurationManager.get_config()
    return url, config
示例#6
0
def dump_to_scraperdb(data, db_session):

    config: "Configuration" = ConfigurationManager.get_config()

    assert data["headline"] is not None
    f = Scraper(rss_feed_id=data["rss_feed_id"],
                uri=data["uri"]["link"],
                headline=data["headline"],
                story_kicker=data["story_kicker"],
                article=data["article"])
    try:
        db_session.add(f)
        db_session.flush()
        db_session.commit()
    except:
        db_session.rollback()
    finally:
        db_session.close()
示例#7
0
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, BigInteger, DateTime, ForeignKey, VARCHAR, DECIMAL
import datetime
from digital_reporter.modules.utilities.configurations.configuration_manager import ConfigurationManager
from digital_reporter.modules.db.managers.MySQL import MySQLManager
import sqlalchemy.dialects.mysql as mysql_t
from sqlalchemy.sql.functions import current_timestamp
from digital_reporter import db

config = ConfigurationManager.get_config()

Base = declarative_base()


class RSSSource(db.Model):
    __tablename__ = 'rss_source'
    id = db.Column(BigInteger, primary_key=True, autoincrement=True)
    scrapper_config_id = db.Column(BigInteger,
                                   ForeignKey('scrapper_config.id',
                                              ondelete='CASCADE',
                                              onupdate='CASCADE'),
                                   nullable=True)
    url = db.Column(VARCHAR(3000), unique=True)
    created_at = db.Column(mysql_t.TIMESTAMP(fsp=6),
                           nullable=False,
                           server_default=current_timestamp(6))


class RSSFeed(db.Model):
    __tablename__ = 'rss_feed'
    id = db.Column(BigInteger, primary_key=True, autoincrement=True)