def dump_index(event: dict, context, index_group_factory: IndexGroupFactory = IndexGroupFactory, downloader_factory: DownloaderFactory = DownloaderFactory, scraper_factory: ScraperFactory = ScraperFactory): data = decode_pubsub_data(event) source = data["source"] index = data["index"] date = date_or_now(data) scrap_stocks = should_scrap_stocks(data) logging.info("Dump index for '{}' from '{}' on {}".format(index, source, date)) index_group = index_group_factory.createFor(source, index) index_storage = IndexStorage(DUMP_FOLDER, index_group, date=date, storage_repository=S3Repository(BUCKET_NAME)) downloader = downloader_factory.create(source) downloader.dump_index(index_group, index_storage) scraper = scraper_factory.create(source) scraper.read_stocks(index_group, index_storage) scraper.scrap_index(index_group, index_storage) if scrap_stocks: send_scrap_messages(index_group, date) else: logging.info("don't scape stocks because of pub/sub message") index_storage.compress()
def read_stocks(indexGroup, index_storage: IndexStorage): pages = index_storage.storage_repository.list( index_storage.getStoragePath("list", "")) for page in pages: content = index_storage.storage_repository.load( index_storage.getDatedPath() + page) if content: soup = BeautifulSoup(content, 'html.parser') article = soup.find("article", {"class": "top-flop-box"}) table = article.find("table") for row in table.findAll("tr"): columns = row.findAll("td") if len(columns) == 0: continue firstCol = columns[0] link = firstCol.find("a") if link is not None and link.get("href") and link.get( "href").startswith("/"): matches = re.search(r'\/aktien\/(.*)-Aktie-(.*)', link.get("href")) name = matches.group(1) stock_id = matches.group(2) field = firstCol.find("span").get_text().strip() indexGroup.add_stock(stock_id, name, field)
def test_base_path_of_index(self): # given: index_group = IndexGroup("isin", "index_name", "source_id", "source") date = datetime.strptime("2018-01-01", "%Y-%m-%d") # when: index_storage = IndexStorage("/tests/dump", index_group, date, get_history=False) base_path = index_storage.getDatedPath() # then: self.assertEqual("/tests/dump/index_name/2018-01-01/", base_path)
def test_storage_path_of_index(self): # given: index_group = IndexGroup("isin", "index_name", "source_id", "source") date = datetime.strptime("2018-01-01", "%Y-%m-%d") # when: index_storage = IndexStorage("/tests/dump", index_group, date, get_history=False) storage_path = index_storage.getStoragePath("profile", "html") # then: self.assertEqual( "/tests/dump/index_name/2018-01-01/index_name.source.profile.html", storage_path)
def dump_stock(event: dict, context): data = decode_pubsub_data(event) logging.info("data: {}".format(data)) source = data["source"] index_group = new_index_group(data["index_group"]) stock = new_stock(data["stock"], index_group) date = date_or_now(data) logging.info("source: {}".format(source)) logging.info("index_group: {}".format(index_group)) logging.info("stock: {}".format(stock)) logging.info("date: {}".format(date)) index_storage = IndexStorage(DUMP_FOLDER, index_group, date=date, storage_repository=S3Repository(BUCKET_NAME)) stock_storage = StockStorage(index_storage, stock, storage_repository=S3Repository(BUCKET_NAME)) downloader = DownloaderFactory.create(source) downloader.dump_stock(stock, stock_storage) scraper = ScraperFactory.create(source) scraper.scrap(stock, stock_storage) stock_storage.store() stock_storage.compress()
def get_allianz_stock_storage(get_history=False, date=datetime.now()): indexGroup = IndexGroup("DE0008469008", "DAX", "DAX", "onvista") index_storage = IndexStorage("resources", indexGroup, date=date, get_history=get_history) stock = Stock("DE0008404005", "Allianz", indexGroup) return StockStorage(index_storage, stock)
def get_vw_stock_storage(get_history=False, date=datetime.now()): indexGroup = IndexGroup("DE0008469008", "DAX", "DAX", "onvista") index_storage = IndexStorage("resources", indexGroup, date=date, get_history=get_history) stock = Stock("DE0007664039", "Volkswagen-VZ", indexGroup) return StockStorage(index_storage, stock)
def write_index_report(index_group: IndexGroup, index_storage: IndexStorage, rating_entities: []): template = Template(filename="libs/templates/index-rating-overview.html") report = template.render(index_group=index_group, rating_entities=rating_entities, source=index_storage.source, report_date=index_storage.date_str) index_storage.storage_repository.store( index_storage.getStoragePath("", "html"), report)
def dump_index(index_group: IndexGroup, index_storage: IndexStorage): main_file = index_storage.getStoragePath("profil", "html") storage_repository = index_storage.storage_repository dl.download(WEBSITE + "/index/" + index_group.isin, main_file, storage_repository) notation = get_notation(main_file, storage_repository) download_history_by_notation(notation, index_storage) links = get_links(main_file, storage_repository) download_stock_list(links["Einzelwerte"], index_storage)
def read_stocks(indexGroup, index_storage: IndexStorage): with open(index_storage.getStoragePath("list", "html"), mode="r", encoding="utf-8") as f: soup = BeautifulSoup(f, 'html.parser') article = soup.find("div", {"id": "index-list-container"}) table = article.find("table") for row in table.findAll("tr"): columns = row.findAll("td") if len(columns) == 0: continue firstCol = columns[0] link = firstCol.find("a") if link is not None and link.get("href") and link.get("href").startswith("/"): matches = re.search(r'\/aktien\/(.*)-aktie', link.get("href")) name = matches.group(1) stock_id = firstCol.get_text().strip().split("\n")[1] indexGroup.add_stock(stock_id, name)
def dump_index(indexGroup: IndexGroup, indexStorage: IndexStorage): main_file = indexStorage.getStoragePath("profil", "html") storage_repository = indexStorage.storage_repository dl.download(f"%s/index/%s" % (WEBSITE, indexGroup.sourceId), main_file, storage_repository) dl.download(f"%s/index/%s/werte" % (WEBSITE, indexGroup.sourceId), indexStorage.getStoragePath("list", "html"), storage_repository)
# indexGroup = IndexGroupFactory.createFor(SOURCE, "NASDAQ") # indexGroup = IndexGroupFactory.createFor(SOURCE, "S&P 500") # indexGroup = IndexGroupFactory.createFor(SOURCE, "Nikkei") # indexGroup = IndexGroupFactory.createFor(SOURCE, "Hang-Seng") # indexGroup = IndexGroupFactory.createFor(SOURCE, "S&P-TSX-Composite") # indexGroup = IndexGroupFactory.createFor(SOURCE, "AEX") # indexGroup = IndexGroupFactory.createFor(SOURCE, "OBX") # indexGroup = IndexGroupFactory.createFor(SOURCE, "PTX") # indexGroup = IndexGroupFactory.createFor(SOURCE, "RTS") # indexGroup = IndexGroupFactory.createFor(SOURCE, "OMXS-30") # indexGroup = IndexGroupFactory.createFor(SOURCE, "IBEX-35") # indexGroup = IndexGroupFactory.createFor(SOURCE, "SOLACTIVE-ORGANIC-FOOD") #date = datetime.strptime("22.11.2020", "%d.%m.%Y") date = datetime.now() index_storage = IndexStorage("dump", indexGroup, date=date) index_storage.uncompress() downloader = DownloaderFactory.create(SOURCE) downloader.dump_index(indexGroup, index_storage) scraper = ScraperFactory.create(SOURCE) scraper.read_stocks(indexGroup, index_storage) scraper.scrap_index(indexGroup, index_storage) index_storage.compress() def thread_body(queue: Queue): while True: