def _check_url_presence(self, urls): scrapes_table = read_sql_df( columns=['url'], table='BaseScrapes', query='SELECT Url FROM Scrapes WHERE Url IN ({})'.format(','.join(["'{}'".format(url) for url in urls]))) return len(scrapes_table) > 0 and len([url for url in urls if url not in scrapes_table['url'].values]) == 0
def sql(self): df = read_sql_df(['coin_id', 'previous_rank', 'rank', 'symbol']) for coin in df['symbol'].values: yield self.sql_template.format( coin=coin, daily_trends_table=_daily_trends_table, values_table=_values_table)
def transform(self, df): coins = read_sql_df(['coin_id', 'rank', 'previous_rank', 'symbol', 'name'], table=_coins_table) complete_dataset = df.merge(coins[['symbol', 'coin_id']], on='symbol', how='inner') complete_dataset = complete_dataset.drop(['level_0', 'Unnamed: 0', 'index', 'name', 'id'], axis=1) complete_dataset['Date'] = '{:%Y-%m-%d}'.format(self.date_hour.date()) return complete_dataset
def _get_not_scraped(self, urls): cols = get_table_columns('Scrapes') scraped = read_sql_df( columns=cols, table='Scrapes') return [u for u in urls if u not in set(scraped['url'])]
def run(self): base_scrape = read_sql_df( columns=get_table_columns('BaseScrapes'), table='BaseScrapes', query="SELECT * FROM {{table}} WHERE ScrapeSource='{source}'".format(source=self.source)) source, url, subsections = base_scrape.values[0] df = self.scrape_top_level(url) if subsections is None else self._scrape_multiple(url, subsections) s3_write(df, 'parquet', self.output().path)
def run(self): df = read_sql_df(columns=self.columns, table=self.table) s3_write(df, self.output_format, self.output_path)