def download(self, area_name, year, month, page=1): """ 指定した検索条件でホームページをダウンロードして、CSV に保存します """ download_url = Config.get_url(area_name) download_file = Config.get_download_file(area_name, year, month, page) save_path = Config.get_download_path(download_file) if os.path.exists(save_path): _logger.info( "skip download for file exist {}".format(download_file)) return form_data = self.get_form_data(year, month, page) req = urllib.request.Request(download_url, form_data) try: html_data = urllib.request.urlopen(req).read() except IncompleteRead as e: html_data = e.partial time.sleep(self.crawl_interval) self.check_html_no_data(html_data) if self.page_found: with open(save_path, mode="wb") as f: f.write(html_data) _logger.info("save {}".format(download_file))
def test_append(): html_path = Config.test_resource("daikoku1.html") parser = Parser("daikoku").parse_html(html_path) html_path2 = Config.test_resource("isogo1.html") parser2 = Parser("isogo").parse_html(html_path2) parser.append(parser2) timestamps = parser.get_timestamps() print(timestamps) assert timestamps['choka'] assert timestamps['newsline']
def initial_load(self, csv, table_name): """ テーブルを作成して、CSV データを初期ロードをします """ results = pd.read_csv(Config.get_datastore_path(csv), index_col=0) results.to_sql(table_name, self.db.engine, if_exists="replace") self.load_counts[csv] = len(results.index)
def test_html_no_data(): html_path = Config.test_resource("not_found1.html") f = open(html_path, encoding='euc_jp', errors='ignore') html = f.read() f.close() download = Download().load_config().check_html_no_data(html) assert not download.page_found
def run(self): """ data ディレクトリ下の ダウンロード済みの釣果情報 HTML ファイルを 順に読み込み、釣果情報を抽出して、CSV 形式にして保存します """ html_files = Config.list_download_dirs() for html_file in html_files: point = Config.get_point_from_html_filename(html_file) if not point: continue _logger.info("read {}".format(html_file)) html_path = Config.get_download_path(html_file) parser = Parser(point).parse_html(html_path) self.append(parser) self.cleansing_fishing_summary() self.export()
def test_honmoku_html_parseer(): html_path = Config.test_resource("honmoku1.html") parser = Parser("honmoku").parse_html(html_path) timestamps = parser.get_timestamps() print(timestamps) assert timestamps['choka'] assert timestamps['newsline']
def test_cleansing_comment(): comment_path = Config.test_resource("comment1.txt") f = open(comment_path, encoding='utf-8') comment = f.read() f.close() comment = Converter.clensing_summary_comment(comment) assert comment print(comment) comment_path = Config.test_resource("comment2.txt") f = open(comment_path, encoding='utf-8') comment = f.read() f.close() comment2 = Converter.clensing_summary_comment(comment) assert comment2 print(comment2)
def reset_load_file(self, filename): """ CSV ロードファイルを削除します """ load_path = Config.get_datastore_path(filename) if os.path.exists(load_path): os.remove(load_path) self.load_counts[filename] = 0
def export_data(self, df, filename, format='csv'): """ 取得した釣果情報データフレームを CSV に保存します """ """ 日付列のCSV変換で、フォーマットを固定して変換""" df['Date'] = df['Date'].apply(lambda x: x.strftime('%Y-%m-%d')) export_path = Config.get_datastore_path(filename) df.to_csv(export_path)
def __init__(self, db_name=config.ChokaDB): """ SQLite3 データベースへの接続と、各モデル定義を初期化します """ self.db_path = Config.get_db_path(db_name) self.db = ds.connect('sqlite:///{}'.format(self.db_path)) self.target = pd.DataFrame(columns=['Target', 'Species']) self.area = pd.DataFrame(columns=['Point', 'PointName'])
def test_append_load(): datastore = Datastore(TEST_DB).reset_database() parser = Parser("daikoku").parse_html( Config.test_resource("daikoku1.html")) parser.export('csv') datastore.csv_import() parser2 = Parser("isogo").parse_html(Config.test_resource("isogo1.html")) parser2.export('csv') datastore.csv_import() assert datastore.load_counts['choka.csv'] == 19 parser3 = Parser("honmoku").parse_html( Config.test_resource("honmoku1.html")) parser3.export('csv') datastore.csv_import() assert datastore.load_counts['choka.csv'] == 54
def append_load(self, csv, table_name, index_columns): """ テーブルに全 CSV レコードを登録します、既設のレコードは更新します """ results = pd.read_csv(Config.get_datastore_path(csv), index_col=0) for result in results.to_dict(orient='records'): self.upsert_row(table_name, index_columns, result) self.load_counts[csv] = len(results.index) return self
def test_daikoku_html_parser(): html_path = Config.test_resource("daikoku1.html") parser = Parser("daikoku").parse_html(html_path) timestamps = parser.get_timestamps() print(parser.choka.columns) print(parser.comment.columns) print(parser.newsline.columns) assert timestamps['choka'] assert timestamps['newsline']
def test_get_path(): assert Config.get_datastore_path("choka.csv") assert Config.get_download_path("choka_daikoku_2021_04_001.html") assert Config.test_resource("daikoku1.html") assert Config.get_url("daikoku") assert Config.get_download_file("daikoku", 2021, 4) assert Config.get_db_path() assert Config.get_config_path("config.toml")
def reset_download(self): """ SQLite3 データベースファイルを削除します """ download_dir = Config.get_download_path("") _logger.info("initialize {}".format(download_dir)) download_files = os.listdir(download_dir) for download_file in download_files: if download_file.endswith(".html"): os.remove(os.path.join(download_dir, download_file))
def test_initial_export(): datastore = Datastore(TEST_DB).reset_database() html_path = Config.test_resource("daikoku1.html") parser = Parser("daikoku").parse_html(html_path) parser.export('csv') datastore.csv_import() assert datastore.load_counts == { 'choka.csv': 12, 'comment.csv': 1, 'newsline.csv': 9 }
def main(self): """ メイン処理。コマンド引数別に処理する """ args = self.parser() self.set_envoronment(args) log_path = None if self.log_enable: log_path = Config.get_ap_log_path() logging.basicConfig( filename=log_path, level=getattr(logging, 'INFO'), format='%(asctime)s [%(levelname)s] %(module)s %(message)s', datefmt='%Y/%m/%d %H:%M:%S', ) if self.show: Config.show_config() return elif self.export: Exporter().run(self.time) elif self.loadmaster: loader = MasterLoader().load_config() if loader: loader.run() elif self.init: Datastore().reset_database() MasterLoader().load_config().run() Download().reset_download() return else: downloader = Download(self.page).load_config( self.config_path).check_config() if downloader: downloader.run(self.month, self.keep) Parser().run() Datastore().csv_import() return
def load_config(self, config_path=Config.get_config_path()): """ 設定ファイルを読み、ホームページダウンロードパラメータを登録します """ config_toml = toml.load(open(config_path, encoding='utf-8')) if 'area' in config_toml: self.areas = config_toml.get('area') if 'interval' in config_toml: self.crawl_interval = config_toml['interval'] if 'max_page' in config_toml: self.max_page = config_toml['max_page'] return self
def test_toml_multibyte(): config_path = Config.get_config_path("config.toml") config_toml = toml.load(open(config_path, encoding='utf-8')) df = pd.DataFrame(columns=['Target', 'Species']) if 'target' in config_toml: targets = config_toml['target'] for target in targets: target_name = target['name'] for species in target['species']: values = {'Target': target_name, 'Species': species} df = df.append(values, ignore_index=True) print(df)
def parser(self): """ コマンド実行オプションの解析 """ parser = argparse.ArgumentParser(description=Description) parser.add_argument("-c", "--config", type=str, default=Config.get_config_path(), help="<path>\\config.toml") parser.add_argument("-m", "--month", type=int, default=0, help="last n month before downloading") parser.add_argument("-p", "--page", type=int, default=config.MaxPage, help="max number of pages to visit the homepage") parser.add_argument("-i", "--init", action="store_true", help="initialize database") parser.add_argument("-k", "--keep", action="store_true", help="keep old download files") parser.add_argument("-l", "--log", action="store_true", help="write log to file") parser.add_argument("-s", "--show", action="store_true", help="show config parameter") parser.add_argument("-e", "--export", action="store_true", help="export csv data") parser.add_argument("--loadmaster", action="store_true", help="import master data") parser.add_argument("-t", "--time", type=str, default="1day", help="time period to export") return parser.parse_args()
def __init__(self, db_name=config.ChokaDB): """ SQLite3 データベースへの接続と、各モデル定義を初期化します """ self.db_path = Config.get_db_path(db_name) db = ds.connect('sqlite:///{}'.format(self.db_path)) self.db = db self.tables = [ Table('fishing_results', ['Date', 'Point', 'Species'], 'choka.csv'), Table('fishing_comments', ['Date', 'Point'], 'comment.csv'), Table('fishing_newslines', ['Date', 'Time', 'Point'], 'newsline.csv'), ] self.load_counts = dict()
def load_config(self, config_path=Config.get_config_path()): """ 設定ファイルを読み、ホームページダウンロードパラメータを登録します """ config_toml = toml.load(open(config_path, encoding='utf-8')) """魚種ターゲットの読み込み""" if 'target' in config_toml: targets = config_toml['target'] for target in targets: target_name = target['name'] for species in target['species']: values = {'Target': target_name, 'Species': species} self.target = self.target.append(values, ignore_index=True) """魚種ターゲットの読み込み""" if 'area' in config_toml: areas = config_toml['area'] for area in areas: values = {'Point': area['name'], 'PointName': area['label']} self.area = self.area.append(values, ignore_index=True) return self
def run(self, interval): """ SQLite3 から指定した期間の履歴データをCSVにエクスポートします """ last_timestamp = self.get_last_time(interval) if not last_timestamp: _logger.error( "--time parse {} by 'n[day|month|yaer]'".format(interval)) return None last_date = last_timestamp.strftime('%Y-%m-%d') ds = Datastore() for table_name in ds.get_table_names(): sql = "select * from {} where Date >= :start".format(table_name) df = pd.read_sql_query(sql, index_col=['Date'], con=ds.db.engine, params={"start": last_date}) df = df.drop(columns=['index']) df = self.cleansing_data(table_name, df) export_path = Config.get_export_path(table_name) df.to_csv(export_path) _logger.info("save {}".format(export_path))
def test_not_found(): html_path = Config.test_resource("not_found1.html") assert not Parser("daikoku").parse_html(html_path)
def test_export(): html_path = Config.test_resource("daikoku1.html") parser = Parser("daikoku").parse_html(html_path) parser.export('csv')
def test_daikoku_only_newsline_parser(): html_path = Config.test_resource("daikoku1_newsline.html") parser = Parser("daikoku").parse_html(html_path) timestamps = parser.get_timestamps() assert not timestamps['choka'] assert timestamps['newsline']
def test_get_point_from_html_filename(): assert Config.get_point_from_html_filename( "choka_daikoku_2021_04_001.html") == "daikoku" assert Config.get_point_from_html_filename("hoge.html") == None
def test_get_download_file(): assert Config.get_download_file("daikoku", 2021, 4) == "choka_daikoku_2021_04_001.html"
def test_get_url(): assert Config.get_url( "daikoku") == 'http://daikoku.yokohama-fishingpiers.jp/choka.php'
def __init__(self, db_name=config.ChokaDB): """ SQLite3 データベースへの接続と、各モデル定義を初期化します """ self.db_path = Config.get_db_path(db_name) self.db = ds.connect('sqlite:///{}'.format(self.db_path))