示例#1
0
    def download(self, area_name, year, month, page=1):
        """
        指定した検索条件でホームページをダウンロードして、CSV に保存します
        """
        download_url = Config.get_url(area_name)
        download_file = Config.get_download_file(area_name, year, month, page)
        save_path = Config.get_download_path(download_file)
        if os.path.exists(save_path):
            _logger.info(
                "skip download for file exist {}".format(download_file))
            return

        form_data = self.get_form_data(year, month, page)
        req = urllib.request.Request(download_url, form_data)
        try:
            html_data = urllib.request.urlopen(req).read()
        except IncompleteRead as e:
            html_data = e.partial
        time.sleep(self.crawl_interval)

        self.check_html_no_data(html_data)
        if self.page_found:
            with open(save_path, mode="wb") as f:
                f.write(html_data)
            _logger.info("save {}".format(download_file))
示例#2
0
def test_append():
    html_path = Config.test_resource("daikoku1.html")
    parser = Parser("daikoku").parse_html(html_path)
    html_path2 = Config.test_resource("isogo1.html")
    parser2 = Parser("isogo").parse_html(html_path2)
    parser.append(parser2)
    timestamps = parser.get_timestamps()
    print(timestamps)
    assert timestamps['choka']
    assert timestamps['newsline']
示例#3
0
 def initial_load(self, csv, table_name):
     """
     テーブルを作成して、CSV データを初期ロードをします
     """
     results = pd.read_csv(Config.get_datastore_path(csv), index_col=0)
     results.to_sql(table_name, self.db.engine, if_exists="replace")
     self.load_counts[csv] = len(results.index)
示例#4
0
def test_html_no_data():
    html_path = Config.test_resource("not_found1.html")
    f = open(html_path, encoding='euc_jp', errors='ignore')
    html = f.read()
    f.close()
    download = Download().load_config().check_html_no_data(html)
    assert not download.page_found
示例#5
0
 def run(self):
     """
     data ディレクトリ下の ダウンロード済みの釣果情報 HTML ファイルを
     順に読み込み、釣果情報を抽出して、CSV 形式にして保存します
     """
     html_files = Config.list_download_dirs()
     for html_file in html_files:
         point = Config.get_point_from_html_filename(html_file)
         if not point:
             continue
         _logger.info("read {}".format(html_file))
         html_path = Config.get_download_path(html_file)
         parser = Parser(point).parse_html(html_path)
         self.append(parser)
     self.cleansing_fishing_summary()
     self.export()
示例#6
0
def test_honmoku_html_parseer():
    html_path = Config.test_resource("honmoku1.html")
    parser = Parser("honmoku").parse_html(html_path)
    timestamps = parser.get_timestamps()
    print(timestamps)
    assert timestamps['choka']
    assert timestamps['newsline']
示例#7
0
def test_cleansing_comment():
    comment_path = Config.test_resource("comment1.txt")
    f = open(comment_path, encoding='utf-8')
    comment = f.read()
    f.close()
    comment = Converter.clensing_summary_comment(comment)
    assert comment
    print(comment)

    comment_path = Config.test_resource("comment2.txt")
    f = open(comment_path, encoding='utf-8')
    comment = f.read()
    f.close()
    comment2 = Converter.clensing_summary_comment(comment)
    assert comment2
    print(comment2)
示例#8
0
 def reset_load_file(self, filename):
     """
     CSV ロードファイルを削除します
     """
     load_path = Config.get_datastore_path(filename)
     if os.path.exists(load_path):
         os.remove(load_path)
     self.load_counts[filename] = 0
示例#9
0
 def export_data(self, df, filename, format='csv'):
     """
     取得した釣果情報データフレームを CSV に保存します
     """
     """ 日付列のCSV変換で、フォーマットを固定して変換"""
     df['Date'] = df['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
     export_path = Config.get_datastore_path(filename)
     df.to_csv(export_path)
示例#10
0
 def __init__(self, db_name=config.ChokaDB):
     """
     SQLite3 データベースへの接続と、各モデル定義を初期化します
     """
     self.db_path = Config.get_db_path(db_name)
     self.db = ds.connect('sqlite:///{}'.format(self.db_path))
     self.target = pd.DataFrame(columns=['Target', 'Species'])
     self.area = pd.DataFrame(columns=['Point', 'PointName'])
示例#11
0
def test_append_load():
    datastore = Datastore(TEST_DB).reset_database()
    parser = Parser("daikoku").parse_html(
        Config.test_resource("daikoku1.html"))
    parser.export('csv')
    datastore.csv_import()

    parser2 = Parser("isogo").parse_html(Config.test_resource("isogo1.html"))
    parser2.export('csv')
    datastore.csv_import()
    assert datastore.load_counts['choka.csv'] == 19

    parser3 = Parser("honmoku").parse_html(
        Config.test_resource("honmoku1.html"))
    parser3.export('csv')
    datastore.csv_import()
    assert datastore.load_counts['choka.csv'] == 54
示例#12
0
 def append_load(self, csv, table_name, index_columns):
     """
     テーブルに全 CSV レコードを登録します、既設のレコードは更新します
     """
     results = pd.read_csv(Config.get_datastore_path(csv), index_col=0)
     for result in results.to_dict(orient='records'):
         self.upsert_row(table_name, index_columns, result)
     self.load_counts[csv] = len(results.index)
     return self
示例#13
0
def test_daikoku_html_parser():
    html_path = Config.test_resource("daikoku1.html")
    parser = Parser("daikoku").parse_html(html_path)
    timestamps = parser.get_timestamps()
    print(parser.choka.columns)
    print(parser.comment.columns)
    print(parser.newsline.columns)
    assert timestamps['choka']
    assert timestamps['newsline']
示例#14
0
def test_get_path():
    assert Config.get_datastore_path("choka.csv")
    assert Config.get_download_path("choka_daikoku_2021_04_001.html")
    assert Config.test_resource("daikoku1.html")
    assert Config.get_url("daikoku")
    assert Config.get_download_file("daikoku", 2021, 4)
    assert Config.get_db_path()
    assert Config.get_config_path("config.toml")
示例#15
0
 def reset_download(self):
     """
     SQLite3 データベースファイルを削除します
     """
     download_dir = Config.get_download_path("")
     _logger.info("initialize {}".format(download_dir))
     download_files = os.listdir(download_dir)
     for download_file in download_files:
         if download_file.endswith(".html"):
             os.remove(os.path.join(download_dir, download_file))
示例#16
0
def test_initial_export():
    datastore = Datastore(TEST_DB).reset_database()
    html_path = Config.test_resource("daikoku1.html")
    parser = Parser("daikoku").parse_html(html_path)
    parser.export('csv')
    datastore.csv_import()
    assert datastore.load_counts == {
        'choka.csv': 12,
        'comment.csv': 1,
        'newsline.csv': 9
    }
示例#17
0
    def main(self):
        """
        メイン処理。コマンド引数別に処理する
        """
        args = self.parser()
        self.set_envoronment(args)
        log_path = None
        if self.log_enable:
            log_path = Config.get_ap_log_path()
        logging.basicConfig(
            filename=log_path,
            level=getattr(logging, 'INFO'),
            format='%(asctime)s [%(levelname)s] %(module)s %(message)s',
            datefmt='%Y/%m/%d %H:%M:%S',
        )
        if self.show:
            Config.show_config()
            return

        elif self.export:
            Exporter().run(self.time)

        elif self.loadmaster:
            loader = MasterLoader().load_config()
            if loader:
                loader.run()

        elif self.init:
            Datastore().reset_database()
            MasterLoader().load_config().run()
            Download().reset_download()
            return

        else:
            downloader = Download(self.page).load_config(
                self.config_path).check_config()
            if downloader:
                downloader.run(self.month, self.keep)
                Parser().run()
                Datastore().csv_import()
            return
示例#18
0
 def load_config(self, config_path=Config.get_config_path()):
     """
     設定ファイルを読み、ホームページダウンロードパラメータを登録します
     """
     config_toml = toml.load(open(config_path, encoding='utf-8'))
     if 'area' in config_toml:
         self.areas = config_toml.get('area')
     if 'interval' in config_toml:
         self.crawl_interval = config_toml['interval']
     if 'max_page' in config_toml:
         self.max_page = config_toml['max_page']
     return self
示例#19
0
def test_toml_multibyte():
    config_path = Config.get_config_path("config.toml")
    config_toml = toml.load(open(config_path, encoding='utf-8'))
    df = pd.DataFrame(columns=['Target', 'Species'])

    if 'target' in config_toml:
        targets = config_toml['target']
        for target in targets:
            target_name = target['name']
            for species in target['species']:
                values = {'Target': target_name, 'Species': species}
                df = df.append(values, ignore_index=True)
    print(df)
示例#20
0
 def parser(self):
     """
     コマンド実行オプションの解析
     """
     parser = argparse.ArgumentParser(description=Description)
     parser.add_argument("-c",
                         "--config",
                         type=str,
                         default=Config.get_config_path(),
                         help="<path>\\config.toml")
     parser.add_argument("-m",
                         "--month",
                         type=int,
                         default=0,
                         help="last n month before downloading")
     parser.add_argument("-p",
                         "--page",
                         type=int,
                         default=config.MaxPage,
                         help="max number of pages to visit the homepage")
     parser.add_argument("-i",
                         "--init",
                         action="store_true",
                         help="initialize database")
     parser.add_argument("-k",
                         "--keep",
                         action="store_true",
                         help="keep old download files")
     parser.add_argument("-l",
                         "--log",
                         action="store_true",
                         help="write log to file")
     parser.add_argument("-s",
                         "--show",
                         action="store_true",
                         help="show config parameter")
     parser.add_argument("-e",
                         "--export",
                         action="store_true",
                         help="export csv data")
     parser.add_argument("--loadmaster",
                         action="store_true",
                         help="import master data")
     parser.add_argument("-t",
                         "--time",
                         type=str,
                         default="1day",
                         help="time period to export")
     return parser.parse_args()
示例#21
0
 def __init__(self, db_name=config.ChokaDB):
     """
     SQLite3 データベースへの接続と、各モデル定義を初期化します
     """
     self.db_path = Config.get_db_path(db_name)
     db = ds.connect('sqlite:///{}'.format(self.db_path))
     self.db = db
     self.tables = [
         Table('fishing_results', ['Date', 'Point', 'Species'],
               'choka.csv'),
         Table('fishing_comments', ['Date', 'Point'], 'comment.csv'),
         Table('fishing_newslines', ['Date', 'Time', 'Point'],
               'newsline.csv'),
     ]
     self.load_counts = dict()
示例#22
0
    def load_config(self, config_path=Config.get_config_path()):
        """
        設定ファイルを読み、ホームページダウンロードパラメータを登録します
        """
        config_toml = toml.load(open(config_path, encoding='utf-8'))

        """魚種ターゲットの読み込み"""
        if 'target' in config_toml:
            targets = config_toml['target']
            for target in targets:
                target_name = target['name']
                for species in target['species']:
                    values = {'Target': target_name, 'Species': species}
                    self.target = self.target.append(values, ignore_index=True)

        """魚種ターゲットの読み込み"""
        if 'area' in config_toml:
            areas = config_toml['area']
            for area in areas:
                values = {'Point': area['name'], 'PointName': area['label']}
                self.area = self.area.append(values, ignore_index=True)

        return self
示例#23
0
    def run(self, interval):
        """
        SQLite3 から指定した期間の履歴データをCSVにエクスポートします
        """
        last_timestamp = self.get_last_time(interval)
        if not last_timestamp:
            _logger.error(
                "--time parse {} by 'n[day|month|yaer]'".format(interval))
            return None

        last_date = last_timestamp.strftime('%Y-%m-%d')
        ds = Datastore()
        for table_name in ds.get_table_names():
            sql = "select * from {} where Date >= :start".format(table_name)
            df = pd.read_sql_query(sql,
                                   index_col=['Date'],
                                   con=ds.db.engine,
                                   params={"start": last_date})
            df = df.drop(columns=['index'])
            df = self.cleansing_data(table_name, df)
            export_path = Config.get_export_path(table_name)
            df.to_csv(export_path)
            _logger.info("save {}".format(export_path))
示例#24
0
def test_not_found():
    html_path = Config.test_resource("not_found1.html")
    assert not Parser("daikoku").parse_html(html_path)
示例#25
0
def test_export():
    html_path = Config.test_resource("daikoku1.html")
    parser = Parser("daikoku").parse_html(html_path)
    parser.export('csv')
示例#26
0
def test_daikoku_only_newsline_parser():
    html_path = Config.test_resource("daikoku1_newsline.html")
    parser = Parser("daikoku").parse_html(html_path)
    timestamps = parser.get_timestamps()
    assert not timestamps['choka']
    assert timestamps['newsline']
示例#27
0
def test_get_point_from_html_filename():
    assert Config.get_point_from_html_filename(
        "choka_daikoku_2021_04_001.html") == "daikoku"
    assert Config.get_point_from_html_filename("hoge.html") == None
示例#28
0
def test_get_download_file():
    assert Config.get_download_file("daikoku", 2021,
                                    4) == "choka_daikoku_2021_04_001.html"
示例#29
0
def test_get_url():
    assert Config.get_url(
        "daikoku") == 'http://daikoku.yokohama-fishingpiers.jp/choka.php'
示例#30
0
 def __init__(self, db_name=config.ChokaDB):
     """
     SQLite3 データベースへの接続と、各モデル定義を初期化します
     """
     self.db_path = Config.get_db_path(db_name)
     self.db = ds.connect('sqlite:///{}'.format(self.db_path))