def on_finished(self, data, args): Logger.d(TAG, 'on_finished') print(args) print(f'len(data) : {len(data)}') items = [format_data(item._json) for item in data] [item.update({'screen_name': args["screen_name"]}) for item in items] for item, d in zip(items, data): item['created_at'] = int(d.created_at.timestamp()) item['id'] = d.id if 'media' in item['entities']: try: s3_media_urls = [] medias = item['entities']['media'] for media in medias: media_url = media['media_url'] filename = media_url.split("/")[-1] local_tmp_filepath = f'/tmp/{filename}' urllib.request.urlretrieve(media_url, local_tmp_filepath) s3_filepath = os.path.join( DataLocationConfig.TWITTER_MEDIAFILE_DIR, args["screen_name"], filename) S3.save_file(local_tmp_filepath, s3_filepath, 'i-app') os.remove(local_tmp_filepath) Logger.d(TAG, f'Uploaded media file to {s3_filepath}') s3_media_urls.append(s3_filepath) item['s3_media_urls'] = s3_media_urls except Exception as e: print(e) DynamoDB.put_items( AWSConfig.DYNAMODB_TWITTER_USER_TWEET_TABLE_NAME, items, )
def on_failed( self, e: Exception, args: Dict, ) -> None: Logger.d(TAG, f'on_failed : {args["code"]} : {e}') global g_fail_cnt g_fail_cnt += 1
def on_finished( self, data: pd.DataFrame, args: Dict, ) -> None: Logger.d('edinet_docinfo_crawl', f'{args["date"]} : {len(data)}') # print(data) filepath = os.path.join( DataLocationConfig.EDINET_SECURITIES_REPORT_DOCINFO_DIR, f'{args["date"].replace("-", "_")}.csv') data.to_csv(filepath, index=False)
def on_finished(self, data, args): Logger.d(TAG, f'on_finished : {args["keyword"]} : {len(data)}') print('=' * 100) items = [format_data(item._json) for item in data] [item.update({'keyword': args["keyword"]}) for item in items] for item, d in zip(items, data): item['created_at'] = int(d.created_at.timestamp()) DynamoDB.put_items( AWSConfig.DYNAMODB_TWITTER_TABLE_NAME, items, )
def on_finished(self, data, args): Logger.d(TAG, 'on_finished') Logger.d(TAG, '=' * 100) for d in data: d['datetime_keyword'] = d['datetime'].strftime( '%Y%m%d_%H%M%S') + '_' + d['keyword'] d['datetime'] = int(d['datetime'].timestamp()) d['date'] = d['date'].strftime("%Y-%m-%d") DynamoDB.put_items( AWSConfig.DYNAMODB_TWITTER_TREND_TABLE_NAME, data, )
def on_finished(self, data, args): Logger.d(TAG, f'on_finished : {args["topic"]}') Logger.d(TAG, '=' * 100) for d in data: d['published_date'] = datetime( *d['published_parsed'][:6]).strftime("%Y-%m-%d") del d['published_parsed'] d['topic'] = args['topic'] print(f'len(data) : {len(data)}') DynamoDB.put_items( AWSConfig.DYNAMODB_GOOGLE_RSS_NEWS_TABLE_NAME, data, )
def on_finished( self, data: pd.DataFrame, args: Dict, ) -> None: filepath = os.path.join( DataLocationConfig.STOCKPRICE_STOOQ_CONCAT_BASEDIR, f'{args["code"]}.csv') data.to_csv(filepath) Logger.d(TAG, f'Saved data to {filepath}') global g_last_success_code global g_fail_cnt g_last_success_code = args["code"] g_fail_cnt = 0
def on_finished(self, data, args): json_data_list = [{ 'keyword': keyword, 'date': args['datetime'].strftime('%Y-%m-%d'), 'datetime': args['datetime'].strftime('%Y-%m-%d %H:%M:%S') } for keyword in data[0].tolist()] Logger.d(TAG, f'on_finished : len(data) : {len(json_data_list)}') DynamoDB.put_items( AWSConfig.DYNAMODB_GOOGLE_TREND_NAME, json_data_list, )
def main(): df_stocklist = pd.read_csv(DataLocationConfig.STOCKLIST_FILE) Logger.d(TAG, df_stocklist['銘柄コード'].unique()) codes = df_stocklist['銘柄コード'].unique() sc = StooqCrawler() while sc.check_restriction() is False: Logger.d(TAG, 'stooq restriction detected, waiting 1hour') time.sleep(60 * 60) global g_reached_end while g_reached_end == False: global g_last_success_code crawl(sc, codes, g_last_success_code + 1)
def crawl(crawler, codes, start_code): for code in codes: if code < start_code: continue crawler.run(code=code, callback=Callback()) time.sleep(20) if code == max(codes): global g_reached_end g_reached_end = True global g_fail_cnt if g_fail_cnt >= MAX_FAIL_CNT: Logger.d( TAG, 'stooq restriction detected, waiting until unrestriction.') while crawler.check_restriction() is False: time.sleep(60 * 60) g_fail_cnt = 0 return
def main(): df_stocklist = pd.read_csv( DataLocationConfig.STOCKLIST_FILE ) Logger.d(TAG, df_stocklist['銘柄コード'].unique()) codes = df_stocklist['銘柄コード'].unique() STOCKPRICE_FILEPATH_FMT = 's3://fin-app/stockprice_concat/{code}.csv' for code in codes[100:101]: code = 1382 try: df = pd.read_csv( STOCKPRICE_FILEPATH_FMT.format(code=code) ) except Exception as e: Logger.e(TAG, f'failed to load csv file from s3 : {e}') continue df['日付'] = pd.to_datetime(df['日付']) df = df.set_index('日付') df = df.rename(columns={ '始値': 'open', '高値': 'high', '安値': 'low', '終値': 'close' }) df.sort_index(inplace=True) df['last_close'] = df['close'].shift(1) df.dropna(inplace=True) print(df.tail()) df['stop_high_low'] = df.apply(lambda x: check_stop_high_low( x['last_close'], x['high'], x['low'] ), axis=1) print(df[df['stop_high_low']==1])
def on_failed( self, e: Exception, args: Dict, ) -> None: Logger.d('edinet_docinfo_crawl', f'{args["date"]} : {e}')
def on_failed(self, e, args): Logger.d(TAG, f'on_failed : {args["keyword"]} : {e}') print('=' * 100)
def main(): df_stocklist = pd.read_csv( DataLocationConfig.STOCKLIST_FILE ) Logger.d(TAG, df_stocklist['銘柄コード'].unique()) codes = df_stocklist['銘柄コード'].unique() STOCKPRICE_FILEPATH_FMT = 's3://fin-app/stockprice_concat/{code}.csv' METADATA_LOCAL_FILEPATH = '/tmp/DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5_stockprice_metadata.csv' METADATA_S3_FILEPATH = os.path.join( DataLocationConfig.STOCKPRICE_CANDLECHART_BASEDIR.replace('s3://fin-app/', ''), f'metadata/DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5/stockprice_metadata.csv' ) s3_filepath_list = [] start_dt_str_list = [] end_dt_str_list = [] code_list = [] change_rate_list = [] for code in tqdm(codes[:]): Logger.i(TAG, code) files = S3.get_filelist( basedir=os.path.join( DataLocationConfig.STOCKPRICE_CANDLECHART_BASEDIR.replace('s3://fin-app/', ''), # 'DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5/1301' f'DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5/{code}' ) ) start_dt_str = [file.split('/')[-1].replace('.png', '').split('_')[0] for file in files] end_dt_str = [file.split('/')[-1].replace('.png', '').split('_')[1] for file in files] s3_filepath_list += files start_dt_str_list += start_dt_str end_dt_str_list += end_dt_str code_list += [code]*len(files) Logger.i(TAG, f'len(files) : {len(files)}') Logger.i(TAG, f'len(s3_filepath_list) : {len(s3_filepath_list)}') try: df = pd.read_csv( STOCKPRICE_FILEPATH_FMT.format(code=code) ) except Exception as e: Logger.e(TAG, f'failed to load csv file from s3 : {e}') change_rate_list += [None]*len(files) continue df['日付'] = pd.to_datetime(df['日付']) df = df.set_index('日付') df = df.rename(columns={ '始値': 'open', '高値': 'high', '安値': 'low', '終値': 'close' }) MAX_DT = df.index.max() for sds, eds in zip(start_dt_str, end_dt_str): if len(df[sds:eds]) == 0: change_rate_list.append(None) continue edt = datetime.strptime(eds, '%Y-%m-%d') for i in range(119): try: df.loc[edt] break except Exception: edt -= timedelta(days=1) continue #raise Exception('') change_rate_start_dt = edt + timedelta(days=1) change_rate_end_dt = change_rate_start_dt + timedelta(days=30) if change_rate_end_dt > MAX_DT or len(df[change_rate_start_dt:change_rate_end_dt]) == 0: change_rate_list.append(None) continue change_rate = \ (df[change_rate_start_dt:change_rate_end_dt]['close'] - df.loc[edt]['close']).mean() / \ df.loc[edt]['close'] change_rate_list.append(change_rate) if code % 10 == 0: df_meta = pd.DataFrame({ 's3_filepath': s3_filepath_list, 'code': code_list, 'start_dt': start_dt_str_list, 'end_dt': end_dt_str_list, 'change_rate_30d': change_rate_list, }) df_meta.to_csv( METADATA_LOCAL_FILEPATH, index=None ) Logger.i(TAG, f'len(df_meta) : {len(df_meta)}') df_meta = pd.DataFrame({ 's3_filepath': s3_filepath_list, 'code': code_list, 'start_dt': start_dt_str_list, 'end_dt': end_dt_str_list, 'change_rate_30d': change_rate_list, }) df_meta.to_csv( METADATA_LOCAL_FILEPATH, index=None ) S3.save_file( local_filepath=METADATA_LOCAL_FILEPATH, s3_filepath=METADATA_S3_FILEPATH, )
def main(): df_stocklist = pd.read_csv( DataLocationConfig.STOCKLIST_FILE ) Logger.d(TAG, df_stocklist['銘柄コード'].unique()) codes = df_stocklist['銘柄コード'].unique() STOCKPRICE_FILEPATH_FMT = 's3://fin-app/stockprice_concat/{code}.csv' STRIDE_DAYS = 30 WINDOW_DAYS = 30*4 STRIDE_D_TD = timedelta(days=STRIDE_DAYS) WINDOW_D_TD = timedelta(days=WINDOW_DAYS) WIDTH = 0.5 S3_CANDLECHART_FILEPATH_FMT = os.path.join( DataLocationConfig.STOCKPRICE_CANDLECHART_BASEDIR.replace('s3://fin-app/', ''), f'DAILY_WINDOW-{WINDOW_DAYS}d_STRIDE-{STRIDE_DAYS}d_WIDTH-{WIDTH}', '{code}', '{start_dt}_{end_dt}.png' ) LOCAL_CANDLECHART_FILEPATH_FMT = os.path.join( '/tmp', f'WINDOW-{WINDOW_DAYS}d_STRIDE-{STRIDE_DAYS}d', '{code}', '{start_dt}_{end_dt}.png' ) for code in codes[2:]: # if code < 1515: # continue try: df = pd.read_csv( STOCKPRICE_FILEPATH_FMT.format(code=code) ) except Exception as e: Logger.e(TAG, f'failed to load csv file from s3 : {e}') continue df['日付'] = pd.to_datetime(df['日付']) df = df.set_index('日付') df = df.rename(columns={ '始値': 'open', '高値': 'high', '安値': 'low', '終値': 'close' }) MIN_DT = df.index.min() MAX_DT = df.index.max() start_dt = MIN_DT end_dt = MIN_DT + WINDOW_D_TD try: while end_dt <= MAX_DT: start_dt_str = start_dt.strftime('%Y-%m-%d') end_dt_str = end_dt.strftime('%Y-%m-%d') df_sliced = df[start_dt_str:end_dt_str] s3_filepath = S3_CANDLECHART_FILEPATH_FMT.format( code=code, start_dt=start_dt_str, end_dt=end_dt_str, ) local_filepath = LOCAL_CANDLECHART_FILEPATH_FMT.format( code=code, start_dt=start_dt_str, end_dt=end_dt_str ) if not os.path.exists(os.path.dirname(local_filepath)): os.makedirs(os.path.dirname(local_filepath)) local_filepath = creaet_candle_chart( opens=df_sliced.open, closes=df_sliced.close, highs=df_sliced.high, lows=df_sliced.low, width=WIDTH, filepath=local_filepath ) S3.save_file( local_filepath=local_filepath, s3_filepath=s3_filepath, ) Logger.i(TAG, f'Saved candle chart image to {s3_filepath}') os.remove(local_filepath) start_dt += STRIDE_D_TD end_dt += STRIDE_D_TD except Exception as e: Logger.e(TAG, f'{e}') continue