Пример #1
0
    def on_finished(self, data, args):
        Logger.d(TAG, 'on_finished')
        print(args)
        print(f'len(data) : {len(data)}')

        items = [format_data(item._json) for item in data]
        [item.update({'screen_name': args["screen_name"]}) for item in items]
        for item, d in zip(items, data):
            item['created_at'] = int(d.created_at.timestamp())
            item['id'] = d.id
            if 'media' in item['entities']:
                try:
                    s3_media_urls = []
                    medias = item['entities']['media']
                    for media in medias:
                        media_url = media['media_url']
                        filename = media_url.split("/")[-1]
                        local_tmp_filepath = f'/tmp/{filename}'
                        urllib.request.urlretrieve(media_url,
                                                   local_tmp_filepath)
                        s3_filepath = os.path.join(
                            DataLocationConfig.TWITTER_MEDIAFILE_DIR,
                            args["screen_name"], filename)
                        S3.save_file(local_tmp_filepath, s3_filepath, 'i-app')
                        os.remove(local_tmp_filepath)
                        Logger.d(TAG, f'Uploaded media file to {s3_filepath}')
                        s3_media_urls.append(s3_filepath)
                    item['s3_media_urls'] = s3_media_urls
                except Exception as e:
                    print(e)

        DynamoDB.put_items(
            AWSConfig.DYNAMODB_TWITTER_USER_TWEET_TABLE_NAME,
            items,
        )
 def on_failed(
     self,
     e: Exception,
     args: Dict,
 ) -> None:
     Logger.d(TAG, f'on_failed : {args["code"]} : {e}')
     global g_fail_cnt
     g_fail_cnt += 1
 def on_finished(
     self,
     data: pd.DataFrame,
     args: Dict,
 ) -> None:
     Logger.d('edinet_docinfo_crawl', f'{args["date"]} : {len(data)}')
     # print(data)
     filepath = os.path.join(
         DataLocationConfig.EDINET_SECURITIES_REPORT_DOCINFO_DIR,
         f'{args["date"].replace("-", "_")}.csv')
     data.to_csv(filepath, index=False)
Пример #4
0
    def on_finished(self, data, args):
        Logger.d(TAG, f'on_finished : {args["keyword"]} : {len(data)}')

        print('=' * 100)
        items = [format_data(item._json) for item in data]
        [item.update({'keyword': args["keyword"]}) for item in items]
        for item, d in zip(items, data):
            item['created_at'] = int(d.created_at.timestamp())
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_TWITTER_TABLE_NAME,
            items,
        )
    def on_finished(self, data, args):
        Logger.d(TAG, 'on_finished')

        Logger.d(TAG, '=' * 100)
        for d in data:
            d['datetime_keyword'] = d['datetime'].strftime(
                '%Y%m%d_%H%M%S') + '_' + d['keyword']
            d['datetime'] = int(d['datetime'].timestamp())
            d['date'] = d['date'].strftime("%Y-%m-%d")

        DynamoDB.put_items(
            AWSConfig.DYNAMODB_TWITTER_TREND_TABLE_NAME,
            data,
        )
Пример #6
0
    def on_finished(self, data, args):
        Logger.d(TAG, f'on_finished : {args["topic"]}')

        Logger.d(TAG, '=' * 100)
        for d in data:
            d['published_date'] = datetime(
                *d['published_parsed'][:6]).strftime("%Y-%m-%d")
            del d['published_parsed']
            d['topic'] = args['topic']

        print(f'len(data) : {len(data)}')
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_GOOGLE_RSS_NEWS_TABLE_NAME,
            data,
        )
    def on_finished(
        self,
        data: pd.DataFrame,
        args: Dict,
    ) -> None:

        filepath = os.path.join(
            DataLocationConfig.STOCKPRICE_STOOQ_CONCAT_BASEDIR,
            f'{args["code"]}.csv')
        data.to_csv(filepath)
        Logger.d(TAG, f'Saved data to {filepath}')
        global g_last_success_code
        global g_fail_cnt
        g_last_success_code = args["code"]
        g_fail_cnt = 0
Пример #8
0
    def on_finished(self, data, args):

        json_data_list = [{
            'keyword':
            keyword,
            'date':
            args['datetime'].strftime('%Y-%m-%d'),
            'datetime':
            args['datetime'].strftime('%Y-%m-%d %H:%M:%S')
        } for keyword in data[0].tolist()]

        Logger.d(TAG, f'on_finished : len(data) : {len(json_data_list)}')
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_GOOGLE_TREND_NAME,
            json_data_list,
        )
def main():
    df_stocklist = pd.read_csv(DataLocationConfig.STOCKLIST_FILE)

    Logger.d(TAG, df_stocklist['銘柄コード'].unique())

    codes = df_stocklist['銘柄コード'].unique()

    sc = StooqCrawler()

    while sc.check_restriction() is False:
        Logger.d(TAG, 'stooq restriction detected, waiting 1hour')
        time.sleep(60 * 60)

    global g_reached_end
    while g_reached_end == False:
        global g_last_success_code
        crawl(sc, codes, g_last_success_code + 1)
def crawl(crawler, codes, start_code):

    for code in codes:
        if code < start_code:
            continue
        crawler.run(code=code, callback=Callback())
        time.sleep(20)
        if code == max(codes):
            global g_reached_end
            g_reached_end = True

        global g_fail_cnt
        if g_fail_cnt >= MAX_FAIL_CNT:
            Logger.d(
                TAG,
                'stooq restriction detected, waiting until unrestriction.')
            while crawler.check_restriction() is False:
                time.sleep(60 * 60)
            g_fail_cnt = 0
            return
def main():
    df_stocklist = pd.read_csv(
        DataLocationConfig.STOCKLIST_FILE
    )

    Logger.d(TAG, df_stocklist['銘柄コード'].unique())

    codes = df_stocklist['銘柄コード'].unique()

    STOCKPRICE_FILEPATH_FMT = 's3://fin-app/stockprice_concat/{code}.csv'

    for code in codes[100:101]:
        code = 1382
        try:
            df = pd.read_csv(
                STOCKPRICE_FILEPATH_FMT.format(code=code)
            )
        except Exception as e:
            Logger.e(TAG, f'failed to load csv file from s3 : {e}')
            continue
        df['日付'] = pd.to_datetime(df['日付'])
        df = df.set_index('日付')
        df = df.rename(columns={
            '始値': 'open',
            '高値': 'high',
            '安値': 'low',
            '終値': 'close'
        })
        df.sort_index(inplace=True)

        df['last_close'] = df['close'].shift(1)
        df.dropna(inplace=True)

        print(df.tail())

        df['stop_high_low'] = df.apply(lambda x: check_stop_high_low(
            x['last_close'],
            x['high'],
            x['low']
        ), axis=1)
        print(df[df['stop_high_low']==1])
 def on_failed(
     self,
     e: Exception,
     args: Dict,
 ) -> None:
     Logger.d('edinet_docinfo_crawl', f'{args["date"]} : {e}')
Пример #13
0
 def on_failed(self, e, args):
     Logger.d(TAG, f'on_failed : {args["keyword"]} : {e}')
     print('=' * 100)
def main():

    df_stocklist = pd.read_csv(
        DataLocationConfig.STOCKLIST_FILE
    )

    Logger.d(TAG, df_stocklist['銘柄コード'].unique())

    codes = df_stocklist['銘柄コード'].unique()

    STOCKPRICE_FILEPATH_FMT = 's3://fin-app/stockprice_concat/{code}.csv'

    METADATA_LOCAL_FILEPATH = '/tmp/DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5_stockprice_metadata.csv'
    METADATA_S3_FILEPATH = os.path.join(
        DataLocationConfig.STOCKPRICE_CANDLECHART_BASEDIR.replace('s3://fin-app/', ''),
        f'metadata/DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5/stockprice_metadata.csv'
    )

    s3_filepath_list = []
    start_dt_str_list = []
    end_dt_str_list = []
    code_list = []
    change_rate_list = []
    for code in tqdm(codes[:]):
        Logger.i(TAG, code)
        files = S3.get_filelist(
            basedir=os.path.join(
                DataLocationConfig.STOCKPRICE_CANDLECHART_BASEDIR.replace('s3://fin-app/', ''),
                # 'DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5/1301'
                f'DAILY_WINDOW-120d_STRIDE-30d_WIDTH-0.5/{code}'
            )
        )
        start_dt_str = [file.split('/')[-1].replace('.png', '').split('_')[0] for file in files]
        end_dt_str = [file.split('/')[-1].replace('.png', '').split('_')[1] for file in files]

        s3_filepath_list += files
        start_dt_str_list += start_dt_str
        end_dt_str_list += end_dt_str
        code_list += [code]*len(files)

        Logger.i(TAG, f'len(files) : {len(files)}')
        Logger.i(TAG, f'len(s3_filepath_list) : {len(s3_filepath_list)}')

        try:
            df = pd.read_csv(
                STOCKPRICE_FILEPATH_FMT.format(code=code)
            )
        except Exception as e:
            Logger.e(TAG, f'failed to load csv file from s3 : {e}')
            change_rate_list += [None]*len(files)
            continue

        df['日付'] = pd.to_datetime(df['日付'])
        df = df.set_index('日付')
        df = df.rename(columns={
            '始値': 'open',
            '高値': 'high',
            '安値': 'low',
            '終値': 'close'
        })
        MAX_DT = df.index.max()

        for sds, eds in zip(start_dt_str, end_dt_str):
            if len(df[sds:eds]) == 0:
                change_rate_list.append(None)
                continue

            edt = datetime.strptime(eds, '%Y-%m-%d')
            for i in range(119):
                try:
                    df.loc[edt]
                    break
                except Exception:
                    edt -= timedelta(days=1)
                    continue
                #raise Exception('')
            change_rate_start_dt = edt + timedelta(days=1)
            change_rate_end_dt = change_rate_start_dt + timedelta(days=30)
            if change_rate_end_dt > MAX_DT or len(df[change_rate_start_dt:change_rate_end_dt]) == 0:
                change_rate_list.append(None)
                continue

            change_rate = \
                (df[change_rate_start_dt:change_rate_end_dt]['close'] - df.loc[edt]['close']).mean() /  \
                df.loc[edt]['close']
            change_rate_list.append(change_rate)

        if code % 10 == 0:
            df_meta = pd.DataFrame({
                's3_filepath': s3_filepath_list,
                'code': code_list,
                'start_dt': start_dt_str_list,
                'end_dt': end_dt_str_list,
                'change_rate_30d': change_rate_list,
            })
            df_meta.to_csv(
                METADATA_LOCAL_FILEPATH,
                index=None
            )
            Logger.i(TAG, f'len(df_meta) : {len(df_meta)}')

    df_meta = pd.DataFrame({
        's3_filepath': s3_filepath_list,
        'code': code_list,
        'start_dt': start_dt_str_list,
        'end_dt': end_dt_str_list,
        'change_rate_30d': change_rate_list,
    })
    df_meta.to_csv(
        METADATA_LOCAL_FILEPATH,
        index=None
    )

    S3.save_file(
        local_filepath=METADATA_LOCAL_FILEPATH,
        s3_filepath=METADATA_S3_FILEPATH,
    )
Пример #15
0
def main():
    df_stocklist = pd.read_csv(
        DataLocationConfig.STOCKLIST_FILE
    )

    Logger.d(TAG, df_stocklist['銘柄コード'].unique())

    codes = df_stocklist['銘柄コード'].unique()

    STOCKPRICE_FILEPATH_FMT = 's3://fin-app/stockprice_concat/{code}.csv'

    STRIDE_DAYS = 30
    WINDOW_DAYS = 30*4
    STRIDE_D_TD = timedelta(days=STRIDE_DAYS)
    WINDOW_D_TD = timedelta(days=WINDOW_DAYS)

    WIDTH = 0.5

    S3_CANDLECHART_FILEPATH_FMT = os.path.join(
        DataLocationConfig.STOCKPRICE_CANDLECHART_BASEDIR.replace('s3://fin-app/', ''),
        f'DAILY_WINDOW-{WINDOW_DAYS}d_STRIDE-{STRIDE_DAYS}d_WIDTH-{WIDTH}',
        '{code}',
        '{start_dt}_{end_dt}.png'
    )
    LOCAL_CANDLECHART_FILEPATH_FMT = os.path.join(
        '/tmp',
        f'WINDOW-{WINDOW_DAYS}d_STRIDE-{STRIDE_DAYS}d',
        '{code}',
        '{start_dt}_{end_dt}.png'
    )

    for code in codes[2:]:
        # if code < 1515:
        #     continue
        try:
            df = pd.read_csv(
                STOCKPRICE_FILEPATH_FMT.format(code=code)
            )
        except Exception as e:
            Logger.e(TAG, f'failed to load csv file from s3 : {e}')
            continue
        df['日付'] = pd.to_datetime(df['日付'])
        df = df.set_index('日付')
        df = df.rename(columns={
            '始値': 'open',
            '高値': 'high',
            '安値': 'low',
            '終値': 'close'
        })
        MIN_DT = df.index.min()
        MAX_DT = df.index.max()

        start_dt = MIN_DT
        end_dt = MIN_DT + WINDOW_D_TD

        try:
            while end_dt <= MAX_DT:
                start_dt_str = start_dt.strftime('%Y-%m-%d')
                end_dt_str = end_dt.strftime('%Y-%m-%d')

                df_sliced = df[start_dt_str:end_dt_str]

                s3_filepath = S3_CANDLECHART_FILEPATH_FMT.format(
                    code=code,
                    start_dt=start_dt_str,
                    end_dt=end_dt_str,
                )
                local_filepath = LOCAL_CANDLECHART_FILEPATH_FMT.format(
                    code=code,
                    start_dt=start_dt_str,
                    end_dt=end_dt_str
                )
                if not os.path.exists(os.path.dirname(local_filepath)):
                    os.makedirs(os.path.dirname(local_filepath))

                local_filepath = creaet_candle_chart(
                    opens=df_sliced.open,
                    closes=df_sliced.close,
                    highs=df_sliced.high,
                    lows=df_sliced.low,
                    width=WIDTH,
                    filepath=local_filepath
                )

                S3.save_file(
                    local_filepath=local_filepath,
                    s3_filepath=s3_filepath,
                )

                Logger.i(TAG, f'Saved candle chart image to {s3_filepath}')

                os.remove(local_filepath)

                start_dt += STRIDE_D_TD
                end_dt += STRIDE_D_TD
        except Exception as e:
            Logger.e(TAG, f'{e}')
            continue