def download_prices_from_s3(bucket: ServiceResource, dir_prices: Path, remote_dir_prices: Path, missing_rics: List[str], logger: logging.Logger) -> None: dir_prices.mkdir(parents=True, exist_ok=True) for ric in missing_rics: remote_filename = ric2filename(remote_dir_prices, ric, 'csv.gz') basename = remote_filename.name dest_parent = dir_prices dest = dest_parent / Path(basename) if dest.is_file(): logger.debug('skip downloading {}'.format(basename)) else: logger.debug('start downloading {}'.format(basename)) try: bucket.download_file(Key=str(remote_filename), Filename=str(dest)) except ClientError as e: code = e.response.get('Error', {}).get('Code', '') if str(code) == str(HTTPStatus.NOT_FOUND.value): logger.critical('{} is not found'.format( str(remote_filename))) logger.debug('end downloading {}'.format(basename))
def upload_prices_to_s3(bucket: ServiceResource, local_dir: Path, remote_dir: Path, rics: List[str]) -> None: for ric in rics: local_filename = ric2filename(local_dir, ric, 'csv.gz') key = str(remote_dir / Path(local_filename.name)) objs = list(bucket.objects.filter(Prefix=key).all()) if len(objs) > 0 and objs[0].key == key: continue with local_filename.open(mode='rb') as body: bucket.put_object(Key=key, Body=body)
def insert_prices(session: Session, dir_prices: Path, missing_rics: List[str], dir_resources: Path, logger: Logger) -> None: ct = ClosingTime(dir_resources) insert_instruments(session, dir_resources / Path('ric.csv'), logger) seqtypes = [ SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort, SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong, SeqType.StdShort, SeqType.StdLong ] for ric in missing_rics: filename = ric2filename(dir_prices, ric, extension='csv.gz') price_seqs = dict((seqtype, []) for seqtype in seqtypes) with gzip.open(filename, mode='rt') as f: dataframe = pandas.read_table(f, delimiter=',') column = 'Close Bid' if int( dataframe[['Last']].dropna().count()) == 0 else 'Last' mean = float(dataframe[[column]].mean()) std = float(dataframe[[column]].std()) f.seek(0) N = sum(1 for _ in f) - 1 f.seek(0) reader = csv.reader(f, delimiter=',') next(reader) fields = next(reader) ric = fields[0] stock_exchange = session \ .query(Instrument.exchange) \ .filter(Instrument.ric == ric) \ .scalar() if stock_exchange is None: stock_exchange = 'TSE' get_close_utc = ct.func_get_close_t(stock_exchange) logger.info('start importing {}'.format(f.name)) f.seek(0) column_names = next(reader) # Some indices contain an additional column shift = 1 if column_names[1] == 'Alias Underlying RIC' else 0 prices = [] close_prices = [] raw_short_vals = [] raw_long_vals = [] raw_mov_ref_long_vals = [] raw_mov_ref_short_vals = [] std_long_vals = [] std_short_vals = [] prev_row_t = None max_mov_ref_long_val = float('-inf') min_mov_ref_long_val = float('inf') max_mov_ref_short_val = float('-inf') min_mov_ref_short_val = float('inf') for _ in tqdm(range(N)): fields = next(reader) ric = fields[0] t = fields[2 + shift].replace('Z', '+0000') utc_offset = int(fields[3 + shift]) if ric == Code.SPX.value: utc_offset += 1 last = fields[8 + shift].strip() close_bid = fields[14 + shift].strip() if last == '' and close_bid == '': continue val = Decimal(close_bid if last == '' else last) std_val = (float(val) - mean) / std try: t = datetime.strptime(t, REUTERS_DATETIME_FORMAT) except ValueError: logger.info('ValueError: {}, {}, {}'.format(ric, t, val)) continue if prev_row_t is not None: if prev_row_t == t: continue close_time = get_close_utc(utc_offset) close_datetime = datetime(t.year, t.month, t.day, close_time.hour, close_time.minute, tzinfo=UTC) if prev_row_t < close_datetime and close_datetime <= t: close_prices.append(Close(ric, t).to_dict()) if len(raw_long_vals) > 1: raw_mov_ref_long_val = float( val) - raw_long_vals[0] raw_mov_ref_long_vals = [raw_mov_ref_long_val] + raw_mov_ref_long_vals \ if len(raw_mov_ref_long_vals) < N_LONG_TERM \ else [raw_mov_ref_long_val] + raw_mov_ref_long_vals[:-1] price_seqs[SeqType.MovRefLong] \ .append(PriceSeq(ric, SeqType.MovRefLong, t, raw_mov_ref_long_vals).to_dict()) max_mov_ref_long_val = raw_mov_ref_long_val \ if raw_mov_ref_long_val > max_mov_ref_long_val \ else max_mov_ref_long_val min_mov_ref_long_val = raw_mov_ref_long_val \ if raw_mov_ref_long_val < min_mov_ref_long_val \ else min_mov_ref_long_val raw_long_vals = [float(val)] + raw_long_vals \ if len(raw_long_vals) < N_LONG_TERM \ else [float(val)] + raw_long_vals[:-1] price_seqs[SeqType.RawLong] \ .append(PriceSeq(ric, SeqType.RawLong, t, raw_long_vals).to_dict()) std_long_vals = [std_val] + std_long_vals \ if len(std_long_vals) < N_LONG_TERM \ else [std_val] + std_long_vals[:-1] price_seqs[SeqType.StdLong] \ .append(PriceSeq(ric, SeqType.StdLong, t, std_long_vals).to_dict()) prices.append(Price(ric, t, utc_offset, val).to_dict()) if len(raw_short_vals) > 1 and len(raw_long_vals) > 2: raw_mov_ref_short_val = float( val) - raw_long_vals[1 if t == close_datetime else 0] raw_mov_ref_short_vals = [raw_mov_ref_short_val] + raw_mov_ref_short_vals \ if len(raw_mov_ref_short_vals) < N_SHORT_TERM \ else [raw_mov_ref_short_val] + raw_mov_ref_short_vals[:-1] price_seqs[SeqType.MovRefShort] \ .append(PriceSeq(ric, SeqType.MovRefShort, t, raw_mov_ref_short_vals).to_dict()) max_mov_ref_short_val = raw_mov_ref_short_val \ if raw_mov_ref_short_val > max_mov_ref_short_val \ else max_mov_ref_short_val min_mov_ref_short_val = raw_mov_ref_short_val \ if raw_mov_ref_short_val < min_mov_ref_short_val \ else min_mov_ref_short_val raw_short_vals = [float(val)] + raw_short_vals \ if len(raw_short_vals) < N_SHORT_TERM \ else [float(val)] + raw_short_vals[:-1] price_seqs[SeqType.RawShort] \ .append(PriceSeq(ric, SeqType.RawShort, t, raw_short_vals).to_dict()) std_short_vals = [std_val] + std_short_vals \ if len(std_short_vals) < N_SHORT_TERM \ else [std_val] + std_short_vals[:-1] price_seqs[SeqType.StdShort] \ .append(PriceSeq(ric, SeqType.StdShort, t, std_short_vals).to_dict()) prev_row_t = t session.execute(Price.__table__.insert(), prices) session.execute(Close.__table__.insert(), close_prices) for seqtype in seqtypes: if seqtype == SeqType.NormMovRefShort: price_seqs[seqtype] = \ [PriceSeq(ric, SeqType.NomMovRefShort, p['t'], None) for p in price_seqs[SeqType.MovRefShort]] \ if isclose(max_mov_ref_long_val, min_mov_ref_short_val) \ else [PriceSeq(ric, SeqType.NormMovRefShort, p['t'], [(2 * v - (max_mov_ref_short_val + min_mov_ref_short_val)) / (max_mov_ref_short_val - min_mov_ref_short_val) for v in p['vals']]).to_dict() for p in price_seqs[SeqType.MovRefShort]] elif seqtype == SeqType.NormMovRefLong: price_seqs[seqtype] = \ [PriceSeq(ric, SeqType.NomMovRefLong, p['t'], None) for p in price_seqs[SeqType.MovRefLong]] \ if isclose(max_mov_ref_long_val, min_mov_ref_long_val) \ else [PriceSeq(ric, SeqType.NormMovRefLong, p['t'], [(2 * v - (max_mov_ref_long_val + min_mov_ref_long_val)) / (max_mov_ref_long_val - min_mov_ref_long_val) for v in p['vals']]).to_dict() for p in price_seqs[SeqType.MovRefLong]] session.execute(PriceSeq.__table__.insert(), price_seqs[seqtype]) session.commit() logger.info('end importing {}'.format(ric))
def test_ric2filename_lowercase(): result = ric2filename(Path('/somewhere/'), 'JNIc1', 'csv.gz') expected = Path('/somewhere/jni#c1.csv.gz') assert result == expected
def test_ric2filename_equal(): result = ric2filename(Path('/somewhere/'), 'EUR=', 'csv.gz') expected = Path('/somewhere/eur=.csv.gz') assert result == expected
def test_ric2filename_underscore(): result = ric2filename(Path('/somewhere/'), '.IRAIL.T', 'csv.gz') expected = Path('/somewhere/_irail_t.csv.gz') assert result == expected
def test_ric2filename_period(): result = ric2filename(Path('/somewhere/'), '.JSD', 'csv.gz') expected = Path('/somewhere/_jsd.csv.gz') assert result == expected
def test_ric2filename_lowercase(self): result = ric2filename(Path('/somewhere/'), 'JNIc1', 'csv.gz') expected = Path('/somewhere/jni#c1.csv.gz') self.assertEqual(result, expected)
def test_ric2filename_underscore(self): result = ric2filename(Path('/somewhere/'), '.IRAIL.T', 'csv.gz') expected = Path('/somewhere/_irail_t.csv.gz') self.assertEqual(result, expected)
def test_ric2filename_equal(self): result = ric2filename(Path('/somewhere/'), 'EUR=', 'csv.gz') expected = Path('/somewhere/eur=.csv.gz') self.assertEqual(result, expected)
def test_ric2filename_period(self): result = ric2filename(Path('/somewhere/'), '.JSD', 'csv.gz') expected = Path('/somewhere/_jsd.csv.gz') self.assertEqual(result, expected)