def populate_raw_data(tickers, fields, raw_path): """tickers is a dict with the ticker string as the key and the SID as the value. """ quandl_tools.set_api_key() # existing = listdir(RAW_FLDR) for ticker, sid in tickers.items(): # if "%d.csv" % sid in existing: # continue try: query_str = "%s %s" % (DS_NAME, ticker) print("fetching data for: {}".format(query_str)) # df = quandl.get_table(query_str, start_date=START_DATE, end_date=END_DATE) df = quandl.get_table( DS_NAME, calendardate={ 'gte': START_DATE, 'lte': END_DATE }, ticker=ticker, qopts={'columns': ['dimension', 'datekey'] + fields}) df = df[df.dimension == "ARQ"] # only use As-Reported numbers # Change column name to field df = df.rename(columns={"datekey": "Date"}) df = df.drop(["dimension"], axis=1) # write raw file: raw/ df.to_csv(os.path.join(raw_path, "{}.csv".format(sid))) except quandl.errors.quandl_error.NotFoundError: print("error with ticker: {}".format(ticker))
def download( bundle=KERNEL_BUNDLE, start_date='2007-01-01', tickers=None, fields=None, dimensions=None, ): """ this method is a top-level executor of the download download volume could be reduced by setting start_date, tickers, fields, dimensions parameters with all parameters set as default will need couple of hours to complete the task for each field it gets each dimension available - thus returns fields X dimension values :param bundle: bundle which to be used to get the universe of tickers, sharadar-prices by default :param start_date: first date of the set :param tickers: list of tickers, all tickers by default :param fields: list of fields, all fields by default :param dimensions: list of dimensions, all dimensions by default (skipping MRs) """ quandl_tools.set_api_key() data = download_fundamendals_data( bundle=bundle, start_date=start_date, tickers=tickers, fields=fields, dimensions=dimensions, ) return data
def populate_raw_data(tickers, fields, dimensions, raw_path): """tickers is a dict with the ticker string as the key and the SID as the value. For each field a dimension is required, so dimensions should be a list of dimensions for each field. """ assert len(fields) == len(dimensions) quandl_tools.set_api_key() # existing = listdir(RAW_FLDR) for ticker, sid in tickers.items(): # if "%d.csv" % sid in existing: # continue try: query_str = "%s %s" % (DS_NAME, ticker) print("fetching data for: {}".format(query_str)) # df = quandl.get_table(query_str, start_date=START_DATE, end_date=END_DATE) df = quandl.get_table( DS_NAME, calendardate={ 'gte': START_DATE, 'lte': END_DATE }, ticker=ticker, qopts={'columns': ['dimension', 'datekey'] + fields}) df = df.rename(columns={'datekey': 'Date'}).set_index('Date') # loop over the fields and dimensions series = [] for i, field in enumerate(fields): s = df[df.dimension == dimensions[i]][field] series.append(s) df = pd.concat(series, axis=1) print(df) # write raw file: raw/ df.to_csv(os.path.join(raw_path, "{}.csv".format(sid))) except quandl.errors.quandl_error.NotFoundError: print("error with ticker: {}".format(ticker))
def populate_raw_data(tickers, input_fields, output_fields, raw_path): """tickers is a dict with the ticker string as the key and the SID as the value. """ quandl_tools.set_api_key() # existing = listdir(RAW_FLDR) for ticker, sid in tickers.items(): # if "%d.csv" % sid in existing: # continue try: time.sleep(0.1) query_str = "%s %s" % (DS_NAME, ticker) print("fetching data for: {}".format(query_str)) # df = quandl.get_table(query_str, start_date=START_DATE, end_date=END_DATE) rawData = quandl.get_table(DS_NAME, filingdate={ 'gte': START_DATE, 'lte': END_DATE }, ticker=ticker, qopts={'columns': input_fields}, paginate=True) df = fnProcessInsiderTrades(rawData, nDaysDiff=3) # Group by and Change column name to field if not df.empty: df = df.groupby('filingdate').sum().reset_index() df = df.rename(columns={"filingdate": "Date"}) df = df[output_fields] # write raw file: raw/ df = df.rename_axis('None', axis=0) df.to_csv(os.path.join(raw_path, "{}.csv".format(sid))) except quandl.errors.quandl_error.NotFoundError: print("error with ticker: {}".format(ticker))
# df['unadjusted_close'].loc[(df.date<='2016-06-23') & (df.ticker=='SH')] = df['unad'] # get last updated # quandl.get_table('SHARADAR/SFP', lastupdated={'gte':'2017-11-03'}, ticker=['SH','SPY'],) # get more than 10,000 rows of data # quandl.get_table('SHARADAR/SFP', date={'gte':startDate,'lte':endDate}, ticker=['SH','SPY'], paginate=True) # get specific date range # quandl.get_table('SHARADAR/SFP', date={'gte':'2017-01-01', 'lte':'2017-10-30'}, ticker=['SH','SPY'], paginate=True) if __name__ == '__main__': # custom pandas settings setPandas() # set quandl API key quandl_tools.set_api_key() startDate = pd.to_datetime(config.get('start_date'), format='%Y%m%d').tz_localize(pytz.utc) endDate = pd.to_datetime(config.get('end_date'), format='%Y%m%d').tz_localize(pytz.utc) # bulk data download df = fnDownloadData(startDate=startDate, endDate=endDate) # df.head()