def __init__(self, root): # # get the initial configuration self.root = root self.configsheet_url = 'https://docs.google.com/spreadsheets/d/1gwtQlzk0iA4qyLzqaYEk5SggOqNZtJnSSfwnZYDNlAw/export?format=csv&gid={SheetId}&run=1' sourceConfigDF = pd.read_csv( self.configsheet_url.replace('{SheetId}', '284194018')).fillna('') sourceConfigDF['enriched'] = False self.sourcesConfigDict = ProUtils.pandas_df_to_dict( sourceConfigDF, 'Configname') self.sport_configs = {} self.TRUE = True # # read IMDB title definitions titleTypesDF = pd.read_csv( self.configsheet_url.replace('{SheetId}', '1802180540')).fillna('') self.titletypesConfigDict = ProUtils.pandas_df_to_dict( titleTypesDF, 'TitleType') #print(sourceConfig) self.consumerStatus = multiprocessing.Queue() self.sentinel = 'Done' self.bqUtils = BigqueryUtils()
def __init__(self): self.seasons = ['2019-regular', '2017-regular', '2017-playoff', '2018-regular', '2018-playoff'] #self.seasons = ['2019-regular'] apikey = '98de7b49-a696-4ed7-8efa-94b28a' self.msf = MySportsFeeds(version="2.0") self.msf.authenticate(apikey, "MYSPORTSFEEDS") self.bqu = BigqueryUtils()
def __init__(self): self.bqu = BigqueryUtils() self.companiesDF = None self.stocksDF = None self.resourceDir = 'resource' self.companiesDataFileName = '{}/companies_{}.csv'.format( self.resourceDir, date.today()) self.stocksDataFileName = '{}/stocks_{}.csv'.format( self.resourceDir, date.today()) self.companiesURL = 'gs://sport-uploads/Finance/companies_fundamentals.csv' self.stocksURL = 'gs://sport-uploads/Finance/eod_stocks_data.csv'
def __init__(self): #ProducerConsumersEngine.__init__(self, self.import_daily_quotes) self.bqu = BigqueryUtils() self.main_dir = '/Users/ysherman/Documents/GitHub/results/Finance/EODHistoricalData/daily-quotes/' #self.AV_api_key = 'NN4P0527XD25VT1Q' #self.WTD_apikey = 'kDxr9tfB8fYVUV0wnkNzZN4W3IQZO48hLOpFKJ2NIiHbHSgKsTyMt4jzW3Cm' self.EODHD_apikey = '5d5d1d7259ef23.41685254' #self.alpha_url = 'https://www.alphavantage.co/query?function={AnalyticFunction}&symbol={Symbol}&outputsize=compact&apikey={api_key}&datatype=csv'.replace('{api_key}', api_key) self.EOD_Symbols_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('COMM', 'NYSE', 'NASDAQ', 'INDX') order by exchange desc" self.FUNDAMENTALS_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('NYSE', 'NASDAQ') AND type='Common Stock' order by exchange desc" self.EOD_DAYBULK_URL = 'https://eodhistoricaldata.com/api/eod-bulk-last-day/{}?api_token=5d5d1d7259ef23.41685254&filter=extended&date={}'
def upload_dated_quote_files(self, startdate): os.chdir(main_dir) extension = 'csv' all_filenames = [ i for i in glob.glob('dated-files/2019-08*.{}'.format(extension)) ] print(all_filenames) bqu = BigqueryUtils() main_start_time = dt.now() count = 0 for csvFileName in all_filenames: datadate = csvFileName.split('/')[1].split('.')[0].replace('-', '') if (datadate < startdate.replace('-', '')): continue tableId = 'daily_stock_history_{}'.format(datadate) print(tableId, dt.now() - main_start_time) csvFile = open(csvFileName, 'rb') bqu.create_table_from_local_file(csvFile, 'Finance_Data', tableId) csvFile.close() count += 1 if count % 20 == 0: print('Done reading: {} of {} Total-delta: {}\n'.format( count, len(all_filenames), dt.now() - main_start_time))
def test(): from contendo_utils import BigqueryUtils import os os.chdir('/Users/ysherman/Documents/GitHub/results/trends') os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "{}/sportsight-tests.json".format(os.environ["HOME"]) query = 'SELECT Code, Name, Sector, count(*) count FROM `sportsight-tests.Finance_Data.indices_company_list` left join unnest(Components) group by 1,2,3 having count>0 order by count desc, name' bqu = BigqueryUtils() gtrend = GoogleTrendImport() itemsDict = bqu.execute_query_to_dict(query) print('Getting {} items for finance'.format(itemsDict['nRows'])) trendsDict = {'Finance': 7, 'Financial-Markets': 1163} for categoryName, category in trendsDict.items(): filename = gtrend.get_trend_for_list(itemsDict['Rows'], 'Code', category, categoryName) datasetId = 'Trends_Data' bqu.create_dataset(datasetId) bqu.create_table_from_local_file(filename, datasetId, 'daily_trends', writeDisposition='WRITE_APPEND') 'Done'
def __init__(self, root='.'): self.root = root self.icm = icm.InsightsConfigurationManager() self.queryDict = {} self.bqu = BigqueryUtils() self.TRUE = True
class InsightsGenerator: def __init__(self, root='.'): self.root = root self.icm = icm.InsightsConfigurationManager() self.queryDict = {} self.bqu = BigqueryUtils() self.TRUE = True def get_twoanswers_query(self, domain): queryKey = '{}.TwoAnswersQuery'.format(domain) if queryKey not in self.queryDict: statsPrepQuery = open( self.root + '/Queries/{StatsPrepQuery}'.format( **self.icm.domainsDict[domain]), 'r').read() twoAnswersQuestionQuery = open( self.root + '/Queries/{TwoAnswersQuestionQuery}'.format( **self.icm.domainsDict[domain]), 'r').read() self.queryDict[ queryKey] = '{},\n{}\nSELECT * from twoQuestionsFinal'.format( statsPrepQuery, twoAnswersQuestionQuery) #print(self.queryDict[domain]) return self.queryDict[queryKey] def get_lists_query(self, domain): queryKey = '{}.ListsQuery'.format(domain) if queryKey not in self.queryDict: listsQuery = open( self.root + '/Queries/{ListsQuery}'.format(**self.icm.domainsDict[domain]), 'r').read() self.queryDict[queryKey] = listsQuery #print(self.queryDict[domain]) return self.queryDict[queryKey] def get_onelist_query(self, domain): queryKey = '{}.OneListQuery'.format(domain) if queryKey not in self.queryDict: listsQuery = open( self.root + '/Queries/{OneListQuery}'.format( **self.icm.domainsDict[domain]), 'r').read() self.queryDict[queryKey] = listsQuery #print(self.queryDict[domain]) return self.queryDict[queryKey] def get_twoquestions_dataset_and_table(self, contentConfigCode): return 'temp', 'finance_questions_' + contentConfigCode def get_lists_dataset_and_table(self, contentConfigCode): return 'temp', 'finance_lists_' + contentConfigCode def trend_teams_filter(self, top=30, minTrend=0): query = 'SELECT TeamId, Trend FROM `sportsight-tests.Baseball1.teams_trend` where Trend>{} order by trend desc limit {}'.format( minTrend, top) teamsDF = self.bqu.execute_query_to_df(query) teamsList = list(teamsDF['TeamId']) inst = {} inst['teamIDs'] = str(teamsList).replace('[', '(').replace(']', ')') #return 'stat1.TeamCode in {teamIDs} or stat2.TeamCode in {teamIDs}'.format(**inst) return 'TeamCode in {teamIDs}'.format(**inst) def filter(self, cond): return cond def one_team_filter(self, teamCode): return '"{}" in (stat1.TeamCode, stat2.TeamCode)'.format(teamCode) def compare_teams_filter(self, team1, team2): return '"{}" in (stat1.TeamCode, stat2.TeamCode) AND "{}" in (stat1.TeamCode, stat2.TeamCode)'.format( team1, team2) def one_player_filter(self, playerCode): return '"{}" in (stat1.PlayerCode, stat2.PlayerCode)'.format( playerCode) def property_compare(self, property, value): return '{} = "{}"'.format(property, value) def marketcap_between(self, min, max): return 'MarketCap between {} and {}'.format(min, max) def condition(self, cond): return cond def calc_filter(self, filter): if filter == True: retFilter = filter else: try: execStr = 'self.' + filter retFilter = eval(execStr) except Exception as e: print("Error while evaluating '{}', error: {}".format( execStr, e)) retFilter = True return retFilter def two_answers_generator(self, contentConfigCode): # # Save the insights configuration to BQ configTableId = self.icm.save_configuration_to_bigquery( contentConfigCode) # # read the query, configure and run it. instructions = self.icm.get_content_config(contentConfigCode) instructions['InsightsConfigurationTable'] = configTableId instructions['StatFilter'] = self.calc_filter( instructions['StatFilter']) instructions['QuestionsFilter'] = self.calc_filter( instructions['QuestionsFilter']) query = self.get_twoanswers_query(instructions['SportCode']) query = ProUtils.format_string(query, instructions) #print("Running query:\n" + query, flush=True) # # Execute the query. dataset_id, table_id = self.get_twoquestions_dataset_and_table( contentConfigCode) queryFile = 'results/queries/{}.sql'.format(table_id) f = open(queryFile, 'w') f.write(query) f.close() nQuestions = self.bqu.execute_query_with_schema_and_target( query, dataset_id, table_id) return nQuestions def lists_generator(self, contentConfigCode): # # Save the insights configuration to BQ configTableId = self.icm.save_configuration_to_bigquery( contentConfigCode) # # read the query, configure and run it. instructions = self.icm.get_content_config(contentConfigCode) instructions['InsightsConfigurationTable'] = configTableId instructions['StatFilter'] = self.calc_filter( instructions['StatFilter']) instructions['QuestionsFilter'] = self.calc_filter( instructions['QuestionsFilter']) query = self.get_lists_query(instructions['SportCode']) query = ProUtils.format_string(query, instructions) #print("Running query:\n" + query, flush=True) # # Execute the query. dataset_id, table_id = self.get_lists_dataset_and_table( contentConfigCode) queryFile = 'results/queries/{}.sql'.format(table_id) f = open(queryFile, 'w') f.write(query) f.close() nItems = self.bqu.execute_query_with_schema_and_target( query, dataset_id, table_id) return nItems
def __init__(self): stocknews_apikey = 'oywghpku7talnwtde1k4h5eqonrgze6i1v6fzmcq' self.stocknews_url_template = "https://stocknewsapi.com/api/v1?tickers={ticker}&items={nitems}&date={fromdate_MMDDYYYY}-today&sortby={sortby}&token={stocknews_apikey}".replace( '{stocknews_apikey}', stocknews_apikey) self.bqu = BigqueryUtils() self.bucketName = 'sport-uploads'
class GetStockNews: def __init__(self): stocknews_apikey = 'oywghpku7talnwtde1k4h5eqonrgze6i1v6fzmcq' self.stocknews_url_template = "https://stocknewsapi.com/api/v1?tickers={ticker}&items={nitems}&date={fromdate_MMDDYYYY}-today&sortby={sortby}&token={stocknews_apikey}".replace( '{stocknews_apikey}', stocknews_apikey) self.bqu = BigqueryUtils() self.bucketName = 'sport-uploads' #nltk.download('punkt') # 1 time download of the sentence tokenizer def get_stocknews_byticker(self, tickersList, nitems=50, daysback=30, sortby='trending'): assert (sortby in ['trending', 'algo']) tickers = str(tickersList).replace('[', '').replace(']', '').replace( "'", '').replace(' ', '') urlInstructions = { 'ticker': tickers, 'nitems': nitems, 'fromdate_MMDDYYYY': (date.today() - datetime.timedelta(days=daysback)).strftime('%m%d%Y'), 'sortby': sortby, 'today': date.today(), } outfileName = 'Finance/temp/{ticker}-{nitems}-{fromdate_MMDDYYYY}-{sortby}-{today}.json'.format( **urlInstructions) text = self.bqu.read_string_from_gcp(self.bucketName, outfileName) if text is None: url = self.stocknews_url_template.format(**urlInstructions) print(url) response = requests.request("GET", url) text = response.text self.bqu.upload_string_to_gcp(response.text, self.bucketName, outfileName) data = json.loads(text) newsDict = data['data'] sentimentDict = { 'Count': 0, 'Negative': 0, 'Positive': 0, 'Neutral': 0, 'Weighted': 0 } sentimentWeight = {'Negative': -1, 'Positive': 1, 'Neutral': 0} count = 0 newsFeed = [] startTime = dt.utcnow() for newsItem in newsDict: count += 1 newItem = { key: newsItem[key] for key in [ 'title', 'news_url', 'text', 'sentiment', 'source_name', 'topics' ] } newItem['index'] = count itemDate = dt.strptime(newsItem['date'], '%a, %d %b %Y %H:%M:%S %z') delta = startTime.date() - itemDate.date() if delta.days <= 3 or count <= 3: newItem['date'] = str(itemDate.date()) if False: # suspend getting the summary article = Article(newItem['news_url']) # Do some NLP try: article.download() # Downloads the link’s HTML content article.parse() # Parse the article article.nlp() # Keyword extraction wrapper newItem['Summary'] = article.summary.replace( '\n', '\n') except Exception as e: print('Error occured:', e) newItem['Summary'] = "<...>" #print(newItem['Summary']) newsFeed.append(newItem) if delta.days <= 3: deltaWeight = 1 elif delta.days <= 7: deltaWeight = 0.5 elif delta.days <= 14: deltaWeight = 0.25 elif delta.days <= 30: deltaWeight = 0.125 else: deltaWeight = 0.05 sentiment = newsItem['sentiment'] sentimentDict[sentiment] += 1 sentimentDict['Count'] += 1 sentimentDict[ 'Weighted'] += sentimentWeight[sentiment] * deltaWeight retDict = { 'NumItems': len(newsFeed), 'Sentiment': sentimentDict, 'Newsfeed': newsFeed, } return retDict
def __init__(self, root='.'): self.icm = icm.InsightsConfigurationManager() self.bqUtils = BigqueryUtils() self.questionsReaderQuery = open( root + '/Queries/SportQuestionsReaderQuery.sql', 'r').read()
class MsfImportMlb: def __init__(self): self.seasons = ['2019-regular', '2017-regular', '2017-playoff', '2018-regular', '2018-playoff'] #self.seasons = ['2019-regular'] apikey = '98de7b49-a696-4ed7-8efa-94b28a' self.msf = MySportsFeeds(version="2.0") self.msf.authenticate(apikey, "MYSPORTSFEEDS") self.bqu = BigqueryUtils() def get_seasonal_stats(self): start_time=dt.now() for feed in ['seasonal_games', 'seasonal_team_stats', 'seasonal_player_stats']: outfile_json = 'results/MLB/msf-mlb-{}-{}.json'.format(feed, dt.now().strftime('%Y%m%dT%H%M%S')) with open(outfile_json, 'w') as jsonfile: for season in self.seasons: params = { 'league': 'mlb', 'season': season, 'feed': feed, 'format': 'json', } print('Starting msf {}-{}, delta-time: {}'.format(season, feed, dt.now()-start_time)) seasondata = self.msf.msf_get_data(**params) outjson = json.dumps({'Season': season, 'Seasondata': seasondata}) jsonfile.write(outjson) jsonfile.write('\n') #delete_query = 'delete from `Baseball1.{}` where season="{}"'.format(feed, season) #self.bqu.execute_query(delete_query) jsonfile.close() print('Starting upload of file {}, delta-time: {}'.format(outfile_json, dt.now() - start_time)) uri = self.bqu.upload_file_to_gcp('sport-uploads', outfile_json, outfile_json) print('Starting table creation, delta-time: {}'.format(dt.now() - start_time)) ret = self.bqu.create_table_from_gcp_file(uri, 'Baseball1', feed, 'WRITE_TRUNCATE') def get_game_days_stats(self): start_time=dt.now() for statObject in ['team', 'player']: feed = 'daily_{}_gamelogs'.format(statObject) # # get the missing game-days query = 'SELECT season,gameDay FROM `Baseball1.missing_{}_gamelogs` group by 1,2'.format(statObject) games_df = self.bqu.execute_query_to_df(query) print(games_df.shape) if (games_df.shape[0] == 0): return # # loop over missing game days for i,game in games_df.iterrows(): # # open the main file. mainfile_name = 'results/MLB/msf-mlb-dayfeeds-{}-{}.json'.format(feed, dt.now().strftime('%Y%m%d')) mainfile = open(mainfile_name, 'w') params = { 'league': 'mlb', 'date': game['gameDay'], 'season': game['season'], 'feed': feed, 'format': 'json', } outfile_json = 'results/MLB/dayfeeds/msf-mlb-{feed}-{season}-{date}.json'.format(**params) if (not os.path.exists(outfile_json) or True): # and (os.path.getsize(outfile_json)>0): print('Getting msf #{}, {}, delta-time: {}'.format(i, outfile_json, dt.now()-start_time)) jsonfile = open(outfile_json, 'w') # # Getting the data from MySportsFeeds try: seasondata = self.msf.msf_get_data(**params) except Exception as e: print('msf_get_data returned with error {}'.format(e)) continue except Warning as w: print('msf_get_data returned with warning {}'.format(w)) continue jsonfile.write(json.dumps(seasondata)) jsonfile.close() else: print('Reading msf #{}, {}, delta-time: {}'.format(i, outfile_json, dt.now()-start_time)) # # loading the JSON from already existing file. try: jsonfile = open(outfile_json,'r') seasondata = json.load(jsonfile) except Exception as e: print('Error loading JSON from file {}'.format(e)) continue dayfeed = { 'gamelogs': seasondata['gamelogs'], 'lastUpdatedOn': seasondata['lastUpdatedOn'], 'season': params['season'] } mainfile.write(json.dumps(dayfeed)+'\n') mainfile.close() # # upload file and update table. try: print('Starting upload of file {}, delta-time: {}'.format(outfile_json, dt.now() - start_time)) uri = self.bqu.upload_file_to_gcp('sport-uploads', mainfile_name, outfile_json + dt.now().strftime('.%Y%m%dT%H%M%S')) print('Starting table creation from {}, delta-time: {}'.format(uri, dt.now() - start_time)) ret = self.bqu.create_table_from_gcp_file(uri, 'Baseball1', '{}_{}'.format(feed, game['gameDay']), 'WRITE_TRUNCATE') except Exception as e: print('Error while uploading table {}'.format(e)) continue def get_game_pbp(self): start_time = dt.now() query='SELECT * FROM `sportsight-tests.Baseball1.missing_pbp_bydate`' datesDF = self.bqu.execute_query_to_df(query) print(datesDF.shape) if (datesDF.shape[0]==0): return feed = 'game_playbyplay' pbpFilePattern = '/Users/ysherman/Documents/GitHub/results/MLB/pbp/msf-pbp-{}-{}.json' for i,dayGames in datesDF.iterrows(): dayGames = dict(dayGames) games = dayGames['games'] date = dayGames['date'].strftime('%Y-%m-%d') outfile_json='results/MLB/temp/{}-{}.json'.format(feed, date) jsonfile = open(outfile_json, 'w') for game in games: pbpFileName = pbpFilePattern.format(game['id'], date) print(pbpFileName) if (not os.path.exists(pbpFileName) or True): params = {} params['season'] = dayGames['season'] params['matchname'] = game['matchname'] params['game'] = game['id'] params['format'] = 'json' params['league'] = 'mlb' params['feed']=feed while True: try: print('Getting for day {}, game-id: {}, {}, season: {}, feed: {}, delta-time: {}'.format( i, game['id'], game['matchname'], dayGames['season'], feed, dt.now() - start_time)) seasondata = self.msf.msf_get_data(**params) break except Exception as e: print("Error: {}".format(e)) except Warning as w: print("Error - Warning: {}".format(w)) except: print("Unknow Error") time.sleep(10) outfile = open(pbpFileName, 'w') outfile.write(json.dumps(seasondata)) outfile.close() else: try: print('Reading file {}'.format(pbpFileName)) pbpfile = open(pbpFileName,'r') seasondata = json.load(pbpfile) pbpfile.close() except Exception as e: print('Error loading JSON from file {}'.format(e)) continue seasondata = self.pbp_to_bigqery_form(seasondata) seasondata['season'] = dayGames['season'] seasondata['gameid'] = game['id'] seasondata['gamename'] = game['matchname'] outjson = json.dumps(seasondata) jsonfile.write(outjson) jsonfile.write('\n') # # uploading file for the day jsonfile.close() try: print('Starting upload of file {}, delta-time: {}'.format(outfile_json, dt.now() - start_time)) uri = self.bqu.upload_file_to_gcp('sport-uploads', outfile_json, outfile_json) print('Starting table creation, delta-time: {}'.format(dt.now() - start_time)) ret = self.bqu.create_table_from_gcp_file(uri, 'Baseball1', '{}_{}'.format(feed, date.replace('-','')), 'WRITE_TRUNCATE') except Exception as e: print('Error while uploading table {}'.format(e)) continue def pbp_atbatsubplay_new(self, atBatSubPlay): newAtBatSubPlay = [] for key,value in atBatSubPlay.items(): if value is None: continue #newAtBatSubPlay.append({'key': key,'value': '', 'type': 'NULL'}) elif key in ['retrievedAtLocation', 'pitchedLocation']: newAtBatSubPlay.append({'key': key+'-x','value': str(value['x']), 'type': 'INTEGER'}) newAtBatSubPlay.append({'key': key+'-y','value': str(value['y']), 'type': 'INTEGER'}) elif type(value)==dict: newAtBatSubPlay.append({'key': key,'value': str(value['id']), 'type': 'PlayerId'}) else: newAtBatSubPlay.append({'key': key,'value': str(value), 'type': type(value).__name__}) return newAtBatSubPlay def pbp_atbatplaystatus_new(self, atBatPlayStatus): playerRoles = ['batter', 'catcher', 'centerFielder', 'firstBaseman', 'firstBaseRunner', 'leftFielder', 'outFielder', 'pitcher', 'rightFielder', 'secondBaseman', 'secondBaseRunner', 'shortStop', 'thirdBaseman', 'thirdBaseRunner'] runnerRoles = ['firstBaseRunner', 'secondBaseRunner', 'thirdBaseRunner'] nRunners = 0 newAtBatPlayStatus = {} for key,value in atBatPlayStatus.items(): if key in runnerRoles: if value is not None: nRunners+=1 if value is None: value={'id': -1} if key in playerRoles: newAtBatPlayStatus[key] = value['id'] else: newAtBatPlayStatus[key] = value newAtBatPlayStatus['numRunners'] = nRunners return newAtBatPlayStatus def pbp_to_bigqery_form(self, pbpDict): newAtBats = [] atBatCounter=0 playCounter=0 for atBat in pbpDict['atBats']: newAtBatPlays = [] atBatPlayCounter = 0 for atBatPlay in atBat['atBatPlay']: newAtBatPlay={} try: if type(atBatPlay)!=dict: continue for key, value in atBatPlay.items(): if key=='description': newAtBatPlay[key]=value elif key == 'playStatus': newAtBatPlay[key] = self.pbp_atbatplaystatus_new(value) else: newAtBatPlay['atBatSubPlay'] = {'name': key, 'properties': self.pbp_atbatsubplay_new(value)} except Exception as e: print('Error {} with atBatCounter = {}, atBatPlayCounter={}, key={}, atbatsubplay={}, atBatPlay={}'.format(e,atBatCounter,atBatPlayCounter,key,value, atBatPlay)) atBatPlayCounter+=1 playCounter+=1 newAtBatPlay['index']=atBatPlayCounter newAtBatPlay['playindex']=playCounter newAtBatPlays.append(newAtBatPlay) atBatCounter += 1 atBat['index'] = atBatCounter atBat['atBatPlay'] = newAtBatPlays newAtBats.append(atBat) pbpDict['atBats'] = newAtBats return pbpDict
class EODHistoricalDataImport: def __init__(self): #ProducerConsumersEngine.__init__(self, self.import_daily_quotes) self.bqu = BigqueryUtils() self.main_dir = '/Users/ysherman/Documents/GitHub/results/Finance/EODHistoricalData/daily-quotes/' #self.AV_api_key = 'NN4P0527XD25VT1Q' #self.WTD_apikey = 'kDxr9tfB8fYVUV0wnkNzZN4W3IQZO48hLOpFKJ2NIiHbHSgKsTyMt4jzW3Cm' self.EODHD_apikey = '5d5d1d7259ef23.41685254' #self.alpha_url = 'https://www.alphavantage.co/query?function={AnalyticFunction}&symbol={Symbol}&outputsize=compact&apikey={api_key}&datatype=csv'.replace('{api_key}', api_key) self.EOD_Symbols_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('COMM', 'NYSE', 'NASDAQ', 'INDX') order by exchange desc" self.FUNDAMENTALS_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('NYSE', 'NASDAQ') AND type='Common Stock' order by exchange desc" self.EOD_DAYBULK_URL = 'https://eodhistoricaldata.com/api/eod-bulk-last-day/{}?api_token=5d5d1d7259ef23.41685254&filter=extended&date={}' def get_eod_daily_bulk(self, startTime): csv_dir = self.main_dir + 'dated-files/' if not os.path.exists(csv_dir): os.mkdir(csv_dir) #datesPD = self.bqu.execute_query_to_df ("SELECT distinct format_date('%Y-%m-%d', timestamp) as Date FROM `sportsight-tests.Finance_Data.daily_stock_history_*` where timestamp<=parse_date('%x', '09/25/18') order by Date desc limit 230") #datesList = list(datesPD['Date']) datesList = ['2019-09-03'] #list(datesPD['Date']) for stockDate in datesList: dailyDF = pd.DataFrame() for exchange in ['COMM', 'INDX', 'NASDAQ', 'NYSE']: url = self.EOD_DAYBULK_URL.format(exchange, stockDate) print(url, dt.now() - startTime) try: stockDF = pd.read_csv(url).fillna(0)[[ 'Date', 'Open', 'High', 'Low', 'Close', 'Adjusted_close', 'Volume', 'Code' ]][:-1] #print(stockDF.shape, stockDF.columns) stockDF.rename(columns={'Code': 'Symbol'}, inplace=True) stockDF['Exchange'] = exchange #dtd = dt.strptime(stockDate, '%Y-%m-%d') #print(dtd, type(dtd)) #finalDate = dtd.date() #print(finalDate, type(finalDate)) #builtDate = dt.date(dtd.year, dtd.month, dtd.day) #print(builtDate, type(builtDate)) #stockDF['Date'] = date(2019,8,25) stockDF['Volume'] = stockDF['Volume'].astype(int) stockDF.to_csv(csv_dir + '{}-{}.csv'.format(stockDate, exchange), index=False) dailyDF = dailyDF.append(stockDF) except Exception as e: print("Error {}".format(e)) #break #tableId = 'Finance_Data.eod_history_data_{}'.format(date.replace('-', '')) if dailyDF.shape[0] > 0: datasetId = 'Finance_Data' tableId = 'eod_daily_history_1year' delQuery = "delete from `{}.{}` where Date=PARSE_DATE('%Y-%m-%d', '{}')".format( datasetId, tableId, stockDate) #print(delQuery) #print(schema) self.bqu.execute_query(delQuery) print('Writing table {}, size {}, delta time {}'.format( tableId, dailyDF.shape, dt.now() - startTime)) schema = self.bqu.get_table_schema(datasetId, tableId) dailyDF.to_gbq('{}.{}'.format(datasetId, tableId), table_schema=schema, if_exists='append') #break print('Done', dt.now() - startTime) def get_fundamentals_data(self, startTime): fundamentals_dir = self.main_dir + 'fundamentals/' if not os.path.exists(fundamentals_dir): os.mkdir(fundamentals_dir) jsonFileName = 'fundamentals-{}.json'.format( dt.now().strftime('%Y-%m-%dT%H%M%S')) outfileName = fundamentals_dir + jsonFileName outfile = open(outfileName, 'w') stocksDict = self.bqu.execute_query_to_dict(self.FUNDAMENTALS_Query) print('Getting {} stocks data'.format(stocksDict['nRows'])) count = 0 successCount = 0 for stock in stocksDict['Rows']: count += 1 print(count, stock, dt.now() - startTime) try: retDict = eod.get_fundamental_data(stock['Code'], 'US', api_key=self.EODHD_apikey) relevantKeys = [ 'General', 'Highlights', 'Valuation', 'SharesStats' ] stockData = { x: retDict[x] for x in relevantKeys if x in retDict } stockData['Time'] = dt.now().strftime('%Y-%m-%dT%H:%M:%S') technicalsDict = {} for key, value in retDict['Technicals'].items(): if key[0] in '0123456789': technicalsDict['T' + key] = value else: technicalsDict[key] = value stockData['Technicals'] = technicalsDict json.dump(stockData, outfile) outfile.write('\n') successCount += 1 except Exception as e: print("Error {}".format(e)) #break #break outfile.close() if successCount > 0: datasetId = 'Finance_Data' tableId = 'fundamentals_daily_{}'.format( startTime.strftime('%Y%m%d')) self.bqu.create_dataset(datasetId) uri = self.bqu.upload_file_to_gcp( 'sport-uploads', outfileName, 'Finance/EOD/Fundamentals/{}'.format(jsonFileName)) ret = self.bqu.create_table_from_gcp_file(uri, datasetId, tableId, 'WRITE_TRUNCATE') print('Done', successCount, dt.now() - startTime) def get_eod_quote(self, comp, startTime): stockCode = '{Code}-{Exchange}-{Type}'.format(**comp) csv_file = comp['CSVDir'] + '{}.csv'.format(stockCode) if (not os.path.exists(csv_file) ): # and (os.path.getsize(outfile_json)>0): print('{}. Getting {}, delta time: {}'.format( comp['i'] + 1, stockCode, dt.now() - startTime)) try: symbol = comp['Code'] exchange = comp['Exchange'] if exchange in ['NYSE', 'NASDAQ']: exchange = 'US' svDF = eod.get_eod_data(symbol, exchange, api_key=self.EODHD_apikey) print(svDF.shape) svDF['Symbol'] = comp['Code'] svDF['Exchange'] = comp['Exchange'] # svDF['Date'] = svDF.index svDF.to_csv(csv_file) return True except Exception as e: print('Error {}, Stock: {}'.format(e, stockCode)) return False def import_indices_fundamentals(self): outfileName = '{}/tmp/indices.json'.format(os.environ['HOME']) outfile = open(outfileName, 'w') for index in ['DJI', 'GSPC']: indexData = eod.get_fundamental_data(index, 'INDX', self.EODHD_apikey) newIndex = {} newIndex['General'] = indexData['General'] complist = [] for key, value in indexData['Components'].items(): component = value component['Index'] = int(key) complist.append(component) newIndex['Components'] = complist newIndex['NumComponents'] = len(complist) json.dump(newIndex, outfile) outfile.write('\n') outfile.close() self.bqu.create_table_from_local_file(outfileName, 'Finance_Data', 'indices_company_list') def import_daily_quotes(self, configurations, startTime): print("Starting import_daily_quotes") comp_df = self.bqu.execute_query_to_df(self.EOD_Symbols_Query) csv_dir = self.main_dir + '{}/'.format(startTime.strftime('%Y-%m-%d')) if not os.path.exists(csv_dir): os.mkdir(csv_dir) print('getting {} companies, delta time: {} '.format( comp_df.shape[0], dt.now() - startTime)) for i, comp in comp_df.iterrows(): comp['CSVDir'] = csv_dir comp['i'] = i ret = self.get_eod_quote(comp, startTime) if ret: continue continue jobData = self.JobData(self.get_eod_quote, dict(comp)) print(jobData.instructions) try: continue self.jobsQueue.put(jobData) except Exception as e: print("Error {} in queue.put".format(e)) break break def create_dated_quote_files(self, dirdate): os.chdir(self.main_dir) extension = 'csv' all_filenames = [ i for i in glob.glob('{}/*.{}'.format(dirdate, extension)) ] print(len(all_filenames)) outfiles = {} mydialect = csv.Dialect mydialect.lineterminator = '\n' mydialect.quoting = csv.QUOTE_MINIMAL mydialect.quotechar = '|' count = 0 main_start_time = dt.now() for csvFileName in all_filenames: infile = open(csvFileName, 'r') linereader = csv.reader(infile, delimiter=',') firstrow = True for line in linereader: if firstrow: firstrow = False topLine = line continue date = line[1] if date not in outfiles: outfile = open('dated-files/{}.csv'.format(date), 'w') outFileWriter = csv.writer(outfile, delimiter=',', dialect=mydialect) outFileWriter.writerow(topLine[1:]) outfiles[date] = { 'outfile': outfile, 'outFileWriter': outFileWriter } else: outFileWriter = outfiles[date]['outFileWriter'] outFileWriter.writerow(line[1:]) #break count += 1 infile.close() if count % 1000 == 0: print('Done reading: {} of {} Total-delta: {}\n'.format( count, len(all_filenames), dt.now() - main_start_time)) def upload_dated_quote_files(self, startdate): os.chdir(main_dir) extension = 'csv' all_filenames = [ i for i in glob.glob('dated-files/2019-08*.{}'.format(extension)) ] print(all_filenames) bqu = BigqueryUtils() main_start_time = dt.now() count = 0 for csvFileName in all_filenames: datadate = csvFileName.split('/')[1].split('.')[0].replace('-', '') if (datadate < startdate.replace('-', '')): continue tableId = 'daily_stock_history_{}'.format(datadate) print(tableId, dt.now() - main_start_time) csvFile = open(csvFileName, 'rb') bqu.create_table_from_local_file(csvFile, 'Finance_Data', tableId) csvFile.close() count += 1 if count % 20 == 0: print('Done reading: {} of {} Total-delta: {}\n'.format( count, len(all_filenames), dt.now() - main_start_time))
class SimpleStatsGenerator(): # # read in the configurations def __init__(self, root): # # get the initial configuration self.root = root self.configsheet_url = 'https://docs.google.com/spreadsheets/d/1gwtQlzk0iA4qyLzqaYEk5SggOqNZtJnSSfwnZYDNlAw/export?format=csv&gid={SheetId}&run=1' sourceConfigDF = pd.read_csv( self.configsheet_url.replace('{SheetId}', '284194018')).fillna('') sourceConfigDF['enriched'] = False self.sourcesConfigDict = ProUtils.pandas_df_to_dict( sourceConfigDF, 'Configname') self.sport_configs = {} self.TRUE = True # # read IMDB title definitions titleTypesDF = pd.read_csv( self.configsheet_url.replace('{SheetId}', '1802180540')).fillna('') self.titletypesConfigDict = ProUtils.pandas_df_to_dict( titleTypesDF, 'TitleType') #print(sourceConfig) self.consumerStatus = multiprocessing.Queue() self.sentinel = 'Done' self.bqUtils = BigqueryUtils() def get_source_configuration(self, configName): sourceConfig = self.sourcesConfigDict[configName] if sourceConfig['DoIT'] != 'y' or sourceConfig['enriched'] == True: return sourceConfig sheetId = sourceConfig['SportSheetId'] # # read all relevant metrics if sheetId not in self.sport_configs.keys(): self.sport_configs[sheetId] = pd.read_csv( self.configsheet_url.replace( '{SheetId}', str(sourceConfig['SportSheetId']))).fillna('') self.sport_configs[sheetId]['SportCode'] = sourceConfig[ 'SportCode'] sourceConfig['StatsDefDict'] = ProUtils.pandas_df_to_dict( self.sport_configs[sheetId], 'StatName') if 'query' not in sourceConfig.keys(): sourceConfig['query'] = open( self.root + '/Queries/' + sourceConfig['QueryFile'], 'r').read() sourceConfig['enriched'] = True self.sourcesConfigDict[configName] = sourceConfig return sourceConfig def queryExecutor(self, i, query_jobs): # # execute a list of query jobs #print('Start executor %d' % i) startTime = dt.now() for queryJob in iter(query_jobs.get, self.sentinel): # # to enforce the schema is correct, we first copy the empty table from the schema template # and then append the result to this empty table try: nRows = self.bqUtils.execute_query_with_schema_and_target( **queryJob['params']) print( 'Returened for Statname: {} ({} rows), StatObject: {}, StatTimeframe: {}, Detlatime: {}' .format(queryJob['StatName'], nRows, queryJob['StatObject'], queryJob['StatTimeframe'], dt.now() - startTime), flush=True) query_jobs.task_done() queryFile = 'results/queries/{}.sql'.format( queryJob['params']['targetTable']) f = open(queryFile, 'w') f.write(queryJob['params']['query']) f.close() except Exception as e: queryFile = 'errors/{}.sql'.format( queryJob['params']['targetTable']) f = open(queryFile, 'w') f.write(queryJob['params']['query']) f.close() # print(queryJob['query'],flush=True) print( 'Error {} with Statname: {}, StatObject: {}, StatTimeframe: {}' .format(e, queryJob['StatName'], queryJob['StatObject'], queryJob['StatTimeframe']), flush=True) #print('Consumer {} terminates, Deltatime: {}'.format(str(i), dt.now() - startTime), flush=True) def queriesGenerator(self, queriesQueue, numExecutors, configurations=[]): startTime = dt.now() # # Make sure the target dataset exists self.bqUtils.create_dataset(targetDataset) # # if there are only partial list of configurations if len(configurations) == 0: configurations = self.sourcesConfigDict.keys() # # loop over all configurations and generate #print(configurations) for sourceConfigName in configurations: # # get the source configuration sourceConfig = self.get_source_configuration(sourceConfigName) # # make sure it is required. if sourceConfig['DoIT'] != 'y': continue # # call the relevant generation function. print("running configuration {}".format(sourceConfigName)) generatorFunc = eval('self.{}'.format( sourceConfig['generatorFunc'])) generatorFunc(queriesQueue, sourceConfig, startTime) # # Set the sentinel for all processes. for i in range(numExecutors): queriesQueue.put(self.sentinel) # indicate sentinel def financeQueriesGenerator(self, queriesQueue, sourceConfig, startTime): # # target table definitions financeTableFormat = 'Stat_Finance_{StatSource}_{StatName}_{StatObject}_Rolling_{RollingDays}' financeStatsDataset = 'Finance_Stats' self.bqUtils.create_dataset(financeStatsDataset) # # create jobs for all relevant metrics. for statDef in sourceConfig['StatsDefDict'].values(): if statDef['Doit'] != 'y': continue #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True) for statObject in statDef['StatObject'].split(',')[:1]: for rollingDays in statDef['RollingDaysList'].split(','): _statDef = statDef.copy() _statDef['StatObject'] = statObject rollingDaysInst = {'RollingDays': rollingDays} query = sourceConfig['query'] query = ProUtils.format_string(query, _statDef) query = ProUtils.format_string(query, sourceConfig) query = ProUtils.format_string(query, rollingDaysInst) #print (query) # # define the destination table instructions = _statDef instructions['StatTimeframe'] = sourceConfig[ 'StatTimeframe'] instructions['StatSource'] = sourceConfig['StatSource'] instructions['RollingDays'] = rollingDays targetTable = ProUtils.format_string( financeTableFormat, instructions).replace('.', '_').replace('-', '_') jobDefinition = { 'params': { 'query': query, 'targetDataset': financeStatsDataset, 'targetTable': targetTable, }, 'StatName': _statDef['StatName'], 'StatObject': statObject, 'StatTimeframe': '{}_Rollingdays'.format(rollingDays) } queriesQueue.put(jobDefinition) def imdbQueriesGenerator(self, queriesQueue, sourceConfig, startTime): # # create jobs for all relevant metrics. for statDef in sourceConfig['StatsDefDict'].values(): if statDef['Doit'] != 'y': continue #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True) for titleType in statDef['TitleType'].split(','): titletypeConfig = self.titletypesConfigDict[titleType] if statDef['Genres'] == 'y': genresList = titletypeConfig['GenresList'].split(',') else: genresList = ['All'] for genre in genresList: _statDef = statDef.copy() query = sourceConfig['query'] if genre == 'All': _statDef['StatCondition'] = '' else: _statDef[ 'StatCondition'] = 'AND STRPOS(Genres, "{}")>0'.format( genre) _statDef['StatName'] = '{}.{}'.format( statDef['StatName'], genre) _statDef['TitleType'] = titleType _statDef['Genre'] = genre _statDef['StatObject'] = titleType query = ProUtils.format_string(query, _statDef) query = ProUtils.format_string(query, sourceConfig) query = ProUtils.format_string(query, titletypeConfig) #print (query) # # define the destination table instructions = _statDef instructions['StatTimeframe'] = sourceConfig[ 'StatTimeframe'] instructions['StatSource'] = sourceConfig['StatSource'] targetTable = ProUtils.format_string( targetTableFormat, instructions).replace('.', '_').replace('-', '_') jobDefinition = { 'params': { 'query': query, 'targetDataset': targetDataset, 'targetTable': targetTable, }, 'StatName': _statDef['StatName'], 'StatObject': titleType, 'StatTimeframe': sourceConfig['StatTimeframe'] } queriesQueue.put(jobDefinition) def imdbQuestionsDefGenerator(self): # # create jobs for all relevant metrics. questionsList = [] sourceConfig = self.get_source_configuration('Entertainmant.IMDB') for statDef in sourceConfig['StatsDefDict'].values(): for titleType in statDef['TitleType'].split(','): titletypeConfig = self.titletypesConfigDict[titleType] if statDef['Genres'] == 'y': genresList = titletypeConfig['GenresList'].split(',') else: genresList = ['All'] for genre in genresList: questionDef = {} questionDef['QuestionCode'] = '{}.{}'.format( titleType, statDef['StatName']) questionDef['StatName'] = statDef['StatName'] questionDef['StatObject'] = titleType questionDef['Genre'] = '' questionDef['TitleType'] = titleType questionDef['Level'] = 'Easy' questionDef['Value1Template'] = statDef['Value1Template'] questionDef['Value2Template'] = statDef['Value2Template'] questionDef['ObjectDisplayName'] = titletypeConfig[ 'ObjectDisplayName'] questionDef['QuestionNObjects'] = '' if genre != 'All': questionDef['QuestionCode'] = '{}.{}'.format( questionDef['QuestionCode'], genre) questionDef['StatName'] = '{}.{}'.format( questionDef['StatName'], genre) questionDef['Genre'] = genre + ' ' questionDef['Question2Objects'] = ProUtils.format_string( statDef['Question2Objects'], questionDef) questionsList.append(questionDef) keys = [ 'QuestionCode', 'StatName', 'Genre', 'Level', 'ObjectDisplayName', 'Question2Objects', 'QuestionNObjects', 'StatObject', 'TitleType', 'Value1Template', 'Value2Template' ] questionsDF = pd.DataFrame(questionsList, columns=keys) questionsDF.to_csv('imdb_questionsList.csv') def days_range(self, interval, prev): instructions = {} startDate = (dt.today() - timedelta(days=interval + prev - 1)) endDate = (dt.today() - timedelta(days=prev)) condTemplate = '{DateProperty} BETWEEN "{StartDate}" and "{EndDate}"' condInst = { 'StartDate': startDate.strftime('%Y%m%d'), 'EndDate': endDate.strftime('%Y%m%d') } instructions['StatCondition'] = ProUtils.format_string( condTemplate, condInst) instructions['DaysRange'] = '{}...{}'.format( startDate.strftime('%Y-%m-%d'), endDate.strftime('%Y-%m-%d')) return instructions def games_days_range(self, interval, prev): instructions = {} startDate = (dt.today() - timedelta(days=interval + prev - 1)) endDate = (dt.today() - timedelta(days=prev)) condTemplate = '{DateProperty} BETWEEN "{StartDate}" and "{EndDate}"' condInst = { 'StartDate': startDate.strftime('%Y%m%d'), 'EndDate': endDate.strftime('%Y%m%d') } instructions['StatCondition'] = ProUtils.format_string( condTemplate, condInst) instructions['DaysRange'] = 'N/A' return instructions def sportsQueriesGenerator(self, queriesQueue, sourceConfig, startTime): # # create jobs for all relevant metrics. for statDef in sourceConfig['StatsDefDict'].values(): if statDef['Doit'] != 'y': continue #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True) sourceDefinitions = definitions[sourceConfig['StatSource']] for statObject in statDef['StatObject'].split(','): for statTimeframe in sourceConfig['StatTimeframe'].split(','): query = sourceConfig['query'] query = query.replace('{StatObject}', statObject) query = query.replace('{StatTimeframe}', statTimeframe) if sourceConfig['StatCondition'] != '': query = ProUtils.format_string( query, eval("self." + sourceConfig['StatCondition'])) else: query = ProUtils.format_string(query, {'StatCondition': True}) query = ProUtils.format_string( query, sourceDefinitions['StatObject'][statObject]) query = ProUtils.format_string(query, statDef) query = ProUtils.format_string(query, sourceConfig) #print (query) # # define the destination table instructions = statDef instructions['StatObject'] = statObject instructions['StatTimeframe'] = statTimeframe instructions['StatSource'] = sourceConfig['StatSource'] targetTable = ProUtils.format_string( targetTableFormat, instructions).replace('.', '_') jobDefinition = { 'params': { 'query': query, 'targetDataset': targetDataset, 'targetTable': targetTable, }, 'StatName': statDef['StatName'], 'StatObject': statObject, 'StatTimeframe': statTimeframe } queriesQueue.put(jobDefinition) def complexQueriesGenerator(self, queriesQueue, sourceConfig, startTime): # # create jobs for all relevant metrics. for statDef in sourceConfig['StatsDefDict'].values(): if statDef['Doit'] != 'y': continue #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True) inst = {} inst['StatTimeframes'] = ProUtils.commastring_to_liststring( statDef['StatTimeframes']) inst['StatObjects'] = ProUtils.commastring_to_liststring( statDef['StatObjects']) inst['NumeratorStatNames'] = ProUtils.commastring_to_liststring( statDef['NumeratorStatNames']) inst['DenominatorStatNames'] = ProUtils.commastring_to_liststring( statDef['DenominatorStatNames']) query = sourceConfig['query'] query = ProUtils.format_string(query, inst) query = ProUtils.format_string(query, statDef) query = ProUtils.format_string(query, sourceConfig) #print (query) # # define the destination table instructions = statDef instructions['StatObject'] = statDef['StatObjects'].replace( ',', '_') instructions['StatTimeframe'] = statDef['StatTimeframes'].replace( ',', '_') instructions['StatSource'] = sourceConfig['StatSource'] targetTable = ProUtils.format_string(targetTableFormat, instructions).replace( '.', '_') jobDefinition = { 'params': { 'query': query, 'targetDataset': targetDataset, 'targetTable': targetTable, }, 'StatName': statDef['StatName'], 'StatObject': instructions['StatObject'], 'StatTimeframe': instructions['StatTimeframe'] } queriesQueue.put(jobDefinition) def run(self, configurations=[], numExecutors=0): # # main method startTime = dt.now() queriesQueue = multiprocessing.JoinableQueue( ) # start a joinable queue to pass messages if numExecutors == 0: numExecutors = multiprocessing.cpu_count() * 8 producer = multiprocessing.Process( name='QueriesGenerator', target=self.queriesGenerator, args=( queriesQueue, numExecutors, ), kwargs={'configurations': configurations}) producer.start() queriesQueue.join() # # initate consumers # consumer will execute the job consumers = [ multiprocessing.Process(name='QueriesExecutor', target=self.queryExecutor, args=( i, queriesQueue, )) for i in range(numExecutors) ] for c in consumers: c.start() while True: if any(c.is_alive() for c in consumers): time.sleep(1) else: print('Done') break
class GetStocksData: def __init__(self): self.bqu = BigqueryUtils() self.companiesDF = None self.stocksDF = None self.resourceDir = 'resource' self.companiesDataFileName = '{}/companies_{}.csv'.format( self.resourceDir, date.today()) self.stocksDataFileName = '{}/stocks_{}.csv'.format( self.resourceDir, date.today()) self.companiesURL = 'gs://sport-uploads/Finance/companies_fundamentals.csv' self.stocksURL = 'gs://sport-uploads/Finance/eod_stocks_data.csv' def get_stockdata_by_dates(self, stocklist, from_date, to_date): # # get updated company data if self.stocksDF is None: if os.path.exists(self.stocksDataFileName): self.stocksDF = pd.read_csv(self.stocksDataFileName) else: stocksQuery = """SELECT * FROM `sportsight-tests.Finance_Data.eod_daily_history_1year` order by Symbol, Date""" self.stocksDF = self.bqu.execute_query_to_df(stocksQuery) if not os.path.exists(self.resourceDir): os.mkdir(self.resourceDir) self.stocksDF.to_csv(self.stocksDataFileName) #url = self.bqu.upload_file_to_gcp('sport-uploads', self.stocksDataFileName, self.stocksURL.replace('gs://sport-uploads/', '')) if len(stocklist) > 0: symbol_condition = 'Symbol in {tickersString} and '.format( tickersString=str(stocklist)) else: symbol_condition = '' stocksQuery = '{symbol_condition} Date >= "{from_date}" and Date <= "{to_date}"'.format( symbol_condition=symbol_condition, from_date=from_date, to_date=to_date) stockDataDF = self.stocksDF.query(stocksQuery) stockDataDF.index = pd.to_datetime(stockDataDF['Date']) stockDataDF.rename_axis("date", axis='index', inplace=True) return stockDataDF # get list of stock action x days to date def get_stockdata_by_cal_days(self, stocklist, numdays, to_date): from_date = to_date - datetime.timedelta(days=numdays - 1) return self.get_stockdata_by_dates(stocklist, from_date, to_date) def get_stock_fundamentals(self, stocklist=[], index=None): # # get updated company data if self.companiesDF is None: if os.path.exists(self.companiesDataFileName): self.companiesDF = pd.read_csv(self.companiesDataFileName) else: companiesQuery = """SELECT * FROM `sportsight-tests.Finance_Data.all_company_data` WHERE MarketCapitalizationMln > 1000""" self.companiesDF = self.bqu.execute_query_to_df(companiesQuery, fillna=0) if not os.path.exists(self.resourceDir): os.mkdir(self.resourceDir) self.companiesDF.to_csv(self.companiesDataFileName) #url = self.bqu.upload_file_to_gcp('sport-uploads', self.companiesDataFileName, self.companiesURL.replace('gs://sport-uploads/', '')) if len(stocklist) > 0: where_condition = 'Symbol in {tickersString}'.format( tickersString=str(stocklist)) elif index in ['DJI', 'SNP']: where_condition = 'is{index}'.format(index=index) else: return self.companiesDF return self.companiesDF.query(where_condition)
def one_list_generator(listName, listConfigDict, startTime=dt.now()): listsDefDict = ProUtils.get_dict_from_jsonfile('lists_config.json') finquery = ProUtils.get_string_from_file('queries/top_lists_query.sql') # # read the query, configure and run it. instructions = {} if listName in listsDefDict.keys(): listConfig = listsDefDict[listName] else: raise NotFound('List {} does not exists'.format(listName)) instructions['StatName'] = listConfig['StatName'] instructions['RollingDaysCondition'] = 'StatRollingDays="{}"'.format( listConfig['RollingDays']) if 'Sector' in listConfigDict: instructions['SectorCondition'] = 'Sector="{}"'.format( listConfigDict['Sector']) else: instructions['SectorCondition'] = 'TRUE' if listConfigDict.get('Index', '') in ['DJI', 'SNP']: instructions['IndexCondition'] = 'is' + listConfigDict['Index'] else: instructions['IndexCondition'] = 'isSNP' minMarketCap = listConfigDict.get('MarketCapMin', 100) maxMarketCap = listConfigDict.get('MarketCapMax', 1000000000) instructions['MarketCapCondition'] = 'MarketCap BETWEEN {} AND {}'.format( minMarketCap, maxMarketCap) instructions['ListSize'] = min(listConfigDict.get('ListSize', 5), 10) #query = self.get_onelist_query(listConfigDict['Domain']) query = ProUtils.format_string(finquery, instructions) #print("Running query:\n" + query, flush=True) #return # # Execute the query. print('Starting get-top-list for {} query execution'.format(instructions), dt.now() - startTime) bqu = BigqueryUtils() listDF = bqu.execute_query_to_df(query) print(list(listDF['Symbol'])) #listDF = listDF.query('TopBottom=="TOP"') #print (listDF.columns, listDF.shape, dt.now()-startTime) listDict = ProUtils.pandas_df_to_dict(listDF, 'TopRank') # # getting additional info print('Starting get_stock_fundamentals for {}'.format('SNP'), dt.now() - startTime) getstocks = GetStocksData() companiesDF = getstocks.get_stock_fundamentals(index='SNP') symbolList = list(companiesDF['Symbol']) print( 'Starting StockMetricsCalculator for {}, {} companies'.format( symbolList, len(symbolList)), dt.now() - startTime) smc = StockMetricsCalculator(symbolList) print('Done StockMetricsCalculator', dt.now() - startTime) gsn = GetStockNews() for key, stockDict in listDict.items(): stockDict['InterestingStatements'] = get_statements_for_ticker( stockDict['Symbol'], smc) stockDict['RelevantNews'] = gsn.get_stocknews_byticker( stockDict['Symbol']) listDict['Description'] = listConfig['QuestionDescription'] print(listDict, dt.now() - startTime) return json.dumps(listDict)
class InsightsPackaging: def __init__(self, root='.'): self.icm = icm.InsightsConfigurationManager() self.bqUtils = BigqueryUtils() self.questionsReaderQuery = open( root + '/Queries/SportQuestionsReaderQuery.sql', 'r').read() def two_answers_reader(self, contentConfigCode): configDef = self.icm.get_content_config(contentConfigCode) # # read the questions query = ProUtils.format_string(self.questionsReaderQuery, configDef) questionsDF = self.bqUtils.execute_query_to_df(query) # # find all metrics within slot nSlots = configDef['NumSlots'] slotStatGroups = {} slotStatGroupKeys = {} for i in range(1, nSlots + 1): slotDF = questionsDF.query('slotNum == %d' % i) slotStatGroups[i] = slotDF.groupby(['QuestionCode', 'StatObject']).groups slotStatGroupKeys[i] = set(slotStatGroups[i].keys()) return questionsDF, slotStatGroups, slotStatGroupKeys def two_answers_question_generator(self, questionDict, configDef): #print(questionDict) stat1 = questionDict['Stat1'] stat2 = questionDict['Stat2'] questionTemplate = stat1['Question2Objects'] questionInstructions = stat1 timeFrameTexts = configDef['TimeframeText'].split(',') loc = random.randint(0, len(timeFrameTexts) - 1) questionInstructions['Timeframe'] = timeFrameTexts[loc] questionText = ProUtils.format_string(questionTemplate, questionInstructions) templateDict = self.icm.templateDefsDict outQuestion = { 'QuestionText': questionText, 'Answer1': stat1['StatObjectName'], 'Answer2': stat2['StatObjectName'], 'Value1': str( eval(templateDict[stat1['Value1Template']]['Template'].replace( '{value}', "stat1['StatValue']"))), 'Value2': str( eval(templateDict[stat2['Value1Template']]['Template'].replace( '{value}', "stat2['StatValue']"))), } questionKeys = [ 'ContentDefCode', 'SportCode', 'StatSource', 'slotNum', 'rankDiff', 'StatObject', 'StatTimeframe', 'LeagueCode', 'SeasonCode', 'CompetitionStageCode', 'MatchStageCode', 'QuestionCode', 'StatCode', 'Description', 'numRanks', 'rankItemsCount', 'valueRange', 'internalDenseRank', 'objectsCount', 'minValue', 'maxValue' ] statKeys = [ 'StatObjectName', 'StatFunction', 'MatchCode', 'TeamCode', 'PlayerCode', 'StatValue', 'Count', 'DenseRank', 'TeamName', 'PlayerName' ] ProUtils.update_dict(outQuestion, stat1, questionKeys) ProUtils.update_dict(outQuestion, questionDict, questionKeys) ProUtils.update_dict(outQuestion, stat1, statKeys, '1') ProUtils.update_dict(outQuestion, stat2, statKeys, '2') return outQuestion def two_answers_package_generator(self, contentConfigCode): configDef = self.icm.get_content_config(contentConfigCode) numPackages = configDef['NumPackages'] numSlots = configDef['NumSlots'] outputDF = pd.DataFrame() questionsDF, slotStatGroups, slotStatGroupKeys = self.two_answers_reader( contentConfigCode) for packageNo in range(1, numPackages + 1): selectedStats = set() package = [] packageId = '{}-{}-{}'.format(contentConfigCode, packageNo, int(dt.timestamp(dt.now()) * 1000)) for slotNo in range(1, numSlots + 1): while True: try: remainingStatCombinations = slotStatGroupKeys[ slotNo] - selectedStats statComb = random.sample(remainingStatCombinations, 1)[0] break except ValueError: selectedStats.clear() continue except Exception as e: print("Error selecting a new stat in slot #{}: {}, {}". format(slotNo, e, type(e))) selectedStats.add(statComb) questionGroup = slotStatGroups[slotNo][statComb] questionIndex = questionGroup[random.randint( 0, len(questionGroup) - 1)] questionDict = dict(questionsDF.iloc[questionIndex]) newQuestion = self.two_answers_question_generator( questionDict, configDef) newQuestion['PackageId'] = packageId newQuestion['Timestamp'] = dt.now() package.append(newQuestion) #print(package) packageDF = pd.DataFrame(package) #print(packageDF) outputDF = outputDF.append(packageDF) # # write to BigQuery #print(outputDF) tableId = 'Sportsight_Packages.two_answers_all_V3' outputDF.to_gbq(tableId, if_exists='append')