Python BigqueryUtils示例

    def __init__(self, root):
        #
        # get the initial configuration
        self.root = root
        self.configsheet_url = 'https://docs.google.com/spreadsheets/d/1gwtQlzk0iA4qyLzqaYEk5SggOqNZtJnSSfwnZYDNlAw/export?format=csv&gid={SheetId}&run=1'
        sourceConfigDF = pd.read_csv(
            self.configsheet_url.replace('{SheetId}', '284194018')).fillna('')
        sourceConfigDF['enriched'] = False
        self.sourcesConfigDict = ProUtils.pandas_df_to_dict(
            sourceConfigDF, 'Configname')
        self.sport_configs = {}
        self.TRUE = True

        #
        # read IMDB title definitions
        titleTypesDF = pd.read_csv(
            self.configsheet_url.replace('{SheetId}', '1802180540')).fillna('')
        self.titletypesConfigDict = ProUtils.pandas_df_to_dict(
            titleTypesDF, 'TitleType')

        #print(sourceConfig)

        self.consumerStatus = multiprocessing.Queue()
        self.sentinel = 'Done'
        self.bqUtils = BigqueryUtils()

示例#2

显示文件

文件： MsfImportMlbFeeds.py 项目： yahali-contendoAI/contendo_core

 def __init__(self):
     self.seasons = ['2019-regular', '2017-regular', '2017-playoff', '2018-regular', '2018-playoff']
     #self.seasons = ['2019-regular']
     apikey = '98de7b49-a696-4ed7-8efa-94b28a'
     self.msf = MySportsFeeds(version="2.0")
     self.msf.authenticate(apikey, "MYSPORTSFEEDS")
     self.bqu = BigqueryUtils()

示例#3

显示文件

 def __init__(self):
     self.bqu = BigqueryUtils()
     self.companiesDF = None
     self.stocksDF = None
     self.resourceDir = 'resource'
     self.companiesDataFileName = '{}/companies_{}.csv'.format(
         self.resourceDir, date.today())
     self.stocksDataFileName = '{}/stocks_{}.csv'.format(
         self.resourceDir, date.today())
     self.companiesURL = 'gs://sport-uploads/Finance/companies_fundamentals.csv'
     self.stocksURL = 'gs://sport-uploads/Finance/eod_stocks_data.csv'

示例#4

显示文件

 def __init__(self):
     #ProducerConsumersEngine.__init__(self, self.import_daily_quotes)
     self.bqu = BigqueryUtils()
     self.main_dir = '/Users/ysherman/Documents/GitHub/results/Finance/EODHistoricalData/daily-quotes/'
     #self.AV_api_key = 'NN4P0527XD25VT1Q'
     #self.WTD_apikey = 'kDxr9tfB8fYVUV0wnkNzZN4W3IQZO48hLOpFKJ2NIiHbHSgKsTyMt4jzW3Cm'
     self.EODHD_apikey = '5d5d1d7259ef23.41685254'
     #self.alpha_url = 'https://www.alphavantage.co/query?function={AnalyticFunction}&symbol={Symbol}&outputsize=compact&apikey={api_key}&datatype=csv'.replace('{api_key}', api_key)
     self.EOD_Symbols_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('COMM', 'NYSE', 'NASDAQ', 'INDX') order by exchange desc"
     self.FUNDAMENTALS_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('NYSE', 'NASDAQ') AND type='Common Stock' order by exchange desc"
     self.EOD_DAYBULK_URL = 'https://eodhistoricaldata.com/api/eod-bulk-last-day/{}?api_token=5d5d1d7259ef23.41685254&filter=extended&date={}'

示例#5

显示文件

 def upload_dated_quote_files(self, startdate):
     os.chdir(main_dir)
     extension = 'csv'
     all_filenames = [
         i for i in glob.glob('dated-files/2019-08*.{}'.format(extension))
     ]
     print(all_filenames)
     bqu = BigqueryUtils()
     main_start_time = dt.now()
     count = 0
     for csvFileName in all_filenames:
         datadate = csvFileName.split('/')[1].split('.')[0].replace('-', '')
         if (datadate < startdate.replace('-', '')):
             continue
         tableId = 'daily_stock_history_{}'.format(datadate)
         print(tableId, dt.now() - main_start_time)
         csvFile = open(csvFileName, 'rb')
         bqu.create_table_from_local_file(csvFile, 'Finance_Data', tableId)
         csvFile.close()
         count += 1
         if count % 20 == 0:
             print('Done reading: {} of {} Total-delta: {}\n'.format(
                 count, len(all_filenames),
                 dt.now() - main_start_time))

示例#6

显示文件

文件： GoogleTrendImport.py 项目： yahali-contendoAI/contendo_core

def test():
    from contendo_utils import BigqueryUtils
    import os
    os.chdir('/Users/ysherman/Documents/GitHub/results/trends')
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "{}/sportsight-tests.json".format(os.environ["HOME"])
    query = 'SELECT Code, Name, Sector, count(*) count FROM `sportsight-tests.Finance_Data.indices_company_list` left join unnest(Components) group by 1,2,3 having count>0 order by count desc, name'
    bqu = BigqueryUtils()

    gtrend = GoogleTrendImport()
    itemsDict = bqu.execute_query_to_dict(query)
    print('Getting {} items for finance'.format(itemsDict['nRows']))
    trendsDict = {'Finance': 7, 'Financial-Markets': 1163}
    for categoryName, category in trendsDict.items():
        filename = gtrend.get_trend_for_list(itemsDict['Rows'], 'Code', category, categoryName)
        datasetId = 'Trends_Data'
        bqu.create_dataset(datasetId)
        bqu.create_table_from_local_file(filename, datasetId, 'daily_trends', writeDisposition='WRITE_APPEND')
    'Done'

示例#7

显示文件

 def __init__(self, root='.'):
     self.root = root
     self.icm = icm.InsightsConfigurationManager()
     self.queryDict = {}
     self.bqu = BigqueryUtils()
     self.TRUE = True

示例#8

显示文件

class InsightsGenerator:
    def __init__(self, root='.'):
        self.root = root
        self.icm = icm.InsightsConfigurationManager()
        self.queryDict = {}
        self.bqu = BigqueryUtils()
        self.TRUE = True

    def get_twoanswers_query(self, domain):
        queryKey = '{}.TwoAnswersQuery'.format(domain)
        if queryKey not in self.queryDict:
            statsPrepQuery = open(
                self.root + '/Queries/{StatsPrepQuery}'.format(
                    **self.icm.domainsDict[domain]), 'r').read()
            twoAnswersQuestionQuery = open(
                self.root + '/Queries/{TwoAnswersQuestionQuery}'.format(
                    **self.icm.domainsDict[domain]), 'r').read()
            self.queryDict[
                queryKey] = '{},\n{}\nSELECT * from twoQuestionsFinal'.format(
                    statsPrepQuery, twoAnswersQuestionQuery)
        #print(self.queryDict[domain])
        return self.queryDict[queryKey]

    def get_lists_query(self, domain):
        queryKey = '{}.ListsQuery'.format(domain)
        if queryKey not in self.queryDict:
            listsQuery = open(
                self.root +
                '/Queries/{ListsQuery}'.format(**self.icm.domainsDict[domain]),
                'r').read()
            self.queryDict[queryKey] = listsQuery
        #print(self.queryDict[domain])
        return self.queryDict[queryKey]

    def get_onelist_query(self, domain):
        queryKey = '{}.OneListQuery'.format(domain)
        if queryKey not in self.queryDict:
            listsQuery = open(
                self.root + '/Queries/{OneListQuery}'.format(
                    **self.icm.domainsDict[domain]), 'r').read()
            self.queryDict[queryKey] = listsQuery
        #print(self.queryDict[domain])
        return self.queryDict[queryKey]

    def get_twoquestions_dataset_and_table(self, contentConfigCode):
        return 'temp', 'finance_questions_' + contentConfigCode

    def get_lists_dataset_and_table(self, contentConfigCode):
        return 'temp', 'finance_lists_' + contentConfigCode

    def trend_teams_filter(self, top=30, minTrend=0):
        query = 'SELECT TeamId, Trend FROM `sportsight-tests.Baseball1.teams_trend` where Trend>{} order by trend desc limit {}'.format(
            minTrend, top)
        teamsDF = self.bqu.execute_query_to_df(query)
        teamsList = list(teamsDF['TeamId'])
        inst = {}
        inst['teamIDs'] = str(teamsList).replace('[', '(').replace(']', ')')
        #return 'stat1.TeamCode in {teamIDs} or stat2.TeamCode in {teamIDs}'.format(**inst)
        return 'TeamCode in {teamIDs}'.format(**inst)

    def filter(self, cond):
        return cond

    def one_team_filter(self, teamCode):
        return '"{}" in (stat1.TeamCode, stat2.TeamCode)'.format(teamCode)

    def compare_teams_filter(self, team1, team2):
        return '"{}" in (stat1.TeamCode, stat2.TeamCode) AND "{}" in (stat1.TeamCode, stat2.TeamCode)'.format(
            team1, team2)

    def one_player_filter(self, playerCode):
        return '"{}" in (stat1.PlayerCode, stat2.PlayerCode)'.format(
            playerCode)

    def property_compare(self, property, value):
        return '{} = "{}"'.format(property, value)

    def marketcap_between(self, min, max):
        return 'MarketCap between {} and {}'.format(min, max)

    def condition(self, cond):
        return cond

    def calc_filter(self, filter):
        if filter == True:
            retFilter = filter
        else:
            try:
                execStr = 'self.' + filter
                retFilter = eval(execStr)
            except Exception as e:
                print("Error while evaluating '{}', error: {}".format(
                    execStr, e))
                retFilter = True

        return retFilter

    def two_answers_generator(self, contentConfigCode):
        #
        # Save the insights configuration to BQ
        configTableId = self.icm.save_configuration_to_bigquery(
            contentConfigCode)
        #
        # read the query, configure and run it.
        instructions = self.icm.get_content_config(contentConfigCode)
        instructions['InsightsConfigurationTable'] = configTableId
        instructions['StatFilter'] = self.calc_filter(
            instructions['StatFilter'])
        instructions['QuestionsFilter'] = self.calc_filter(
            instructions['QuestionsFilter'])
        query = self.get_twoanswers_query(instructions['SportCode'])
        query = ProUtils.format_string(query, instructions)
        #print("Running query:\n" + query, flush=True)
        #
        # Execute the query.
        dataset_id, table_id = self.get_twoquestions_dataset_and_table(
            contentConfigCode)
        queryFile = 'results/queries/{}.sql'.format(table_id)
        f = open(queryFile, 'w')
        f.write(query)
        f.close()
        nQuestions = self.bqu.execute_query_with_schema_and_target(
            query, dataset_id, table_id)
        return nQuestions

    def lists_generator(self, contentConfigCode):
        #
        # Save the insights configuration to BQ
        configTableId = self.icm.save_configuration_to_bigquery(
            contentConfigCode)
        #
        # read the query, configure and run it.
        instructions = self.icm.get_content_config(contentConfigCode)
        instructions['InsightsConfigurationTable'] = configTableId
        instructions['StatFilter'] = self.calc_filter(
            instructions['StatFilter'])
        instructions['QuestionsFilter'] = self.calc_filter(
            instructions['QuestionsFilter'])
        query = self.get_lists_query(instructions['SportCode'])
        query = ProUtils.format_string(query, instructions)
        #print("Running query:\n" + query, flush=True)
        #
        # Execute the query.
        dataset_id, table_id = self.get_lists_dataset_and_table(
            contentConfigCode)
        queryFile = 'results/queries/{}.sql'.format(table_id)
        f = open(queryFile, 'w')
        f.write(query)
        f.close()
        nItems = self.bqu.execute_query_with_schema_and_target(
            query, dataset_id, table_id)
        return nItems

示例#9

显示文件

文件： GetStockNews.py 项目： yahali-contendoAI/contendo_core

 def __init__(self):
     stocknews_apikey = 'oywghpku7talnwtde1k4h5eqonrgze6i1v6fzmcq'
     self.stocknews_url_template = "https://stocknewsapi.com/api/v1?tickers={ticker}&items={nitems}&date={fromdate_MMDDYYYY}-today&sortby={sortby}&token={stocknews_apikey}".replace(
         '{stocknews_apikey}', stocknews_apikey)
     self.bqu = BigqueryUtils()
     self.bucketName = 'sport-uploads'

示例#10

显示文件

文件： GetStockNews.py 项目： yahali-contendoAI/contendo_core

class GetStockNews:
    def __init__(self):
        stocknews_apikey = 'oywghpku7talnwtde1k4h5eqonrgze6i1v6fzmcq'
        self.stocknews_url_template = "https://stocknewsapi.com/api/v1?tickers={ticker}&items={nitems}&date={fromdate_MMDDYYYY}-today&sortby={sortby}&token={stocknews_apikey}".replace(
            '{stocknews_apikey}', stocknews_apikey)
        self.bqu = BigqueryUtils()
        self.bucketName = 'sport-uploads'
        #nltk.download('punkt')  # 1 time download of the sentence tokenizer

    def get_stocknews_byticker(self,
                               tickersList,
                               nitems=50,
                               daysback=30,
                               sortby='trending'):
        assert (sortby in ['trending', 'algo'])

        tickers = str(tickersList).replace('[', '').replace(']', '').replace(
            "'", '').replace(' ', '')
        urlInstructions = {
            'ticker':
            tickers,
            'nitems':
            nitems,
            'fromdate_MMDDYYYY':
            (date.today() -
             datetime.timedelta(days=daysback)).strftime('%m%d%Y'),
            'sortby':
            sortby,
            'today':
            date.today(),
        }
        outfileName = 'Finance/temp/{ticker}-{nitems}-{fromdate_MMDDYYYY}-{sortby}-{today}.json'.format(
            **urlInstructions)

        text = self.bqu.read_string_from_gcp(self.bucketName, outfileName)
        if text is None:
            url = self.stocknews_url_template.format(**urlInstructions)
            print(url)
            response = requests.request("GET", url)
            text = response.text
            self.bqu.upload_string_to_gcp(response.text, self.bucketName,
                                          outfileName)

        data = json.loads(text)

        newsDict = data['data']

        sentimentDict = {
            'Count': 0,
            'Negative': 0,
            'Positive': 0,
            'Neutral': 0,
            'Weighted': 0
        }
        sentimentWeight = {'Negative': -1, 'Positive': 1, 'Neutral': 0}
        count = 0
        newsFeed = []
        startTime = dt.utcnow()
        for newsItem in newsDict:
            count += 1
            newItem = {
                key: newsItem[key]
                for key in [
                    'title', 'news_url', 'text', 'sentiment', 'source_name',
                    'topics'
                ]
            }
            newItem['index'] = count
            itemDate = dt.strptime(newsItem['date'],
                                   '%a, %d %b %Y %H:%M:%S %z')
            delta = startTime.date() - itemDate.date()
            if delta.days <= 3 or count <= 3:
                newItem['date'] = str(itemDate.date())
                if False:  # suspend getting the summary
                    article = Article(newItem['news_url'])
                    # Do some NLP
                    try:
                        article.download()  # Downloads the link’s HTML content
                        article.parse()  # Parse the article
                        article.nlp()  # Keyword extraction wrapper
                        newItem['Summary'] = article.summary.replace(
                            '\n', '\n')
                    except Exception as e:
                        print('Error occured:', e)
                        newItem['Summary'] = "<...>"

                #print(newItem['Summary'])
                newsFeed.append(newItem)
            if delta.days <= 3:
                deltaWeight = 1
            elif delta.days <= 7:
                deltaWeight = 0.5
            elif delta.days <= 14:
                deltaWeight = 0.25
            elif delta.days <= 30:
                deltaWeight = 0.125
            else:
                deltaWeight = 0.05

            sentiment = newsItem['sentiment']
            sentimentDict[sentiment] += 1
            sentimentDict['Count'] += 1
            sentimentDict[
                'Weighted'] += sentimentWeight[sentiment] * deltaWeight
        retDict = {
            'NumItems': len(newsFeed),
            'Sentiment': sentimentDict,
            'Newsfeed': newsFeed,
        }

        return retDict

示例#11

显示文件

文件： InsightsPackaging.py 项目： yahali-contendoAI/contendo_core

 def __init__(self, root='.'):
     self.icm = icm.InsightsConfigurationManager()
     self.bqUtils = BigqueryUtils()
     self.questionsReaderQuery = open(
         root + '/Queries/SportQuestionsReaderQuery.sql', 'r').read()

示例#12

显示文件

文件： MsfImportMlbFeeds.py 项目： yahali-contendoAI/contendo_core

class MsfImportMlb:

    def __init__(self):
        self.seasons = ['2019-regular', '2017-regular', '2017-playoff', '2018-regular', '2018-playoff']
        #self.seasons = ['2019-regular']
        apikey = '98de7b49-a696-4ed7-8efa-94b28a'
        self.msf = MySportsFeeds(version="2.0")
        self.msf.authenticate(apikey, "MYSPORTSFEEDS")
        self.bqu = BigqueryUtils()

    def get_seasonal_stats(self):
        start_time=dt.now()
        for feed in ['seasonal_games', 'seasonal_team_stats', 'seasonal_player_stats']:
            outfile_json = 'results/MLB/msf-mlb-{}-{}.json'.format(feed, dt.now().strftime('%Y%m%dT%H%M%S'))
            with open(outfile_json, 'w') as jsonfile:
                for season in self.seasons:
                    params = {
                        'league': 'mlb',
                        'season': season,
                        'feed': feed,
                        'format': 'json',
                    }
                    print('Starting msf {}-{}, delta-time: {}'.format(season, feed, dt.now()-start_time))
                    seasondata = self.msf.msf_get_data(**params)
                    outjson = json.dumps({'Season': season, 'Seasondata': seasondata})
                    jsonfile.write(outjson)
                    jsonfile.write('\n')
                    #delete_query = 'delete from `Baseball1.{}` where season="{}"'.format(feed, season)
                    #self.bqu.execute_query(delete_query)

            jsonfile.close()
            print('Starting upload of file {}, delta-time: {}'.format(outfile_json, dt.now() - start_time))
            uri = self.bqu.upload_file_to_gcp('sport-uploads', outfile_json, outfile_json)
            print('Starting table creation, delta-time: {}'.format(dt.now() - start_time))
            ret = self.bqu.create_table_from_gcp_file(uri, 'Baseball1', feed, 'WRITE_TRUNCATE')


    def get_game_days_stats(self):
        start_time=dt.now()
        for statObject in ['team', 'player']:
            feed = 'daily_{}_gamelogs'.format(statObject)
            #
            # get the missing game-days
            query = 'SELECT season,gameDay FROM `Baseball1.missing_{}_gamelogs` group by 1,2'.format(statObject)
            games_df = self.bqu.execute_query_to_df(query)
            print(games_df.shape)
            if (games_df.shape[0] == 0):
                return
            #
            # loop over missing game days
            for i,game in games_df.iterrows():
                #
                # open the main file.
                mainfile_name = 'results/MLB/msf-mlb-dayfeeds-{}-{}.json'.format(feed, dt.now().strftime('%Y%m%d'))
                mainfile = open(mainfile_name, 'w')

                params = {
                    'league': 'mlb',
                    'date': game['gameDay'],
                    'season': game['season'],
                    'feed': feed,
                    'format': 'json',
                }
                outfile_json = 'results/MLB/dayfeeds/msf-mlb-{feed}-{season}-{date}.json'.format(**params)
                if (not os.path.exists(outfile_json) or True): # and (os.path.getsize(outfile_json)>0):
                    print('Getting msf #{}, {}, delta-time: {}'.format(i, outfile_json, dt.now()-start_time))
                    jsonfile = open(outfile_json, 'w')
                    #
                    # Getting the data from MySportsFeeds
                    try:
                        seasondata = self.msf.msf_get_data(**params)
                    except Exception as e:
                        print('msf_get_data returned with error {}'.format(e))
                        continue
                    except Warning as w:
                        print('msf_get_data returned with warning {}'.format(w))
                        continue

                    jsonfile.write(json.dumps(seasondata))
                    jsonfile.close()
                else:
                    print('Reading msf #{}, {}, delta-time: {}'.format(i, outfile_json, dt.now()-start_time))
                    #
                    # loading the JSON from already existing file.
                    try:
                        jsonfile = open(outfile_json,'r')
                        seasondata = json.load(jsonfile)
                    except Exception as e:
                        print('Error loading JSON from file {}'.format(e))
                        continue

                dayfeed = {
                    'gamelogs': seasondata['gamelogs'],
                    'lastUpdatedOn': seasondata['lastUpdatedOn'],
                    'season': params['season']
                }
                mainfile.write(json.dumps(dayfeed)+'\n')
                mainfile.close()
                #
                # upload file and update table.
                try:
                    print('Starting upload of file {}, delta-time: {}'.format(outfile_json, dt.now() - start_time))
                    uri = self.bqu.upload_file_to_gcp('sport-uploads', mainfile_name, outfile_json + dt.now().strftime('.%Y%m%dT%H%M%S'))
                    print('Starting table creation from {}, delta-time: {}'.format(uri, dt.now() - start_time))
                    ret = self.bqu.create_table_from_gcp_file(uri, 'Baseball1', '{}_{}'.format(feed, game['gameDay']), 'WRITE_TRUNCATE')
                except Exception as e:
                    print('Error while uploading table {}'.format(e))
                    continue

    def get_game_pbp(self):
        start_time = dt.now()
        query='SELECT * FROM `sportsight-tests.Baseball1.missing_pbp_bydate`'
        datesDF = self.bqu.execute_query_to_df(query)
        print(datesDF.shape)
        if (datesDF.shape[0]==0):
            return

        feed = 'game_playbyplay'
        pbpFilePattern = '/Users/ysherman/Documents/GitHub/results/MLB/pbp/msf-pbp-{}-{}.json'
        for i,dayGames in datesDF.iterrows():
            dayGames = dict(dayGames)
            games = dayGames['games']
            date = dayGames['date'].strftime('%Y-%m-%d')
            outfile_json='results/MLB/temp/{}-{}.json'.format(feed, date)
            jsonfile = open(outfile_json, 'w')
            for game in games:
                pbpFileName = pbpFilePattern.format(game['id'], date)
                print(pbpFileName)
                if (not os.path.exists(pbpFileName) or True):
                    params = {}
                    params['season'] = dayGames['season']
                    params['matchname'] = game['matchname']
                    params['game'] = game['id']
                    params['format'] = 'json'
                    params['league'] = 'mlb'
                    params['feed']=feed
                    while True:
                        try:
                            print('Getting for day {}, game-id: {}, {}, season: {}, feed: {}, delta-time: {}'.format(
                                i,
                                game['id'],
                                game['matchname'],
                                dayGames['season'],
                                feed,
                                dt.now() - start_time))
                            seasondata = self.msf.msf_get_data(**params)
                            break
                        except Exception as e:
                            print("Error: {}".format(e))
                        except Warning as w:
                            print("Error - Warning: {}".format(w))
                        except:
                            print("Unknow Error")
                        time.sleep(10)
                    outfile = open(pbpFileName, 'w')
                    outfile.write(json.dumps(seasondata))
                    outfile.close()

                else:
                    try:
                        print('Reading file {}'.format(pbpFileName))
                        pbpfile = open(pbpFileName,'r')
                        seasondata = json.load(pbpfile)
                        pbpfile.close()
                    except Exception as e:
                        print('Error loading JSON from file {}'.format(e))
                        continue

                seasondata = self.pbp_to_bigqery_form(seasondata)
                seasondata['season'] = dayGames['season']
                seasondata['gameid'] = game['id']
                seasondata['gamename'] = game['matchname']
                outjson = json.dumps(seasondata)
                jsonfile.write(outjson)
                jsonfile.write('\n')
            #
            # uploading file for the day
            jsonfile.close()
            try:
                print('Starting upload of file {}, delta-time: {}'.format(outfile_json, dt.now() - start_time))
                uri = self.bqu.upload_file_to_gcp('sport-uploads', outfile_json, outfile_json)
                print('Starting table creation, delta-time: {}'.format(dt.now() - start_time))
                ret = self.bqu.create_table_from_gcp_file(uri, 'Baseball1', '{}_{}'.format(feed, date.replace('-','')), 'WRITE_TRUNCATE')
            except Exception as e:
                print('Error while uploading table {}'.format(e))
                continue

    def pbp_atbatsubplay_new(self, atBatSubPlay):
        newAtBatSubPlay = []
        for key,value in atBatSubPlay.items():
            if value is None:
                continue
                #newAtBatSubPlay.append({'key': key,'value': '', 'type': 'NULL'})
            elif key in ['retrievedAtLocation', 'pitchedLocation']:
                newAtBatSubPlay.append({'key': key+'-x','value': str(value['x']), 'type': 'INTEGER'})
                newAtBatSubPlay.append({'key': key+'-y','value': str(value['y']), 'type': 'INTEGER'})
            elif type(value)==dict:
                newAtBatSubPlay.append({'key': key,'value': str(value['id']), 'type': 'PlayerId'})
            else:
                newAtBatSubPlay.append({'key': key,'value': str(value), 'type': type(value).__name__})
        return newAtBatSubPlay

    def pbp_atbatplaystatus_new(self, atBatPlayStatus):
        playerRoles =  ['batter', 'catcher', 'centerFielder', 'firstBaseman', 'firstBaseRunner', 'leftFielder', 'outFielder', 'pitcher', 'rightFielder', 'secondBaseman', 'secondBaseRunner', 'shortStop', 'thirdBaseman', 'thirdBaseRunner']
        runnerRoles =  ['firstBaseRunner', 'secondBaseRunner', 'thirdBaseRunner']
        nRunners = 0
        newAtBatPlayStatus = {}
        for key,value in atBatPlayStatus.items():
            if key in runnerRoles:
                if value is not None:
                    nRunners+=1
            if value is None:
                value={'id': -1}
            if key in playerRoles:
                newAtBatPlayStatus[key] = value['id']
            else:
                newAtBatPlayStatus[key] = value
        newAtBatPlayStatus['numRunners'] = nRunners
        return newAtBatPlayStatus

    def pbp_to_bigqery_form(self, pbpDict):
        newAtBats = []
        atBatCounter=0
        playCounter=0
        for atBat in pbpDict['atBats']:
            newAtBatPlays = []
            atBatPlayCounter = 0
            for atBatPlay in atBat['atBatPlay']:
                newAtBatPlay={}
                try:
                    if type(atBatPlay)!=dict:
                        continue
                    for key, value in atBatPlay.items():
                        if key=='description':
                            newAtBatPlay[key]=value
                        elif key == 'playStatus':
                            newAtBatPlay[key] = self.pbp_atbatplaystatus_new(value)
                        else:
                            newAtBatPlay['atBatSubPlay'] = {'name': key, 'properties': self.pbp_atbatsubplay_new(value)}
                except Exception as e:
                    print('Error {} with atBatCounter = {}, atBatPlayCounter={}, key={}, atbatsubplay={}, atBatPlay={}'.format(e,atBatCounter,atBatPlayCounter,key,value, atBatPlay))
                atBatPlayCounter+=1
                playCounter+=1
                newAtBatPlay['index']=atBatPlayCounter
                newAtBatPlay['playindex']=playCounter
                newAtBatPlays.append(newAtBatPlay)
            atBatCounter += 1
            atBat['index'] = atBatCounter
            atBat['atBatPlay'] = newAtBatPlays
            newAtBats.append(atBat)
        pbpDict['atBats'] = newAtBats
        return pbpDict

示例#13

显示文件

class EODHistoricalDataImport:
    def __init__(self):
        #ProducerConsumersEngine.__init__(self, self.import_daily_quotes)
        self.bqu = BigqueryUtils()
        self.main_dir = '/Users/ysherman/Documents/GitHub/results/Finance/EODHistoricalData/daily-quotes/'
        #self.AV_api_key = 'NN4P0527XD25VT1Q'
        #self.WTD_apikey = 'kDxr9tfB8fYVUV0wnkNzZN4W3IQZO48hLOpFKJ2NIiHbHSgKsTyMt4jzW3Cm'
        self.EODHD_apikey = '5d5d1d7259ef23.41685254'
        #self.alpha_url = 'https://www.alphavantage.co/query?function={AnalyticFunction}&symbol={Symbol}&outputsize=compact&apikey={api_key}&datatype=csv'.replace('{api_key}', api_key)
        self.EOD_Symbols_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('COMM', 'NYSE', 'NASDAQ', 'INDX') order by exchange desc"
        self.FUNDAMENTALS_Query = "SELECT Code, Exchange, Type FROM `sportsight-tests.Finance_Data.eod_exchange_symbols_list` where Exchange in ('NYSE', 'NASDAQ') AND type='Common Stock' order by exchange desc"
        self.EOD_DAYBULK_URL = 'https://eodhistoricaldata.com/api/eod-bulk-last-day/{}?api_token=5d5d1d7259ef23.41685254&filter=extended&date={}'

    def get_eod_daily_bulk(self, startTime):
        csv_dir = self.main_dir + 'dated-files/'
        if not os.path.exists(csv_dir):
            os.mkdir(csv_dir)
        #datesPD = self.bqu.execute_query_to_df ("SELECT distinct format_date('%Y-%m-%d', timestamp) as Date FROM `sportsight-tests.Finance_Data.daily_stock_history_*` where timestamp<=parse_date('%x', '09/25/18') order by Date desc limit 230")
        #datesList = list(datesPD['Date'])
        datesList = ['2019-09-03']  #list(datesPD['Date'])
        for stockDate in datesList:
            dailyDF = pd.DataFrame()
            for exchange in ['COMM', 'INDX', 'NASDAQ', 'NYSE']:
                url = self.EOD_DAYBULK_URL.format(exchange, stockDate)
                print(url, dt.now() - startTime)
                try:
                    stockDF = pd.read_csv(url).fillna(0)[[
                        'Date', 'Open', 'High', 'Low', 'Close',
                        'Adjusted_close', 'Volume', 'Code'
                    ]][:-1]
                    #print(stockDF.shape, stockDF.columns)
                    stockDF.rename(columns={'Code': 'Symbol'}, inplace=True)
                    stockDF['Exchange'] = exchange
                    #dtd = dt.strptime(stockDate, '%Y-%m-%d')
                    #print(dtd, type(dtd))
                    #finalDate = dtd.date()
                    #print(finalDate, type(finalDate))
                    #builtDate = dt.date(dtd.year, dtd.month, dtd.day)
                    #print(builtDate, type(builtDate))
                    #stockDF['Date'] = date(2019,8,25)
                    stockDF['Volume'] = stockDF['Volume'].astype(int)
                    stockDF.to_csv(csv_dir +
                                   '{}-{}.csv'.format(stockDate, exchange),
                                   index=False)
                    dailyDF = dailyDF.append(stockDF)
                except Exception as e:
                    print("Error {}".format(e))
                #break
            #tableId = 'Finance_Data.eod_history_data_{}'.format(date.replace('-', ''))
            if dailyDF.shape[0] > 0:
                datasetId = 'Finance_Data'
                tableId = 'eod_daily_history_1year'
                delQuery = "delete from `{}.{}` where Date=PARSE_DATE('%Y-%m-%d', '{}')".format(
                    datasetId, tableId, stockDate)
                #print(delQuery)
                #print(schema)
                self.bqu.execute_query(delQuery)
                print('Writing table {}, size {}, delta time {}'.format(
                    tableId, dailyDF.shape,
                    dt.now() - startTime))
                schema = self.bqu.get_table_schema(datasetId, tableId)
                dailyDF.to_gbq('{}.{}'.format(datasetId, tableId),
                               table_schema=schema,
                               if_exists='append')

            #break
        print('Done', dt.now() - startTime)

    def get_fundamentals_data(self, startTime):
        fundamentals_dir = self.main_dir + 'fundamentals/'
        if not os.path.exists(fundamentals_dir):
            os.mkdir(fundamentals_dir)
        jsonFileName = 'fundamentals-{}.json'.format(
            dt.now().strftime('%Y-%m-%dT%H%M%S'))
        outfileName = fundamentals_dir + jsonFileName
        outfile = open(outfileName, 'w')

        stocksDict = self.bqu.execute_query_to_dict(self.FUNDAMENTALS_Query)
        print('Getting {} stocks data'.format(stocksDict['nRows']))
        count = 0
        successCount = 0
        for stock in stocksDict['Rows']:
            count += 1
            print(count, stock, dt.now() - startTime)
            try:
                retDict = eod.get_fundamental_data(stock['Code'],
                                                   'US',
                                                   api_key=self.EODHD_apikey)
                relevantKeys = [
                    'General', 'Highlights', 'Valuation', 'SharesStats'
                ]
                stockData = {
                    x: retDict[x]
                    for x in relevantKeys if x in retDict
                }
                stockData['Time'] = dt.now().strftime('%Y-%m-%dT%H:%M:%S')
                technicalsDict = {}
                for key, value in retDict['Technicals'].items():
                    if key[0] in '0123456789':
                        technicalsDict['T' + key] = value
                    else:
                        technicalsDict[key] = value
                stockData['Technicals'] = technicalsDict
                json.dump(stockData, outfile)
                outfile.write('\n')
                successCount += 1

            except Exception as e:
                print("Error {}".format(e))
                #break
            #break
        outfile.close()
        if successCount > 0:
            datasetId = 'Finance_Data'
            tableId = 'fundamentals_daily_{}'.format(
                startTime.strftime('%Y%m%d'))
            self.bqu.create_dataset(datasetId)
            uri = self.bqu.upload_file_to_gcp(
                'sport-uploads', outfileName,
                'Finance/EOD/Fundamentals/{}'.format(jsonFileName))
            ret = self.bqu.create_table_from_gcp_file(uri, datasetId, tableId,
                                                      'WRITE_TRUNCATE')

        print('Done', successCount, dt.now() - startTime)

    def get_eod_quote(self, comp, startTime):
        stockCode = '{Code}-{Exchange}-{Type}'.format(**comp)
        csv_file = comp['CSVDir'] + '{}.csv'.format(stockCode)
        if (not os.path.exists(csv_file)
            ):  # and (os.path.getsize(outfile_json)>0):
            print('{}. Getting {}, delta time: {}'.format(
                comp['i'] + 1, stockCode,
                dt.now() - startTime))
            try:
                symbol = comp['Code']
                exchange = comp['Exchange']
                if exchange in ['NYSE', 'NASDAQ']:
                    exchange = 'US'
                svDF = eod.get_eod_data(symbol,
                                        exchange,
                                        api_key=self.EODHD_apikey)
                print(svDF.shape)
                svDF['Symbol'] = comp['Code']
                svDF['Exchange'] = comp['Exchange']
                # svDF['Date'] = svDF.index
                svDF.to_csv(csv_file)
                return True

            except Exception as e:
                print('Error {}, Stock: {}'.format(e, stockCode))
        return False

    def import_indices_fundamentals(self):
        outfileName = '{}/tmp/indices.json'.format(os.environ['HOME'])
        outfile = open(outfileName, 'w')
        for index in ['DJI', 'GSPC']:
            indexData = eod.get_fundamental_data(index, 'INDX',
                                                 self.EODHD_apikey)
            newIndex = {}
            newIndex['General'] = indexData['General']
            complist = []
            for key, value in indexData['Components'].items():
                component = value
                component['Index'] = int(key)
                complist.append(component)
            newIndex['Components'] = complist
            newIndex['NumComponents'] = len(complist)
            json.dump(newIndex, outfile)
            outfile.write('\n')
        outfile.close()
        self.bqu.create_table_from_local_file(outfileName, 'Finance_Data',
                                              'indices_company_list')

    def import_daily_quotes(self, configurations, startTime):
        print("Starting import_daily_quotes")
        comp_df = self.bqu.execute_query_to_df(self.EOD_Symbols_Query)

        csv_dir = self.main_dir + '{}/'.format(startTime.strftime('%Y-%m-%d'))
        if not os.path.exists(csv_dir):
            os.mkdir(csv_dir)

        print('getting {} companies, delta time: {} '.format(
            comp_df.shape[0],
            dt.now() - startTime))
        for i, comp in comp_df.iterrows():
            comp['CSVDir'] = csv_dir
            comp['i'] = i
            ret = self.get_eod_quote(comp, startTime)
            if ret:
                continue
            continue

            jobData = self.JobData(self.get_eod_quote, dict(comp))
            print(jobData.instructions)
            try:
                continue
                self.jobsQueue.put(jobData)
            except Exception as e:
                print("Error {} in queue.put".format(e))
                break
            break

    def create_dated_quote_files(self, dirdate):
        os.chdir(self.main_dir)
        extension = 'csv'
        all_filenames = [
            i for i in glob.glob('{}/*.{}'.format(dirdate, extension))
        ]
        print(len(all_filenames))
        outfiles = {}

        mydialect = csv.Dialect
        mydialect.lineterminator = '\n'
        mydialect.quoting = csv.QUOTE_MINIMAL
        mydialect.quotechar = '|'

        count = 0
        main_start_time = dt.now()
        for csvFileName in all_filenames:
            infile = open(csvFileName, 'r')
            linereader = csv.reader(infile, delimiter=',')
            firstrow = True
            for line in linereader:
                if firstrow:
                    firstrow = False
                    topLine = line
                    continue
                date = line[1]
                if date not in outfiles:
                    outfile = open('dated-files/{}.csv'.format(date), 'w')
                    outFileWriter = csv.writer(outfile,
                                               delimiter=',',
                                               dialect=mydialect)
                    outFileWriter.writerow(topLine[1:])
                    outfiles[date] = {
                        'outfile': outfile,
                        'outFileWriter': outFileWriter
                    }
                else:
                    outFileWriter = outfiles[date]['outFileWriter']
                outFileWriter.writerow(line[1:])
                #break
            count += 1
            infile.close()
            if count % 1000 == 0:
                print('Done reading: {} of {} Total-delta: {}\n'.format(
                    count, len(all_filenames),
                    dt.now() - main_start_time))

    def upload_dated_quote_files(self, startdate):
        os.chdir(main_dir)
        extension = 'csv'
        all_filenames = [
            i for i in glob.glob('dated-files/2019-08*.{}'.format(extension))
        ]
        print(all_filenames)
        bqu = BigqueryUtils()
        main_start_time = dt.now()
        count = 0
        for csvFileName in all_filenames:
            datadate = csvFileName.split('/')[1].split('.')[0].replace('-', '')
            if (datadate < startdate.replace('-', '')):
                continue
            tableId = 'daily_stock_history_{}'.format(datadate)
            print(tableId, dt.now() - main_start_time)
            csvFile = open(csvFileName, 'rb')
            bqu.create_table_from_local_file(csvFile, 'Finance_Data', tableId)
            csvFile.close()
            count += 1
            if count % 20 == 0:
                print('Done reading: {} of {} Total-delta: {}\n'.format(
                    count, len(all_filenames),
                    dt.now() - main_start_time))

示例#14

显示文件

文件： SimpleStatsGenerator.py 项目： yahali-contendoAI/contendo_core

class SimpleStatsGenerator():
    #
    # read in the configurations
    def __init__(self, root):
        #
        # get the initial configuration
        self.root = root
        self.configsheet_url = 'https://docs.google.com/spreadsheets/d/1gwtQlzk0iA4qyLzqaYEk5SggOqNZtJnSSfwnZYDNlAw/export?format=csv&gid={SheetId}&run=1'
        sourceConfigDF = pd.read_csv(
            self.configsheet_url.replace('{SheetId}', '284194018')).fillna('')
        sourceConfigDF['enriched'] = False
        self.sourcesConfigDict = ProUtils.pandas_df_to_dict(
            sourceConfigDF, 'Configname')
        self.sport_configs = {}
        self.TRUE = True

        #
        # read IMDB title definitions
        titleTypesDF = pd.read_csv(
            self.configsheet_url.replace('{SheetId}', '1802180540')).fillna('')
        self.titletypesConfigDict = ProUtils.pandas_df_to_dict(
            titleTypesDF, 'TitleType')

        #print(sourceConfig)

        self.consumerStatus = multiprocessing.Queue()
        self.sentinel = 'Done'
        self.bqUtils = BigqueryUtils()

    def get_source_configuration(self, configName):
        sourceConfig = self.sourcesConfigDict[configName]
        if sourceConfig['DoIT'] != 'y' or sourceConfig['enriched'] == True:
            return sourceConfig

        sheetId = sourceConfig['SportSheetId']
        #
        # read all relevant metrics
        if sheetId not in self.sport_configs.keys():
            self.sport_configs[sheetId] = pd.read_csv(
                self.configsheet_url.replace(
                    '{SheetId}', str(sourceConfig['SportSheetId']))).fillna('')
            self.sport_configs[sheetId]['SportCode'] = sourceConfig[
                'SportCode']

        sourceConfig['StatsDefDict'] = ProUtils.pandas_df_to_dict(
            self.sport_configs[sheetId], 'StatName')

        if 'query' not in sourceConfig.keys():
            sourceConfig['query'] = open(
                self.root + '/Queries/' + sourceConfig['QueryFile'],
                'r').read()

        sourceConfig['enriched'] = True
        self.sourcesConfigDict[configName] = sourceConfig
        return sourceConfig

    def queryExecutor(self, i, query_jobs):
        #
        # execute a list of query jobs
        #print('Start executor %d' % i)
        startTime = dt.now()
        for queryJob in iter(query_jobs.get, self.sentinel):
            #
            # to enforce the schema is correct, we first copy the empty table from the schema template
            # and then append the result to this empty table
            try:
                nRows = self.bqUtils.execute_query_with_schema_and_target(
                    **queryJob['params'])
                print(
                    'Returened for Statname: {} ({} rows), StatObject: {}, StatTimeframe: {}, Detlatime: {}'
                    .format(queryJob['StatName'], nRows,
                            queryJob['StatObject'], queryJob['StatTimeframe'],
                            dt.now() - startTime),
                    flush=True)
                query_jobs.task_done()
                queryFile = 'results/queries/{}.sql'.format(
                    queryJob['params']['targetTable'])
                f = open(queryFile, 'w')
                f.write(queryJob['params']['query'])
                f.close()
            except Exception as e:
                queryFile = 'errors/{}.sql'.format(
                    queryJob['params']['targetTable'])
                f = open(queryFile, 'w')
                f.write(queryJob['params']['query'])
                f.close()
                # print(queryJob['query'],flush=True)
                print(
                    'Error {} with Statname: {}, StatObject: {}, StatTimeframe: {}'
                    .format(e, queryJob['StatName'], queryJob['StatObject'],
                            queryJob['StatTimeframe']),
                    flush=True)
        #print('Consumer {} terminates, Deltatime: {}'.format(str(i), dt.now() - startTime), flush=True)

    def queriesGenerator(self, queriesQueue, numExecutors, configurations=[]):
        startTime = dt.now()
        #
        # Make sure the target dataset exists
        self.bqUtils.create_dataset(targetDataset)
        #
        # if there are only partial list of configurations
        if len(configurations) == 0:
            configurations = self.sourcesConfigDict.keys()
        #
        # loop over all configurations and generate
        #print(configurations)
        for sourceConfigName in configurations:
            #
            # get the source configuration
            sourceConfig = self.get_source_configuration(sourceConfigName)
            #
            # make sure it is required.
            if sourceConfig['DoIT'] != 'y':
                continue
            #
            # call the relevant generation function.
            print("running configuration {}".format(sourceConfigName))
            generatorFunc = eval('self.{}'.format(
                sourceConfig['generatorFunc']))
            generatorFunc(queriesQueue, sourceConfig, startTime)
        #
        # Set the sentinel for all processes.
        for i in range(numExecutors):
            queriesQueue.put(self.sentinel)  # indicate sentinel

    def financeQueriesGenerator(self, queriesQueue, sourceConfig, startTime):
        #
        # target table definitions
        financeTableFormat = 'Stat_Finance_{StatSource}_{StatName}_{StatObject}_Rolling_{RollingDays}'
        financeStatsDataset = 'Finance_Stats'
        self.bqUtils.create_dataset(financeStatsDataset)

        #
        # create jobs for all relevant metrics.
        for statDef in sourceConfig['StatsDefDict'].values():

            if statDef['Doit'] != 'y':
                continue

            #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True)

            for statObject in statDef['StatObject'].split(',')[:1]:
                for rollingDays in statDef['RollingDaysList'].split(','):
                    _statDef = statDef.copy()
                    _statDef['StatObject'] = statObject
                    rollingDaysInst = {'RollingDays': rollingDays}
                    query = sourceConfig['query']
                    query = ProUtils.format_string(query, _statDef)
                    query = ProUtils.format_string(query, sourceConfig)
                    query = ProUtils.format_string(query, rollingDaysInst)
                    #print (query)
                    #
                    # define the destination table
                    instructions = _statDef
                    instructions['StatTimeframe'] = sourceConfig[
                        'StatTimeframe']
                    instructions['StatSource'] = sourceConfig['StatSource']
                    instructions['RollingDays'] = rollingDays
                    targetTable = ProUtils.format_string(
                        financeTableFormat,
                        instructions).replace('.', '_').replace('-', '_')
                    jobDefinition = {
                        'params': {
                            'query': query,
                            'targetDataset': financeStatsDataset,
                            'targetTable': targetTable,
                        },
                        'StatName': _statDef['StatName'],
                        'StatObject': statObject,
                        'StatTimeframe': '{}_Rollingdays'.format(rollingDays)
                    }
                    queriesQueue.put(jobDefinition)

    def imdbQueriesGenerator(self, queriesQueue, sourceConfig, startTime):

        #
        # create jobs for all relevant metrics.
        for statDef in sourceConfig['StatsDefDict'].values():

            if statDef['Doit'] != 'y':
                continue

            #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True)

            for titleType in statDef['TitleType'].split(','):
                titletypeConfig = self.titletypesConfigDict[titleType]
                if statDef['Genres'] == 'y':
                    genresList = titletypeConfig['GenresList'].split(',')
                else:
                    genresList = ['All']

                for genre in genresList:
                    _statDef = statDef.copy()
                    query = sourceConfig['query']
                    if genre == 'All':
                        _statDef['StatCondition'] = ''
                    else:
                        _statDef[
                            'StatCondition'] = 'AND STRPOS(Genres, "{}")>0'.format(
                                genre)
                        _statDef['StatName'] = '{}.{}'.format(
                            statDef['StatName'], genre)
                    _statDef['TitleType'] = titleType
                    _statDef['Genre'] = genre
                    _statDef['StatObject'] = titleType
                    query = ProUtils.format_string(query, _statDef)
                    query = ProUtils.format_string(query, sourceConfig)
                    query = ProUtils.format_string(query, titletypeConfig)
                    #print (query)
                    #
                    # define the destination table
                    instructions = _statDef
                    instructions['StatTimeframe'] = sourceConfig[
                        'StatTimeframe']
                    instructions['StatSource'] = sourceConfig['StatSource']
                    targetTable = ProUtils.format_string(
                        targetTableFormat,
                        instructions).replace('.', '_').replace('-', '_')
                    jobDefinition = {
                        'params': {
                            'query': query,
                            'targetDataset': targetDataset,
                            'targetTable': targetTable,
                        },
                        'StatName': _statDef['StatName'],
                        'StatObject': titleType,
                        'StatTimeframe': sourceConfig['StatTimeframe']
                    }
                    queriesQueue.put(jobDefinition)

    def imdbQuestionsDefGenerator(self):
        #
        # create jobs for all relevant metrics.
        questionsList = []
        sourceConfig = self.get_source_configuration('Entertainmant.IMDB')

        for statDef in sourceConfig['StatsDefDict'].values():

            for titleType in statDef['TitleType'].split(','):
                titletypeConfig = self.titletypesConfigDict[titleType]
                if statDef['Genres'] == 'y':
                    genresList = titletypeConfig['GenresList'].split(',')
                else:
                    genresList = ['All']

                for genre in genresList:
                    questionDef = {}
                    questionDef['QuestionCode'] = '{}.{}'.format(
                        titleType, statDef['StatName'])
                    questionDef['StatName'] = statDef['StatName']
                    questionDef['StatObject'] = titleType
                    questionDef['Genre'] = ''
                    questionDef['TitleType'] = titleType
                    questionDef['Level'] = 'Easy'
                    questionDef['Value1Template'] = statDef['Value1Template']
                    questionDef['Value2Template'] = statDef['Value2Template']
                    questionDef['ObjectDisplayName'] = titletypeConfig[
                        'ObjectDisplayName']

                    questionDef['QuestionNObjects'] = ''
                    if genre != 'All':
                        questionDef['QuestionCode'] = '{}.{}'.format(
                            questionDef['QuestionCode'], genre)
                        questionDef['StatName'] = '{}.{}'.format(
                            questionDef['StatName'], genre)
                        questionDef['Genre'] = genre + ' '

                    questionDef['Question2Objects'] = ProUtils.format_string(
                        statDef['Question2Objects'], questionDef)
                    questionsList.append(questionDef)

        keys = [
            'QuestionCode', 'StatName', 'Genre', 'Level', 'ObjectDisplayName',
            'Question2Objects', 'QuestionNObjects', 'StatObject', 'TitleType',
            'Value1Template', 'Value2Template'
        ]
        questionsDF = pd.DataFrame(questionsList, columns=keys)
        questionsDF.to_csv('imdb_questionsList.csv')

    def days_range(self, interval, prev):
        instructions = {}
        startDate = (dt.today() - timedelta(days=interval + prev - 1))
        endDate = (dt.today() - timedelta(days=prev))
        condTemplate = '{DateProperty} BETWEEN "{StartDate}" and "{EndDate}"'
        condInst = {
            'StartDate': startDate.strftime('%Y%m%d'),
            'EndDate': endDate.strftime('%Y%m%d')
        }
        instructions['StatCondition'] = ProUtils.format_string(
            condTemplate, condInst)
        instructions['DaysRange'] = '{}...{}'.format(
            startDate.strftime('%Y-%m-%d'), endDate.strftime('%Y-%m-%d'))
        return instructions

    def games_days_range(self, interval, prev):
        instructions = {}
        startDate = (dt.today() - timedelta(days=interval + prev - 1))
        endDate = (dt.today() - timedelta(days=prev))
        condTemplate = '{DateProperty} BETWEEN "{StartDate}" and "{EndDate}"'
        condInst = {
            'StartDate': startDate.strftime('%Y%m%d'),
            'EndDate': endDate.strftime('%Y%m%d')
        }
        instructions['StatCondition'] = ProUtils.format_string(
            condTemplate, condInst)
        instructions['DaysRange'] = 'N/A'
        return instructions

    def sportsQueriesGenerator(self, queriesQueue, sourceConfig, startTime):
        #
        # create jobs for all relevant metrics.
        for statDef in sourceConfig['StatsDefDict'].values():

            if statDef['Doit'] != 'y':
                continue

            #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True)

            sourceDefinitions = definitions[sourceConfig['StatSource']]

            for statObject in statDef['StatObject'].split(','):
                for statTimeframe in sourceConfig['StatTimeframe'].split(','):
                    query = sourceConfig['query']
                    query = query.replace('{StatObject}', statObject)
                    query = query.replace('{StatTimeframe}', statTimeframe)
                    if sourceConfig['StatCondition'] != '':
                        query = ProUtils.format_string(
                            query,
                            eval("self." + sourceConfig['StatCondition']))
                    else:
                        query = ProUtils.format_string(query,
                                                       {'StatCondition': True})

                    query = ProUtils.format_string(
                        query, sourceDefinitions['StatObject'][statObject])
                    query = ProUtils.format_string(query, statDef)
                    query = ProUtils.format_string(query, sourceConfig)
                    #print (query)
                    #
                    # define the destination table
                    instructions = statDef
                    instructions['StatObject'] = statObject
                    instructions['StatTimeframe'] = statTimeframe
                    instructions['StatSource'] = sourceConfig['StatSource']
                    targetTable = ProUtils.format_string(
                        targetTableFormat, instructions).replace('.', '_')
                    jobDefinition = {
                        'params': {
                            'query': query,
                            'targetDataset': targetDataset,
                            'targetTable': targetTable,
                        },
                        'StatName': statDef['StatName'],
                        'StatObject': statObject,
                        'StatTimeframe': statTimeframe
                    }
                    queriesQueue.put(jobDefinition)

    def complexQueriesGenerator(self, queriesQueue, sourceConfig, startTime):
        #
        # create jobs for all relevant metrics.
        for statDef in sourceConfig['StatsDefDict'].values():

            if statDef['Doit'] != 'y':
                continue

            #print('Metric: {}, Sport:{}, Delta time: {}'.format(statDef['StatName'], statDef['SportCode'], dt.now() - startTime), flush=True)
            inst = {}
            inst['StatTimeframes'] = ProUtils.commastring_to_liststring(
                statDef['StatTimeframes'])
            inst['StatObjects'] = ProUtils.commastring_to_liststring(
                statDef['StatObjects'])
            inst['NumeratorStatNames'] = ProUtils.commastring_to_liststring(
                statDef['NumeratorStatNames'])
            inst['DenominatorStatNames'] = ProUtils.commastring_to_liststring(
                statDef['DenominatorStatNames'])
            query = sourceConfig['query']
            query = ProUtils.format_string(query, inst)
            query = ProUtils.format_string(query, statDef)
            query = ProUtils.format_string(query, sourceConfig)
            #print (query)
            #
            # define the destination table
            instructions = statDef
            instructions['StatObject'] = statDef['StatObjects'].replace(
                ',', '_')
            instructions['StatTimeframe'] = statDef['StatTimeframes'].replace(
                ',', '_')
            instructions['StatSource'] = sourceConfig['StatSource']
            targetTable = ProUtils.format_string(targetTableFormat,
                                                 instructions).replace(
                                                     '.', '_')
            jobDefinition = {
                'params': {
                    'query': query,
                    'targetDataset': targetDataset,
                    'targetTable': targetTable,
                },
                'StatName': statDef['StatName'],
                'StatObject': instructions['StatObject'],
                'StatTimeframe': instructions['StatTimeframe']
            }
            queriesQueue.put(jobDefinition)

    def run(self, configurations=[], numExecutors=0):
        #
        # main method

        startTime = dt.now()
        queriesQueue = multiprocessing.JoinableQueue(
        )  # start a joinable queue to pass messages

        if numExecutors == 0:
            numExecutors = multiprocessing.cpu_count() * 8
        producer = multiprocessing.Process(
            name='QueriesGenerator',
            target=self.queriesGenerator,
            args=(
                queriesQueue,
                numExecutors,
            ),
            kwargs={'configurations': configurations})
        producer.start()
        queriesQueue.join()
        #
        # initate consumers
        # consumer will execute the job
        consumers = [
            multiprocessing.Process(name='QueriesExecutor',
                                    target=self.queryExecutor,
                                    args=(
                                        i,
                                        queriesQueue,
                                    )) for i in range(numExecutors)
        ]
        for c in consumers:
            c.start()

        while True:
            if any(c.is_alive() for c in consumers):
                time.sleep(1)
            else:
                print('Done')
                break

示例#15

显示文件

class GetStocksData:
    def __init__(self):
        self.bqu = BigqueryUtils()
        self.companiesDF = None
        self.stocksDF = None
        self.resourceDir = 'resource'
        self.companiesDataFileName = '{}/companies_{}.csv'.format(
            self.resourceDir, date.today())
        self.stocksDataFileName = '{}/stocks_{}.csv'.format(
            self.resourceDir, date.today())
        self.companiesURL = 'gs://sport-uploads/Finance/companies_fundamentals.csv'
        self.stocksURL = 'gs://sport-uploads/Finance/eod_stocks_data.csv'

    def get_stockdata_by_dates(self, stocklist, from_date, to_date):
        #
        # get updated company data
        if self.stocksDF is None:
            if os.path.exists(self.stocksDataFileName):
                self.stocksDF = pd.read_csv(self.stocksDataFileName)
            else:
                stocksQuery = """SELECT * FROM `sportsight-tests.Finance_Data.eod_daily_history_1year` order by Symbol, Date"""
                self.stocksDF = self.bqu.execute_query_to_df(stocksQuery)
                if not os.path.exists(self.resourceDir):
                    os.mkdir(self.resourceDir)
                self.stocksDF.to_csv(self.stocksDataFileName)
                #url = self.bqu.upload_file_to_gcp('sport-uploads', self.stocksDataFileName, self.stocksURL.replace('gs://sport-uploads/', ''))

        if len(stocklist) > 0:
            symbol_condition = 'Symbol in {tickersString} and '.format(
                tickersString=str(stocklist))
        else:
            symbol_condition = ''

        stocksQuery = '{symbol_condition} Date >= "{from_date}" and Date <= "{to_date}"'.format(
            symbol_condition=symbol_condition,
            from_date=from_date,
            to_date=to_date)
        stockDataDF = self.stocksDF.query(stocksQuery)
        stockDataDF.index = pd.to_datetime(stockDataDF['Date'])
        stockDataDF.rename_axis("date", axis='index', inplace=True)
        return stockDataDF

    # get list of stock action x days to date
    def get_stockdata_by_cal_days(self, stocklist, numdays, to_date):
        from_date = to_date - datetime.timedelta(days=numdays - 1)
        return self.get_stockdata_by_dates(stocklist, from_date, to_date)

    def get_stock_fundamentals(self, stocklist=[], index=None):
        #
        # get updated company data
        if self.companiesDF is None:
            if os.path.exists(self.companiesDataFileName):
                self.companiesDF = pd.read_csv(self.companiesDataFileName)
            else:
                companiesQuery = """SELECT * FROM `sportsight-tests.Finance_Data.all_company_data` WHERE MarketCapitalizationMln > 1000"""
                self.companiesDF = self.bqu.execute_query_to_df(companiesQuery,
                                                                fillna=0)
                if not os.path.exists(self.resourceDir):
                    os.mkdir(self.resourceDir)
                self.companiesDF.to_csv(self.companiesDataFileName)
                #url = self.bqu.upload_file_to_gcp('sport-uploads', self.companiesDataFileName, self.companiesURL.replace('gs://sport-uploads/', ''))

        if len(stocklist) > 0:
            where_condition = 'Symbol in {tickersString}'.format(
                tickersString=str(stocklist))
        elif index in ['DJI', 'SNP']:
            where_condition = 'is{index}'.format(index=index)
        else:
            return self.companiesDF

        return self.companiesDF.query(where_condition)

示例#16

显示文件

文件： app.py 项目： yahali-contendoAI/contendo_core

def one_list_generator(listName, listConfigDict, startTime=dt.now()):
    listsDefDict = ProUtils.get_dict_from_jsonfile('lists_config.json')
    finquery = ProUtils.get_string_from_file('queries/top_lists_query.sql')
    #
    # read the query, configure and run it.
    instructions = {}
    if listName in listsDefDict.keys():
        listConfig = listsDefDict[listName]
    else:
        raise NotFound('List {} does not exists'.format(listName))

    instructions['StatName'] = listConfig['StatName']
    instructions['RollingDaysCondition'] = 'StatRollingDays="{}"'.format(
        listConfig['RollingDays'])

    if 'Sector' in listConfigDict:
        instructions['SectorCondition'] = 'Sector="{}"'.format(
            listConfigDict['Sector'])
    else:
        instructions['SectorCondition'] = 'TRUE'

    if listConfigDict.get('Index', '') in ['DJI', 'SNP']:
        instructions['IndexCondition'] = 'is' + listConfigDict['Index']
    else:
        instructions['IndexCondition'] = 'isSNP'

    minMarketCap = listConfigDict.get('MarketCapMin', 100)
    maxMarketCap = listConfigDict.get('MarketCapMax', 1000000000)
    instructions['MarketCapCondition'] = 'MarketCap BETWEEN {} AND {}'.format(
        minMarketCap, maxMarketCap)
    instructions['ListSize'] = min(listConfigDict.get('ListSize', 5), 10)

    #query = self.get_onelist_query(listConfigDict['Domain'])
    query = ProUtils.format_string(finquery, instructions)
    #print("Running query:\n" + query, flush=True)
    #return
    #
    # Execute the query.
    print('Starting get-top-list for {} query execution'.format(instructions),
          dt.now() - startTime)
    bqu = BigqueryUtils()
    listDF = bqu.execute_query_to_df(query)
    print(list(listDF['Symbol']))
    #listDF = listDF.query('TopBottom=="TOP"')
    #print (listDF.columns, listDF.shape, dt.now()-startTime)
    listDict = ProUtils.pandas_df_to_dict(listDF, 'TopRank')

    #
    # getting additional info
    print('Starting get_stock_fundamentals for {}'.format('SNP'),
          dt.now() - startTime)
    getstocks = GetStocksData()
    companiesDF = getstocks.get_stock_fundamentals(index='SNP')
    symbolList = list(companiesDF['Symbol'])
    print(
        'Starting StockMetricsCalculator for {}, {} companies'.format(
            symbolList, len(symbolList)),
        dt.now() - startTime)
    smc = StockMetricsCalculator(symbolList)
    print('Done StockMetricsCalculator', dt.now() - startTime)
    gsn = GetStockNews()
    for key, stockDict in listDict.items():
        stockDict['InterestingStatements'] = get_statements_for_ticker(
            stockDict['Symbol'], smc)
        stockDict['RelevantNews'] = gsn.get_stocknews_byticker(
            stockDict['Symbol'])

    listDict['Description'] = listConfig['QuestionDescription']
    print(listDict, dt.now() - startTime)
    return json.dumps(listDict)

示例#17

显示文件

文件： InsightsPackaging.py 项目： yahali-contendoAI/contendo_core

class InsightsPackaging:
    def __init__(self, root='.'):
        self.icm = icm.InsightsConfigurationManager()
        self.bqUtils = BigqueryUtils()
        self.questionsReaderQuery = open(
            root + '/Queries/SportQuestionsReaderQuery.sql', 'r').read()

    def two_answers_reader(self, contentConfigCode):
        configDef = self.icm.get_content_config(contentConfigCode)
        #
        # read the questions
        query = ProUtils.format_string(self.questionsReaderQuery, configDef)
        questionsDF = self.bqUtils.execute_query_to_df(query)
        #
        # find all metrics within slot
        nSlots = configDef['NumSlots']
        slotStatGroups = {}
        slotStatGroupKeys = {}
        for i in range(1, nSlots + 1):
            slotDF = questionsDF.query('slotNum == %d' % i)
            slotStatGroups[i] = slotDF.groupby(['QuestionCode',
                                                'StatObject']).groups
            slotStatGroupKeys[i] = set(slotStatGroups[i].keys())

        return questionsDF, slotStatGroups, slotStatGroupKeys

    def two_answers_question_generator(self, questionDict, configDef):
        #print(questionDict)
        stat1 = questionDict['Stat1']
        stat2 = questionDict['Stat2']
        questionTemplate = stat1['Question2Objects']
        questionInstructions = stat1
        timeFrameTexts = configDef['TimeframeText'].split(',')
        loc = random.randint(0, len(timeFrameTexts) - 1)
        questionInstructions['Timeframe'] = timeFrameTexts[loc]
        questionText = ProUtils.format_string(questionTemplate,
                                              questionInstructions)
        templateDict = self.icm.templateDefsDict

        outQuestion = {
            'QuestionText':
            questionText,
            'Answer1':
            stat1['StatObjectName'],
            'Answer2':
            stat2['StatObjectName'],
            'Value1':
            str(
                eval(templateDict[stat1['Value1Template']]['Template'].replace(
                    '{value}', "stat1['StatValue']"))),
            'Value2':
            str(
                eval(templateDict[stat2['Value1Template']]['Template'].replace(
                    '{value}', "stat2['StatValue']"))),
        }
        questionKeys = [
            'ContentDefCode', 'SportCode', 'StatSource', 'slotNum', 'rankDiff',
            'StatObject', 'StatTimeframe', 'LeagueCode', 'SeasonCode',
            'CompetitionStageCode', 'MatchStageCode', 'QuestionCode',
            'StatCode', 'Description', 'numRanks', 'rankItemsCount',
            'valueRange', 'internalDenseRank', 'objectsCount', 'minValue',
            'maxValue'
        ]
        statKeys = [
            'StatObjectName', 'StatFunction', 'MatchCode', 'TeamCode',
            'PlayerCode', 'StatValue', 'Count', 'DenseRank', 'TeamName',
            'PlayerName'
        ]
        ProUtils.update_dict(outQuestion, stat1, questionKeys)
        ProUtils.update_dict(outQuestion, questionDict, questionKeys)
        ProUtils.update_dict(outQuestion, stat1, statKeys, '1')
        ProUtils.update_dict(outQuestion, stat2, statKeys, '2')

        return outQuestion

    def two_answers_package_generator(self, contentConfigCode):
        configDef = self.icm.get_content_config(contentConfigCode)
        numPackages = configDef['NumPackages']
        numSlots = configDef['NumSlots']
        outputDF = pd.DataFrame()
        questionsDF, slotStatGroups, slotStatGroupKeys = self.two_answers_reader(
            contentConfigCode)

        for packageNo in range(1, numPackages + 1):
            selectedStats = set()
            package = []
            packageId = '{}-{}-{}'.format(contentConfigCode, packageNo,
                                          int(dt.timestamp(dt.now()) * 1000))
            for slotNo in range(1, numSlots + 1):
                while True:
                    try:
                        remainingStatCombinations = slotStatGroupKeys[
                            slotNo] - selectedStats
                        statComb = random.sample(remainingStatCombinations,
                                                 1)[0]
                        break

                    except ValueError:
                        selectedStats.clear()
                        continue

                    except Exception as e:
                        print("Error selecting a new stat in slot #{}: {}, {}".
                              format(slotNo, e, type(e)))

                selectedStats.add(statComb)
                questionGroup = slotStatGroups[slotNo][statComb]
                questionIndex = questionGroup[random.randint(
                    0,
                    len(questionGroup) - 1)]
                questionDict = dict(questionsDF.iloc[questionIndex])
                newQuestion = self.two_answers_question_generator(
                    questionDict, configDef)
                newQuestion['PackageId'] = packageId
                newQuestion['Timestamp'] = dt.now()
                package.append(newQuestion)
            #print(package)
            packageDF = pd.DataFrame(package)
            #print(packageDF)
            outputDF = outputDF.append(packageDF)

        #
        # write to BigQuery
        #print(outputDF)
        tableId = 'Sportsight_Packages.two_answers_all_V3'
        outputDF.to_gbq(tableId, if_exists='append')