예제 #1
0
 def connectTo(self, connections, **kwargs):
     ''' Set up allowed data sources '''
     tz = kwargs.get('timezone', self.tz)
     log = kwargs.get('logger', None)
     lvl = kwargs.get('lvl', 'debug')
     if 'database' in connections:
         db_name = kwargs.get('db_name', str(os.environ['QTRADEDB']))
         self.db = QuantSQLite(db_name, timezone=tz, logger=log, lvl=lvl)
         self._logger.info('Database location provided, Connected to %s' %
                           db_name)
         self.connected['database'] = True
     if 'remote' in connections:
         self.remote = Fetcher(timezone=tz, logger=log, lvl=lvl)
         self.connected['remote'] = True
     if 'csv' in connections:
         raise NotImplementedError()
예제 #2
0
class DataAgent(object):
    ''' Quote object - fill everything related:
            name, code, current value, bought value,
            market, actions owned,
            and date, open, close, volume, of asked period'''
    def __init__(self, sources=None, tz=pytz.utc, logger=None, lvl='debug'):
        self.tz = tz
        if logger is None:
            self._logger = LogSubsystem(DataAgent.__name__, lvl).getLog()
        else:
            self._logger = logger
        self.connected = {'remote': False, 'database': False, 'csv': False}
        if isinstance(sources, list):
            self.connectTo(sources, lvl=lvl, timezone=tz, logger=logger)

    def connectTo(self, connections, **kwargs):
        ''' Set up allowed data sources '''
        tz = kwargs.get('timezone', self.tz)
        log = kwargs.get('logger', None)
        lvl = kwargs.get('lvl', 'debug')
        if 'database' in connections:
            db_name = kwargs.get('db_name', str(os.environ['QTRADEDB']))
            self.db = QuantSQLite(db_name, timezone=tz, logger=log, lvl=lvl)
            self._logger.info('Database location provided, Connected to %s' %
                              db_name)
            self.connected['database'] = True
        if 'remote' in connections:
            self.remote = Fetcher(timezone=tz, logger=log, lvl=lvl)
            self.connected['remote'] = True
        if 'csv' in connections:
            raise NotImplementedError()

    def _makeIndex(self, args):
        ''' Take getQuotes input and produce suitable index '''
        #TODO implement elapse
        start_day = args.get('start', None)
        end_day = args.get('end', dt.datetime.now(self.tz))
        delta = args.get('delta', self._guessResolution(start_day, end_day))
        period = args.get('period', None)
        if period:
            if not start_day:
                start_day = end_day - period
        if not start_day:
            if not delta:
                start_day = dt.datetime.now(self.tz).date()
                delta = pd.datetools.Minute()
            else:
                self._logger.error('** No suitable date informations provided')
                return None
        return pd.date_range(start=start_day, end=end_day, freq=delta)

    def _inspectDB(self, ticker, request_idx, fields=Fields.QUOTES):
        ''' Check available df in db, according to the requested index '''
        self._logger.info('Inspecting database.')
        assert (isinstance(ticker, str))
        assert (isinstance(request_idx, pd.Index))
        assert (isinstance(fields, list))
        #TODO comparisons are too strics
        #TODO identificate NaN columns, that will erase everything at dropna
        if not self.connected['database']:
            self._logger.info('No database access allowed.')
            return DataFrame(), request_idx

        db_index = self.db.getDataIndex(ticker, fields, summary=False)
        if not isinstance(db_index, pd.Index):
            self._logger.info('No quotes stored in database.')
            return DataFrame(), request_idx
        elif db_index.freq > request_idx.freq and not db_index.freq == request_idx.freq:
            self._logger.info('Superior asked frequency,\
                    dropping and downloading along whole timestamp')
            return DataFrame(), request_idx
        else:
            if db_index[0] > request_idx[0] and db_index[-1] < request_idx[-1]:
                return DataFrame(), request_idx
                raise NotImplementedError()
                #TODO Two different timestamps to download !
                #to dl: start_day -> first_db_date and last_db_date -> end_day
            else:
                #NOTE Intersection and other seemed cool but don't work
                if db_index[0].hour != 0:
                    db_index = pd.date_range(
                        db_index[0] -
                        pd.datetools.relativedelta(hours=db_index[0].hour),
                        db_index[-1] -
                        pd.datetools.relativedelta(hours=db_index[0].hour),
                        freq=db_index.freq)
                intersect = db_index & request_idx
                #start_to_get = intersect[0] - request_idx.freq
                #if not start_to_get.tzinfo:
                #self.tz.localize(start_to_get)
                #idx_to_get = pd.date_range(start_to_get, intersect[-1], freq=request_idx.freq)
                idx_to_get = pd.date_range(intersect[0] - request_idx.freq,
                                           intersect[-1],
                                           freq=request_idx.freq)
                if idx_to_get is None or idx_to_get.size == 0:
                    self._logger.info('No quotes available in database.')
                    return DataFrame(), request_idx
                #NOTE try union minus intersect
                #TODO getQuotesDB and others take care of dataframe cast and fields cut
                if db_index[0] > request_idx[0]:
                    idx_to_dl = request_idx[request_idx < idx_to_get[0]]
                else:
                    idx_to_dl = request_idx[request_idx > idx_to_get[-1]]
                db_df = DataFrame(self.db.getQuotesDB(ticker, idx_to_get),
                                  columns=fields).dropna()
                if not db_df.index.tzinfo:
                    db_df.index.tz_localize(self.tz)
        return db_df, idx_to_dl

    #TODO each retriever, remote and db, takes care of dropna and columns
    def getQuotes(self, tickers, fields=Fields.QUOTES, index=None, **kwargs):
        '''
        @summary: retrieve google finance data asked while initializing
        and store it: Date, open, low, high, close, volume
        @param quotes: list of quotes to fetch
        @param fields: list of fields to store per quotes
        @param index: pandas.Index object, used for dataframes
        @param kwargs.start: date or datetime of the first values
               kwargs.end: date or datetime of the last value
               kwargs.delta: datetime.timedelta object, period of time to fill
               kwargs.save: save to database downloaded quotes
               kwargs.reverse: reverse companie name and field in panel
               kwargs.symbols
               kwargs.markets
        @return a panel/dataframe/timeserie like close = data['google']['close'][date]
        '''
        ''' ----------------------------------------------------------------------------'''
        ''' ----------------------------------  Index check and build  -----------------'''
        #FIXME reversed dataframe could be store in database ?
        df = dict()
        save = kwargs.get('save', False)
        reverse = kwargs.get('reverse', False)
        markets = kwargs.get('markets', None)
        symbols = kwargs.get('symbols', None)
        if not isinstance(index, pd.DatetimeIndex):
            index = self._makeIndex(kwargs)
            if not isinstance(index, pd.DatetimeIndex):
                return None
        if not index.tzinfo:
            index = index.tz_localize(self.tz)
        assert (index.tzinfo)

        if self.connected['database']:
            symbols, markets = self.db.getTickersCodes(tickers)
        elif not symbols or not markets:
            self._logger.error('** No database neither informations provided')
            return None

        for ticker in tickers:
            if not ticker in symbols:
                self._logger.warning(
                    'No code availablefor {}, going on'.format(ticker))
                continue
            self._logger.info('Processing {} stock'.format(ticker))
            ''' ----------------------------------------------------------------------------'''
            ''' ----------------------------------------------  Database check  ------------'''
            db_df, index = self._inspectDB(ticker, index, fields)
            assert (index.tzinfo)
            if not db_df.empty:
                assert (db_df.index.tzinfo)
                if index.size == 0:
                    save = False
                    df[ticker] = db_df
                    continue
            ''' ----------------------------------------------------------------------------'''
            ''' ----------------------------------------------  Remote retrievers  ---------'''
            self._logger.info('Downloading missing data, from {} to {}'.format(
                index[0], index[-1]))
            #FIXME No index.freq for comaprison?
            #if (index[1] - index[0]) < pd.datetools.timedelta(days=1):
            if index.freq > pd.datetools.BDay():
                self._logger.info('Fetching minutely quotes ({})'.format(
                    index.freq))
                #TODO truncate in the method
                network_df = DataFrame(
                    self.remote.getMinutelyQuotes(symbols[ticker],
                                                  markets[ticker], index),
                    columns=fields).truncate(after=index[-1])
            else:
                network_df = DataFrame(self.remote.getHistoricalQuotes(
                    symbols[ticker], index),
                                       columns=fields)
            ''' ----------------------------------------------------------------------------'''
            ''' ----------------------------------------------  Merging  -------------------'''
            if not db_df.empty:
                self._logger.debug(
                    'Checking db index ({}) vs network index ({})'.format(
                        db_df.index, network_df.index))
                if db_df.index[0] > network_df.index[0]:
                    df[ticker] = pd.concat([network_df, db_df])
                else:
                    df[ticker] = pd.concat([db_df, network_df]).sort_index()
            else:
                df[ticker] = network_df
        ''' ----------------------------------------------------------------------------'''
        ''' ----------------------------------------------  Manage final panel  --------'''
        data = Panel.from_dict(df, intersect=True)
        if save:
            #TODO: accumulation and compression of data issue, drop always true at the moment
            if self.connected['database']:
                self.db.updateStockDb(data, Fields.QUOTES, drop=True)
            else:
                self._logger.warning('! No database connection for saving.')
        if reverse:
            return Panel.from_dict(df, intersect=True, orient='minor')
        #NOTE if data used here, insert every FIELD.QUOTES columns
        #NOTE Only return Panel when one ticker and/or one field ?
        return Panel.from_dict(df, intersect=True)

    def _guessResolution(self, start, end):
        if not start or not end:
            return None
        elapse = end - start
        if abs(elapse.days) > 5 and abs(elapse.days) < 30:
            delta = pd.datetools.BDay()
        elif abs(elapse.days) <= 5 and abs(elapse.days) > 1:
            delta = pd.datetools.Hour()
        elif abs(elapse.days) >= 30:
            delta = pd.datetools.BMonthEnd(round(elapse.days / 30))
        else:
            delta = pd.datetools.Minute(10)
        self._logger.info('Automatic delta fixing: {}'.format(delta))
        return delta

    def load_from_csv(self, tickers, index, fields=Fields.QUOTES, **kwargs):
        ''' Return a quote panel '''
        #TODO Replace adj_close with actual_close
        #TODO Add reindex methods, and start, end, delta parameters
        reverse = kwargs.get('reverse', False)
        verbose = kwargs.get('verbose', False)
        if self.connected['database']:
            symbols, markets = self.db.getTickersCodes(tickers)
        elif not symbols:
            self._logger.error('** No database neither informations provided')
            return None
        timestamps = du.getNYSEdays(index[0], index[-1],
                                    dt.timedelta(hours=16))
        csv = da.DataAccess('Yahoo')
        df = csv.get_data(timestamps,
                          symbols.values(),
                          fields,
                          verbose=verbose)
        quotes_dict = dict()
        for ticker in tickers:
            j = 0
            quotes_dict[ticker] = dict()
            for field in fields:
                serie = df[j][symbols[ticker]].groupby(
                    index.freq.rollforward).aggregate(np.mean)
                #TODO add a function parameter to decide what to do about it
                clean_serie = serie.fillna(method='pad')
                quotes_dict[ticker][field] = clean_serie
                j += 1
        if reverse:
            return Panel.from_dict(quotes_dict, intersect=True, orient='minor')
        return Panel.from_dict(quotes_dict, intersect=True)

    def help(self, category):
        #TODO: stuff to know like fields and functions
        print('{} help'.format(category))
예제 #3
0

if __name__ == '__main__':
    # Configuration
    assert len(sys.argv) == 2
    if sys.argv[1] == 'twitter':
        database = 'feeds.db'
        script = 'twitterBuild.sql'
    elif sys.argv[1] == 'stocks':
        database = 'stocks.db'
        script = 'stocksBuild.sql'
    else:
        raise NotImplementedError()

    # Open or create the desired sqlite database
    db = QuantSQLite(database)
    # Test everything is alright
    print db.execute('select sqlite_version()')

    # Execute sql script
    db.queryFromScript(script)
    db.close()

    if sys.argv[1] == 'stocks':
        # getQuotes() method will fetch data that is not available in database and then store it
        timestamp = pd.date_range(pd.datetime(2005, 1, 1, tzinfo=pytz.utc),
                                  pd.datetime(2012, 11, 30, tzinfo=pytz.utc),
                                  freq=pd.datetools.BDay())
        tickers = ['starbucks', 'google', 'apple', 'altair']
        fill_quotes(tickers, timestamp)