Exemplo n.º 1
0
def load_quotes_from_hdf_store(store,
                               place_code,
                               pcode,
                               from_time=None,
                               to_time=None):
    # загружает котировки из hdf файла
    key = place_code + '/' + pcode
    cond = list()
    if from_time is not None:
        cond.append(pd.Term('index', '>=', pd.Timestamp(from_time)))
    if to_time is not None:
        cond.append(pd.Term('index', '<=', pd.Timestamp(to_time)))
    return store.select(key, cond)
Exemplo n.º 2
0
 def _merge_data(self):
     self.est_data = crsp.select('/crsp/dsf',
                                 where=[
                                     pd.Term('PERMNO', '=', self._id),
                                     pd.Term('DATE', '=',
                                             self.est_period.tolist())
                                 ])
     self.evt_data = crsp.select('/crsp/dsf',
                                 where=[
                                     pd.Term('PERMNO', '=', self._id),
                                     pd.Term('DATE', '=',
                                             self.evt_window.tolist())
                                 ])
     self.est_data = self.est_data.reset_index(level=0).join(DAILY_FACTORS)
     self.evt_data = self.evt_data.reset_index(level=0).join(DAILY_FACTORS)
     self._has_data = True
Exemplo n.º 3
0
    def getCentroid(self, targetdict, valcol, topN=20):
        # get the rows.
        query = [
            pd.Term(key, "=", value) for key, value in targetdict.iteritems()
        ]
        selectdf = self.store.select(self.tablename, where=query)

        # pivot the table.
        colindex = [
            x for x in selectdf.columns
            if x not in targetdict.keys() + [valcol]
        ]
        pivotdf = selectdf.pivot_table(valcol, targetdict.keys(),
                                       colindex).fillna(0)

        ## unit-vectorize the pivot table.  The *columns* are treated as the vectors.
        #for col in pivotdf.columns:
        #    pivotdf[col] = nla.norm(pivotdf[col])

        # The columns are the vectors.  Return the topN columnheads by length.
        pivotdf = pivotdf.apply(lambda x: nla.norm(x))

        # FIXED: sort is deprecated
        # pivotdf.sort(ascending=False)
        pivotdf.sort_values(inplace=True, ascending=False)

        topcols = list(pivotdf[:topN].index)
        if not topcols:
            return None
        topcols = [x.replace('"', "\\\"") for x in topcols]

        #print >>sys.stderr, pivotdf

        # Now use the topN to get relevant rows for the colindex.
        query = [
            pd.Term(colindex[n], "=", topcols)
            for n in range(0, len(colindex))
        ]
        #print >>sys.stderr, repr(query)

        # Now create the centroid based on the colindex.
        selectdf = self.store.select(self.tablename, where=query)
        pivotdf = selectdf.pivot_table(valcol, colindex,
                                       sorted(targetdict.keys())).fillna(0)

        return pivotdf.sum()
Exemplo n.º 4
0
    def getRow(self, rowname, rowval, valcol):
        query = pd.Term(rowname, '=', rowval)
        selectdf = self.store.select(self.tablename, where=query)

        colindex = [x for x in selectdf.columns if x not in [rowname, valcol]]
        pivotdf = selectdf.pivot_table(valcol, rowname,
                                       sorted(colindex)).fillna(0)

        return pivotdf.sum()  # need this to get a Series out.
Exemplo n.º 5
0
def get_store_data(mode, chid, term):
    store = pd.HDFStore('/raid1/maye/rdr20_month_samples/' + mode + '/' +
                        chid + '.h5')
    if not 'm' in chid:  # if no m85 in chid, no need for pd.Term
        mine = store.select('df', [pd.Term(term)],
                            columns=['clat', 'clon', 'cloctime', 'tb'])
    else:
        mine = store.select('df')
    store.close()
    return mine
Exemplo n.º 6
0
 def data(self,channels=None):
     """Efficiently get data chunks from disk by supplying a column list (Note:data must be in table format)
     Keyword Arguments: 
     channels=None -- The channels to pull from the data (This should be done efficiently but needs to be reviewed)"""
     if channels is not None:
         try:
             #Efficient read from disk (no need to load all data in memory) if data is in table format
             d = self.__signals.select('data', [pd.Term('columns','=',channels)])
         except:
             #If in pytables format then we need to load all data into memory and clip
             d= self.__signals['data'][channels]
     else:
         d= self.__signals['data']
     
     gc.collect() #Do garbage collection to free up wasted memory
     return d
Exemplo n.º 7
0
def get_var_ib_null_or_nan(variable, condition):  # condition == null ou condition == Nan
    """ voir dans quelles catégories des variables etat, qualite et statut sont les indiv dont l'ib est égal à 0, pour
    comprendre le sens d'un ib égal à 0 et déceler d'éventuelles anomalies."""
    idents_annee = get_df_ib_condition(condition)
    idents = idents_annee['ident'].tolist()
    annees = idents_annee['annee'].tolist()
    df = pd.read_hdf(
        hdf5_file_path,
        '{}'.format(variable),
        where = [pd.Term("ident", "=", idents)],
        start = 800000,
        stop = 999999,
        )
    df = df[df['annee'].isin(annees)]
    df_per_year = df.groupby(['annee', variable]).size().reset_index()
    df_per_year.columns = ['annee', '{}_categorie'.format(variable), '{}_compte'.format(variable)]
    sns.pointplot(x="annee", y="{}_compte".format(variable), hue="{}_categorie".format(variable), data=df_per_year)
    plt.title('Effectifs annuels par categorie de la variable {} pour ib {}'.format(variable, condition))
Exemplo n.º 8
0
def read_method(method):
    if method == 1:
        a = pd.read_hdf(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5')
        time_list = [time.time()]
        # cost  0.30s
        res = a.loc[pd.IndexSlice[tradingday, windcode], :]
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))
        # use idx
    elif method == 2:
        a = pd.HDFStore(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5')
        time_list = [time.time()]
        # cost  0.05s
        res = a.select(
            key='Data',
            where=pd.Term("(TradingDay = tradingday)&(WindCode = windcode)"))
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))
    elif method == 3:
        a = pd.read_hdf(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5')
        pn = a.to_panel()
        time_list = [time.time()]
        # cost  0.001s
        res = pn.loc[:, tradingday, windcode]
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))
    elif method == 4:
        # this method is very slow!!
        a = pd.read_hdf(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5')
        dict_frm = a.to_dict(orient='index')
        res = dict_frm[tradingday]
Exemplo n.º 9
0
 def test_term(self):
     with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
         pd.Term('index>=date')
Exemplo n.º 10
0
	def get_equal_time_term(self, t):
		return pd.Term( 'major_axis', '==', pd.Timestamp(t))
Exemplo n.º 11
0
	def get_interval(self, t0, t1):
		t1 = t1 or t0
		return self.select( self.name,
			[pd.Term('major_axis', '>=', pd.Timestamp(t0)),
			 pd.Term('major_axis', '<=', pd.Timestamp(t1))] )
Exemplo n.º 12
0
 def join_col(self,
              df,
              add_cols,
              join_cols=None,
              join_key=None,
              join_store=None,
              join_filter=None,
              drop_joining_duplicates=True):
     """
     This function is meant to return the input df with add_cols added.
     These columns are fetched in join_store[join_key] and are aligned to df using join_cols.
     Note: At the time of this writing, only a restricted case is handled, namely:
         join_cols has only one element that must be in the index of the store
     """
     join_store = join_store or self.join_store
     join_key = join_key or self.join_key
     if isinstance(add_cols, basestring):
         if add_cols in self.add_from.keys():
             if 'join_store' in self.add_from[add_cols].keys():
                 join_store = join_store or self.add_from[add_cols][
                     'join_store']
             if 'join_key' in self.add_from[add_cols].keys():
                 join_key = join_key or self.add_from[add_cols]['join_key']
             if 'join_cols' in self.add_from[add_cols].keys():
                 join_cols = join_cols or self.add_from[add_cols][
                     'join_cols']
     join_cols = util_ulist.ascertain_list(join_cols)
     add_cols = util_ulist.ascertain_list(add_cols)
     # get the df values to join (and see if they're in cols or index)
     if coll_op.contains(list(df.columns), join_cols):
         df_join_cols_in_columns = True
         df_join_col_values = np.unique(df[join_cols])
     else:
         df_join_cols_in_columns = False
         df_join_col_values = np.unique(list(df.index))
     # get necessary information from store
     store_key_info = self.store_info[join_store]
     join_key = ascertain_prefix_slash(join_key)
     store_key_info = store_key_info[join_key]
     if len(join_cols) == 1 and join_cols[0] == 'index':
         print "uploading only specific indices for join_df"
         join_df = self.store[join_store].select(
             key=join_key,
             where=[pd.Term('index', df_join_col_values)],
             columns=add_cols)
     elif join_cols in store_key_info['column_names']:
         print "uploading only specific columns for join_df"
         join_df = self.store[join_store].select(
             key=join_key,
             where=[pd.Term(join_cols[0], df_join_col_values)],
             columns=join_cols + add_cols)
         join_df.set_index(join_cols[0])
     else:
         print "uploading the whole potential join_df"
         join_df = self.store[join_store].select(key=join_key,
                                                 columns=join_cols +
                                                 add_cols)
     #print join_cols
     #print add_cols
     #print join_df.head(10)
     # drop duplicates
     if drop_joining_duplicates == True:
         join_df = join_df.drop_duplicates()
     if coll_op.contains(list(join_df.columns), join_cols):
         join_df_cols_in_cols = True
     else:
         join_df_cols_in_cols = False
     #print df_join_cols_in_columns
     #print join_df_cols_in_cols
     # join
     if df_join_cols_in_columns:
         if join_df_cols_in_cols:
             return pd.merge(df, join_df, on=join_cols)
         else:
             return pd.merge(df,
                             join_df,
                             right_on=join_cols,
                             left_index=True)
     else:
         if join_df_cols_in_cols:
             return pd.merge(df,
                             join_df,
                             right_index=True,
                             left_on=join_cols)
         else:
             return pd.merge(df, join_df, right_index=True, left_index=True)
Exemplo n.º 13
0
 def get_table(self, selection, columns=None, key=None):
     key = key or self.key
     columns = columns or self.columns
     return self.select(key=self.key,
                        where=pd.Term(self.selection_col, selection),
                        columns=self.columns)
Exemplo n.º 14
0
 def or_select_single_var(self, key=None, where=None):
     key = key or self.key
     #where_string = where[0]+'='
     return self.select(key=key, where=pd.Term(where[0], where[1]))
Exemplo n.º 15
0
def read_method(method):
    if method == 1:
        a = pd.read_hdf(r'D:\BRPrice_0302.h5')
        time_list = [time.time()]
        # cost  0.30s
        res = a.loc[pd.IndexSlice[tradingday, windcode], :]
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))
        # use idx
    elif method == 2:
        a = pd.HDFStore(r'D:\BRPrice_0302.h5')
        time_list = [time.time()]
        # cost  0.10s
        res = a.select(
            key='Data',
            where=pd.Term("(TradingDay = tradingday)&(WindCode = windcode)"))
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))
    elif method == 3:
        a = pd.read_hdf('D:\\New_BRPrice_0302.h5')
        pn = a.to_panel()
        time_list = [time.time()]
        # cost  0.42s
        # print(pn)
        # if tradingday in pn.major_axis:
        #     print(tradingday)
        # if windcode in pn.minor_axis:
        #     print(windcode)
        # else:
        #     print('not exists')

        res = pn.loc[:, tradingday, windcode]
        # print(res)
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))
    elif method == 4:
        # this method is very slow!!
        a = pd.read_hdf(r'D:\BRPrice_0302.h5')
        dict_frm = a.to_dict(orient='index')
        res = dict_frm[tradingday]

    #日期用str比Timestamp慢
    elif method == 5:
        a = pd.read_hdf('D:\\New_BRPrice_0302.h5')
        pn = a.to_panel()
        time_list = [time.time()]
        res = pn.loc[:, '2005-01-05', windcode]
        # print(res)
        time_list.append(time.time())
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))

    #ix 很慢
    elif method == 6:
        time_list = [time.time()]
        dailyQuote = pd.read_hdf(r'D:\BRPrice.h5')
        # # print(store)
        # dailyQuote = store.select('Data'
        #   # [
        #   # Term('InnerCode', '=', 3),
        #   # Term('TradingDay', '>=', startDate),
        #   # Term('TradingDay', '<=', endDate),
        #   # Term('columns', '=', 'Mom')
        #   # ]
        #   );
        time_list.append(time.time())
        # dailyQuote = dailyQuote[dailyQuote.index.get_level_values(0) == tradingday]
        time_list.append(time.time())

        # dailyQuote.sort_index(inplace= True)
        for i in range(10000):
            dailyQuote2 = dailyQuote.xs(tradingday)
            # print(i)

            # dailyQuote2 = dailyQuote.loc[tradingday]
        time_list.append(time.time())
        # print(dailyQuote)
        print(
            np.array(time_list[1:len(time_list)]) -
            np.array(time_list[0:len(time_list) - 1]))

        print('-----')

        planBuyList = [
            '000001.SZ', '000002.SZ', '000004.SZ', '000007.SZ', '000010.SZ'
        ]

        for innerCode in planBuyList:
            time_list = [time.time()]
            # entity = dailyQuote.ix[(tradingday,innerCode)]
            entity = dailyQuote2.ix[innerCode]
            # print(entity)
            time_list.append(time.time())
            print(
                np.array(time_list[1:len(time_list)]) -
                np.array(time_list[0:len(time_list) - 1]))
Exemplo n.º 16
0
 def getLengths(self, lengths):
     lengthquery = pd.Term(self.vecnamecol, "=", lengths)
     return self.store.select(self.tablename,
                              where=lengthquery).set_index(self.vecnamecol)