示例#1
0
    def replicateLatestRecord(self, no_update_code_list, trade_calendar,
                              target_latest_trade_date):
        data_tot_result = pd.DataFrame([])

        # copy the latest records for those not updated codes
        today_str = datetime.strftime(datetime.now(), '%Y-%m-%d')
        expand_dates = trade_calendar[
            (trade_calendar > target_latest_trade_date)
            & (trade_calendar <= today_str)]
        if not expand_dates.empty:  # today is not the latest record date in target table
            tmp_not_updated_codes = list(
                map(lambda x: "'%s'" % x, no_update_code_list))
            tmp_not_updated_codes = ','.join(tmp_not_updated_codes)

            # get the latest record from target table
            tmp_state = "select * from `%s` where `%s` in (%s) and `%s` = '%s'" % (
                self.targetTableName, self.codeField, tmp_not_updated_codes,
                self.dateField, target_latest_trade_date)
            tmp_latest_records = readDB(tmp_state, ConfigQuant)

            # expand the latest record up to today
            for tmp_idx in tmp_latest_records.index:
                tmp_record = tmp_latest_records.loc[tmp_idx]
                tmp_expand_records = pd.DataFrame([])
                tmp_expand_records = tmp_expand_records.append(
                    [tmp_record] * expand_dates.size)
                tmp_expand_records.loc[:,
                                       self.dateField] = expand_dates.tolist()
                data_tot_result = data_tot_result.append(tmp_expand_records)

            data_tot_result.loc[:, self.timeStampField] = datetime.now()

        return data_tot_result
示例#2
0
    def run(self):
        self.prepareData()
        if self.state == '':  # already the latest data
            return

        # fetch data from sql
        dataO = readDB(self.state, ConfigQuant)
        dataO = dataO.drop_duplicates([self.dateField])
        dataO = dataO.sort_values(self.dateField)

        data_result = pd.DataFrame(dataO[self.dateField])
        tmp_df = priceTechnicalIndicatorTimeSeries(dataO[self.valueField],
                                                   self.lags,
                                                   'PRICE')  # MA, MA_DIFF...
        data_result = data_result.join(tmp_df)

        # trim data for increment
        if self.last_update_date != '':
            data_result = data_result.loc[
                data_result[self.dateField] > self.last_update_date]

        # add timestamp
        data_result['time_stamp'] = datetime.now()

        # dump data into sql
        writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist)
    def getStockCategory(self, stock_data, index_data, date_range, stock_area):
        if self.categoryField == 'industry':
            tmp_state = 'select `%s`, `%s`, `%s` from %s where %s in (%s)' % (
                self.dateField, self.codeField, self.categoryField,
                self.sourceCategoryTableName, self.dateField, date_range)
            category = readDB(tmp_state, ConfigQuant)
            stock_data = stock_data.merge(category,
                                          on=[self.dateField, self.codeField])
            stock_data = stock_data.merge(
                index_data,
                on=[self.dateField, self.categoryField],
                suffixes=['', '_index'])
        elif self.categoryField == 'area':
            stock_data = stock_data.merge(stock_area, on=self.codeField)
            stock_data = stock_data.merge(
                index_data,
                on=[self.dateField, self.categoryField],
                suffixes=['', '_index'])
        elif self.categoryField == 'market':
            stock_data[self.categoryField] = stock_data[self.codeField].apply(
                lambda x: 'SH'
                if x[:2] == '60' else ('SMEB' if x[:3] == '002' else
                                       ('GEB' if x[:3] == '300' else 'SZ')))
            stock_data = stock_data.merge(
                index_data,
                on=[self.dateField, self.categoryField],
                suffixes=['', '_index'])
        elif self.categoryField == 'HS300':
            stock_data = stock_data.merge(index_data,
                                          on=self.dateField,
                                          suffixes=['', '_index'])

        return stock_data
    def getMarketRiseRatio(self, stock_data, date_range, stock_area, date_set):
        if self.categoryField == 'industry':
            tmp_state = 'select `%s`, `%s`, `%s` from %s where %s in (%s)' % (
                self.dateField, self.codeField, self.categoryField,
                self.sourceCategoryTableName, self.dateField, date_range)
            category = readDB(tmp_state, ConfigQuant)
            stock_data = stock_data.merge(category,
                                          on=[self.dateField, self.codeField])
        elif self.categoryField == 'area':
            stock_data = stock_data.merge(stock_area, on=self.codeField)
        elif self.categoryField == 'market':
            stock_data[self.categoryField] = stock_data[self.codeField].apply(
                lambda x: 'SH'
                if x[:2] == '60' else ('SMEB' if x[:3] == '002' else
                                       ('GEB' if x[:3] == '300' else 'SZ')))
        elif self.categoryField == 'all':
            rise_ratio = stock_data.groupby(
                self.dateField)[self.retSeriesField].apply(self.riseRatio)
            rise_ratio = rise_ratio.reset_index()
            return rise_ratio

        rise_ratio = stock_data.groupby(
            [self.dateField,
             self.categoryField])[self.retSeriesField].apply(self.riseRatio)
        rise_ratio = rise_ratio.reset_index()
        return rise_ratio
示例#5
0
    def run(self):
        self.prepareData()
        if self.state == '':  # already the latest data
            return

        # fetch data from sql
        dataO = readDB(self.state, ConfigQuant)
        dataO = dataO.drop_duplicates([self.dateField])
        dataO = dataO.sort_values(self.dateField)  # sort by date
        dataO = dataO.fillna(method='ffill')
        # for field in [self.openField, self.highField, self.lowField, self.closeField, self.volumeField]:
        #     dataO.loc[:, field] = dataO[field].astype('float')

        data_result = pd.DataFrame(dataO[self.dateField])
        # MA, MA_DIFF...
        tmp_df = priceTechnicalIndicator(dataO[self.closeField], self.lags, '')
        data_result = data_result.join(tmp_df)

        # ADX, CCI ...
        tmp_df = priceTechnicalIndicatorOHLCV(dataO[self.openField],
                                              dataO[self.highField],
                                              dataO[self.lowField],
                                              dataO[self.closeField],
                                              dataO[self.volumeField])
        data_result = data_result.join(tmp_df)

        # VMA, VMA_DIFF..., OBV, VOLUME_RATIO...
        tmp_df = volumeTechnicalIndicators(dataO[self.closeField],
                                           dataO[self.volumeField], self.lags,
                                           'V')
        data_result = data_result.join(tmp_df)

        # trim data for increment
        if self.last_update_date != '':
            data_result = data_result.loc[
                data_result[self.dateField] > self.last_update_date]

        # add timestamp
        data_result['time_stamp'] = datetime.now()

        # dump data into sql
        writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist)
    def getStockCategory(self, stock_data, uni_dates, stock_area):
        if self.categoryField == 'industry':
            date_range = list(map(lambda x: "'%s'" % x, uni_dates))
            date_range = ','.join(date_range)
            tmp_state = 'select `%s`, `%s`, `%s` from %s where %s in (%s)' % (
                self.dateField, self.codeField, self.categoryField,
                self.sourceCategoryTableName, self.dateField, date_range)
            category = readDB(tmp_state, ConfigQuant)
            stock_data = stock_data.merge(category,
                                          on=[self.dateField, self.codeField])
        elif self.categoryField == 'area':
            stock_data = stock_data.merge(stock_area, on=self.codeField)
        elif self.categoryField == 'market':
            stock_data[self.categoryField] = stock_data[self.codeField].apply(
                lambda x: 'SH'
                if x[:2] == '60' else ('SMEB' if x[:3] == '002' else
                                       ('GEB' if x[:3] == '300' else 'SZ')))
        elif self.categoryField == 'all':
            pass

        return stock_data
示例#7
0
    def runFull(self):
        # get total code list
        tmp_state = 'select distinct %s from %s' % (self.codeField,
                                                    self.sourceTableName)
        code_list = readDB(tmp_state, ConfigQuant).values
        code_list = code_list.T[0]

        # get trade date list from calendar
        tmp_state = 'select `date` from %s' % self.calendarTableName
        tradedates = readDB(tmp_state, ConfigQuant)['date']

        # calculate num of loop
        loop_num = int(code_list.size / self.chunkSize)
        if code_list.size > loop_num * self.chunkSize:
            loop_num = loop_num + 1

        if self.isMultiProcess:  # use multi processing
            # register pool
            pool = Pool(processes=self.processNum)
            # fetch and process data from sql by chunk
            for i in range(loop_num):
                tmp_code = code_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
                tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code))
                tmp_range = ','.join(tmp_code_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.codeField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO.loc[:, self.reportPeriodField] = dataO[
                    self.yearField] * 100 + dataO[
                        self.seasonField]  # combine report year and season
                dataO = dataO.drop_duplicates(
                    [self.codeField, self.reportPeriodField])
                dataO = dataO.sort_values(
                    self.reportPeriodField)  # sort by report period

                # process chunk data
                pool_results = []
                for code in tmp_code:
                    tmp_data = dataO.loc[dataO[self.codeField] ==
                                         code]  # dataO already sorted by date

                    if tmp_data.empty:
                        continue

                    # multiprocessing
                    tmp_procs = pool.apply_async(self.coreComputation,
                                                 (code, tmp_data, tradedates))
                    pool_results.append(tmp_procs)

                # get result from the process pool
                data_tot_result = pd.DataFrame([])
                for tmp_procs in pool_results:
                    data_result = tmp_procs.get()
                    data_tot_result = data_tot_result.append(data_result)

                # add timestamp
                data_tot_result['time_stamp'] = datetime.now()

                if data_tot_result.empty:
                    continue

                # dump chunk data into sql
                writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                        self.if_exist)
                self.if_exist = 'append'

            pool.close()
        else:  # not multiprocess
            # fetch and process data from sql by chunk
            for i in range(loop_num):
                tmp_code = code_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
                tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code))
                tmp_range = ','.join(tmp_code_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.codeField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO.loc[:, self.reportPeriodField] = dataO[
                    self.yearField] * 100 + dataO[
                        self.seasonField]  # combine report year and season
                dataO = dataO.drop_duplicates(
                    [self.codeField, self.reportPeriodField])
                # dataO = dataO.sort_values(self.reportPeriodField)  # sort by report period

                data_tot_result = pd.DataFrame([])
                for code in tmp_code:
                    tmp_data = dataO.loc[dataO[self.codeField] ==
                                         code]  # dataO already sorted by date
                    tmp_data = tmp_data.sort_values(
                        self.reportPeriodField)  # sort by report period

                    if tmp_data.empty:
                        continue

                    data_result = self.coreComputation(code, tmp_data,
                                                       tradedates)

                    data_tot_result = data_tot_result.append(data_result)

                # add timestamp
                data_tot_result['time_stamp'] = datetime.now()

                if data_tot_result.empty:
                    continue

                # dump chunk data into sql
                writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                        self.if_exist)
                self.if_exist = 'append'
示例#8
0
    def runIncrm(self):
        # fetch and process all incremental data from sql
        dataO = readDB(self.state, ConfigQuant)
        dataO.loc[:, self.reportPeriodField] = dataO[
            self.yearField] * 100 + dataO[self.seasonField]
        dataO = dataO.drop_duplicates([self.codeField, self.reportPeriodField])
        # dataO = dataO.sort_values(self.reportPeriodField) # sort by date

        # get calendar
        tmp_state = "select `date` from `%s`;" % self.calendarTableName
        trade_calendar = readDB(tmp_state, ConfigQuant)
        trade_calendar = trade_calendar['date']

        # get latest time stamp for each stock in the target table
        tmp_state = "select `%s`, max(`%s`) as %s from `%s` group by `%s`" % (
            self.codeField, self.timeStampField, self.latestTimeStamp,
            self.targetTableName, self.codeField)
        target_latest_time_stamp = readDB(tmp_state, ConfigQuant)
        target_latest_time_stamp = target_latest_time_stamp.set_index(
            self.codeField)

        # get the latest trade date of the data in target table
        tmp_state = "select max(`%s`) from `%s`" % (self.dateField,
                                                    self.targetTableName)
        target_latest_trade_date = readDB(tmp_state, ConfigQuant)
        target_latest_trade_date = target_latest_trade_date.iloc[0, 0]

        # process incremental data
        code_list = dataO[self.codeField].unique()

        data_tot_result = pd.DataFrame([])
        # use multiprocessing to improve computation hours
        if self.isMultiProcess:
            pool = Pool(processes=self.processNum)
            pool_results = []
            pool_data_first_date = []
            no_update_code_list = []
            # build pool
            for code in code_list:
                tmp_data = dataO.loc[dataO[self.codeField] == code]
                tmp_data = tmp_data.sort_values(
                    self.reportPeriodField)  #  sorted by report period

                if tmp_data.empty:
                    continue

                # find the latest time stamp, and compare it with the raw data, delete obsolete data if there exists
                is_new_data = False
                try:
                    tmp_target_latest_time_stamp = target_latest_time_stamp.loc[
                        code, self.latestTimeStamp]
                    tmp_data_source_new = tmp_data.loc[tmp_data[
                        self.timeStampField] >= tmp_target_latest_time_stamp]
                    tmp_data_source_unexpanded = tmp_data.loc[tmp_data[
                        self.releaseDateField] > target_latest_trade_date]
                    tmp_data_source_new = tmp_data_source_new.append(
                        tmp_data_source_unexpanded)
                    tmp_data_source_new = tmp_data_source_new.drop_duplicates(
                        [self.codeField, self.yearField, self.seasonField])

                    tmp_data_source_new = tmp_data_source_new.loc[
                        ~tmp_data_source_new[self.releaseDateField].isnull()]

                    if not tmp_data_source_new.empty:  # obsolete data
                        data_new_first_data = tmp_data_source_new[
                            self.releaseDateField].min(
                            )  # find the earliest report in new update data

                        if type(
                                data_new_first_data
                        ).__name__ == 'str':  # else data_new_first_data is nan
                            is_new_data = True
                            deleteObsoleteDataFromDB(
                                code, data_new_first_data, self.dateField,
                                self.codeField, self.targetTableName,
                                ConfigQuant
                            )  # delete obsolet data not earlier than the eariliest report date
                except KeyError:
                    is_new_data = True
                    data_new_first_data = '2007-01-01'  # this stock code is new to the target table

                if is_new_data:  # have values updated or completely new
                    tmp_result = pool.apply_async(
                        self.coreComputation, (code, tmp_data, trade_calendar))
                    pool_results.append(tmp_result)
                    data_new_first_data = min(
                        data_new_first_data, trade_calendar[
                            trade_calendar > target_latest_trade_date].iloc[0])
                    pool_data_first_date.append(data_new_first_data)
                else:  # no new data from source table to update target table
                    no_update_code_list.append(code)

            # get result from the pool
            for tmp_result, tmp_first_date in zip(pool_results,
                                                  pool_data_first_date):
                data_result = tmp_result.get()
                data_result = data_result.loc[
                    data_result[self.dateField] >=
                    tmp_first_date]  # slice data, from the earliest report release date
                print('%s regenerate %d data' % (code, data_result.shape[0]))
                data_tot_result = data_tot_result.append(data_result)

            # replicate the latest records for those not updated codes
            replicate_records = self.replicateLatestRecord(
                no_update_code_list, trade_calendar, target_latest_trade_date)
            data_tot_result = data_tot_result.append(replicate_records)

        else:  # single process
            no_update_code_list = []
            for code in code_list:
                tmp_data = dataO.loc[dataO[self.codeField] == code]
                tmp_data = tmp_data.sort_values(
                    self.reportPeriodField)  # sorted by report period

                if tmp_data.empty:
                    continue

                # find the latest time stamp, and compare it with the raw data, delete obsolete data if there exists
                has_new_data = False
                try:
                    tmp_target_latest_time_stamp = target_latest_time_stamp.loc[
                        code, self.latestTimeStamp]
                    tmp_data_source_new = tmp_data[tmp_data[
                        self.timeStampField] >= tmp_target_latest_time_stamp]
                    tmp_data_source_unexpanded = tmp_data[tmp_data[
                        self.releaseDateField] > target_latest_trade_date]
                    tmp_data_source_new = tmp_data_source_new.append(
                        tmp_data_source_unexpanded)
                    tmp_data_source_new = tmp_data_source_new.drop_duplicates(
                        [self.codeField, self.yearField, self.seasonField])

                    if not tmp_data_source_new.empty:  # obsolete data
                        tmp_data_new_first_data = tmp_data_source_new[
                            self.releaseDateField].min(
                            )  # find the earliest report in new update data
                        if type(
                                tmp_data_new_first_data
                        ).__name__ == 'str':  # else tmp_data_new_first_data is nan
                            has_new_data = True
                            deleteObsoleteDataFromDB(
                                code, tmp_data_new_first_data, self.dateField,
                                self.codeField, self.targetTableName,
                                ConfigQuant
                            )  # delete obsolet data later than the eariliest report date
                except KeyError:
                    has_new_data = True  # this stock code is new to the target table
                    tmp_data_new_first_data = '2007-01-01'

                if has_new_data:
                    data_result = self.coreComputation(code, tmp_data,
                                                       trade_calendar)
                    tmp_data_new_first_data = min(
                        tmp_data_new_first_data, trade_calendar[
                            trade_calendar > target_latest_trade_date].iloc[0])
                    data_result = data_result.loc[
                        data_result[self.dateField] >= tmp_data_new_first_data]
                    print('%s regenerate %d data' %
                          (code, data_result.shape[0]))
                    data_tot_result = data_tot_result.append(data_result)
                else:  # no new data from source table to update target table
                    no_update_code_list.append(code)

            # replicate latest records for those not updated codes
            replicate_records = self.replicateLatestRecord(
                no_update_code_list, trade_calendar, target_latest_trade_date)
            data_tot_result = data_tot_result.append(replicate_records)

        if not data_tot_result.empty:
            # add timestamp
            data_tot_result['time_stamp'] = datetime.now()

            # dump chunk data into sql
            writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'
    def run(self, startDate='2007-01-01'):
        self.prepareData()
        if self.stockState == '':  # already the latest data
            return

        # get total date list (horizontally)
        tmp_state = 'select distinct %s from %s' % (self.dateField,
                                                    self.sourceStockTableName)
        date_list = readDB(tmp_state, ConfigQuant).values
        date_list = date_list.T[0]
        if self.last_update_date == '':
            date_list = date_list[date_list > startDate]
        else:
            date_list = date_list[date_list > self.last_update_date]

        #  ******************  area is the same for all dates, that means using future data!!! need to be modified
        if self.categoryField == 'area':
            tmp_state = "SELECT `%s`, `%s`  FROM `%s`;" % (
                self.codeField, self.categoryField,
                self.sourceCategoryTableName)
            stock_area = readDB(tmp_state, ConfigQuant)
        else:
            stock_area = None

        # calculate num of loop
        loop_num = int(date_list.size / self.chunkSize)
        if date_list.size > loop_num * self.chunkSize:
            loop_num = loop_num + 1

        # fetch and process data from sql by chunk
        for i in range(loop_num):
            tmp_date = date_list[i * self.chunkSize:(i + 1) * self.chunkSize]
            tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date))
            tmp_range = ','.join(tmp_date_str)
            # read stock return data
            tmp_state = self.stockState + "`%s` in (%s)" % (self.dateField,
                                                            tmp_range)
            dataStockO = readDB(tmp_state, ConfigQuant)
            # read index return data
            tmp_state = self.indexState + "`%s` in (%s)" % (self.dateField,
                                                            tmp_range)
            dataIndexO = readDB(tmp_state, ConfigQuant)

            dataStockO = dataStockO.drop_duplicates(
                [self.dateField, self.codeField])
            if self.categoryField == 'HS300':
                dataIndexO = dataIndexO.drop_duplicates(self.dateField)
            else:
                dataIndexO = dataIndexO.drop_duplicates(
                    [self.dateField, self.categoryField])
            # dataStockO = dataStockO.fillna(0)
            # dataIndexO = dataIndexO.fillna(0)

            data_result = self.getStockCategory(dataStockO, dataIndexO,
                                                tmp_range, stock_area)
            new_fields = list(
                map(lambda x: 'EXCESSIVE_%s' % x, self.retSeriesField))
            for field in zip(new_fields, self.retSeriesField):
                data_result[field[0]] = data_result[field[1]] - data_result[
                    field[1] + '_index']

            tot_fields = [self.dateField, self.codeField]
            tot_fields.extend(new_fields)
            data_result = data_result[tot_fields]

            # add timestamp
            print(self.targetTableName)
            print(data_result.shape)
            data_result.loc[:, 'time_stamp'] = datetime.now()

            # dump chunk data into sql
            writeDB(self.targetTableName, data_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'
    def run(self, startDate='2007-01-01'):
        self.prepareData()
        if self.stockState == '':  # already the latest data
            return

        # get total date list (horizontally)
        tmp_state = 'select distinct %s from %s' % (
            self.dateField, self.sourceStockTechIndiTableName)
        date_list = readDB(tmp_state, ConfigQuant).values
        date_list = date_list.T[0]
        if self.last_update_date == '':
            date_list = date_list[date_list > startDate]
        else:
            date_list = date_list[date_list > self.last_update_date]

        # sort
        date_list = np.sort(date_list)

        #  ******************  area is the same for all dates, using future data!!! need to be modified
        if self.categoryField == 'area':
            tmp_state = "SELECT `%s`, `%s`  FROM `%s`;" % (
                self.codeField, self.categoryField,
                self.sourceCategoryTableName)
            stock_area = readDB(tmp_state, ConfigQuant)
        else:
            stock_area = None

        # calculate num of loop
        loop_num = int(date_list.size / self.chunkSize)
        if date_list.size > loop_num * self.chunkSize:
            loop_num = loop_num + 1

        # fetch and process data from sql by chunk
        for i in range(loop_num):
            tmp_date_set = date_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
            tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date_set))
            tmp_range = ','.join(tmp_date_str)
            # read stock return data
            tmp_state = self.stockState + "`%s` in (%s)" % (self.dateField,
                                                            tmp_range)
            dataStockO = readDB(tmp_state, ConfigQuant)

            dataStockO = dataStockO.drop_duplicates(
                [self.dateField, self.codeField])
            dataStockO.loc[:, self.retSeriesField] = dataStockO[
                self.retSeriesField].astype('float')
            # dataStockO = dataStockO.fillna(0)

            data_result = self.getMarketRiseRatio(dataStockO, tmp_range,
                                                  stock_area, tmp_date_set)
            new_fields = list(
                map(lambda x: 'RATIO_%s' % x, self.retSeriesField))
            rename_dict = {}
            for field in zip(new_fields, self.retSeriesField):
                rename_dict[field[1]] = field[0]
            data_result = data_result.rename(columns=rename_dict)

            # add timestamp
            data_result['time_stamp'] = datetime.now()

            # dump chunk data into sql
            writeDB(self.targetTableName, data_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'
示例#11
0
    def runFull(self):
        # get total code list
        tmp_state = 'select distinct %s from %s' % (self.codeField,
                                                    self.sourceTableName)
        code_list = readDB(tmp_state, ConfigQuant).values
        code_list = code_list.T[0]

        # calculate num of loop
        loop_num = int(code_list.size / self.chunkSize)
        if code_list.size > loop_num * self.chunkSize:
            loop_num = loop_num + 1

        if self.isMultiProcess:  # use multi processing
            # register pool
            pool = Pool(processes=self.processNum)
            # fetch and process data from sql by chunk
            for i in range(loop_num):
                tmp_code = code_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
                tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code))
                tmp_range = ','.join(tmp_code_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.codeField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO = dataO.drop_duplicates([self.dateField, self.codeField])
                dataO = dataO.sort_values(self.dateField)  # sort by date
                # dataO = dataO.fillna(method='ffill') # **** fill with pre-close, not simply forward fill
                for field in [
                        self.openField, self.highField, self.lowField,
                        self.closeField, self.volumeField
                ]:
                    dataO.loc[:, field] = dataO[field].astype(
                        'float')  # change data type
                if self.turnoverField == '':  # no turnover field, fill with all nan
                    self.turnoverField = 'turnover'
                    dataO[self.turnoverField] = np.nan

                # process chunk data
                pool_results = []
                for code in tmp_code:
                    tmp_data = dataO.loc[dataO[self.codeField] ==
                                         code]  # dataO already sorted by date

                    if tmp_data.empty:
                        continue

                    tmp_procs = pool.apply_async(self.coreComputationFull,
                                                 (tmp_data, ))
                    pool_results.append(tmp_procs)
                    # data_result = self.coreComputationFull(tmp_data)

                # get result from the pool
                data_tot_result = pd.DataFrame([])
                for tmp_procs in pool_results:
                    data_result = tmp_procs.get()
                    data_tot_result = data_tot_result.append(data_result)

                # # trim data for increment
                # if self.last_update_date != '':
                #     data_tot_result = data_tot_result.loc[data_tot_result[self.dateField] > self.last_update_date]

                # add timestamp
                data_tot_result['time_stamp'] = datetime.now()

                if data_tot_result.empty:
                    continue

                # dump chunk data into sql
                writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                        self.if_exist)
                self.if_exist = 'append'
        else:
            # fetch and process data from sql by chunk
            for i in range(loop_num):
                tmp_code = code_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
                tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code))
                tmp_range = ','.join(tmp_code_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.codeField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO = dataO.drop_duplicates([self.dateField, self.codeField])
                dataO = dataO.sort_values(self.dateField)  # sort by date
                # dataO = dataO.fillna(method='ffill') # **** fill with pre-close, not simply forward fill
                for field in [
                        self.openField, self.highField, self.lowField,
                        self.closeField, self.volumeField
                ]:
                    dataO.loc[:, field] = dataO[field].astype(
                        'float')  # change data type
                if self.turnoverField == '':  # no turnover field, fill with all nan
                    self.turnoverField = 'turnover'
                    dataO[self.turnoverField] = np.nan

                # process chunk data
                data_tot_result = pd.DataFrame([])
                for code in tmp_code:
                    tmp_data = dataO.loc[dataO[self.codeField] ==
                                         code]  # dataO already sorted by date

                    if tmp_data.empty:
                        continue

                    # data_result = tmp_data[[self.dateField, self.codeField]]
                    #
                    # # MA, MA_DIFF...
                    # tmp_df = priceTechnicalIndicator(tmp_data[self.closeField], self.lags, '')
                    # data_result = data_result.join(tmp_df)
                    #
                    # # ADX, CCI ...
                    # tmp_df = priceTechnicalIndicatorOHLCV(tmp_data[self.openField], tmp_data[self.highField], tmp_data[self.lowField],
                    #                                       tmp_data[self.closeField], tmp_data[self.volumeField])
                    # data_result = data_result.join(tmp_df)
                    #
                    # # VMA, VMA_DIFF..., OBV, VOLUME_RATIO...
                    # tmp_df = volumeTechnicalIndicators(tmp_data[self.closeField], tmp_data[self.volumeField], self.lags, 'V')
                    # data_result = data_result.join(tmp_df)
                    #
                    # # turnover rolling sum, amplitude
                    # tmp_df = priceTechnicalIndicatorRollingSum(tmp_data[self.openField], tmp_data[self.highField],
                    #                                            tmp_data[self.lowField],
                    #                                            tmp_data[self.closeField], tmp_data[self.volumeField],
                    #                                            tmp_data[self.turnoverField], self.lags)
                    # data_result = data_result.join(tmp_df)
                    data_result = self.coreComputationFull(tmp_data)

                    data_tot_result = data_tot_result.append(data_result)

                # # trim data for increment
                # if self.last_update_date != '':
                #     data_tot_result = data_tot_result.loc[data_tot_result[self.dateField] > self.last_update_date]

                # add timestamp
                data_tot_result['time_stamp'] = datetime.now()

                if data_tot_result.empty:
                    continue

                # dump chunk data into sql
                writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                        self.if_exist)
                self.if_exist = 'append'
示例#12
0
    def runIncrm(self):
        # fetch and process all incremental data from sql
        dataO = readDB(self.state, ConfigQuant)
        dataO = dataO.drop_duplicates([self.dateField, self.codeField])
        dataO = dataO.sort_values(self.dateField)  # sort by date
        # dataO = dataO.fillna(method='ffill') # **** fill with pre-close, not simply forward fill
        # for field in [self.openField, self.highField, self.lowField, self.closeField, self.volumeField]:
        #     dataO.loc[:, field] = dataO[field].astype('float') # change data type
        if self.turnoverField == '':  # no turnover field, fill with all nan
            self.turnoverField = 'turnover'
            dataO[self.turnoverField] = np.nan

        # process incremental data
        code_list = dataO[self.codeField].unique()

        data_tot_result = pd.DataFrame([])
        # use multiprocessing to improve computation hours
        if self.isMultiProcess:
            pool = Pool(processes=self.processNum)
            pool_results = []
            # build pool
            for code in code_list:
                tmp_data = dataO.loc[dataO[self.codeField] ==
                                     code]  # dataO already sorted by date

                if tmp_data.empty:
                    continue

                tmp_result = pool.apply_async(self.coreComputation,
                                              (tmp_data, ))
                pool_results.append(tmp_result)

            # get result from the pool
            for tmp_result in pool_results:
                data_result = tmp_result.get()
                data_tot_result = data_tot_result.append(data_result)
        else:
            for code in code_list:
                tmp_data = dataO.loc[dataO[self.codeField] ==
                                     code]  # dataO already sorted by date

                if tmp_data.empty:
                    continue

                data_result = self.coreComputation(tmp_data)
                data_tot_result = data_tot_result.append(data_result)

                # data_result = tmp_data[[self.dateField, self.codeField]]
                #
                # # MA, MA_DIFF...
                # tmp_df = priceTechnicalIndicator(tmp_data[self.closeField], self.lags, '')
                # data_result = data_result.join(tmp_df)
                #
                # # ADX, CCI ...
                # tmp_df = priceTechnicalIndicatorOHLCV(tmp_data[self.openField], tmp_data[self.highField], tmp_data[self.lowField],
                #                                       tmp_data[self.closeField], tmp_data[self.volumeField])
                # data_result = data_result.join(tmp_df)
                #
                # # OBV...
                # tmp_df = volumeTechnicalIndicators(tmp_data[self.closeField], tmp_data[self.volumeField], self.lags, 'V')
                # data_result = data_result.join(tmp_df)
                #
                # # turnover rolling sum, amplitude
                # tmp_df = priceTechnicalIndicatorRollingSum(tmp_data[self.openField], tmp_data[self.highField],
                #                                            tmp_data[self.lowField],
                #                                            tmp_data[self.closeField], tmp_data[self.volumeField],
                #                                            tmp_data[self.turnoverField], self.lags)
                # data_result = data_result.join(tmp_df)
                #
                # # trim data for increment
                # data_result = data_result.loc[data_result[self.dateField] > self.last_update_date]
                #
                # data_tot_result = data_tot_result.append(data_result)

        # if self.last_update_date != '':
        #     data_tot_result = data_tot_result.loc[data_tot_result[self.dateField] > self.last_update_date]

        if not data_tot_result.empty:
            # add timestamp
            data_tot_result['time_stamp'] = datetime.now()

            # dump chunk data into sql
            writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'
    def runFull(self):
        # get total code list
        tmp_state = 'select distinct %s from %s' % (self.codeField,
                                                    self.sourceTableName)
        code_list = readDB(tmp_state, ConfigQuant).values
        code_list = code_list.T[0]

        # get total trade dates
        tmp_state = "select `%s` from %s" % (self.dateField,
                                             self.calendarTableName)
        trade_calendar = readDB(tmp_state, ConfigQuant)
        trade_calendar = trade_calendar[self.dateField].values
        trade_calendar = trade_calendar[trade_calendar >= self.init_date]

        # calculate num of loop for time series calculation
        code_chunk_num = int(code_list.size / self.chunkSize)
        if code_list.size > code_chunk_num * self.chunkSize:
            code_chunk_num = code_chunk_num + 1

        # calculate num of loop for time horizon calculation
        date_chunk_num = int(trade_calendar.size / self.chunkSize)
        if trade_calendar.size > date_chunk_num * self.chunkSize:
            date_chunk_num = date_chunk_num + 1

        if self.isMultiProcess:  # use multi processing
            # register pool
            pool = Pool(processes=self.processNum)

            # calculate time series derivative features
            data_tot_result_time_series = pd.DataFrame([])
            for i in range(code_chunk_num):
                tmp_code = code_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
                tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code))
                tmp_range = ','.join(tmp_code_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.codeField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO = dataO.drop_duplicates([self.dateField, self.codeField])
                # dataO = dataO.sort_values(self.dateField)  # sort by date

                # process chunk data
                pool_results = []
                for code in tmp_code:
                    tmp_data = dataO.loc[dataO[self.codeField] ==
                                         code]  # dataO already sorted by date
                    tmp_data = tmp_data.sort_values(self.dateField)

                    if tmp_data.empty:
                        continue

                    # multiprocessing
                    tmp_procs = pool.apply_async(
                        self.coreComputationTimeSeries, (tmp_data, ))
                    pool_results.append(tmp_procs)
                    # data_result = self.coreComputationFull(tmp_data)

                # get result from the process pool
                for tmp_procs in pool_results:
                    data_result = tmp_procs.get()
                    data_tot_result_time_series = data_tot_result_time_series.append(
                        data_result)

            # calculate time horizon derivative features
            data_tot_result_time_horizon = pd.DataFrame([])
            for i in range(date_chunk_num):
                tmp_date = trade_calendar[i * self.chunkSize:(i + 1) *
                                          self.chunkSize]
                tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date))
                tmp_range = ','.join(tmp_date_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.dateField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO = dataO.drop_duplicates([self.dateField, self.codeField])
                # dataO = dataO.sort_values(self.dateField)  # sort by date

                # process chunk data
                pool_results = []
                for single_date in tmp_date:
                    tmp_data = dataO.loc[
                        dataO[self.dateField] ==
                        single_date]  # dataO already sorted by date

                    if tmp_data.empty:
                        continue

                    # multiprocessing
                    tmp_procs = pool.apply_async(
                        self.coreComputationTimeHorizon, (tmp_data, ))
                    pool_results.append(tmp_procs)
                    # data_result = self.coreComputationFull(tmp_data)

                # get result from the process pool
                for tmp_procs in pool_results:
                    data_result = tmp_procs.get()
                    data_tot_result_time_horizon = data_tot_result_time_horizon.append(
                        data_result)

            # combine time series and time horizon features
            data_tot_result = data_tot_result_time_series.merge(
                data_tot_result_time_horizon,
                on=[self.dateField, self.codeField],
                how='outer')

            # add timestamp
            data_tot_result.loc[:, 'time_stamp'] = datetime.now()

            # dump all data into database
            writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'

            pool.close()
        else:
            # calculate time series derivative features
            data_tot_result_time_series = pd.DataFrame([])
            for i in range(code_chunk_num):
                tmp_code = code_list[i * self.chunkSize:(i + 1) *
                                     self.chunkSize]
                tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code))
                tmp_range = ','.join(tmp_code_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.codeField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO = dataO.drop_duplicates([self.dateField, self.codeField])
                # dataO = dataO.sort_values(self.dateField)  # sort by date

                # process chunk data
                for code in tmp_code:
                    tmp_data = dataO.loc[dataO[self.codeField] ==
                                         code]  # dataO already sorted by date
                    tmp_data = tmp_data.sort_values(self.dateField)

                    if tmp_data.empty:
                        continue

                    data_result = self.coreComputationTimeSeries(tmp_data)
                    data_tot_result_time_series = data_tot_result_time_series.append(
                        data_result)

            # calculate time horizon derivative features
            data_tot_result_time_horizon = pd.DataFrame([])
            for i in range(date_chunk_num):
                tmp_date = trade_calendar[i * self.chunkSize:(i + 1) *
                                          self.chunkSize]
                tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date))
                tmp_range = ','.join(tmp_date_str)
                tmp_state = self.state + "`%s` in (%s)" % (self.dateField,
                                                           tmp_range)
                dataO = readDB(tmp_state, ConfigQuant)
                dataO = dataO.drop_duplicates([self.dateField, self.codeField])
                # dataO = dataO.sort_values(self.dateField)  # sort by date

                # process chunk data
                for single_date in tmp_date:
                    tmp_data = dataO.loc[
                        dataO[self.dateField] ==
                        single_date]  # dataO already sorted by date

                    if tmp_data.empty:
                        continue

                    # multiprocessing
                    data_result = self.coreComputationTimeHorizon(tmp_data)
                    data_tot_result_time_horizon = data_tot_result_time_horizon.append(
                        data_result)

            # combine time series and time horizon features
            data_tot_result = data_tot_result_time_series.merge(
                data_tot_result_time_horizon,
                on=[self.dateField, self.codeField],
                how='outer')

            # add timestamp
            data_tot_result.loc[:, 'time_stamp'] = datetime.now()

            # dump chunk data into database
            writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'
    def runIncrm(self):
        # fetch and process all incremental data from sql
        dataO = readDB(self.state, ConfigQuant)
        dataO = dataO.drop_duplicates([self.dateField, self.codeField])
        # dataO = dataO.sort_values(self.dateField) # sort by date

        # process incremental data
        code_list = dataO[self.codeField].unique()
        trade_calendar = dataO[self.dateField].unique()
        trade_calendar = trade_calendar[trade_calendar > self.last_update_date]

        # ==== calculate features
        if self.isMultiProcess:
            # build pool
            pool = Pool(processes=self.processNum)

            # calculate time series features
            data_tot_result_time_series = pd.DataFrame([])
            pool_results = []
            for code in code_list:
                tmp_data = dataO.loc[dataO[self.codeField] == code]
                tmp_data = tmp_data.sort_values(self.dateField)  # sort by date

                if tmp_data.empty:
                    continue

                tmp_result = pool.apply_async(self.coreComputationTimeSeries,
                                              (tmp_data, ))
                pool_results.append(tmp_result)

            # get result from the pool (time series)
            for tmp_result in pool_results:
                data_result = tmp_result.get()
                data_tot_result_time_series = data_tot_result_time_series.append(
                    data_result)

            # calculate time horizon features
            data_tot_result_time_horizon = pd.DataFrame([])
            pool_results = []
            for trade_date in trade_calendar:
                tmp_data = dataO.loc[dataO[self.dateField] ==
                                     trade_date]  # no need to sort date

                if tmp_data.empty:
                    continue

                tmp_result = pool.apply_async(self.coreComputationTimeHorizon,
                                              (tmp_data, ))
                pool_results.append(tmp_result)

            # get result from the pool (time horizon)
            for tmp_result in pool_results:
                data_result = tmp_result.get()
                data_tot_result_time_horizon = data_tot_result_time_horizon.append(
                    data_result)

            pool.terminate()

        # single process
        else:
            # calculate time series features
            data_tot_result_time_series = pd.DataFrame([])
            for code in code_list:
                tmp_data = dataO.loc[dataO[self.codeField] == code]
                tmp_data = tmp_data.sort_values(
                    self.dateField)  # sorted by date

                if tmp_data.empty:
                    continue

                data_result = self.coreComputationTimeSeries(tmp_data)
                data_tot_result_time_series = data_tot_result_time_series.append(
                    data_result)

            # calculate time horizon features
            data_tot_result_time_horizon = pd.DataFrame([])
            for trade_date in trade_calendar:
                tmp_data = dataO.loc[
                    dataO[self.dateField] ==
                    trade_date]  # dataO already sorted by date

                if tmp_data.empty:
                    continue

                data_result = self.coreComputationTimeHorizon(tmp_data)
                data_tot_result_time_horizon = data_tot_result_time_horizon.append(
                    data_result)

        # write to sql (both multiprocess and single process)
        data_tot_result = data_tot_result_time_series.merge(
            data_tot_result_time_horizon,
            on=[self.dateField, self.codeField],
            how='outer')  # combine time series and time horizon features
        data_tot_result = data_tot_result.loc[data_tot_result[
            self.dateField] > self.last_update_date]  # truncate data

        if not data_tot_result.empty:
            # add timestamp
            data_tot_result['time_stamp'] = datetime.now()

            # dump chunk data into sql
            writeDB(self.targetTableName, data_tot_result, ConfigQuant,
                    self.if_exist)
    def run(self, startDate='2007-01-01'):
        self.prepareData()
        if self.basicState == '':  # already the latest data
            return

        # get total date list (horizontally)
        tmp_state = 'select distinct %s from %s' % (self.dateField,
                                                    self.sourceBasicTableName)
        date_list = readDB(tmp_state, ConfigQuant).values
        date_list = date_list.T[0]
        if self.last_update_date == '':
            date_list = date_list[date_list > startDate]
        else:
            date_list = date_list[date_list > self.last_update_date]

        #  get area (if category is area) ******************  area is the same for all dates, that means using future data!!! need to be modified
        if self.categoryField == 'area':
            tmp_state = "SELECT `%s`, `%s`  FROM `%s`;" % (
                self.codeField, self.categoryField,
                self.sourceCategoryTableName)
            stock_area = readDB(tmp_state, ConfigQuant)
        else:
            stock_area = None

        # calculate num of loop
        loop_num = int(date_list.size / self.chunkSize)
        if date_list.size > loop_num * self.chunkSize:
            loop_num = loop_num + 1

        # fetch and process data from sql by chunk
        for i in range(loop_num):
            tmp_date = date_list[i * self.chunkSize:(i + 1) * self.chunkSize]
            tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date))
            tmp_range = ','.join(tmp_date_str)
            # read basic data
            tmp_state = self.basicState + "`%s` in (%s)" % (self.dateField,
                                                            tmp_range)
            dataBasicO = readDB(tmp_state, ConfigQuant)
            for field in self.reverseSeriesField:
                dataBasicO[field] = 1. / dataBasicO[field]
            # read technical data
            tmp_state = self.techState + "`%s` in (%s)" % (self.dateField,
                                                           tmp_range)
            dataTechO = readDB(tmp_state, ConfigQuant)
            for tmp_type in ['dataBasicO', 'dataTechO']:
                exec(
                    "{0} = {0}.drop_duplicates([self.dateField, self.codeField])"
                    .format(tmp_type))
                exec("{0} = {0}.sort_values(self.dateField)".format(
                    tmp_type))  # sort by date
            # dataBasicO = dataBasicO.fillna(method='ffill')

            # for field in self.basicSeriesField:
            #     dataBasicO.loc[:, field] = dataBasicO[field].astype('float') # change data type

            # process chunk data
            data_result = pd.DataFrame([])

            # Fundamental data
            uni_dates = dataBasicO[self.dateField].unique()
            dataBasicO = self.getStockCategory(
                dataBasicO, uni_dates, stock_area)  # merge with stock category
            tmp_df = priceOtherIndicatorRanking(dataBasicO, uni_dates,
                                                self.dateField, self.codeField,
                                                self.basicSeriesField,
                                                self.categoryField)  # ranking
            data_result = pd.concat([data_result, tmp_df])

            # Technical data
            uni_dates = dataTechO[self.dateField].unique()
            dataTechO = self.getStockCategory(
                dataTechO, uni_dates, stock_area)  # merge with stock category
            tmp_df = priceOtherIndicatorRanking(dataTechO, uni_dates,
                                                self.dateField, self.codeField,
                                                self.techSeriesField,
                                                self.categoryField)  # ranking
            data_result = data_result.merge(
                tmp_df, on=[self.dateField, self.codeField], how='outer')

            # add timestamp
            data_result['time_stamp'] = datetime.now()

            # dump chunk data into sql
            writeDB(self.targetTableName, data_result, ConfigQuant,
                    self.if_exist)
            self.if_exist = 'append'