Exemplo n.º 1
0
def main(algo_id=None,
         train_steps=None,
         download_feature_db=None,
         do_preprocessing=None,
         upload_model=None):
    try:
        _start = time.time()

        cfg.load(
            Config(algo_id, train_steps, download_feature_db, do_preprocessing,
                   upload_model))

        cfg.cls_data_formatter = os.path.normpath(
            os.path.join(os.path.dirname(__file__), 'data_formatter.py'))
        cfg.cls_coder = os.path.normpath(
            os.path.join(os.path.dirname(__file__), 'coder.py'))

        Env.init()

        preprocessor = Preprocessor()
        preprocessor.process()

        trainer = Trainer()
        trainer.train()

        Uploader.upload_model()

        log.info("[total] use {} seconds totally".format(time.time() - _start))
    except Exception as e:
        import traceback
        log.info(traceback.format_exc())
        time.sleep(60 * 60 * 24)  # wait to check logs
Exemplo n.º 2
0
    def create_feature_columns(self, tf_transform_output):
        """
        Returns feature columns to be used by the model
        """
        # Define the feature columns: this is how the transformed inputs are
        # used by the tf model, the output of the feature columns will be
        # stacked into the `tf.feature_column.input_layer`. This object can
        # then be used by the downstream tf graph (e.g., network layers).
        base_features_columns = []
        for key in self.data_formatter.vocabulary_features:
            fc = tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_vocabulary_file(
                    key=key,
                    num_oov_buckets=1,
                    vocabulary_file=tf_transform_output.
                    vocabulary_file_by_name(vocab_filename=key)))
            base_features_columns.append(fc)

        # NUM_INT and NUM_FLOAT, already converted to numeric value by tft and scaled
        base_features_columns += [
            tf.feature_column.numeric_column(key, default_value=0.)
            for key in self.data_formatter.number_features
        ]

        log.info('len of features_columns: {}'.format(
            len(base_features_columns)))
        for fc in base_features_columns:
            log.info('feature column {}'.format(fc.name))
        return base_features_columns
Exemplo n.º 3
0
    def split_to_shards(self, file_name):
        # print("start to split into shards")

        def split_file(fname_in, fname_out, num_shards):
            f_outs = list()
            for i in range(num_shards):
                f_outs.append(
                    open('{}-{:05}-of-{:05}'.format(fname_out, i, num_shards),
                         'w+'))
            f_in = open(fname_in, 'rb')
            r_c = 0
            for line in f_in:
                f_outs[r_c].write(line)
                r_c = (r_c + 1) % num_shards
            for f_out in f_outs:
                f_out.close()
            f_in.close()

        st = time.time()
        assert isinstance(cfg.DATASET_NUM_SHARDS, int)

        splitter = mp.Process(target=split_file,
                              args=(cfg.get_shuffled_file(file_name),
                                    cfg.get_shard_file(file_name),
                                    cfg.DATASET_NUM_SHARDS))
        splitter.start()
        splitter.join()

        # After splitting, remove original shuffed file.
        # os.remove(self.train_split_fname_out)
        # os.remove(self.eval_split_fname_out)
        log.info(
            'complete split Train and Eval files into serval shards. use {:.2f} sec.'
            .format(time.time() - st))
Exemplo n.º 4
0
    def execute(self, all_shares, start_date, end_date):
        """
        extract all
        the main method of a extractor, which is responsible to get the data from somewhere
        :param all_shares:
        :param start_date:
        :param end_date:
        :return:
        """
        index = 0
        for share in all_shares:
            try:
                index = index + 1
                print(
                    "[SequenceExtractor] process the {} shares".format(index))
                self.extract_one_share(share, start_date, end_date)
            except Exception as e:
                log.info(
                    "[SequenceExtractor] fail to extract share_id={}, start_date={}, end_date={}"
                    .format(share, start_date, end_date))
                log.error("[error]" + traceback.format_exc())
            time.sleep(1)

        # you should close the sdk when finishing extraction
        self.sdk.close()

        self._validate_data()
Exemplo n.º 5
0
    def _extract_one_share_n_days_price_ratio(self,
                                              share_id,
                                              bar_df,
                                              field="close"):
        """
        uniform method for getting price of HLOC
        :param share_id:
        :param bar_df:
        :param field:
        :return:
        """
        price_s = bar_df[field]
        price_list = price_s.tolist()
        date_list = price_s.index.tolist()
        log.info("[extractor] share_id= {}. there are {} rows of price".format(
            share_id, len(price_list)))

        # due to the order of xxx_price in tushare is DESC, so we can get the data from recently.
        # example
        keys, values = [], []
        n = feature_definition_config["hloc_seq_step"]
        keys = keys + ["time", "share_id"]
        keys = keys + [
            "{}_b".format(field) + str(i - 1) for i in range(n - 1, 0, -1)
        ]
        keys = keys + ["target_{}_price".format(field)]
        keys = keys + ["target_{}_trend".format(field)]

        for index in range(len(price_list)):
            if len(price_list[index:index + n]
                   ) == n:  # else: not enough (N) data, so drop it.

                price_segment = price_list[index:index + n][::-1]
                if self.normalized:
                    price_x = [1] + [
                        curr / price_segment[i]
                        for i, curr in enumerate(price_segment[1:-1])
                    ]
                    price_y = price_segment[-1] / price_segment[-2]
                else:
                    price_x = price_segment[:-1]
                    price_y = price_segment[-1]

                precision = 4
                price_x = [round(x, precision) for x in price_x]
                price_y = round(price_y, precision)

                if self.normalized:
                    trend = 1 if price_y > 1 else 0
                else:
                    trend = 1 if price_segment[-1] / price_segment[
                        -2] > 1 else 0
                values.append([str(date_list[index]),
                               str(share_id)] + price_x + [price_y] + [trend])
        return pd.DataFrame(columns=keys, data=values)
Exemplo n.º 6
0
    def _extract_one_share_n_days_ror(self, bar_df, share_id):
        # reverse the original data frame, so that it make us easy to calculate the RoR (return of rate)
        bar_df = bar_df.iloc[::-1]
        close_s = bar_df['close']
        # print(close_s)
        close_list = close_s.tolist()
        date_list = close_s.index.tolist()

        keys, values = [], []
        # similar to __extract_one_share_n_days_close, only reverse the order.
        # here, we calculate ror_5_days, ror_10_days ... we should use the max N in the loop.
        n = feature_definition_config["ror_n_days_after"]
        keys += ["time", "share_id"]
        keys += [
            "ror_1_days", "ror_5_days", "ror_10_days", "ror_20_days",
            "ror_40_days", "ror_60_days"
        ]
        try:
            for index in range(len(close_list)):
                if len(close_list[index:index + n]
                       ) == n:  # else: not enough (N) data, so drop it.
                    ror1 = round(
                        (close_list[index + 1] / close_list[index]) - 1, 4)
                    ror5 = round(
                        (close_list[index + 5] / close_list[index]) - 1, 4)
                    ror10 = round(
                        (close_list[index + 10] / close_list[index]) - 1, 4)
                    ror20 = round(
                        (close_list[index + 20] / close_list[index]) - 1, 4)
                    ror40 = round(
                        (close_list[index + 40] / close_list[index]) - 1, 4)
                    ror60 = round(
                        (close_list[index + 60] / close_list[index]) - 1, 4)

                    values.append([
                        str(date_list[index]),
                        str(share_id), ror1, ror5, ror10, ror20, ror40, ror60
                    ])
        except IndexError:
            log.info(
                "share_id = {} calculate ror maybe complete ".format(share_id))

        return pd.DataFrame(columns=keys, data=values)
Exemplo n.º 7
0
    def _extract_one_share_n_days_close(self, bar_df, share_id):
        close_s = bar_df['close']
        # print(close_s)
        close_list = close_s.tolist()
        date_list = close_s.index.tolist()
        log.info("[extractor] share_id= {}. there are {} rows of close price".
                 format(share_id, len(close_list)))

        # due to the order of close_price in tushare is DESC, so we can get the data from recently.
        keys, values = [], []
        n = feature_definition_config["close_n_days_before"]
        keys = keys + ["time", "share_id", "close_price", "target_close_price"]

        # normalize:
        # the first element as 1, the use the `ratio` to normalize.
        assert n > 2
        for index in range(len(close_list)):
            if len(close_list[index:index + n]
                   ) == n:  # else: not enough (N) data, so drop it.
                close_seq = close_list[index:index + n]

                if self.normalized:
                    close_price = [1] + [
                        curr / close_seq[i]
                        for i, curr in enumerate(close_seq[1:])
                    ]
                    target_close_price = close_seq[-1] / close_seq[-2]
                else:
                    close_price = close_seq[:-2]
                    target_close_price = close_seq[-1]

                # adjust the precision for float
                precision = 4
                close_price = [round(x, precision) for x in close_price]
                target_close_price = round(target_close_price, precision)

                values.append([
                    str(date_list[index]),
                    str(share_id), close_price, target_close_price
                ])
        return pd.DataFrame(columns=keys, data=values)
Exemplo n.º 8
0
def main():
    try:
        _start = time.time()

        cfg.load(Config())

        cfg.cls_data_formatter = os.path.normpath(
            os.path.join(os.path.dirname(__file__), 'data_formatter.py'))
        cfg.cls_coder = os.path.normpath(
            os.path.join(os.path.dirname(__file__), 'coder.py'))

        preprocessor = Preprocessor()
        preprocessor.process()

        trainer = Trainer()
        trainer.train()

        log.info("[total] use {} seconds totally".format(time.time() - _start))
    except Exception as e:
        import traceback
        log.info(traceback.format_exc())
        time.sleep(60 * 60 * 24)  # wait to check logs
Exemplo n.º 9
0
    def _extract_one_share_n_days_vol(self, bar_df, share_id):
        vol_s = bar_df['vol']
        # print(close_s)
        vol_list = vol_s.tolist()
        date_list = vol_s.index.tolist()
        log.info(
            "[extractor] share_id= {}. there are {} rows of volume".format(
                share_id, len(vol_list)))

        # this part is the same as _extract_one_share_n_days_close
        keys, values = [], []
        n = feature_definition_config["close_n_days_before"]
        keys = keys + ["time", "share_id", "volume", "target_volume"]

        assert n > 2
        for index in range(len(vol_list)):
            if len(vol_list[index:index +
                            n]) == n:  # else: not enough (N) data, so drop it.
                seq = vol_list[index:index + n]
                if self.normalized:
                    volume = [1] + [
                        curr / seq[i] for i, curr in enumerate(seq[1:])
                    ]
                    target_volume = seq[-1] / seq[-2]
                else:
                    volume = seq[:-2]
                    target_volume = seq[-1]

                # adjust the precision for float
                precision = 4
                volume = [round(x, precision) for x in volume]
                target_volume = round(target_volume, precision)

                values.append([
                    str(date_list[index]),
                    str(share_id), volume, target_volume
                ])
        return pd.DataFrame(columns=keys, data=values)
Exemplo n.º 10
0
    def shuf(self, file_name):
        st = time.time()
        log.info('shuff start')

        # Use terashuf quasi-shuffle
        FileUtil.save_remove_first_line(
            cfg.get_exp_file(file_name),
            cfg.get_exp_file_without_header(file_name))

        shuf_cmd = 'MEMORY={:.1f} terashuf < {} > {}'.format(
            cfg.SHUF_MEM, cfg.get_exp_file_without_header(file_name),
            cfg.get_shuffled_file(file_name))
        log.info('Executing shuf call: \"{}\"'.format(shuf_cmd))

        ret_stat = os.system(shuf_cmd)
        if ret_stat != 0:
            log.info('`terashuf` failed, falling back to `sort -R`.')
            shuf_cmd = 'sort -R {} -o {}'.format(
                cfg.get_exp_file_without_header(file_name),
                cfg.get_shuffled_file(file_name))
            ret_stat = os.system(shuf_cmd)

        log.info('Executing shuf call: \"{}\"'.format(shuf_cmd))
        log.info("complete shuff. use time {:.2f}s".format(time.time() - st))
Exemplo n.º 11
0
def main(test=None):
    _start = time.time()
    econfig.init(test)

    extractor = FeatureExtractor()

    # select the target shares first
    # shangzheng 50
    ingredient_df = context.tushare.index_weight(index_code='000016.SH',
                                                 start_date='20080101',
                                                 end_date='20180101')
    ingredient_share_set = set(ingredient_df['con_code'].tolist())

    target_shares = [
        "601398.SH",
        "601288.SH",
        "601988.SH",
        "601939.SH",
        "601328.SH",
    ]

    if econfig.DEBUG:
        # extractor.extract_all(start_date='20080101', end_date='20080301',
        #                       params={'normalized': True, 'output_name': TRAIN_FILE_NAME})  # as train
        # extractor.extract_all(start_date='20190101', end_date='20190301',
        #                       params={'normalized': True, 'output_name': EVAL_FILE_NAME})  # as eval

        # extractor.extract_multiple(share_ids=list(ingredient_share_set), start_date='20080101', end_date='20180101',
        #                            params={'normalized': True, 'output_name': TRAIN_FILE_NAME})  # as train
        # extractor.extract_multiple(share_ids=list(ingredient_share_set), start_date='20180102', end_date='20190901',
        #                            params={'normalized': True, 'output_name': EVAL_FILE_NAME})  # as eval

        extractor.extract_multiple(share_ids=target_shares,
                                   start_date='20080101',
                                   end_date='20180101',
                                   params={
                                       'normalized': False,
                                       'output_name': TRAIN_FILE_NAME
                                   })  # as train
        extractor.extract_multiple(share_ids=target_shares,
                                   start_date='20180102',
                                   end_date='20190901',
                                   params={
                                       'normalized': False,
                                       'output_name': EVAL_FILE_NAME
                                   })  # as eval
    else:
        # for test stage1, we should only extract recent 4000 days's data
        # extractor.extract_all(start_date='20050101', end_date='20181231')

        # extractor.extract_all(start_date='20080101', end_date='20180101',
        #                       params={'normalized': True, 'output_name': TRAIN_FILE_NAME})  # as train
        # extractor.extract_all(start_date='20180102', end_date='20190901',
        #                       params={'normalized': True, 'output_name': EVAL_FILE_NAME})  # as eval

        extractor.extract_multiple(share_ids=list(ingredient_share_set),
                                   start_date='20180101',
                                   end_date='20190601',
                                   params={
                                       'normalized': True,
                                       'output_name': TRAIN_FILE_NAME
                                   })  # as train
        extractor.extract_multiple(share_ids=list(ingredient_share_set),
                                   start_date='20190602',
                                   end_date='20191101',
                                   params={
                                       'normalized': True,
                                       'output_name': EVAL_FILE_NAME
                                   })  # as eval

        # extractor.extract_multiple(share_ids=target_shares, start_date='20080101', end_date='20180101',
        #                            params={'normalized': True, 'output_name': TRAIN_FILE_NAME})  # as train
        # extractor.extract_multiple(share_ids=target_shares, start_date='20180102', end_date='20190901',
        #                            params={'normalized': True, 'output_name': EVAL_FILE_NAME})  # as eval

    log.info("extracting completed, use time {}s".format(
        str(time.time() - _start)))