示例#1
0
def processYahoo(raw_dir, processed_dir):
  raw_files = os.listdir(raw_dir)
  for raw_file in raw_files:
    assert raw_file.endswith(EXTENSION)
    ticker = raw_file[:-len(EXTENSION)]
    input_file = '%s/%s' % (raw_dir, raw_file)
    with open(input_file, 'r') as fp:
      lines = fp.read().splitlines()
    assert len(lines) > 0
    assert lines[0] == HEADER
    processed_file = '%s/%s' % (processed_dir, ticker)
    with open(processed_file, 'w') as fp:
      previous_ymd = None
      previous_ym = None
      previous_vo = 0.0
      previous_days = 0
      for i in range(len(lines) - 1, 0, -1):
        ymd, op, hi, lo, cl, vo, acl = lines[i].split(',')
        if previous_ymd is not None:
          assert ymd > previous_ymd
        previous_ymd = ymd
        ym = util.ymdToYm(ymd)
        if ym != previous_ym:
          previous_ym = ym
          if previous_days > 0:
            previous_vo /= previous_days
          print >> fp, '%s\t%s\t%s\t%s\t%s' % (
              ymd, cl, acl, previous_vo, previous_vo)
          previous_vo = 0.0
          previous_days = 0
        previous_vo += getVolume(vo)
        previous_days += 1
示例#2
0
def processYahoo(raw_dir, processed_dir):
    raw_files = os.listdir(raw_dir)
    for raw_file in raw_files:
        assert raw_file.endswith(EXTENSION)
        ticker = raw_file[:-len(EXTENSION)]
        input_file = '%s/%s' % (raw_dir, raw_file)
        with open(input_file, 'r') as fp:
            lines = fp.read().splitlines()
        assert len(lines) > 0
        assert lines[0] == HEADER
        processed_file = '%s/%s' % (processed_dir, ticker)
        with open(processed_file, 'w') as fp:
            previous_ymd = None
            previous_ym = None
            previous_vo = 0.0
            previous_days = 0
            for i in range(len(lines) - 1, 0, -1):
                ymd, op, hi, lo, cl, vo, acl = lines[i].split(',')
                if previous_ymd is not None:
                    assert ymd > previous_ymd
                previous_ymd = ymd
                ym = util.ymdToYm(ymd)
                if ym != previous_ym:
                    previous_ym = ym
                    if previous_days > 0:
                        previous_vo /= previous_days
                    print >> fp, '%s\t%s\t%s\t%s\t%s' % (
                        ymd, cl, acl, previous_vo, previous_vo)
                    previous_vo = 0.0
                    previous_days = 0
                previous_vo += getVolume(vo)
                previous_days += 1
示例#3
0
def processEodRaw(raw_dir, ticker_file, processed_dir):
  tickers = util.readTickers(ticker_file)
  for ticker in tickers:
    raw_file = '%s/%s' % (raw_dir, ticker)
    if not os.path.isfile(raw_file):
      continue
    with open(raw_file, 'r') as fp:
      lines = fp.read().splitlines()
    processed_file = '%s/%s' % (processed_dir, ticker)
    with open(processed_file, 'w') as fp:
      previous_ymd = None
      previous_ym = None
      previous_vo, previous_avo = 0.0, 0.0
      previous_days = 0
      for line in lines:
        _, ymd, op, hi, lo, cl, vo, di, sp, aop, ahi, alo, acl, avo = (
            line.split(','))
        assert _ == ticker
        if previous_ymd is not None:
          assert ymd > previous_ymd
        previous_ymd = ymd
        ym = util.ymdToYm(ymd)
        if ym != previous_ym:
          previous_ym = ym
          if previous_days > 0:
            previous_vo /= previous_days
            previous_avo /= previous_days
          print >> fp, '%s\t%s\t%s\t%f\t%f' % (
              ymd, cl, acl, previous_vo, previous_avo)
          previous_vo = 0.0
          previous_avo = 0.0
          previous_days = 0
        previous_vo += getVolume(vo)
        previous_avo += getVolume(avo)
        previous_days += 1
示例#4
0
def processEodRaw(raw_dir, ticker_file, processed_dir):
    tickers = util.readTickers(ticker_file)
    for ticker in tickers:
        raw_file = '%s/%s' % (raw_dir, ticker)
        if not os.path.isfile(raw_file):
            continue
        with open(raw_file, 'r') as fp:
            lines = fp.read().splitlines()
        processed_file = '%s/%s' % (processed_dir, ticker)
        with open(processed_file, 'w') as fp:
            previous_ymd = None
            previous_ym = None
            previous_vo, previous_avo = 0.0, 0.0
            previous_days = 0
            for line in lines:
                _, ymd, op, hi, lo, cl, vo, di, sp, aop, ahi, alo, acl, avo = (
                    line.split(','))
                assert _ == ticker
                if previous_ymd is not None:
                    assert ymd > previous_ymd
                previous_ymd = ymd
                ym = util.ymdToYm(ymd)
                if ym != previous_ym:
                    previous_ym = ym
                    if previous_days > 0:
                        previous_vo /= previous_days
                        previous_avo /= previous_days
                    print >> fp, '%s\t%s\t%s\t%f\t%f' % (
                        ymd, cl, acl, previous_vo, previous_avo)
                    previous_vo = 0.0
                    previous_avo = 0.0
                    previous_days = 0
                previous_vo += getVolume(vo)
                previous_avo += getVolume(avo)
                previous_days += 1
示例#5
0
def getGainDict(gain_file):
  dgains = util.readKeyValueList(gain_file)
  gain_dict = dict()
  for date, gain in dgains:
    ym = util.ymdToYm(date)
    assert ym not in gain_dict
    gain_dict[ym] = [date, gain]
  return gain_dict
示例#6
0
def computePreviousFeature(feature_dir, k, pfeature_dir):
  tickers = sorted(os.listdir(feature_dir))
  for ticker in tickers:
    feature_file = '%s/%s' % (feature_dir, ticker)
    dfeatures = util.readKeyValueList(feature_file)
    with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp:
      for date, feature in dfeatures:
        ym = util.ymdToYm(date)
        pdate = util.getNextYm(ym, k)
        print >> fp, '%s-01\t%f' % (pdate, feature)
示例#7
0
def computePreviousFeature(feature_dir, k, pfeature_dir):
    tickers = sorted(os.listdir(feature_dir))
    for ticker in tickers:
        feature_file = '%s/%s' % (feature_dir, ticker)
        dfeatures = util.readKeyValueList(feature_file)
        with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp:
            for date, feature in dfeatures:
                ym = util.ymdToYm(date)
                pdate = util.getNextYm(ym, k)
                print >> fp, '%s-01\t%f' % (pdate, feature)
示例#8
0
def readPrices(price_file):
    with open(price_file, 'r') as fp:
        lines = fp.read().splitlines()
    prices = dict()  # ym => [ymd, price]
    for line in lines:
        ymd, price = line.split('\t')
        ym = util.ymdToYm(ymd)
        assert ym not in prices
        prices[ym] = [ymd, float(price)]
    return prices
示例#9
0
def readPrices(price_file):
  with open(price_file, 'r') as fp:
    lines = fp.read().splitlines()
  prices = dict()  # ym => [ymd, price]
  for line in lines:
    ymd, price = line.split('\t')
    ym = util.ymdToYm(ymd)
    assert ym not in prices
    prices[ym] = [ymd, float(price)]
  return prices
示例#10
0
def prepareData(ym, data_file, label_file, meta_file, predict_meta_file,
                tmp_data_file):
  data_ifp = open(data_file, 'r')
  label_ifp = open(label_file, 'r')
  meta_ifp = open(meta_file, 'r')
  data_ofp = open(tmp_data_file, 'w')
  if predict_meta_file is None:
    predict_meta_ifp = None
    predict_meta = None
  else:
    predict_meta_ifp = open(predict_meta_file, 'r')
    predict_meta = predict_meta_ifp.readline()

  meta = []
  while True:
    line = meta_ifp.readline()
    if line == '':
      assert data_ifp.readline() == ''
      assert label_ifp.readline() == ''
      break
    assert line[-1] == '\n'
    data_line = data_ifp.readline()
    label_line = label_ifp.readline()
    assert data_line != ''
    assert label_line != ''

    if predict_meta is not None:
      if line != predict_meta:
        continue
      predict_meta = predict_meta_ifp.readline()

    ticker, date, tmp, gain = line[:-1].split('\t')
    if util.ymdToYm(date) != ym:
      continue
    assert data_line[-1] == '\n'
    assert label_line[-1] == '\n'
    label = float(label_line[:-1])
    gain = float(gain)
    # This is not true when labels are cut at other places than 0.
    # TODO: --label_file is not needed; remove.
    #if label > 0.5: assert gain >= 0
    #if label < 0.5: assert gain <= 0
    print >> data_ofp, data_line[:-1]
    meta.append([ticker, gain])

  data_ifp.close()
  label_ifp.close()
  meta_ifp.close()
  data_ofp.close()
  if predict_meta_ifp is not None:
    predict_meta_ifp.close()
  return meta
示例#11
0
def prepareData(ym, data_file, label_file, meta_file, predict_meta_file,
                tmp_data_file):
  data_ifp = open(data_file, 'r')
  label_ifp = open(label_file, 'r')
  meta_ifp = open(meta_file, 'r')
  data_ofp = open(tmp_data_file, 'w')
  if predict_meta_file is None:
    predict_meta_ifp = None
    predict_meta = None
  else:
    predict_meta_ifp = open(predict_meta_file, 'r')
    predict_meta = predict_meta_ifp.readline()

  meta = []
  while True:
    line = meta_ifp.readline()
    if line == '':
      assert data_ifp.readline() == ''
      assert label_ifp.readline() == ''
      break
    assert line[-1] == '\n'
    data_line = data_ifp.readline()
    label_line = label_ifp.readline()
    assert data_line != ''
    assert label_line != ''

    if predict_meta is not None:
      if line != predict_meta:
        continue
      predict_meta = predict_meta_ifp.readline()

    ticker, date, tmp, gain = line[:-1].split('\t')
    if util.ymdToYm(date) != ym:
      continue
    assert data_line[-1] == '\n'
    assert label_line[-1] == '\n'
    label = float(label_line[:-1])
    gain = float(gain)
    # This is not true when labels are cut at other places than 0.
    # TODO: --label_file is not needed; remove.
    #if label > 0.5: assert gain >= 0
    #if label < 0.5: assert gain <= 0
    print >> data_ofp, data_line[:-1]
    meta.append([ticker, gain])

  data_ifp.close()
  label_ifp.close()
  meta_ifp.close()
  data_ofp.close()
  if predict_meta_ifp is not None:
    predict_meta_ifp.close()
  return meta
示例#12
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_file', required=True)
  parser.add_argument('--label_file', required=True)
  parser.add_argument('--meta_file', required=True)
  # Similar to --train_meta_file in train_model.py
  parser.add_argument('--predict_meta_file')
  parser.add_argument('--model_dir', required=True)
  parser.add_argument('--model_prefix', required=True)
  parser.add_argument('--model_suffix', required=True)
  parser.add_argument('--imputer_dir', required=True)
  parser.add_argument('--imputer_prefix', required=True)
  parser.add_argument('--imputer_suffix', required=True)
  parser.add_argument('--prediction_window', type=int, required=True)
  parser.add_argument('--delay_window', type=int, required=True)
  parser.add_argument('--result_file', required=True)
  parser.add_argument('--allow_older_models', action='store_true')
  args = parser.parse_args()

  # get dates for prediction
  with open(args.meta_file, 'r') as fp:
    lines = fp.read().splitlines()
  dates = set()
  for line in lines:
    tmp1, date, tmp2, tmp3 = line.split('\t')
    dates.add(util.ymdToYm(date))
  dates = sorted(dates)

  ofp = open(args.result_file, 'w')

  started = False  # check no 'hole' in simulation period
  delta = args.prediction_window + args.delay_window
  previous_files = [None, None]  # model, imputer
  for date in dates:
    ym = util.getPreviousYm(date, delta)
    model_name = getName(ym, args.model_prefix, args.model_suffix)
    imputer_name = getName(ym, args.imputer_prefix, args.imputer_suffix)
    model_file = '%s/%s' % (args.model_dir, model_name)
    imputer_file = '%s/%s' % (args.imputer_dir, imputer_name)
    if not os.path.isfile(model_file):
      if args.allow_older_models and previous_files[0] is not None:
        model_file = previous_files[0]
        imputer_file = previous_files[1]
        logging.warn('using previous model %s for %s' % (model_file, date))
      else:
        assert not started
        continue

    assert os.path.isfile(imputer_file)
    started = True
    previous_files = [model_file, imputer_file]

    meta = prepareData(date, args.data_file, args.label_file, args.meta_file,
                       args.predict_meta_file, TMP_DATA_FILE)
    data = numpy.loadtxt(TMP_DATA_FILE)
    assert data.shape[0] == len(meta), 'inconsistent data size: %d vs %d' % (
        data.shape[0], len(meta))

    with open(imputer_file, 'rb') as fp:
      imputer = pickle.load(fp)
    data = imputer.transform(data)

    with open(model_file, 'rb') as fp:
      model = pickle.load(fp)

    if 'predict_proba' in dir(model):
      prob = model.predict_proba(data)
      prob = [item[1] for item in prob]
    else:
      prob = model.predict(data)

    assert len(prob) == len(meta)
    items = [[meta[i][0], meta[i][1], prob[i]]
             for i in range(len(prob))]
    items.sort(key=lambda item: item[2], reverse=True)
    print >> ofp, 'date: %s' % date
    for item in items:
      ticker, gain, score = item
      print >> ofp, '\t%s\t%f\t%f' % (ticker, gain, score)

  ofp.close()
  if os.path.isfile(TMP_DATA_FILE):
    os.remove(TMP_DATA_FILE)
示例#13
0
def selectData(data_file, label_file, meta_file, weight_file, train_meta_file,
               yyyymm, months, tmp_data_file, tmp_label_file, tmp_weight_file):
  assert len(yyyymm) == 6
  y = yyyymm[:4]
  m = yyyymm[4:]
  last_ym = '%s-%s' % (y, m)
  if months <= 0:
    first_ym = '0000-00'
  else:
    first_ym = util.getPreviousYm(last_ym, months - 1)
  logging.info('training period: %s - %s' % (first_ym, last_ym))
  assert first_ym <= last_ym

  data_ifp = open(data_file, 'r')
  data_ofp = open(tmp_data_file, 'w')
  label_ifp = open(label_file, 'r')
  label_ofp = open(tmp_label_file, 'w')
  if weight_file:
    weight_ifp = open(weight_file, 'r')
  if tmp_weight_file:
    weight_ofp = open(tmp_weight_file, 'w')

  meta_fp = open(meta_file, 'r')
  if train_meta_file is None:
    train_meta_fp = None
    train_meta = None
  else:
    train_meta_fp = open(train_meta_file, 'r')
    train_meta = train_meta_fp.readline()

  count = 0
  while True:
    meta = meta_fp.readline()
    if meta == '':
      assert data_ifp.readline() == ''
      assert label_ifp.readline() == ''
      if weight_file:
        assert weight_ifp.readline() == ''
      break
    data = data_ifp.readline()
    label = label_ifp.readline()
    assert data != ''
    assert label != ''
    if weight_file:
      weight = weight_ifp.readline()
      assert weight != ''

    if train_meta is not None:
      if meta != train_meta:
        continue
      train_meta = train_meta_fp.readline()

    assert meta[-1] == '\n'
    ticker, date, tmp1, tmp2 = meta[:-1].split('\t')
    ym = util.ymdToYm(date)
    if ym < first_ym or ym > last_ym:
      continue
    assert data[-1] == '\n'
    assert label[-1] == '\n'
    print >> data_ofp, data[:-1]
    print >> label_ofp, label[:-1]
    if tmp_weight_file:
      assert weight[-1] == '\n'
      print >> weight_ofp, weight[:-1]
    count += 1
    
  logging.info('selected %d training samples' % count)
  data_ifp.close()
  data_ofp.close()
  label_ifp.close()
  label_ofp.close()
  if weight_file:
    weight_ifp.close()
  if tmp_weight_file:
    weight_ofp.close()
  meta_fp.close()
  if train_meta_fp is not None:
    train_meta_fp.close()
示例#14
0
def computePercFeature(input_dir, tickers, rank, output_dir):
  # ticker => [[date, value] ...]
  # where date is the first yyyy-mm after data is published.
  # Dates are deduped (any yyyy-mm with more than one values available,
  # the latest one wins).
  data = dict()
  for ticker in tickers:
    dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker))
    udvalues = []
    for i in range(len(dvalues)):
      date = util.getNextYm(util.ymdToYm(dvalues[i][0]))
      if len(udvalues) > 0 and udvalues[-1][0] == date:
        udvalues[-1][1] = dvalues[i][1]
      else:
        if len(udvalues) > 0:
          assert udvalues[-1][0] < date
        udvalues.append([date, dvalues[i][1]])
    data[ticker] = udvalues

  min_date = '9999-99'
  max_date = '0000-00'
  for dvalues in data.itervalues():
    if len(dvalues) == 0:
      continue
    min_date = min(min_date, dvalues[0][0])
    max_date = max(max_date, dvalues[-1][0])

  percs = dict()  # date => [[ticker, value] ...]
  date = min_date
  while date <= max_date:
    percs[date] = []
    date = util.getNextYm(date)
  for ticker, dvalues in data.iteritems():
    for i in range(len(dvalues)):
      date, value = dvalues[i]
      if i < len(dvalues) - 1:
        # Populate value up to next date (not inclusive).
        next = dvalues[i+1][0]
      else:
        # Populate value up to max date (inclusive).
        next = util.getNextYm(max_date)
      while date < next:
        percs[date].append([ticker, value])
        date = util.getNextYm(date)

  # Calculate percentiles.
  for date, tvalues in percs.iteritems():
    assert len(tvalues) > 0
    # Use 0.5 if there is a single element.
    if len(tvalues) == 1:
      tvalues[0][1] = 0.5
      continue
    if rank:
      tvalues.sort(key=lambda item: item[1])
      for i in range(len(tvalues)):
        tvalues[i][1] = float(i)/len(tvalues)
    else:
      values = [item[1] for item in tvalues]
      maxv = max(values)
      minv = min(values)
      span = maxv - minv
      for i in range(len(tvalues)):
        if span < EPS:
          tvalues[i][1] = 0.5
        else:
          tvalues[i][1] = (tvalues[i][1] - minv) / span

  # Write output.
  data = dict()  # ticker => [[date, perc] ...]
  for date, tpercs in percs.iteritems():
    for ticker, perc in tpercs:
      if ticker not in data:
        data[ticker] = [[date, perc]]
      else:
        data[ticker].append([date, perc])
  for ticker, dpercs in data.iteritems():
    dpercs.sort(key=lambda item: item[0])
    with open('%s/%s' % (output_dir, ticker), 'w') as fp:
      for date, perc in dpercs:
        print >> fp, '%s\t%f' % (date, perc)
示例#15
0
def selectData(data_file, label_file, meta_file, weight_file, train_meta_file,
               yyyymm, months, tmp_data_file, tmp_label_file, tmp_weight_file):
    assert len(yyyymm) == 6
    y = yyyymm[:4]
    m = yyyymm[4:]
    last_ym = '%s-%s' % (y, m)
    if months <= 0:
        first_ym = '0000-00'
    else:
        first_ym = util.getPreviousYm(last_ym, months - 1)
    logging.info('training period: %s - %s' % (first_ym, last_ym))
    assert first_ym <= last_ym

    data_ifp = open(data_file, 'r')
    data_ofp = open(tmp_data_file, 'w')
    label_ifp = open(label_file, 'r')
    label_ofp = open(tmp_label_file, 'w')
    if weight_file:
        weight_ifp = open(weight_file, 'r')
    if tmp_weight_file:
        weight_ofp = open(tmp_weight_file, 'w')

    meta_fp = open(meta_file, 'r')
    if train_meta_file is None:
        train_meta_fp = None
        train_meta = None
    else:
        train_meta_fp = open(train_meta_file, 'r')
        train_meta = train_meta_fp.readline()

    count = 0
    while True:
        meta = meta_fp.readline()
        if meta == '':
            assert data_ifp.readline() == ''
            assert label_ifp.readline() == ''
            if weight_file:
                assert weight_ifp.readline() == ''
            break
        data = data_ifp.readline()
        label = label_ifp.readline()
        assert data != ''
        assert label != ''
        if weight_file:
            weight = weight_ifp.readline()
            assert weight != ''

        if train_meta is not None:
            if meta != train_meta:
                continue
            train_meta = train_meta_fp.readline()

        assert meta[-1] == '\n'
        ticker, date, tmp1, tmp2 = meta[:-1].split('\t')
        ym = util.ymdToYm(date)
        if ym < first_ym or ym > last_ym:
            continue
        assert data[-1] == '\n'
        assert label[-1] == '\n'
        print >> data_ofp, data[:-1]
        print >> label_ofp, label[:-1]
        if tmp_weight_file:
            assert weight[-1] == '\n'
            print >> weight_ofp, weight[:-1]
        count += 1

    logging.info('selected %d training samples' % count)
    data_ifp.close()
    data_ofp.close()
    label_ifp.close()
    label_ofp.close()
    if weight_file:
        weight_ifp.close()
    if tmp_weight_file:
        weight_ofp.close()
    meta_fp.close()
    if train_meta_fp is not None:
        train_meta_fp.close()
示例#16
0
def computePercFeature(input_dir, tickers, rank, output_dir):
    # ticker => [[date, value] ...]
    # where date is the first yyyy-mm after data is published.
    # Dates are deduped (any yyyy-mm with more than one values available,
    # the latest one wins).
    data = dict()
    for ticker in tickers:
        dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker))
        udvalues = []
        for i in range(len(dvalues)):
            date = util.getNextYm(util.ymdToYm(dvalues[i][0]))
            if len(udvalues) > 0 and udvalues[-1][0] == date:
                udvalues[-1][1] = dvalues[i][1]
            else:
                if len(udvalues) > 0:
                    assert udvalues[-1][0] < date
                udvalues.append([date, dvalues[i][1]])
        data[ticker] = udvalues

    min_date = '9999-99'
    max_date = '0000-00'
    for dvalues in data.itervalues():
        if len(dvalues) == 0:
            continue
        min_date = min(min_date, dvalues[0][0])
        max_date = max(max_date, dvalues[-1][0])

    percs = dict()  # date => [[ticker, value] ...]
    date = min_date
    while date <= max_date:
        percs[date] = []
        date = util.getNextYm(date)
    for ticker, dvalues in data.iteritems():
        for i in range(len(dvalues)):
            date, value = dvalues[i]
            if i < len(dvalues) - 1:
                # Populate value up to next date (not inclusive).
                next = dvalues[i + 1][0]
            else:
                # Populate value up to max date (inclusive).
                next = util.getNextYm(max_date)
            while date < next:
                percs[date].append([ticker, value])
                date = util.getNextYm(date)

    # Calculate percentiles.
    for date, tvalues in percs.iteritems():
        assert len(tvalues) > 0
        # Use 0.5 if there is a single element.
        if len(tvalues) == 1:
            tvalues[0][1] = 0.5
            continue
        if rank:
            tvalues.sort(key=lambda item: item[1])
            for i in range(len(tvalues)):
                tvalues[i][1] = float(i) / len(tvalues)
        else:
            values = [item[1] for item in tvalues]
            maxv = max(values)
            minv = min(values)
            span = maxv - minv
            for i in range(len(tvalues)):
                if span < EPS:
                    tvalues[i][1] = 0.5
                else:
                    tvalues[i][1] = (tvalues[i][1] - minv) / span

    # Write output.
    data = dict()  # ticker => [[date, perc] ...]
    for date, tpercs in percs.iteritems():
        for ticker, perc in tpercs:
            if ticker not in data:
                data[ticker] = [[date, perc]]
            else:
                data[ticker].append([date, perc])
    for ticker, dpercs in data.iteritems():
        dpercs.sort(key=lambda item: item[0])
        with open('%s/%s' % (output_dir, ticker), 'w') as fp:
            for date, perc in dpercs:
                print >> fp, '%s\t%f' % (date, perc)
示例#17
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_file', required=True)
  parser.add_argument('--label_file', required=True)
  parser.add_argument('--meta_file', required=True)
  # Similar to --train_meta_file in train_model.py
  parser.add_argument('--predict_meta_file')
  parser.add_argument('--model_dir', required=True)
  parser.add_argument('--model_prefix', required=True)
  parser.add_argument('--model_suffix', required=True)
  parser.add_argument('--imputer_dir', required=True)
  parser.add_argument('--imputer_prefix', required=True)
  parser.add_argument('--imputer_suffix', required=True)
  parser.add_argument('--prediction_window', type=int, required=True)
  parser.add_argument('--delay_window', type=int, required=True)
  parser.add_argument('--result_file', required=True)
  parser.add_argument('--allow_older_models', action='store_true')
  args = parser.parse_args()

  # get dates for prediction
  with open(args.meta_file, 'r') as fp:
    lines = fp.read().splitlines()
  dates = set()
  for line in lines:
    tmp1, date, tmp2, tmp3 = line.split('\t')
    dates.add(util.ymdToYm(date))
  dates = sorted(dates)

  ofp = open(args.result_file, 'w')

  started = False  # check no 'hole' in simulation period
  delta = args.prediction_window + args.delay_window
  previous_files = [None, None]  # model, imputer
  for date in dates:
    ym = util.getPreviousYm(date, delta)
    model_name = getName(ym, args.model_prefix, args.model_suffix)
    imputer_name = getName(ym, args.imputer_prefix, args.imputer_suffix)
    model_file = '%s/%s' % (args.model_dir, model_name)
    imputer_file = '%s/%s' % (args.imputer_dir, imputer_name)
    if not os.path.isfile(model_file):
      if args.allow_older_models and previous_files[0] is not None:
        model_file = previous_files[0]
        imputer_file = previous_files[1]
        logging.warn('using previous model %s for %s' % (model_file, date))
      else:
        assert not started
        continue

    assert os.path.isfile(imputer_file)
    started = True
    previous_files = [model_file, imputer_file]

    meta = prepareData(date, args.data_file, args.label_file, args.meta_file,
                       args.predict_meta_file, TMP_DATA_FILE)
    data = numpy.loadtxt(TMP_DATA_FILE)
    assert data.shape[0] == len(meta), 'inconsistent data size: %d vs %d' % (
        data.shape[0], len(meta))

    with open(imputer_file, 'rb') as fp:
      imputer = pickle.load(fp)
    data = imputer.transform(data)

    with open(model_file, 'rb') as fp:
      model = pickle.load(fp)

    if 'predict_proba' in dir(model):
      prob = model.predict_proba(data)
      prob = [item[1] for item in prob]
    else:
      prob = model.predict(data)

    assert len(prob) == len(meta)
    items = [[meta[i][0], meta[i][1], prob[i]]
             for i in range(len(prob))]
    items.sort(key=lambda item: item[2], reverse=True)
    print >> ofp, 'date: %s' % date
    for item in items:
      ticker, gain, score = item
      print >> ofp, '\t%s\t%f\t%f' % (ticker, gain, score)

  ofp.close()
  if os.path.isfile(TMP_DATA_FILE):
    os.remove(TMP_DATA_FILE)