def computeVertPerc2Feature(input_dir, output_dir, feature, windows_str, ticker_file, info_dir):
  windows = [int(w) for w in windows_str.split(',')]
  tickers = util.readTickers(ticker_file)
  feature_info = {w: [] for w in windows}  # [[yyyy, feature] ...]

  for window in windows:
    target_dir = getTargetDir(output_dir, feature, window)
    if not os.path.isdir(target_dir):
      os.mkdir(target_dir)

  for ticker in tickers:
    feature_file = '%s/%s/%s' % (input_dir, feature, ticker)
    if not os.path.isfile(feature_file):
      continue
    data = util.readKeyValueList(feature_file)

    ofps = {w: open('%s/%s' % (getTargetDir(output_dir, feature, w), ticker), 'w')
            for w in windows}

    for index in range(len(data)):
      date, percs = compute(data, index, windows)
      for window, perc in percs.iteritems():
        print >> ofps[window], '%s\t%f' % (date, perc)
      year = util.ymdToY(date)
      for window in windows:
        feature_info[window].append((year, percs[window]))

    for ofp in ofps.itervalues():
      ofp.close()

  for window in windows:
    target = getTarget(feature, window)
    util.writeFeatureInfo(
        [input_dir, output_dir, feature, windows_str, ticker_file],
        feature_info[window], '%s/%s' % (info_dir, target))
示例#2
0
def computePerc(input_dir, output_dir):
    tickers = sorted(os.listdir(input_dir))
    dvalues = dict()  # date => [[ticker, value] ...]
    for ticker in tickers:
        data = util.readKeyValueList('%s/%s' % (input_dir, ticker))
        for date, value in data:
            if date not in dvalues:
                dvalues[date] = [[ticker, value]]
            else:
                dvalues[date].append([ticker, value])
    # Convert raw value to perc.
    for tvalues in dvalues.itervalues():
        tvalues.sort(key=lambda item: item[1])
        for i in range(len(tvalues)):
            tvalues[i][1] = float(i) / len(tvalues)
    tvalues = dict()  # ticker => [[date, perc] ...]
    for date, tpercs in dvalues.iteritems():
        for ticker, perc in tpercs:
            if ticker not in tvalues:
                tvalues[ticker] = [[date, perc]]
            else:
                tvalues[ticker].append([date, perc])
    # Write output.
    for ticker, values in tvalues.iteritems():
        values.sort(key=lambda value: value[0])
        with open('%s/%s' % (output_dir, ticker), 'w') as fp:
            for date, perc in values:
                print >> fp, '%s\t%f' % (date, perc)
示例#3
0
def computePerc(input_dir, output_dir):
  tickers = sorted(os.listdir(input_dir))
  dvalues = dict()  # date => [[ticker, value] ...]
  for ticker in tickers:
    data = util.readKeyValueList('%s/%s' % (input_dir, ticker))
    for date, value in data:
      if date not in dvalues:
        dvalues[date] = [[ticker, value]]
      else:
        dvalues[date].append([ticker, value])
  # Convert raw value to perc.
  for tvalues in dvalues.itervalues():
    tvalues.sort(key=lambda item: item[1])
    for i in range(len(tvalues)):
      tvalues[i][1] = float(i)/len(tvalues)
  tvalues = dict()  # ticker => [[date, perc] ...]
  for date, tpercs in dvalues.iteritems():
    for ticker, perc in tpercs:
      if ticker not in tvalues:
        tvalues[ticker] = [[date, perc]]
      else:
        tvalues[ticker].append([date, perc])
  # Write output.
  for ticker, values in tvalues.iteritems():
    values.sort(key=lambda value: value[0])
    with open('%s/%s' % (output_dir, ticker), 'w') as fp:
      for date, perc in values:
        print >> fp, '%s\t%f' % (date, perc)
示例#4
0
def getGainDict(gain_file):
  dgains = util.readKeyValueList(gain_file)
  gain_dict = dict()
  for date, gain in dgains:
    ym = util.ymdToYm(date)
    assert ym not in gain_dict
    gain_dict[ym] = [date, gain]
  return gain_dict
示例#5
0
def computePreviousFeature(feature_dir, k, pfeature_dir):
  tickers = sorted(os.listdir(feature_dir))
  for ticker in tickers:
    feature_file = '%s/%s' % (feature_dir, ticker)
    dfeatures = util.readKeyValueList(feature_file)
    with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp:
      for date, feature in dfeatures:
        ym = util.ymdToYm(date)
        pdate = util.getNextYm(ym, k)
        print >> fp, '%s-01\t%f' % (pdate, feature)
示例#6
0
def computePreviousFeature(feature_dir, k, pfeature_dir):
    tickers = sorted(os.listdir(feature_dir))
    for ticker in tickers:
        feature_file = '%s/%s' % (feature_dir, ticker)
        dfeatures = util.readKeyValueList(feature_file)
        with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp:
            for date, feature in dfeatures:
                ym = util.ymdToYm(date)
                pdate = util.getNextYm(ym, k)
                print >> fp, '%s-01\t%f' % (pdate, feature)
示例#7
0
def computeVolatility(price_dir, k, volatility_dir):
  assert k > 0
  tickers = sorted(os.listdir(price_dir))
  for ticker in tickers:
    price_file = '%s/%s' % (price_dir, ticker)
    dprices = util.readKeyValueList(price_file)
    with open('%s/%s' % (volatility_dir, ticker), 'w') as fp:
      for i in range(len(dprices)):
        prices = [dprices[j][1] for j in range(max(0,i-k), i+1)]
        gains = [(prices[j+1] - prices[j]) / (prices[j] + EPS) for j in range(len(prices) - 1)]
        volatility = computeStd(gains)
        print >> fp, '%s\t%f' % (dprices[i][0], volatility)
示例#8
0
def computeWindowFeature(args):
    assert args.do_raw or args.do_fd
    tickers = sorted(os.listdir(args.value_dir))
    windows = [int(window) for window in args.windows.split(',')]
    assert min(windows) >= 0, 'cannot look at future values'
    assert len(windows) > 0
    assert len(windows) > 1 or not args.do_fd
    max_window = max(windows)

    for ticker in tickers:
        raw_fps = None
        fd_fps = None
        if args.do_raw:
            raw_fps = []
            for window in windows:
                raw_dir = '%s/%s%d' % (args.feature_dir, args.prefix, window)
                if not os.path.isdir(raw_dir):
                    os.mkdir(raw_dir)
                raw_fps.append(open('%s/%s' % (raw_dir, ticker), 'w'))
        if args.do_fd:
            fd_fps = []
            for i in range(len(windows) - 1):
                fd_dir = '%s/%sfd-%d' % (args.feature_dir, args.prefix,
                                         windows[i])
                if not os.path.isdir(fd_dir):
                    os.mkdir(fd_dir)
                fd_fps.append(open('%s/%s' % (fd_dir, ticker), 'w'))

        dvalues = util.readKeyValueList('%s/%s' % (args.value_dir, ticker))
        for i in range(max_window, len(dvalues)):
            values = [dvalues[i - window][1] for window in windows]
            if raw_fps:
                raws = util.normalize(values)
                assert len(raws) == len(raw_fps)
                for j in range(len(raws)):
                    print >> raw_fps[j], '%s\t%f' % (dvalues[i][0], raws[j])
            if fd_fps:
                derivatives = [
                    (values[j] - values[j + 1]) / (values[j + 1] + args.bonus)
                    for j in range(len(windows) - 1)
                ]
                derivatives = util.normalize(derivatives)
                assert len(derivatives) == len(fd_fps)
                for j in range(len(derivatives)):
                    print >> fd_fps[j], '%s\t%f' % (dvalues[i][0],
                                                    derivatives[j])
        if raw_fps:
            for fp in raw_fps:
                fp.close()
        if fd_fps:
            for fp in fd_fps:
                fp.close()
示例#9
0
def computeVolatility(price_dir, k, volatility_dir):
    assert k > 0
    tickers = sorted(os.listdir(price_dir))
    for ticker in tickers:
        price_file = '%s/%s' % (price_dir, ticker)
        dprices = util.readKeyValueList(price_file)
        with open('%s/%s' % (volatility_dir, ticker), 'w') as fp:
            for i in range(len(dprices)):
                prices = [dprices[j][1] for j in range(max(0, i - k), i + 1)]
                gains = [(prices[j + 1] - prices[j]) / (prices[j] + EPS)
                         for j in range(len(prices) - 1)]
                volatility = computeStd(gains)
                print >> fp, '%s\t%f' % (dprices[i][0], volatility)
def computeRollingWindowFeature(args):
    assert args.window > 0
    tickers = sorted(os.listdir(args.input_dir))
    for ticker in tickers:
        dvalues = util.readKeyValueList('%s/%s' % (args.input_dir, ticker))
        dates = [dvalue[0] for dvalue in dvalues]
        values = [dvalue[1] for dvalue in dvalues]
        for i in range(len(dates) - 1):
            assert dates[i] < dates[i + 1]
        with open('%s/%s' % (args.output_dir, ticker), 'w') as fp:
            for i in range(args.window - 1, len(dates)):
                wvalues = values[i - args.window + 1:i + 1]
                if args.method == 'mean':
                    f = sum(wvalues) / args.window
                print >> fp, '%s\t%f' % (dates[i], f)
示例#11
0
def computeVertPercFeature(input_dir, output_dir, feature, windows_str,
                           ticker_file, info_dir):
    windows = [int(w) for w in windows_str.split(',')]
    tickers = util.readTickers(ticker_file)
    feature_info = {w: [] for w in windows}  # [[yyyy, feature] ...]

    for window in windows:
        target_dir = getTargetDir(output_dir, feature, window)
        if not os.path.isdir(target_dir):
            os.mkdir(target_dir)

    for ticker in tickers:
        feature_file = '%s/%s/%s' % (input_dir, feature, ticker)
        if not os.path.isfile(feature_file):
            continue
        data = util.readKeyValueList(feature_file)

        ofps = {
            w: open('%s/%s' % (getTargetDir(output_dir, feature, w), ticker),
                    'w')
            for w in windows
        }

        for index in range(len(data)):
            date, ws, vs = collectData(data, index, windows)
            ps = computePerc(vs)
            for i in range(len(ws)):
                print >> ofps[ws[i]], '%s\t%f' % (date, ps[i])
            year = util.ymdToY(date)
            for window in windows:
                try:
                    i = ws.index(window)
                    feature_info[window].append((year, ps[i]))
                except ValueError:
                    feature_info[window].append((year, None))

        for ofp in ofps.itervalues():
            ofp.close()

    for window in windows:
        target = getTarget(feature, window)
        util.writeFeatureInfo(
            [input_dir, output_dir, feature, windows_str, ticker_file],
            feature_info[window], '%s/%s' % (info_dir, target))
示例#12
0
def computeVertGainFeature(input_dir, output_dir, feature, windows_str,
                           ticker_file, info_dir):
    windows = [int(w) for w in windows_str.split(',')]
    tickers = util.readTickers(ticker_file)
    feature_info = {w: [] for w in windows}  # [[yyyy, feature] ...]

    for window in windows:
        target_dir = getTargetDir(output_dir, feature, window)
        if not os.path.isdir(target_dir):
            os.mkdir(target_dir)

    for ticker in tickers:
        feature_file = '%s/%s/%s' % (input_dir, feature, ticker)
        if not os.path.isfile(feature_file):
            continue
        data = util.readKeyValueList(feature_file)

        ofps = {
            w: open('%s/%s' % (getTargetDir(output_dir, feature, w), ticker),
                    'w')
            for w in windows
        }

        for index in range(len(data)):
            date, gains = compute(data, index, windows)
            year = util.ymdToY(date)
            for window, gain in gains.iteritems():
                print >> ofps[window], '%s\t%f' % (date, gain)
            for window in windows:
                if window in gains:
                    feature_info[window].append((year, gains[window]))
                else:
                    feature_info[window].append((year, None))

        for ofp in ofps.itervalues():
            ofp.close()

    for window in windows:
        target = getTarget(feature, window)
        util.writeFeatureInfo(
            [input_dir, output_dir, feature, windows_str, ticker_file],
            feature_info[window], '%s/%s' % (info_dir, target))
示例#13
0
def filterMetadata(input_file, min_raw_price, raw_price_dir, max_volatility,
                   volatility_dir, min_marketcap, marketcap_dir, max_holes,
                   hole_dir, membership_file, remove_neg_labels, label_file,
                   output_file):
    stats = {
        'min_raw_price': 0,
        'max_volatility': 0,
        'min_marketcap': 0,
        'max_holes': 0,
        'membership': 0,
        'neg_label': 0,
    }

    ifp = open(input_file, 'r')
    if remove_neg_labels:
        lfp = open(label_file, 'r')
    ofp = open(output_file, 'w')

    prev_ticker = None
    price = None  # for prev_ticker, date => price
    volatility = None  # for prev ticker, date => volatility
    marketcap = None  # for prev ticker, [dates, values]
    hole = None  # for prev ticker, [dates, holes]
    if membership_file is None:
        membership = None
    else:
        membership = readMembership(
            membership_file)  # ticker => [[start, end] ...]

    while True:
        line = ifp.readline()
        if remove_neg_labels:
            lline = lfp.readline()
        if line == '':
            if remove_neg_labels:
                assert lline == '', 'inconsisten line count between meta and label files'
            break
        assert line.endswith('\n')
        items = line[:-1].split('\t')
        assert len(items) >= 2
        ticker, date = items[0], items[1]
        if ticker != prev_ticker:
            prev_ticker = ticker
            if raw_price_dir is not None:
                price = util.readKeyValueDict('%s/%s' %
                                              (raw_price_dir, ticker))
            if volatility_dir is not None:
                volatility = util.readKeyValueDict('%s/%s' %
                                                   (volatility_dir, ticker))
            if marketcap_dir is not None:
                tmp = util.readKeyValueList('%s/%s' % (marketcap_dir, ticker))
                marketcap_dates = [t[0] for t in tmp]
                marketcap_values = [t[1] for t in tmp]
                marketcap = (marketcap_dates, marketcap_values)
            if hole_dir is not None:
                tmp = util.readKeyValueList('%s/%s' % (hole_dir, ticker))
                hole_dates = [t[0] for t in tmp]
                hole_values = [t[1] for t in tmp]
                hole = (hole_dates, hole_values)
        # Maybe check price.
        if price is not None:
            assert date in price, 'missing price for %s on %s' % (ticker, date)
            if price[date] < min_raw_price:
                stats['min_raw_price'] += 1
                continue
        # Maybe check volatility.
        if volatility is not None:
            assert date in volatility, 'missing volatility for %s on %s' % (
                ticker, date)
            if volatility[date] > max_volatility:
                stats['max_volatility'] += 1
                continue
        # Maybe check marketcap.
        if marketcap is not None:
            marketcap_dates, marketcap_values = marketcap
            index = bisect.bisect_right(marketcap_dates, date) - 1
            if index < 0 or marketcap_values[index] < min_marketcap:
                stats['min_marketcap'] += 1
                continue
        # Maybe check holes.
        if hole is not None:
            hole_dates, hole_values = hole
            index = bisect.bisect_right(hole_dates, date) - 1
            if index < 0 or hole_values[index] > max_holes:
                stats['max_holes'] += 1
                continue
        # Maybe check membership.
        if membership is not None:
            if not isMember(membership, ticker, date):
                stats['membership'] += 1
                continue
        # Maybe check label.
        if remove_neg_labels:
            assert lline.endswith('\n')
            label = float(lline[:-1])
            if label < 0:
                stats['neg_label'] += 1
                continue
        print >> ofp, line[:-1]
    logging.info('skip_stats: %s' % stats)
示例#14
0
def collectData(gain_dir, date_file, max_neg, min_pos, feature_base_dir,
                feature_list_file, feature_stats_file, min_date, max_date,
                window, min_feature_perc, data_file, label_file, rlabel_file,
                meta_file, weight_power, weight_file):
    tickers = sorted(os.listdir(gain_dir))
    feature_list = readFeatureList(feature_list_file)
    min_feature_count = int(len(feature_list) * min_feature_perc)
    feature_ranges = readFeatureRanges(feature_stats_file)
    for feature in feature_list:
        if feature not in feature_ranges:
            assert (feature.find('gain') > 0 or feature.find('price') > 0
                    or feature.find('volume') > 0
                    or feature.find('volatility') > 0
                    or feature.find('_hp') > 0 or feature.startswith('sector')
                    or feature.startswith('industry')
                    or feature.startswith('window')), (
                        'no range info for feature %s' % feature)
            feature_ranges[feature] = [float('-Inf'), float('Inf')]
        lower, upper = feature_ranges[feature]

    data_fp = open(data_file, 'w')
    label_fp = open(label_file, 'w')
    rlabel_fp = open(rlabel_file, 'w')
    meta_fp = open(meta_file, 'w')
    weight_fp = None
    if weight_file:
        weight_fp = open(weight_file, 'w')

    skip_stats = {
        'feature_file': 0,
        'index': 0,
        'min_date': 0,
        'max_date': 0,
        'filter_date': 0,
        'neg_pos': 0,
        'window': 0,
        'min_perc': 0,
        '1_perc': 0,
        '99_perc': 0
    }

    dates = None
    if date_file:
        with open(date_file, 'r') as fp:
            dates = set(fp.read().splitlines())

    for ticker in tickers:
        gain_file = '%s/%s' % (gain_dir, ticker)
        gains = util.readKeyValueList(gain_file)

        feature_items = [[] for i in range(len(feature_list))]
        for i in range(len(feature_list)):
            feature_file = '%s/%s/%s' % (feature_base_dir, feature_list[i],
                                         ticker)
            if not os.path.isfile(feature_file):
                skip_stats['feature_file'] += 1
                continue
            keys, values = util.readKeyListValueList(feature_file)
            for j in range(len(keys)):
                if keys[j] == '*':
                    continue
                ymd = keys[j].split('-')
                if len(ymd) == 3:
                    continue
                # Change yyyy-mm to yyyy-mm-01
                assert len(ymd) == 2
                keys[j] += '-01'
            feature_items[i] = [keys, values]

        for gain_date, gain in gains:
            if gain_date < min_date:
                skip_stats['min_date'] += 1
                continue
            if gain_date > max_date:
                skip_stats['max_date'] += 1
                continue
            if dates is not None and gain_date not in dates:
                skip_stats['filter_date'] += 1
                continue

            if max_neg < gain and gain < min_pos:
                skip_stats['neg_pos'] += 1
                # Do not skip these they need to be part of testing data.
                # Instead output negative label and postpone filtering to filter_metadata
                # (remove negative labels for training and not for testing).
                #continue

            if DEBUG:
                print 'gain: %f (%s)' % (gain, gain_date)

            if gain <= max_neg:
                weight = max_neg - gain
                label = 0.0
            elif gain >= min_pos:
                weight = gain - min_pos
                label = 1.0
            else:
                weight = 0.0
                label = -1.0
            weight = weight**weight_power

            features = [MISSING_VALUE for i in range(len(feature_list))]
            feature_count = 0
            for i in range(len(feature_list)):
                keys, values = feature_items[i]
                if len(keys) == 1 and keys[0] == '*':
                    # undated feature, eg sector
                    index = 0
                else:
                    # dated feature, eg pgain
                    index = bisect.bisect_right(keys, gain_date) - 1
                    if index < 0:
                        skip_stats['index'] += 1
                        continue

                    gain_date_obj = datetime.datetime.strptime(
                        gain_date, '%Y-%m-%d')
                    feature_date_obj = datetime.datetime.strptime(
                        keys[index], '%Y-%m-%d')
                    delta = (gain_date_obj - feature_date_obj).days
                    if delta > window:
                        skip_stats['window'] += 1
                        continue

                feature = values[index]
                lower, upper = feature_ranges[feature_list[i]]
                if feature < lower:
                    skip_stats['1_perc'] += 1
                    continue
                if feature > upper:
                    skip_stats['99_perc'] += 1
                    continue

                if DEBUG:
                    print 'feature %s: (%s, %f)' % (feature_list[i],
                                                    keys[index], feature)

                features[i] = feature
                feature_count += 1

            if feature_count < min_feature_count:
                skip_stats['min_perc'] += 1
                continue

            print >> data_fp, ' '.join(
                ['%f' % feature for feature in features])
            print >> label_fp, '%f' % label
            print >> rlabel_fp, '%f' % gain
            print >> meta_fp, '%s\t%s\t%d\t%f' % (ticker, gain_date,
                                                  feature_count, gain)
            if weight_fp:
                print >> weight_fp, '%f' % weight

        if DEBUG: break

    data_fp.close()
    label_fp.close()
    rlabel_fp.close()
    meta_fp.close()
    if weight_fp:
        weight_fp.close()
    logging.info('skip_stats: %s' % skip_stats)
示例#15
0
def computePercFeature(input_dir, tickers, rank, output_dir):
  # ticker => [[date, value] ...]
  # where date is the first yyyy-mm after data is published.
  # Dates are deduped (any yyyy-mm with more than one values available,
  # the latest one wins).
  data = dict()
  for ticker in tickers:
    dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker))
    udvalues = []
    for i in range(len(dvalues)):
      date = util.getNextYm(util.ymdToYm(dvalues[i][0]))
      if len(udvalues) > 0 and udvalues[-1][0] == date:
        udvalues[-1][1] = dvalues[i][1]
      else:
        if len(udvalues) > 0:
          assert udvalues[-1][0] < date
        udvalues.append([date, dvalues[i][1]])
    data[ticker] = udvalues

  min_date = '9999-99'
  max_date = '0000-00'
  for dvalues in data.itervalues():
    if len(dvalues) == 0:
      continue
    min_date = min(min_date, dvalues[0][0])
    max_date = max(max_date, dvalues[-1][0])

  percs = dict()  # date => [[ticker, value] ...]
  date = min_date
  while date <= max_date:
    percs[date] = []
    date = util.getNextYm(date)
  for ticker, dvalues in data.iteritems():
    for i in range(len(dvalues)):
      date, value = dvalues[i]
      if i < len(dvalues) - 1:
        # Populate value up to next date (not inclusive).
        next = dvalues[i+1][0]
      else:
        # Populate value up to max date (inclusive).
        next = util.getNextYm(max_date)
      while date < next:
        percs[date].append([ticker, value])
        date = util.getNextYm(date)

  # Calculate percentiles.
  for date, tvalues in percs.iteritems():
    assert len(tvalues) > 0
    # Use 0.5 if there is a single element.
    if len(tvalues) == 1:
      tvalues[0][1] = 0.5
      continue
    if rank:
      tvalues.sort(key=lambda item: item[1])
      for i in range(len(tvalues)):
        tvalues[i][1] = float(i)/len(tvalues)
    else:
      values = [item[1] for item in tvalues]
      maxv = max(values)
      minv = min(values)
      span = maxv - minv
      for i in range(len(tvalues)):
        if span < EPS:
          tvalues[i][1] = 0.5
        else:
          tvalues[i][1] = (tvalues[i][1] - minv) / span

  # Write output.
  data = dict()  # ticker => [[date, perc] ...]
  for date, tpercs in percs.iteritems():
    for ticker, perc in tpercs:
      if ticker not in data:
        data[ticker] = [[date, perc]]
      else:
        data[ticker].append([date, perc])
  for ticker, dpercs in data.iteritems():
    dpercs.sort(key=lambda item: item[0])
    with open('%s/%s' % (output_dir, ticker), 'w') as fp:
      for date, perc in dpercs:
        print >> fp, '%s\t%f' % (date, perc)
示例#16
0
def computePercFeature(input_dir, tickers, rank, output_dir):
    # ticker => [[date, value] ...]
    # where date is the first yyyy-mm after data is published.
    # Dates are deduped (any yyyy-mm with more than one values available,
    # the latest one wins).
    data = dict()
    for ticker in tickers:
        dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker))
        udvalues = []
        for i in range(len(dvalues)):
            date = util.getNextYm(util.ymdToYm(dvalues[i][0]))
            if len(udvalues) > 0 and udvalues[-1][0] == date:
                udvalues[-1][1] = dvalues[i][1]
            else:
                if len(udvalues) > 0:
                    assert udvalues[-1][0] < date
                udvalues.append([date, dvalues[i][1]])
        data[ticker] = udvalues

    min_date = '9999-99'
    max_date = '0000-00'
    for dvalues in data.itervalues():
        if len(dvalues) == 0:
            continue
        min_date = min(min_date, dvalues[0][0])
        max_date = max(max_date, dvalues[-1][0])

    percs = dict()  # date => [[ticker, value] ...]
    date = min_date
    while date <= max_date:
        percs[date] = []
        date = util.getNextYm(date)
    for ticker, dvalues in data.iteritems():
        for i in range(len(dvalues)):
            date, value = dvalues[i]
            if i < len(dvalues) - 1:
                # Populate value up to next date (not inclusive).
                next = dvalues[i + 1][0]
            else:
                # Populate value up to max date (inclusive).
                next = util.getNextYm(max_date)
            while date < next:
                percs[date].append([ticker, value])
                date = util.getNextYm(date)

    # Calculate percentiles.
    for date, tvalues in percs.iteritems():
        assert len(tvalues) > 0
        # Use 0.5 if there is a single element.
        if len(tvalues) == 1:
            tvalues[0][1] = 0.5
            continue
        if rank:
            tvalues.sort(key=lambda item: item[1])
            for i in range(len(tvalues)):
                tvalues[i][1] = float(i) / len(tvalues)
        else:
            values = [item[1] for item in tvalues]
            maxv = max(values)
            minv = min(values)
            span = maxv - minv
            for i in range(len(tvalues)):
                if span < EPS:
                    tvalues[i][1] = 0.5
                else:
                    tvalues[i][1] = (tvalues[i][1] - minv) / span

    # Write output.
    data = dict()  # ticker => [[date, perc] ...]
    for date, tpercs in percs.iteritems():
        for ticker, perc in tpercs:
            if ticker not in data:
                data[ticker] = [[date, perc]]
            else:
                data[ticker].append([date, perc])
    for ticker, dpercs in data.iteritems():
        dpercs.sort(key=lambda item: item[0])
        with open('%s/%s' % (output_dir, ticker), 'w') as fp:
            for date, perc in dpercs:
                print >> fp, '%s\t%f' % (date, perc)
示例#17
0
def collectData(gain_dir, feature_base_dir, feature_list_file,
                feature_stats_file, min_date, max_date,
                window, min_feature_perc, data_file, meta_file):
  tickers = sorted(os.listdir(gain_dir))
  feature_list = readFeatureList(feature_list_file)
  min_feature_count = int(len(feature_list) * min_feature_perc)
  feature_ranges = readFeatureRanges(feature_stats_file)
  for feature in feature_list:
    if feature not in feature_ranges:
      assert (feature.find('gain') > 0 or
              feature.find('price') > 0 or
              feature.find('volume') > 0 or
              feature.find('volatility') > 0 or
              feature.find('_hp') > 0 or
              feature.startswith('sector') or
              feature.startswith('industry')), (
          'no range info for feature %s' % feature)
      feature_ranges[feature] = [float('-Inf'), float('Inf')]
    lower, upper = feature_ranges[feature]

  data_fp = open(data_file, 'w')
  meta_fp = open(meta_file, 'w')

  skip_stats = {'feature_file': 0,
                'index': 0,
                'min_date': 0,
                'max_date': 0,
                'window': 0,
                'min_perc': 0,
                '1_perc': 0,
                '99_perc': 0}

  for ticker in tickers:
    gain_file = '%s/%s' % (gain_dir, ticker)
    gains = util.readKeyValueList(gain_file)

    feature_items = [[] for i in range(len(feature_list))]
    for i in range(len(feature_list)):
      feature_file = '%s/%s/%s' % (feature_base_dir, feature_list[i], ticker)
      if not os.path.isfile(feature_file):
        skip_stats['feature_file'] += 1
        continue
      items = util.readKeyValueList(feature_file)
      for j in range(len(items)):
        if items[j][0] == '*':
          continue
        ymd = items[j][0].split('-')
        if len(ymd) == 3:
          continue
        # Change yyyy-mm to yyyy-mm-01
        assert len(ymd) == 2
        items[j][0] += '-01'
      feature_items[i] = items

    for gain_date, gain in gains:
      if gain_date < min_date:
        skip_stats['min_date'] += 1
        continue
      if gain_date > max_date:
        skip_stats['max_date'] += 1
        continue

      if DEBUG:
        print 'gain: %f (%s)' % (gain, gain_date)

      features = [MISSING_VALUE for i in range(len(feature_list))]
      feature_count = 0
      for i in range(len(feature_list)):
        if len(feature_items[i]) == 1 and feature_items[i][0][0] == '*':
          # undated feature, eg sector
          index = 0
        else:
          # dated feature, eg pgain
          feature_dates = [item[0] for item in feature_items[i]]
          index = bisect.bisect_right(feature_dates, gain_date) - 1
          if index < 0:
            skip_stats['index'] += 1
            continue

          gain_date_obj = datetime.datetime.strptime(gain_date, '%Y-%m-%d')
          feature_date_obj = datetime.datetime.strptime(feature_dates[index],
                                                        '%Y-%m-%d')
          delta = (gain_date_obj - feature_date_obj).days
          if delta > window:
            skip_stats['window'] += 1
            continue

        feature = feature_items[i][index][1]
        lower, upper = feature_ranges[feature_list[i]]
        if feature < lower:
          skip_stats['1_perc'] += 1
          continue
        if feature > upper:
          skip_stats['99_perc'] += 1
          continue

        if DEBUG:
          print 'feature %s: (%s, %f)' % (
              feature_list[i], feature_items[i][index][0], feature)

        features[i] = feature
        feature_count += 1

      if feature_count < min_feature_count:
        skip_stats['min_perc'] += 1
        continue

      print >> data_fp, ' '.join(['%f' % feature for feature in features])
      print >> meta_fp, '%s\t%s\t%d\t%f' % (
          ticker, gain_date, feature_count, gain)

    if DEBUG: break

  data_fp.close()
  meta_fp.close()
  logging.info('skip_stats: %s' % skip_stats)