def processYahoo(raw_dir, processed_dir): raw_files = os.listdir(raw_dir) for raw_file in raw_files: assert raw_file.endswith(EXTENSION) ticker = raw_file[:-len(EXTENSION)] input_file = '%s/%s' % (raw_dir, raw_file) with open(input_file, 'r') as fp: lines = fp.read().splitlines() assert len(lines) > 0 assert lines[0] == HEADER processed_file = '%s/%s' % (processed_dir, ticker) with open(processed_file, 'w') as fp: previous_ymd = None previous_ym = None previous_vo = 0.0 previous_days = 0 for i in range(len(lines) - 1, 0, -1): ymd, op, hi, lo, cl, vo, acl = lines[i].split(',') if previous_ymd is not None: assert ymd > previous_ymd previous_ymd = ymd ym = util.ymdToYm(ymd) if ym != previous_ym: previous_ym = ym if previous_days > 0: previous_vo /= previous_days print >> fp, '%s\t%s\t%s\t%s\t%s' % ( ymd, cl, acl, previous_vo, previous_vo) previous_vo = 0.0 previous_days = 0 previous_vo += getVolume(vo) previous_days += 1
def processYahoo(raw_dir, processed_dir): raw_files = os.listdir(raw_dir) for raw_file in raw_files: assert raw_file.endswith(EXTENSION) ticker = raw_file[:-len(EXTENSION)] input_file = '%s/%s' % (raw_dir, raw_file) with open(input_file, 'r') as fp: lines = fp.read().splitlines() assert len(lines) > 0 assert lines[0] == HEADER processed_file = '%s/%s' % (processed_dir, ticker) with open(processed_file, 'w') as fp: previous_ymd = None previous_ym = None previous_vo = 0.0 previous_days = 0 for i in range(len(lines) - 1, 0, -1): ymd, op, hi, lo, cl, vo, acl = lines[i].split(',') if previous_ymd is not None: assert ymd > previous_ymd previous_ymd = ymd ym = util.ymdToYm(ymd) if ym != previous_ym: previous_ym = ym if previous_days > 0: previous_vo /= previous_days print >> fp, '%s\t%s\t%s\t%s\t%s' % ( ymd, cl, acl, previous_vo, previous_vo) previous_vo = 0.0 previous_days = 0 previous_vo += getVolume(vo) previous_days += 1
def processEodRaw(raw_dir, ticker_file, processed_dir): tickers = util.readTickers(ticker_file) for ticker in tickers: raw_file = '%s/%s' % (raw_dir, ticker) if not os.path.isfile(raw_file): continue with open(raw_file, 'r') as fp: lines = fp.read().splitlines() processed_file = '%s/%s' % (processed_dir, ticker) with open(processed_file, 'w') as fp: previous_ymd = None previous_ym = None previous_vo, previous_avo = 0.0, 0.0 previous_days = 0 for line in lines: _, ymd, op, hi, lo, cl, vo, di, sp, aop, ahi, alo, acl, avo = ( line.split(',')) assert _ == ticker if previous_ymd is not None: assert ymd > previous_ymd previous_ymd = ymd ym = util.ymdToYm(ymd) if ym != previous_ym: previous_ym = ym if previous_days > 0: previous_vo /= previous_days previous_avo /= previous_days print >> fp, '%s\t%s\t%s\t%f\t%f' % ( ymd, cl, acl, previous_vo, previous_avo) previous_vo = 0.0 previous_avo = 0.0 previous_days = 0 previous_vo += getVolume(vo) previous_avo += getVolume(avo) previous_days += 1
def processEodRaw(raw_dir, ticker_file, processed_dir): tickers = util.readTickers(ticker_file) for ticker in tickers: raw_file = '%s/%s' % (raw_dir, ticker) if not os.path.isfile(raw_file): continue with open(raw_file, 'r') as fp: lines = fp.read().splitlines() processed_file = '%s/%s' % (processed_dir, ticker) with open(processed_file, 'w') as fp: previous_ymd = None previous_ym = None previous_vo, previous_avo = 0.0, 0.0 previous_days = 0 for line in lines: _, ymd, op, hi, lo, cl, vo, di, sp, aop, ahi, alo, acl, avo = ( line.split(',')) assert _ == ticker if previous_ymd is not None: assert ymd > previous_ymd previous_ymd = ymd ym = util.ymdToYm(ymd) if ym != previous_ym: previous_ym = ym if previous_days > 0: previous_vo /= previous_days previous_avo /= previous_days print >> fp, '%s\t%s\t%s\t%f\t%f' % ( ymd, cl, acl, previous_vo, previous_avo) previous_vo = 0.0 previous_avo = 0.0 previous_days = 0 previous_vo += getVolume(vo) previous_avo += getVolume(avo) previous_days += 1
def getGainDict(gain_file): dgains = util.readKeyValueList(gain_file) gain_dict = dict() for date, gain in dgains: ym = util.ymdToYm(date) assert ym not in gain_dict gain_dict[ym] = [date, gain] return gain_dict
def computePreviousFeature(feature_dir, k, pfeature_dir): tickers = sorted(os.listdir(feature_dir)) for ticker in tickers: feature_file = '%s/%s' % (feature_dir, ticker) dfeatures = util.readKeyValueList(feature_file) with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp: for date, feature in dfeatures: ym = util.ymdToYm(date) pdate = util.getNextYm(ym, k) print >> fp, '%s-01\t%f' % (pdate, feature)
def computePreviousFeature(feature_dir, k, pfeature_dir): tickers = sorted(os.listdir(feature_dir)) for ticker in tickers: feature_file = '%s/%s' % (feature_dir, ticker) dfeatures = util.readKeyValueList(feature_file) with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp: for date, feature in dfeatures: ym = util.ymdToYm(date) pdate = util.getNextYm(ym, k) print >> fp, '%s-01\t%f' % (pdate, feature)
def readPrices(price_file): with open(price_file, 'r') as fp: lines = fp.read().splitlines() prices = dict() # ym => [ymd, price] for line in lines: ymd, price = line.split('\t') ym = util.ymdToYm(ymd) assert ym not in prices prices[ym] = [ymd, float(price)] return prices
def readPrices(price_file): with open(price_file, 'r') as fp: lines = fp.read().splitlines() prices = dict() # ym => [ymd, price] for line in lines: ymd, price = line.split('\t') ym = util.ymdToYm(ymd) assert ym not in prices prices[ym] = [ymd, float(price)] return prices
def prepareData(ym, data_file, label_file, meta_file, predict_meta_file, tmp_data_file): data_ifp = open(data_file, 'r') label_ifp = open(label_file, 'r') meta_ifp = open(meta_file, 'r') data_ofp = open(tmp_data_file, 'w') if predict_meta_file is None: predict_meta_ifp = None predict_meta = None else: predict_meta_ifp = open(predict_meta_file, 'r') predict_meta = predict_meta_ifp.readline() meta = [] while True: line = meta_ifp.readline() if line == '': assert data_ifp.readline() == '' assert label_ifp.readline() == '' break assert line[-1] == '\n' data_line = data_ifp.readline() label_line = label_ifp.readline() assert data_line != '' assert label_line != '' if predict_meta is not None: if line != predict_meta: continue predict_meta = predict_meta_ifp.readline() ticker, date, tmp, gain = line[:-1].split('\t') if util.ymdToYm(date) != ym: continue assert data_line[-1] == '\n' assert label_line[-1] == '\n' label = float(label_line[:-1]) gain = float(gain) # This is not true when labels are cut at other places than 0. # TODO: --label_file is not needed; remove. #if label > 0.5: assert gain >= 0 #if label < 0.5: assert gain <= 0 print >> data_ofp, data_line[:-1] meta.append([ticker, gain]) data_ifp.close() label_ifp.close() meta_ifp.close() data_ofp.close() if predict_meta_ifp is not None: predict_meta_ifp.close() return meta
def prepareData(ym, data_file, label_file, meta_file, predict_meta_file, tmp_data_file): data_ifp = open(data_file, 'r') label_ifp = open(label_file, 'r') meta_ifp = open(meta_file, 'r') data_ofp = open(tmp_data_file, 'w') if predict_meta_file is None: predict_meta_ifp = None predict_meta = None else: predict_meta_ifp = open(predict_meta_file, 'r') predict_meta = predict_meta_ifp.readline() meta = [] while True: line = meta_ifp.readline() if line == '': assert data_ifp.readline() == '' assert label_ifp.readline() == '' break assert line[-1] == '\n' data_line = data_ifp.readline() label_line = label_ifp.readline() assert data_line != '' assert label_line != '' if predict_meta is not None: if line != predict_meta: continue predict_meta = predict_meta_ifp.readline() ticker, date, tmp, gain = line[:-1].split('\t') if util.ymdToYm(date) != ym: continue assert data_line[-1] == '\n' assert label_line[-1] == '\n' label = float(label_line[:-1]) gain = float(gain) # This is not true when labels are cut at other places than 0. # TODO: --label_file is not needed; remove. #if label > 0.5: assert gain >= 0 #if label < 0.5: assert gain <= 0 print >> data_ofp, data_line[:-1] meta.append([ticker, gain]) data_ifp.close() label_ifp.close() meta_ifp.close() data_ofp.close() if predict_meta_ifp is not None: predict_meta_ifp.close() return meta
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_file', required=True) parser.add_argument('--label_file', required=True) parser.add_argument('--meta_file', required=True) # Similar to --train_meta_file in train_model.py parser.add_argument('--predict_meta_file') parser.add_argument('--model_dir', required=True) parser.add_argument('--model_prefix', required=True) parser.add_argument('--model_suffix', required=True) parser.add_argument('--imputer_dir', required=True) parser.add_argument('--imputer_prefix', required=True) parser.add_argument('--imputer_suffix', required=True) parser.add_argument('--prediction_window', type=int, required=True) parser.add_argument('--delay_window', type=int, required=True) parser.add_argument('--result_file', required=True) parser.add_argument('--allow_older_models', action='store_true') args = parser.parse_args() # get dates for prediction with open(args.meta_file, 'r') as fp: lines = fp.read().splitlines() dates = set() for line in lines: tmp1, date, tmp2, tmp3 = line.split('\t') dates.add(util.ymdToYm(date)) dates = sorted(dates) ofp = open(args.result_file, 'w') started = False # check no 'hole' in simulation period delta = args.prediction_window + args.delay_window previous_files = [None, None] # model, imputer for date in dates: ym = util.getPreviousYm(date, delta) model_name = getName(ym, args.model_prefix, args.model_suffix) imputer_name = getName(ym, args.imputer_prefix, args.imputer_suffix) model_file = '%s/%s' % (args.model_dir, model_name) imputer_file = '%s/%s' % (args.imputer_dir, imputer_name) if not os.path.isfile(model_file): if args.allow_older_models and previous_files[0] is not None: model_file = previous_files[0] imputer_file = previous_files[1] logging.warn('using previous model %s for %s' % (model_file, date)) else: assert not started continue assert os.path.isfile(imputer_file) started = True previous_files = [model_file, imputer_file] meta = prepareData(date, args.data_file, args.label_file, args.meta_file, args.predict_meta_file, TMP_DATA_FILE) data = numpy.loadtxt(TMP_DATA_FILE) assert data.shape[0] == len(meta), 'inconsistent data size: %d vs %d' % ( data.shape[0], len(meta)) with open(imputer_file, 'rb') as fp: imputer = pickle.load(fp) data = imputer.transform(data) with open(model_file, 'rb') as fp: model = pickle.load(fp) if 'predict_proba' in dir(model): prob = model.predict_proba(data) prob = [item[1] for item in prob] else: prob = model.predict(data) assert len(prob) == len(meta) items = [[meta[i][0], meta[i][1], prob[i]] for i in range(len(prob))] items.sort(key=lambda item: item[2], reverse=True) print >> ofp, 'date: %s' % date for item in items: ticker, gain, score = item print >> ofp, '\t%s\t%f\t%f' % (ticker, gain, score) ofp.close() if os.path.isfile(TMP_DATA_FILE): os.remove(TMP_DATA_FILE)
def selectData(data_file, label_file, meta_file, weight_file, train_meta_file, yyyymm, months, tmp_data_file, tmp_label_file, tmp_weight_file): assert len(yyyymm) == 6 y = yyyymm[:4] m = yyyymm[4:] last_ym = '%s-%s' % (y, m) if months <= 0: first_ym = '0000-00' else: first_ym = util.getPreviousYm(last_ym, months - 1) logging.info('training period: %s - %s' % (first_ym, last_ym)) assert first_ym <= last_ym data_ifp = open(data_file, 'r') data_ofp = open(tmp_data_file, 'w') label_ifp = open(label_file, 'r') label_ofp = open(tmp_label_file, 'w') if weight_file: weight_ifp = open(weight_file, 'r') if tmp_weight_file: weight_ofp = open(tmp_weight_file, 'w') meta_fp = open(meta_file, 'r') if train_meta_file is None: train_meta_fp = None train_meta = None else: train_meta_fp = open(train_meta_file, 'r') train_meta = train_meta_fp.readline() count = 0 while True: meta = meta_fp.readline() if meta == '': assert data_ifp.readline() == '' assert label_ifp.readline() == '' if weight_file: assert weight_ifp.readline() == '' break data = data_ifp.readline() label = label_ifp.readline() assert data != '' assert label != '' if weight_file: weight = weight_ifp.readline() assert weight != '' if train_meta is not None: if meta != train_meta: continue train_meta = train_meta_fp.readline() assert meta[-1] == '\n' ticker, date, tmp1, tmp2 = meta[:-1].split('\t') ym = util.ymdToYm(date) if ym < first_ym or ym > last_ym: continue assert data[-1] == '\n' assert label[-1] == '\n' print >> data_ofp, data[:-1] print >> label_ofp, label[:-1] if tmp_weight_file: assert weight[-1] == '\n' print >> weight_ofp, weight[:-1] count += 1 logging.info('selected %d training samples' % count) data_ifp.close() data_ofp.close() label_ifp.close() label_ofp.close() if weight_file: weight_ifp.close() if tmp_weight_file: weight_ofp.close() meta_fp.close() if train_meta_fp is not None: train_meta_fp.close()
def computePercFeature(input_dir, tickers, rank, output_dir): # ticker => [[date, value] ...] # where date is the first yyyy-mm after data is published. # Dates are deduped (any yyyy-mm with more than one values available, # the latest one wins). data = dict() for ticker in tickers: dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker)) udvalues = [] for i in range(len(dvalues)): date = util.getNextYm(util.ymdToYm(dvalues[i][0])) if len(udvalues) > 0 and udvalues[-1][0] == date: udvalues[-1][1] = dvalues[i][1] else: if len(udvalues) > 0: assert udvalues[-1][0] < date udvalues.append([date, dvalues[i][1]]) data[ticker] = udvalues min_date = '9999-99' max_date = '0000-00' for dvalues in data.itervalues(): if len(dvalues) == 0: continue min_date = min(min_date, dvalues[0][0]) max_date = max(max_date, dvalues[-1][0]) percs = dict() # date => [[ticker, value] ...] date = min_date while date <= max_date: percs[date] = [] date = util.getNextYm(date) for ticker, dvalues in data.iteritems(): for i in range(len(dvalues)): date, value = dvalues[i] if i < len(dvalues) - 1: # Populate value up to next date (not inclusive). next = dvalues[i+1][0] else: # Populate value up to max date (inclusive). next = util.getNextYm(max_date) while date < next: percs[date].append([ticker, value]) date = util.getNextYm(date) # Calculate percentiles. for date, tvalues in percs.iteritems(): assert len(tvalues) > 0 # Use 0.5 if there is a single element. if len(tvalues) == 1: tvalues[0][1] = 0.5 continue if rank: tvalues.sort(key=lambda item: item[1]) for i in range(len(tvalues)): tvalues[i][1] = float(i)/len(tvalues) else: values = [item[1] for item in tvalues] maxv = max(values) minv = min(values) span = maxv - minv for i in range(len(tvalues)): if span < EPS: tvalues[i][1] = 0.5 else: tvalues[i][1] = (tvalues[i][1] - minv) / span # Write output. data = dict() # ticker => [[date, perc] ...] for date, tpercs in percs.iteritems(): for ticker, perc in tpercs: if ticker not in data: data[ticker] = [[date, perc]] else: data[ticker].append([date, perc]) for ticker, dpercs in data.iteritems(): dpercs.sort(key=lambda item: item[0]) with open('%s/%s' % (output_dir, ticker), 'w') as fp: for date, perc in dpercs: print >> fp, '%s\t%f' % (date, perc)
def selectData(data_file, label_file, meta_file, weight_file, train_meta_file, yyyymm, months, tmp_data_file, tmp_label_file, tmp_weight_file): assert len(yyyymm) == 6 y = yyyymm[:4] m = yyyymm[4:] last_ym = '%s-%s' % (y, m) if months <= 0: first_ym = '0000-00' else: first_ym = util.getPreviousYm(last_ym, months - 1) logging.info('training period: %s - %s' % (first_ym, last_ym)) assert first_ym <= last_ym data_ifp = open(data_file, 'r') data_ofp = open(tmp_data_file, 'w') label_ifp = open(label_file, 'r') label_ofp = open(tmp_label_file, 'w') if weight_file: weight_ifp = open(weight_file, 'r') if tmp_weight_file: weight_ofp = open(tmp_weight_file, 'w') meta_fp = open(meta_file, 'r') if train_meta_file is None: train_meta_fp = None train_meta = None else: train_meta_fp = open(train_meta_file, 'r') train_meta = train_meta_fp.readline() count = 0 while True: meta = meta_fp.readline() if meta == '': assert data_ifp.readline() == '' assert label_ifp.readline() == '' if weight_file: assert weight_ifp.readline() == '' break data = data_ifp.readline() label = label_ifp.readline() assert data != '' assert label != '' if weight_file: weight = weight_ifp.readline() assert weight != '' if train_meta is not None: if meta != train_meta: continue train_meta = train_meta_fp.readline() assert meta[-1] == '\n' ticker, date, tmp1, tmp2 = meta[:-1].split('\t') ym = util.ymdToYm(date) if ym < first_ym or ym > last_ym: continue assert data[-1] == '\n' assert label[-1] == '\n' print >> data_ofp, data[:-1] print >> label_ofp, label[:-1] if tmp_weight_file: assert weight[-1] == '\n' print >> weight_ofp, weight[:-1] count += 1 logging.info('selected %d training samples' % count) data_ifp.close() data_ofp.close() label_ifp.close() label_ofp.close() if weight_file: weight_ifp.close() if tmp_weight_file: weight_ofp.close() meta_fp.close() if train_meta_fp is not None: train_meta_fp.close()
def computePercFeature(input_dir, tickers, rank, output_dir): # ticker => [[date, value] ...] # where date is the first yyyy-mm after data is published. # Dates are deduped (any yyyy-mm with more than one values available, # the latest one wins). data = dict() for ticker in tickers: dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker)) udvalues = [] for i in range(len(dvalues)): date = util.getNextYm(util.ymdToYm(dvalues[i][0])) if len(udvalues) > 0 and udvalues[-1][0] == date: udvalues[-1][1] = dvalues[i][1] else: if len(udvalues) > 0: assert udvalues[-1][0] < date udvalues.append([date, dvalues[i][1]]) data[ticker] = udvalues min_date = '9999-99' max_date = '0000-00' for dvalues in data.itervalues(): if len(dvalues) == 0: continue min_date = min(min_date, dvalues[0][0]) max_date = max(max_date, dvalues[-1][0]) percs = dict() # date => [[ticker, value] ...] date = min_date while date <= max_date: percs[date] = [] date = util.getNextYm(date) for ticker, dvalues in data.iteritems(): for i in range(len(dvalues)): date, value = dvalues[i] if i < len(dvalues) - 1: # Populate value up to next date (not inclusive). next = dvalues[i + 1][0] else: # Populate value up to max date (inclusive). next = util.getNextYm(max_date) while date < next: percs[date].append([ticker, value]) date = util.getNextYm(date) # Calculate percentiles. for date, tvalues in percs.iteritems(): assert len(tvalues) > 0 # Use 0.5 if there is a single element. if len(tvalues) == 1: tvalues[0][1] = 0.5 continue if rank: tvalues.sort(key=lambda item: item[1]) for i in range(len(tvalues)): tvalues[i][1] = float(i) / len(tvalues) else: values = [item[1] for item in tvalues] maxv = max(values) minv = min(values) span = maxv - minv for i in range(len(tvalues)): if span < EPS: tvalues[i][1] = 0.5 else: tvalues[i][1] = (tvalues[i][1] - minv) / span # Write output. data = dict() # ticker => [[date, perc] ...] for date, tpercs in percs.iteritems(): for ticker, perc in tpercs: if ticker not in data: data[ticker] = [[date, perc]] else: data[ticker].append([date, perc]) for ticker, dpercs in data.iteritems(): dpercs.sort(key=lambda item: item[0]) with open('%s/%s' % (output_dir, ticker), 'w') as fp: for date, perc in dpercs: print >> fp, '%s\t%f' % (date, perc)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_file', required=True) parser.add_argument('--label_file', required=True) parser.add_argument('--meta_file', required=True) # Similar to --train_meta_file in train_model.py parser.add_argument('--predict_meta_file') parser.add_argument('--model_dir', required=True) parser.add_argument('--model_prefix', required=True) parser.add_argument('--model_suffix', required=True) parser.add_argument('--imputer_dir', required=True) parser.add_argument('--imputer_prefix', required=True) parser.add_argument('--imputer_suffix', required=True) parser.add_argument('--prediction_window', type=int, required=True) parser.add_argument('--delay_window', type=int, required=True) parser.add_argument('--result_file', required=True) parser.add_argument('--allow_older_models', action='store_true') args = parser.parse_args() # get dates for prediction with open(args.meta_file, 'r') as fp: lines = fp.read().splitlines() dates = set() for line in lines: tmp1, date, tmp2, tmp3 = line.split('\t') dates.add(util.ymdToYm(date)) dates = sorted(dates) ofp = open(args.result_file, 'w') started = False # check no 'hole' in simulation period delta = args.prediction_window + args.delay_window previous_files = [None, None] # model, imputer for date in dates: ym = util.getPreviousYm(date, delta) model_name = getName(ym, args.model_prefix, args.model_suffix) imputer_name = getName(ym, args.imputer_prefix, args.imputer_suffix) model_file = '%s/%s' % (args.model_dir, model_name) imputer_file = '%s/%s' % (args.imputer_dir, imputer_name) if not os.path.isfile(model_file): if args.allow_older_models and previous_files[0] is not None: model_file = previous_files[0] imputer_file = previous_files[1] logging.warn('using previous model %s for %s' % (model_file, date)) else: assert not started continue assert os.path.isfile(imputer_file) started = True previous_files = [model_file, imputer_file] meta = prepareData(date, args.data_file, args.label_file, args.meta_file, args.predict_meta_file, TMP_DATA_FILE) data = numpy.loadtxt(TMP_DATA_FILE) assert data.shape[0] == len(meta), 'inconsistent data size: %d vs %d' % ( data.shape[0], len(meta)) with open(imputer_file, 'rb') as fp: imputer = pickle.load(fp) data = imputer.transform(data) with open(model_file, 'rb') as fp: model = pickle.load(fp) if 'predict_proba' in dir(model): prob = model.predict_proba(data) prob = [item[1] for item in prob] else: prob = model.predict(data) assert len(prob) == len(meta) items = [[meta[i][0], meta[i][1], prob[i]] for i in range(len(prob))] items.sort(key=lambda item: item[2], reverse=True) print >> ofp, 'date: %s' % date for item in items: ticker, gain, score = item print >> ofp, '\t%s\t%f\t%f' % (ticker, gain, score) ofp.close() if os.path.isfile(TMP_DATA_FILE): os.remove(TMP_DATA_FILE)