def computeVertPerc2Feature(input_dir, output_dir, feature, windows_str, ticker_file, info_dir): windows = [int(w) for w in windows_str.split(',')] tickers = util.readTickers(ticker_file) feature_info = {w: [] for w in windows} # [[yyyy, feature] ...] for window in windows: target_dir = getTargetDir(output_dir, feature, window) if not os.path.isdir(target_dir): os.mkdir(target_dir) for ticker in tickers: feature_file = '%s/%s/%s' % (input_dir, feature, ticker) if not os.path.isfile(feature_file): continue data = util.readKeyValueList(feature_file) ofps = {w: open('%s/%s' % (getTargetDir(output_dir, feature, w), ticker), 'w') for w in windows} for index in range(len(data)): date, percs = compute(data, index, windows) for window, perc in percs.iteritems(): print >> ofps[window], '%s\t%f' % (date, perc) year = util.ymdToY(date) for window in windows: feature_info[window].append((year, percs[window])) for ofp in ofps.itervalues(): ofp.close() for window in windows: target = getTarget(feature, window) util.writeFeatureInfo( [input_dir, output_dir, feature, windows_str, ticker_file], feature_info[window], '%s/%s' % (info_dir, target))
def computePerc(input_dir, output_dir): tickers = sorted(os.listdir(input_dir)) dvalues = dict() # date => [[ticker, value] ...] for ticker in tickers: data = util.readKeyValueList('%s/%s' % (input_dir, ticker)) for date, value in data: if date not in dvalues: dvalues[date] = [[ticker, value]] else: dvalues[date].append([ticker, value]) # Convert raw value to perc. for tvalues in dvalues.itervalues(): tvalues.sort(key=lambda item: item[1]) for i in range(len(tvalues)): tvalues[i][1] = float(i) / len(tvalues) tvalues = dict() # ticker => [[date, perc] ...] for date, tpercs in dvalues.iteritems(): for ticker, perc in tpercs: if ticker not in tvalues: tvalues[ticker] = [[date, perc]] else: tvalues[ticker].append([date, perc]) # Write output. for ticker, values in tvalues.iteritems(): values.sort(key=lambda value: value[0]) with open('%s/%s' % (output_dir, ticker), 'w') as fp: for date, perc in values: print >> fp, '%s\t%f' % (date, perc)
def computePerc(input_dir, output_dir): tickers = sorted(os.listdir(input_dir)) dvalues = dict() # date => [[ticker, value] ...] for ticker in tickers: data = util.readKeyValueList('%s/%s' % (input_dir, ticker)) for date, value in data: if date not in dvalues: dvalues[date] = [[ticker, value]] else: dvalues[date].append([ticker, value]) # Convert raw value to perc. for tvalues in dvalues.itervalues(): tvalues.sort(key=lambda item: item[1]) for i in range(len(tvalues)): tvalues[i][1] = float(i)/len(tvalues) tvalues = dict() # ticker => [[date, perc] ...] for date, tpercs in dvalues.iteritems(): for ticker, perc in tpercs: if ticker not in tvalues: tvalues[ticker] = [[date, perc]] else: tvalues[ticker].append([date, perc]) # Write output. for ticker, values in tvalues.iteritems(): values.sort(key=lambda value: value[0]) with open('%s/%s' % (output_dir, ticker), 'w') as fp: for date, perc in values: print >> fp, '%s\t%f' % (date, perc)
def getGainDict(gain_file): dgains = util.readKeyValueList(gain_file) gain_dict = dict() for date, gain in dgains: ym = util.ymdToYm(date) assert ym not in gain_dict gain_dict[ym] = [date, gain] return gain_dict
def computePreviousFeature(feature_dir, k, pfeature_dir): tickers = sorted(os.listdir(feature_dir)) for ticker in tickers: feature_file = '%s/%s' % (feature_dir, ticker) dfeatures = util.readKeyValueList(feature_file) with open('%s/%s' % (pfeature_dir, ticker), 'w') as fp: for date, feature in dfeatures: ym = util.ymdToYm(date) pdate = util.getNextYm(ym, k) print >> fp, '%s-01\t%f' % (pdate, feature)
def computeVolatility(price_dir, k, volatility_dir): assert k > 0 tickers = sorted(os.listdir(price_dir)) for ticker in tickers: price_file = '%s/%s' % (price_dir, ticker) dprices = util.readKeyValueList(price_file) with open('%s/%s' % (volatility_dir, ticker), 'w') as fp: for i in range(len(dprices)): prices = [dprices[j][1] for j in range(max(0,i-k), i+1)] gains = [(prices[j+1] - prices[j]) / (prices[j] + EPS) for j in range(len(prices) - 1)] volatility = computeStd(gains) print >> fp, '%s\t%f' % (dprices[i][0], volatility)
def computeWindowFeature(args): assert args.do_raw or args.do_fd tickers = sorted(os.listdir(args.value_dir)) windows = [int(window) for window in args.windows.split(',')] assert min(windows) >= 0, 'cannot look at future values' assert len(windows) > 0 assert len(windows) > 1 or not args.do_fd max_window = max(windows) for ticker in tickers: raw_fps = None fd_fps = None if args.do_raw: raw_fps = [] for window in windows: raw_dir = '%s/%s%d' % (args.feature_dir, args.prefix, window) if not os.path.isdir(raw_dir): os.mkdir(raw_dir) raw_fps.append(open('%s/%s' % (raw_dir, ticker), 'w')) if args.do_fd: fd_fps = [] for i in range(len(windows) - 1): fd_dir = '%s/%sfd-%d' % (args.feature_dir, args.prefix, windows[i]) if not os.path.isdir(fd_dir): os.mkdir(fd_dir) fd_fps.append(open('%s/%s' % (fd_dir, ticker), 'w')) dvalues = util.readKeyValueList('%s/%s' % (args.value_dir, ticker)) for i in range(max_window, len(dvalues)): values = [dvalues[i - window][1] for window in windows] if raw_fps: raws = util.normalize(values) assert len(raws) == len(raw_fps) for j in range(len(raws)): print >> raw_fps[j], '%s\t%f' % (dvalues[i][0], raws[j]) if fd_fps: derivatives = [ (values[j] - values[j + 1]) / (values[j + 1] + args.bonus) for j in range(len(windows) - 1) ] derivatives = util.normalize(derivatives) assert len(derivatives) == len(fd_fps) for j in range(len(derivatives)): print >> fd_fps[j], '%s\t%f' % (dvalues[i][0], derivatives[j]) if raw_fps: for fp in raw_fps: fp.close() if fd_fps: for fp in fd_fps: fp.close()
def computeVolatility(price_dir, k, volatility_dir): assert k > 0 tickers = sorted(os.listdir(price_dir)) for ticker in tickers: price_file = '%s/%s' % (price_dir, ticker) dprices = util.readKeyValueList(price_file) with open('%s/%s' % (volatility_dir, ticker), 'w') as fp: for i in range(len(dprices)): prices = [dprices[j][1] for j in range(max(0, i - k), i + 1)] gains = [(prices[j + 1] - prices[j]) / (prices[j] + EPS) for j in range(len(prices) - 1)] volatility = computeStd(gains) print >> fp, '%s\t%f' % (dprices[i][0], volatility)
def computeRollingWindowFeature(args): assert args.window > 0 tickers = sorted(os.listdir(args.input_dir)) for ticker in tickers: dvalues = util.readKeyValueList('%s/%s' % (args.input_dir, ticker)) dates = [dvalue[0] for dvalue in dvalues] values = [dvalue[1] for dvalue in dvalues] for i in range(len(dates) - 1): assert dates[i] < dates[i + 1] with open('%s/%s' % (args.output_dir, ticker), 'w') as fp: for i in range(args.window - 1, len(dates)): wvalues = values[i - args.window + 1:i + 1] if args.method == 'mean': f = sum(wvalues) / args.window print >> fp, '%s\t%f' % (dates[i], f)
def computeVertPercFeature(input_dir, output_dir, feature, windows_str, ticker_file, info_dir): windows = [int(w) for w in windows_str.split(',')] tickers = util.readTickers(ticker_file) feature_info = {w: [] for w in windows} # [[yyyy, feature] ...] for window in windows: target_dir = getTargetDir(output_dir, feature, window) if not os.path.isdir(target_dir): os.mkdir(target_dir) for ticker in tickers: feature_file = '%s/%s/%s' % (input_dir, feature, ticker) if not os.path.isfile(feature_file): continue data = util.readKeyValueList(feature_file) ofps = { w: open('%s/%s' % (getTargetDir(output_dir, feature, w), ticker), 'w') for w in windows } for index in range(len(data)): date, ws, vs = collectData(data, index, windows) ps = computePerc(vs) for i in range(len(ws)): print >> ofps[ws[i]], '%s\t%f' % (date, ps[i]) year = util.ymdToY(date) for window in windows: try: i = ws.index(window) feature_info[window].append((year, ps[i])) except ValueError: feature_info[window].append((year, None)) for ofp in ofps.itervalues(): ofp.close() for window in windows: target = getTarget(feature, window) util.writeFeatureInfo( [input_dir, output_dir, feature, windows_str, ticker_file], feature_info[window], '%s/%s' % (info_dir, target))
def computeVertGainFeature(input_dir, output_dir, feature, windows_str, ticker_file, info_dir): windows = [int(w) for w in windows_str.split(',')] tickers = util.readTickers(ticker_file) feature_info = {w: [] for w in windows} # [[yyyy, feature] ...] for window in windows: target_dir = getTargetDir(output_dir, feature, window) if not os.path.isdir(target_dir): os.mkdir(target_dir) for ticker in tickers: feature_file = '%s/%s/%s' % (input_dir, feature, ticker) if not os.path.isfile(feature_file): continue data = util.readKeyValueList(feature_file) ofps = { w: open('%s/%s' % (getTargetDir(output_dir, feature, w), ticker), 'w') for w in windows } for index in range(len(data)): date, gains = compute(data, index, windows) year = util.ymdToY(date) for window, gain in gains.iteritems(): print >> ofps[window], '%s\t%f' % (date, gain) for window in windows: if window in gains: feature_info[window].append((year, gains[window])) else: feature_info[window].append((year, None)) for ofp in ofps.itervalues(): ofp.close() for window in windows: target = getTarget(feature, window) util.writeFeatureInfo( [input_dir, output_dir, feature, windows_str, ticker_file], feature_info[window], '%s/%s' % (info_dir, target))
def filterMetadata(input_file, min_raw_price, raw_price_dir, max_volatility, volatility_dir, min_marketcap, marketcap_dir, max_holes, hole_dir, membership_file, remove_neg_labels, label_file, output_file): stats = { 'min_raw_price': 0, 'max_volatility': 0, 'min_marketcap': 0, 'max_holes': 0, 'membership': 0, 'neg_label': 0, } ifp = open(input_file, 'r') if remove_neg_labels: lfp = open(label_file, 'r') ofp = open(output_file, 'w') prev_ticker = None price = None # for prev_ticker, date => price volatility = None # for prev ticker, date => volatility marketcap = None # for prev ticker, [dates, values] hole = None # for prev ticker, [dates, holes] if membership_file is None: membership = None else: membership = readMembership( membership_file) # ticker => [[start, end] ...] while True: line = ifp.readline() if remove_neg_labels: lline = lfp.readline() if line == '': if remove_neg_labels: assert lline == '', 'inconsisten line count between meta and label files' break assert line.endswith('\n') items = line[:-1].split('\t') assert len(items) >= 2 ticker, date = items[0], items[1] if ticker != prev_ticker: prev_ticker = ticker if raw_price_dir is not None: price = util.readKeyValueDict('%s/%s' % (raw_price_dir, ticker)) if volatility_dir is not None: volatility = util.readKeyValueDict('%s/%s' % (volatility_dir, ticker)) if marketcap_dir is not None: tmp = util.readKeyValueList('%s/%s' % (marketcap_dir, ticker)) marketcap_dates = [t[0] for t in tmp] marketcap_values = [t[1] for t in tmp] marketcap = (marketcap_dates, marketcap_values) if hole_dir is not None: tmp = util.readKeyValueList('%s/%s' % (hole_dir, ticker)) hole_dates = [t[0] for t in tmp] hole_values = [t[1] for t in tmp] hole = (hole_dates, hole_values) # Maybe check price. if price is not None: assert date in price, 'missing price for %s on %s' % (ticker, date) if price[date] < min_raw_price: stats['min_raw_price'] += 1 continue # Maybe check volatility. if volatility is not None: assert date in volatility, 'missing volatility for %s on %s' % ( ticker, date) if volatility[date] > max_volatility: stats['max_volatility'] += 1 continue # Maybe check marketcap. if marketcap is not None: marketcap_dates, marketcap_values = marketcap index = bisect.bisect_right(marketcap_dates, date) - 1 if index < 0 or marketcap_values[index] < min_marketcap: stats['min_marketcap'] += 1 continue # Maybe check holes. if hole is not None: hole_dates, hole_values = hole index = bisect.bisect_right(hole_dates, date) - 1 if index < 0 or hole_values[index] > max_holes: stats['max_holes'] += 1 continue # Maybe check membership. if membership is not None: if not isMember(membership, ticker, date): stats['membership'] += 1 continue # Maybe check label. if remove_neg_labels: assert lline.endswith('\n') label = float(lline[:-1]) if label < 0: stats['neg_label'] += 1 continue print >> ofp, line[:-1] logging.info('skip_stats: %s' % stats)
def collectData(gain_dir, date_file, max_neg, min_pos, feature_base_dir, feature_list_file, feature_stats_file, min_date, max_date, window, min_feature_perc, data_file, label_file, rlabel_file, meta_file, weight_power, weight_file): tickers = sorted(os.listdir(gain_dir)) feature_list = readFeatureList(feature_list_file) min_feature_count = int(len(feature_list) * min_feature_perc) feature_ranges = readFeatureRanges(feature_stats_file) for feature in feature_list: if feature not in feature_ranges: assert (feature.find('gain') > 0 or feature.find('price') > 0 or feature.find('volume') > 0 or feature.find('volatility') > 0 or feature.find('_hp') > 0 or feature.startswith('sector') or feature.startswith('industry') or feature.startswith('window')), ( 'no range info for feature %s' % feature) feature_ranges[feature] = [float('-Inf'), float('Inf')] lower, upper = feature_ranges[feature] data_fp = open(data_file, 'w') label_fp = open(label_file, 'w') rlabel_fp = open(rlabel_file, 'w') meta_fp = open(meta_file, 'w') weight_fp = None if weight_file: weight_fp = open(weight_file, 'w') skip_stats = { 'feature_file': 0, 'index': 0, 'min_date': 0, 'max_date': 0, 'filter_date': 0, 'neg_pos': 0, 'window': 0, 'min_perc': 0, '1_perc': 0, '99_perc': 0 } dates = None if date_file: with open(date_file, 'r') as fp: dates = set(fp.read().splitlines()) for ticker in tickers: gain_file = '%s/%s' % (gain_dir, ticker) gains = util.readKeyValueList(gain_file) feature_items = [[] for i in range(len(feature_list))] for i in range(len(feature_list)): feature_file = '%s/%s/%s' % (feature_base_dir, feature_list[i], ticker) if not os.path.isfile(feature_file): skip_stats['feature_file'] += 1 continue keys, values = util.readKeyListValueList(feature_file) for j in range(len(keys)): if keys[j] == '*': continue ymd = keys[j].split('-') if len(ymd) == 3: continue # Change yyyy-mm to yyyy-mm-01 assert len(ymd) == 2 keys[j] += '-01' feature_items[i] = [keys, values] for gain_date, gain in gains: if gain_date < min_date: skip_stats['min_date'] += 1 continue if gain_date > max_date: skip_stats['max_date'] += 1 continue if dates is not None and gain_date not in dates: skip_stats['filter_date'] += 1 continue if max_neg < gain and gain < min_pos: skip_stats['neg_pos'] += 1 # Do not skip these they need to be part of testing data. # Instead output negative label and postpone filtering to filter_metadata # (remove negative labels for training and not for testing). #continue if DEBUG: print 'gain: %f (%s)' % (gain, gain_date) if gain <= max_neg: weight = max_neg - gain label = 0.0 elif gain >= min_pos: weight = gain - min_pos label = 1.0 else: weight = 0.0 label = -1.0 weight = weight**weight_power features = [MISSING_VALUE for i in range(len(feature_list))] feature_count = 0 for i in range(len(feature_list)): keys, values = feature_items[i] if len(keys) == 1 and keys[0] == '*': # undated feature, eg sector index = 0 else: # dated feature, eg pgain index = bisect.bisect_right(keys, gain_date) - 1 if index < 0: skip_stats['index'] += 1 continue gain_date_obj = datetime.datetime.strptime( gain_date, '%Y-%m-%d') feature_date_obj = datetime.datetime.strptime( keys[index], '%Y-%m-%d') delta = (gain_date_obj - feature_date_obj).days if delta > window: skip_stats['window'] += 1 continue feature = values[index] lower, upper = feature_ranges[feature_list[i]] if feature < lower: skip_stats['1_perc'] += 1 continue if feature > upper: skip_stats['99_perc'] += 1 continue if DEBUG: print 'feature %s: (%s, %f)' % (feature_list[i], keys[index], feature) features[i] = feature feature_count += 1 if feature_count < min_feature_count: skip_stats['min_perc'] += 1 continue print >> data_fp, ' '.join( ['%f' % feature for feature in features]) print >> label_fp, '%f' % label print >> rlabel_fp, '%f' % gain print >> meta_fp, '%s\t%s\t%d\t%f' % (ticker, gain_date, feature_count, gain) if weight_fp: print >> weight_fp, '%f' % weight if DEBUG: break data_fp.close() label_fp.close() rlabel_fp.close() meta_fp.close() if weight_fp: weight_fp.close() logging.info('skip_stats: %s' % skip_stats)
def computePercFeature(input_dir, tickers, rank, output_dir): # ticker => [[date, value] ...] # where date is the first yyyy-mm after data is published. # Dates are deduped (any yyyy-mm with more than one values available, # the latest one wins). data = dict() for ticker in tickers: dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker)) udvalues = [] for i in range(len(dvalues)): date = util.getNextYm(util.ymdToYm(dvalues[i][0])) if len(udvalues) > 0 and udvalues[-1][0] == date: udvalues[-1][1] = dvalues[i][1] else: if len(udvalues) > 0: assert udvalues[-1][0] < date udvalues.append([date, dvalues[i][1]]) data[ticker] = udvalues min_date = '9999-99' max_date = '0000-00' for dvalues in data.itervalues(): if len(dvalues) == 0: continue min_date = min(min_date, dvalues[0][0]) max_date = max(max_date, dvalues[-1][0]) percs = dict() # date => [[ticker, value] ...] date = min_date while date <= max_date: percs[date] = [] date = util.getNextYm(date) for ticker, dvalues in data.iteritems(): for i in range(len(dvalues)): date, value = dvalues[i] if i < len(dvalues) - 1: # Populate value up to next date (not inclusive). next = dvalues[i+1][0] else: # Populate value up to max date (inclusive). next = util.getNextYm(max_date) while date < next: percs[date].append([ticker, value]) date = util.getNextYm(date) # Calculate percentiles. for date, tvalues in percs.iteritems(): assert len(tvalues) > 0 # Use 0.5 if there is a single element. if len(tvalues) == 1: tvalues[0][1] = 0.5 continue if rank: tvalues.sort(key=lambda item: item[1]) for i in range(len(tvalues)): tvalues[i][1] = float(i)/len(tvalues) else: values = [item[1] for item in tvalues] maxv = max(values) minv = min(values) span = maxv - minv for i in range(len(tvalues)): if span < EPS: tvalues[i][1] = 0.5 else: tvalues[i][1] = (tvalues[i][1] - minv) / span # Write output. data = dict() # ticker => [[date, perc] ...] for date, tpercs in percs.iteritems(): for ticker, perc in tpercs: if ticker not in data: data[ticker] = [[date, perc]] else: data[ticker].append([date, perc]) for ticker, dpercs in data.iteritems(): dpercs.sort(key=lambda item: item[0]) with open('%s/%s' % (output_dir, ticker), 'w') as fp: for date, perc in dpercs: print >> fp, '%s\t%f' % (date, perc)
def computePercFeature(input_dir, tickers, rank, output_dir): # ticker => [[date, value] ...] # where date is the first yyyy-mm after data is published. # Dates are deduped (any yyyy-mm with more than one values available, # the latest one wins). data = dict() for ticker in tickers: dvalues = util.readKeyValueList('%s/%s' % (input_dir, ticker)) udvalues = [] for i in range(len(dvalues)): date = util.getNextYm(util.ymdToYm(dvalues[i][0])) if len(udvalues) > 0 and udvalues[-1][0] == date: udvalues[-1][1] = dvalues[i][1] else: if len(udvalues) > 0: assert udvalues[-1][0] < date udvalues.append([date, dvalues[i][1]]) data[ticker] = udvalues min_date = '9999-99' max_date = '0000-00' for dvalues in data.itervalues(): if len(dvalues) == 0: continue min_date = min(min_date, dvalues[0][0]) max_date = max(max_date, dvalues[-1][0]) percs = dict() # date => [[ticker, value] ...] date = min_date while date <= max_date: percs[date] = [] date = util.getNextYm(date) for ticker, dvalues in data.iteritems(): for i in range(len(dvalues)): date, value = dvalues[i] if i < len(dvalues) - 1: # Populate value up to next date (not inclusive). next = dvalues[i + 1][0] else: # Populate value up to max date (inclusive). next = util.getNextYm(max_date) while date < next: percs[date].append([ticker, value]) date = util.getNextYm(date) # Calculate percentiles. for date, tvalues in percs.iteritems(): assert len(tvalues) > 0 # Use 0.5 if there is a single element. if len(tvalues) == 1: tvalues[0][1] = 0.5 continue if rank: tvalues.sort(key=lambda item: item[1]) for i in range(len(tvalues)): tvalues[i][1] = float(i) / len(tvalues) else: values = [item[1] for item in tvalues] maxv = max(values) minv = min(values) span = maxv - minv for i in range(len(tvalues)): if span < EPS: tvalues[i][1] = 0.5 else: tvalues[i][1] = (tvalues[i][1] - minv) / span # Write output. data = dict() # ticker => [[date, perc] ...] for date, tpercs in percs.iteritems(): for ticker, perc in tpercs: if ticker not in data: data[ticker] = [[date, perc]] else: data[ticker].append([date, perc]) for ticker, dpercs in data.iteritems(): dpercs.sort(key=lambda item: item[0]) with open('%s/%s' % (output_dir, ticker), 'w') as fp: for date, perc in dpercs: print >> fp, '%s\t%f' % (date, perc)
def collectData(gain_dir, feature_base_dir, feature_list_file, feature_stats_file, min_date, max_date, window, min_feature_perc, data_file, meta_file): tickers = sorted(os.listdir(gain_dir)) feature_list = readFeatureList(feature_list_file) min_feature_count = int(len(feature_list) * min_feature_perc) feature_ranges = readFeatureRanges(feature_stats_file) for feature in feature_list: if feature not in feature_ranges: assert (feature.find('gain') > 0 or feature.find('price') > 0 or feature.find('volume') > 0 or feature.find('volatility') > 0 or feature.find('_hp') > 0 or feature.startswith('sector') or feature.startswith('industry')), ( 'no range info for feature %s' % feature) feature_ranges[feature] = [float('-Inf'), float('Inf')] lower, upper = feature_ranges[feature] data_fp = open(data_file, 'w') meta_fp = open(meta_file, 'w') skip_stats = {'feature_file': 0, 'index': 0, 'min_date': 0, 'max_date': 0, 'window': 0, 'min_perc': 0, '1_perc': 0, '99_perc': 0} for ticker in tickers: gain_file = '%s/%s' % (gain_dir, ticker) gains = util.readKeyValueList(gain_file) feature_items = [[] for i in range(len(feature_list))] for i in range(len(feature_list)): feature_file = '%s/%s/%s' % (feature_base_dir, feature_list[i], ticker) if not os.path.isfile(feature_file): skip_stats['feature_file'] += 1 continue items = util.readKeyValueList(feature_file) for j in range(len(items)): if items[j][0] == '*': continue ymd = items[j][0].split('-') if len(ymd) == 3: continue # Change yyyy-mm to yyyy-mm-01 assert len(ymd) == 2 items[j][0] += '-01' feature_items[i] = items for gain_date, gain in gains: if gain_date < min_date: skip_stats['min_date'] += 1 continue if gain_date > max_date: skip_stats['max_date'] += 1 continue if DEBUG: print 'gain: %f (%s)' % (gain, gain_date) features = [MISSING_VALUE for i in range(len(feature_list))] feature_count = 0 for i in range(len(feature_list)): if len(feature_items[i]) == 1 and feature_items[i][0][0] == '*': # undated feature, eg sector index = 0 else: # dated feature, eg pgain feature_dates = [item[0] for item in feature_items[i]] index = bisect.bisect_right(feature_dates, gain_date) - 1 if index < 0: skip_stats['index'] += 1 continue gain_date_obj = datetime.datetime.strptime(gain_date, '%Y-%m-%d') feature_date_obj = datetime.datetime.strptime(feature_dates[index], '%Y-%m-%d') delta = (gain_date_obj - feature_date_obj).days if delta > window: skip_stats['window'] += 1 continue feature = feature_items[i][index][1] lower, upper = feature_ranges[feature_list[i]] if feature < lower: skip_stats['1_perc'] += 1 continue if feature > upper: skip_stats['99_perc'] += 1 continue if DEBUG: print 'feature %s: (%s, %f)' % ( feature_list[i], feature_items[i][index][0], feature) features[i] = feature feature_count += 1 if feature_count < min_feature_count: skip_stats['min_perc'] += 1 continue print >> data_fp, ' '.join(['%f' % feature for feature in features]) print >> meta_fp, '%s\t%s\t%d\t%f' % ( ticker, gain_date, feature_count, gain) if DEBUG: break data_fp.close() meta_fp.close() logging.info('skip_stats: %s' % skip_stats)