def alpha34(df): """ Alpha#34 rank(((1 - rank((stddev(returns, 2) / stddev(returns, 5)))) + (1 - rank(delta(close, 1))))) """ return u.rank(((1 - u.rank((u.stddev(df.returns, 2) / u.stddev(df.returns, 5)))) \ + (1 - u.rank(u.delta(df.close, 1)))))
def alpha21(df): """ Alpha#21 ((((sum(close, 8) / 8) + stddev(close, 8)) < (sum(close, 2) / 2)) ? (-1 * 1) : (((sum(close,2) / 2) < ((sum(close, 8) / 8) - stddev(close, 8))) ? 1 : (((1 < (volume / adv20)) || ((volume /adv20) == 1)) ? 1 : (-1 * 1)))) """ decision1 = (u.ts_sum(df.close, 8) / 8 + u.stddev(df.close, 8)) < (u.ts_sum(df.close, 2) / 2) decision2 = (u.ts_sum(df.close, 2) / 2 < (u.ts_sum(df.close, 8) / 8) - u.stddev(df.close, 8)) decision3 = ((1 < (df.volume / u.adv(df, 20))) | ((df.volume / u.adv(df, 20)) == 1)) return np.where(decision1, (-1 * 1), np.where(decision2, 1, np.where(decision3, 1, (-1 * 1))))
def buildk(vals, ma, s=1.5): """ According to Kemp (1962), the expression for determing a target value k for cusum should be done via: k = mean_a + .5 delta (Where: delta is the mean shift we want to detect. mean_a is an "acceptable process mean value." mean_a is the mean of the original dataset.) Lucas et al. (1982) suggested it be close to .5 delta as well, and it should be chosen close to: mean_d - mean_a k = --------------------------- ln (mean_d) - ln (mean_a) Mean_d is the "barely tolerable mean value". This is the mean that CUSUM should quickly detect. Mean_d is based on the declared needs of an experimental designer, the mean, and the std dev. mean_d = s * p + mean_a (Where: s is a value chosen by the experimental designers, p is the standard deviation, and mean_a is the mean of the dataset.) """ md = s * utils.stddev(vals) + ma return (md - ma) / (math.log(md, math.e) - math.log(ma, math.e))
def alpha40(df): """ Alpha#40 ((-1 * rank(stddev(high, 10))) * correlation(high, volume, 10)) """ return ((-1 * u.rank(u.stddev(df.high, 10))) * u.corr(df.high, df.volume, 10))
def alpha22(df): """ Alpha#22 (-1 * (delta(correlation(high, volume, 5), 5) * rank(stddev(close, 20)))) """ return (-1 * (u.delta(u.corr(df.high, df.volume, 5), 5) * u.rank(u.stddev(df.close, 20))))
def print_cluster_separation(self): print "CLUSTER SEPERATION" print print "Comparing each Cluster to it's most similar other clusters" if len(self.clusters) < 2: print "There are less than two clusters" return cluster_sim_mat = self.confirm.get_cluster_sim_mat() for row in cluster_sim_mat: row.sort(reverse=True) top_1 = list() top_3 = list() top_5 = list() for row in cluster_sim_mat: for x, val in enumerate(row): if x == 0: continue if x <= 1: top_1.append(val) if x <= 3: top_3.append(val) if x <= 5: top_5.append(val) else: break top_1.sort(reverse=True) top_1_mean = utils.avg(top_1) top_1_stddev = utils.stddev(top_1) top_3_mean = utils.avg(top_3) top_3_stddev = utils.stddev(top_3) top_5_mean = utils.avg(top_5) top_5_stddev = utils.stddev(top_5) print "\n Mean\t Std Dev" print "Top 1: %3.3f\t %3.3f" % (top_1_mean, top_1_stddev) print "Top 3: %3.3f\t %3.3f" % (top_3_mean, top_3_stddev) print "Top 5: %3.3f\t %3.3f" % (top_5_mean, top_5_stddev) print print "List of 10 most similar scores" print ", ".join(map(lambda x: "%4.3f" % x, top_1[:10])) print print
def alpha18(df): """ Alpha#18 (-1 * rank(((stddev(abs((close - open)), 5) + (close - open)) + correlation(close, open, 10)))) """ temp1 = u.stddev(abs((df.close - df.open)), 5) temp2 = df.close - df.open temp3 = u.corr(df.close, df.open, 10) return (-1 * u.rank(temp1 + temp2 + temp3))
def feature_eval_metrics(self, sim_fun): doc_cluster_sims_flat = list() doc_cluster_means = list() doc_cluster_std_devs = list() for cluster in self.clusters: cluster_sims = list() for _doc in cluster.members: val = sim_fun(cluster, _doc) doc_cluster_sims_flat.append(val) cluster_sims.append(val) doc_cluster_means.append(utils.avg(cluster_sims)) doc_cluster_std_devs.append(utils.stddev(cluster_sims)) global_mean = utils.avg(doc_cluster_sims_flat) global_stddev = utils.stddev(doc_cluster_sims_flat) mean_of_means = utils.avg(doc_cluster_means) stddev_of_means = utils.stddev(doc_cluster_means) mean_of_stddev = utils.avg(doc_cluster_std_devs) stddev_of_stddev = utils.stddev(doc_cluster_std_devs) return global_mean, global_stddev, mean_of_means, stddev_of_means, mean_of_stddev, stddev_of_stddev
def clock_gets(number=100): times = [] for i in xrange(number): start = clock() x=get_message() time_taken = clock() - start times.append(time_taken) median_time = median(times) mean_time = mean(times) stddev_time = stddev(times) return (median_time,mean_time,stddev_time)
def print_cluster_cohesion(self): print "CLUSTER COHESION:" sim_names = self.clusters[0].members[0].get_feature_set_names()[:] sim_names.append("confirm") print "\t\t%s SIZE" % (" ".join(sim_names)) for x, cluster in enumerate(self.clusters): # list of lists similarities = map(lambda _doc: _doc.global_sim(cluster.center), cluster.members) to_print = list() for y in xrange(len(similarities[0])): values = map(lambda row: row[y], similarities) to_print.append(utils.avg(values)) to_print.append(utils.stddev(values)) values = map(lambda _doc: self.confirm.cluster_doc_similarity(cluster, _doc), cluster.members) to_print.append(utils.avg(values)) to_print.append(utils.stddev(values)) l = len(cluster.members) print "\t%s: %s %d" % (x, " ".join(map(lambda s: "%3.2f" % s, to_print)), l) print print
def alpha1(df): """ Alpha#1 (rank(Ts_ArgMax(SignedPower(((returns < 0) ? stddev(returns, 20) : close), 2.), 5)) - 0.5) :param df: dataframe :return: """ temp1 = pd.Series(np.where((df.returns < 0), u.stddev(df.returns, 20), df.close), index=df.index) return (u.rank(u.ts_argmax(temp1**2, 5)) - 0.5)
def print_cluster_cohesion(self): print "CLUSTER COHESION:" sim_names = self.clusters[0].members[0].get_feature_set_names()[:] sim_names.append("confirm") print "\t\t%s SIZE" % (" ".join(sim_names)) for x, cluster in enumerate(self.clusters): # list of lists similarities = map(lambda _doc: _doc.global_sim(cluster.center), cluster.members) to_print = list() for y in xrange(len(similarities[0])): values = map(lambda row: row[y], similarities) to_print.append(utils.avg(values)) to_print.append(utils.stddev(values)) values = map( lambda _doc: self.confirm.cluster_doc_similarity( cluster, _doc), cluster.members) to_print.append(utils.avg(values)) to_print.append(utils.stddev(values)) l = len(cluster.members) print "\t%s: %s %d" % (x, " ".join( map(lambda s: "%3.2f" % s, to_print)), l) print print
def get_stat(self, max_cnt_label=None): szs = sorted(self.err_log.keys()) ave = [utils.mean(self.err_log[sz]) for sz in szs] dev = [utils.stddev(self.err_log[sz]) for sz in szs] idx = [i for i in range(0, len(ave))] idx = sorted(idx, key=lambda i: ave[i][0]) ave = [ave[i] for i in idx] dev = [dev[i] for i in idx] length = len(ave) if max_cnt_label is None else len( [c for c in ave if c[0] <= max_cnt_label * 2]) self.err_stat = ([c[1] for c in ave[:length]], [c[1] for c in dev[:length]]) self.label_stat = ([c[0] for c in ave[:length]], [c[0] for c in dev[:length]]) self.last_max_cnt_label = max_cnt_label
def get_stat(self, max_cnt_label=-1, robust=False): szs = sorted(self.err_log.keys()) if robust: num = len(self.err_log[szs[0]]) ave = [np.array([utils.robust_mean([self.err_log[sz][i][tp] for i in range(0, num)])\ for tp in (0,1)]) for sz in szs] dev = [np.array([utils.robust_stddev([self.err_log[sz][i][tp] for i in range(0, num)])\ for tp in (0,1)]) for sz in szs] else: ave = [utils.mean(self.err_log[sz]) for sz in szs] dev = [utils.stddev(self.err_log[sz]) for sz in szs] idx = [i for i in range(0, len(ave))] idx = sorted(idx, key=lambda i: ave[i][0]) ave = [ave[i] for i in idx] dev = [dev[i] for i in idx] length = len(ave) if max_cnt_label < 0 else len( [c for c in ave if c[0] <= max_cnt_label * 2]) self.err_stat = ([c[1] for c in ave[:length]], [c[1] for c in dev[:length]]) self.label_stat = ([c[0] for c in ave[:length]], [c[0] for c in dev[:length]]) self.last_max_cnt_label = max_cnt_label
def esth(vals): """ A reasonable estimate for h is approx. 5 * sigma. (i.e. 5 * std. deviation of samples.) """ return 5.0 * utils.stddev(vals)