Exemplo n.º 1
0
 def __init__(self, start_date="1995-01-01", stock_list=None, freq="BQ", periods=(2016 - 1995) * 4 + 3):
     self._accessor = DataAccessor(DataAccessor.Names.for_clustering)
     self.data = {}
     self.data_to_fit = []
     self.start = start_date
     self.ld = LearningData()
     self.stocks = stock_list
     if self.stocks is None:
         self.stocks = self.ld.get_stock_names()
     self.freq = freq
     self.periods = periods
     self.__init_data()
Exemplo n.º 2
0
def run_on_dates(start_date, model, stocks=('SHW', 'MNK', 'BIO', 'KRO'), is_sc=False, is_tree=False):
    ld = LearningData()
    if is_sc:
        sc = StrengthCalc()
    for s in stocks:
        td = TrainingData(s, ld=ld).set_threshold(0.8).add_history(10)
        data, classes = td.get()
        if is_tree:
            best_cut = 22
            data, classes = pandas.get_dummies(data.apply(lambda s: pandas.qcut(s.rank(method='first'), best_cut))), classes

        traind = data.loc[data.index < start_date]
        trainc = classes.iloc[-traind.shape[0]:]
        testd = td.slice_by_date(data, startdate=start_date, enddate=None)
        testd = testd.iloc[:-1]
        true_change = ld.get_future_change_classification(testd, s, 1)
        if is_sc:
            stock_strengths = sc.get_strength_stock(s, 5, 325, 2, 0.1)
            from Utilities import clean_market_name


            all_cols = set(traind.columns)

            cur_cols = []
            cur_strengths = []
            for m in ld.get_market_names():
                cur_cols.append(list(filter(lambda c: clean_market_name(m) in c, all_cols)))
                cur_strengths.append(stock_strengths[m])

        if is_sc:
            model.fit(traind, trainc, strengths=cur_strengths, connection_columns=cur_cols)
        else:
            model.fit(traind, trainc)
        predicted = model.predict(testd)
        start = 100
        for p, c in zip(predicted, true_change):
            if p:
                print(p, c)
                start *= (1+c)
        print(s, start, testd.shape[0], sum(predicted))
Exemplo n.º 3
0
class TestLearningData(TestCase):
    def setUp(self):
        self.ld = LearningData()
        self.df = self.ld.get_stock_data('ABC')
        self.start = datetime.datetime(2000, 5, 5)
        self.end = datetime.datetime(2005, 5, 5)

    def test_get_stock_data(self):
        try:
            self.assertIsNotNone(self.df)
            s = self.df.shape
            self.assertGreater(s[0], 100)
        except:
            traceback.print_exc()
            self.fail()
Exemplo n.º 4
0
 def setUp(self):
     logging.getLogger().setLevel(logging.DEBUG)
     self.ld = LearningData()
Exemplo n.º 5
0
 def setUp(self):
     self.ld = LearningData()
     self.df = self.ld.get_stock_data('ABC')
     self.start = datetime.datetime(2000, 5, 5)
     self.end = datetime.datetime(2005, 5, 5)
Exemplo n.º 6
0
class StrengthCalc(object):
    def __init__(self, start_date="1995-01-01", stock_list=None, freq="BQ", periods=(2016 - 1995) * 4 + 3):
        self._accessor = DataAccessor(DataAccessor.Names.for_clustering)
        self.data = {}
        self.data_to_fit = []
        self.start = start_date
        self.ld = LearningData()
        self.stocks = stock_list
        if self.stocks is None:
            self.stocks = self.ld.get_stock_names()
        self.freq = freq
        self.periods = periods
        self.__init_data()

    def __combined_stock_name(self, st):
        return st + str(self.start) + str(self.freq) + str(self.periods)

    def __init_data(self):
        date_array = pd.bdate_range(start=self.start, periods=self.periods, freq=self.freq)
        for st in self.stocks:
            if self.__combined_stock_name(st) not in self._accessor:
                current_stock_data = self.ld.get_stock_data(st)
                # need to take in consideration case of "inf" in value
                # gets the open and volume of the current df and calculate the percentage of change
                try:
                    value = current_stock_data.loc[date_array][['open', 'volume']]
                except Exception as e:
                    logging.warning("{} was skipped because of {}".format(st, str(e)))
                    continue
                # fill NaN with mean of the column
                value = value.reset_index()
                value.columns = ['time', 'open', 'volume']
                value = value.drop_duplicates('time').set_index('time')
                value = value.fillna(value.mean())
                # volume at the before exist is 0
                value = value.replace('inf', 0.0)
                # need to take the two columns and create a feature row from it. (single row for cluster)
                # we have 2 columns, open and volume and many dates.
                self._accessor[self.__combined_stock_name(st)] = value
            self.data[st] = self._accessor[self.__combined_stock_name(st)]
            self.data_to_fit.append(self.data[st].values)
        self.data_to_fit = np.vstack(np.dstack(self.data_to_fit)).T

    def ready_stock_to_predict(self, stocks):
        data_arr = []
        for st in stocks:
            data_arr.append(self.data[st].values)
        return np.vstack(np.dstack(data_arr)).T

    @functools.lru_cache(maxsize=325)
    def create_clustering_obj(self, n_clusters):
        clr = sklearn.cluster.KMeans(n_clusters=n_clusters)
        clr.fit(self.data_to_fit, self.stocks)
        return clr

    def create_array_of_clusters(self, min_number=3, max_number=120, step=None):
        arr_clr = []
        for x in range(min_number, max_number + 1, step):
            arr_clr.append(self.create_clustering_obj(n_clusters=x))
        return arr_clr

    def get_strength(self, stock, market, min_number, max_number, step, threshold=0.75):
        market = market_stock_dic[market]
        arr_clr = self.create_array_of_clusters(min_number=min_number, max_number=max_number, step=step)
        arr_clr.reverse()
        stock_data = self.ready_stock_to_predict([stock])
        market_data = self.ready_stock_to_predict(market)
        return self._calc_strength(stock_data, market_data, arr_clr, threshold)

    @staticmethod
    def float_round(num, places=0, direction=ceil):
        return direction(num * (10 ** places)) / float(10 ** places)

    @staticmethod
    def _calc_strength(stock_data, market_data, arr_clr, threshold):
        strength = 1.0
        dec = StrengthCalc.float_round(num=1 / len(arr_clr), places=3 ,direction=floor)
        for a in arr_clr:
            count = 0
            stock_label = a.predict(stock_data)
            market_labels = a.predict(market_data)
            for m in market_labels:
                if stock_label == m:
                    count += 1
            if count / len(market_labels) > threshold:
                return strength
            else:
                strength -= dec
        return strength

    def get_strength_stock(self, stock, min_number=5, max_number=325, step=2, threshold=0.1):
        markets = self.ld.get_market_names()
        strength_dic = dict.fromkeys(markets)
        for mk in markets:
            strength_dic[mk] = self.get_strength(stock=stock,market=mk,min_number=min_number,max_number=max_number,step=step,threshold=threshold)
        return strength_dic






# print(get_strength(stock='NHC',market='Medical Laboratories & Research (Healthcare)',min_number=5,max_number=20,step=5))
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--database',
                        default='exchange',
                        help='name of database to connect to')
    args = parser.parse_args()

    # need to be carefull as not all data is initialised
    ld = LearningData(database=args.database)
    market_names = ld.stocks.distinct('market_name')
    market_to_stocks = {m: [] for m in market_names}
    for stock in ld.get_stock_names():
        m = ld.stocks.find_one({'ticker': stock})['market_name']
        market_to_stocks[m].append(stock)

    for market in market_names:
        stocks_data = [(s_name, ld.get_stock_data(s_name, force=False))
                       for s_name in market_to_stocks[market]]
        market_data = pd.DataFrame(columns=stocks_data[0][1].columns)
        for s_name, s in stocks_data:
            market_data = pd.merge(market_data,
                                   s,
                                   how='outer',
                                   suffixes=('', '_' + s_name),
                                   left_index=True,
                                   right_index=True)

        docs = {}
        for row in market_data.iterrows():
            date = row[0]
            row = row[1]
            doc = {'market_name': market, 'date': date}
            if date in docs:
                doc = docs[date]
            else:
                doc['size'] = 0
                doc['volume'] = 0
                doc['change'] = 0
            for s_name in market_to_stocks[market]:
                if np.isnan(row['volume_' + s_name]):
                    continue
                try:
                    doc['change'] += (row['close_' + s_name] -
                                      row['open_' + s_name]) * row['volume_' +
                                                                   s_name]
                except:
                    continue
                doc['volume'] += row['volume_' + s_name]
                doc['size'] += 1
            if date not in docs:
                ld.markets.insert_one(doc)
            else:
                ld.markets.update_one({'date': doc['date']}, {
                    '$set': {
                        'change': doc['change'],
                        'volume': doc['volume'],
                        'size': doc['size']
                    }
                })
            docs[date] = doc