Exemplo n.º 1
0
    def __findBestModel(self, model: str) -> str:
        pred = Predictor()

        pdHandler = PrometheusDataHandler(model)

        data = pdHandler.get_data()
        results = {}

        count = 0.0
        logging.debug("Running findBestModel in SimpleLoadBalance")
        #data = { ip, vals }
        for ip in data:
            # data_values = [ (name,value) ]
            values = data[ip]
            data_values = pred.accuracy(values)
            for r in data_values:
                # print(r)
                try:
                    results[r[1]] += r[0]
                    #some times it is better to ask for forgiveness than permission :)
                except KeyError as e:
                    results[r[1]] = r[0]
            count += 1.0

        ##
        #Here we take the average between all of the nodes
        ##
        for k in results.keys():
            val = results[k]
            results[k] = val / count

        order_acc = sorted(results.items(), key=operator.itemgetter(1))
        return order_acc[0][0]
Exemplo n.º 2
0
 def on_batch_end(self, batch, logs=None):
     self.train_loss.update_state(logs['loss'])
     if batch % self.eval_freq == 0:
         image, bbox, labels = next(self.iter)
         image = image.numpy().astype('uint8')
         predictor = Predictor(yolov5l=yolov5l)
         boundings = predictor.predict(image)
         color_map = dict()
         for bounding in boundings:
             if bounding[5].numpy().astype('int32') not in color_map:
                 color_map[bounding[5].numpy().astype('int32')] = tuple(
                     np.random.randint(low=0, high=256,
                                       size=(3, )).tolist())
             clr = color_map[bounding[5].numpy().astype('int32')]
             cv2.rectangle(image,
                           tuple(bounding[0:2].numpy().astype('int32')),
                           tuple(bounding[2:4].numpy().astype('int32')),
                           clr, 1)
             cv2.putText(
                 image,
                 predictor.getClsName(
                     bounding[5].numpy().astype('int32')),
                 tuple(bounding[0:2].numpy().astype('int32')),
                 cv2.FONT_HERSHEY_PLAIN, 1, clr, 2)
         image = tf.expand_dims(image, axis=0)
         with self.log.as_default():
             tf.summary.scalar('train loss',
                               self.train_loss.result(),
                               step=optimizer.iterations)
             tf.summary.image('detect',
                              image[..., ::-1],
                              step=optimizer.iterations)
         self.train_loss.reset_states()
Exemplo n.º 3
0
def predict(modelPath, labelPath, wavPath):
    lf = open(labelPath, 'r')
    labelLn = lf.readlines()

    # remove \n
    label = []
    for item in labelLn:
        label.append(item.strip())

    pred = Predictor()
    pred.load(modelPath)

    #70Hzと74Hzの区別がつく分解能が要る。
    FR = 44100
    T = 1.0 / FR
    N = 16384

    rate, dataAll = wavfile.read(wavPath, True)

    if rate != FR:
        print('Error: sample rate is not 44.1kHz!')
        return

    # get channel 0 (left channel in stereo)
    if dataAll.ndim != 1:
        dataAll = dataAll[:, 0]

    print('Time        Chord   Confidence')

    nPred = (int)(len(dataAll) / FR)
    for t in range(0, nPred):
        data = dataAll[t * FR:t * FR + N]

        # 16bit int format -> float64
        x = data.astype(np.float)

        # normalize input time domain data to [-1 +1]
        xs = normalizeArray(x)

        w = scipy.signal.blackman(N)
        xf = scipy.fftpack.fft(xs * w)
        xfa = np.abs(xf)

        #70Hzと4kHzの区間を見る。
        startIdx = (int)(70 * N / FR)
        endIdx = (int)(4000 * N / FR)
        xfs = xfa[startIdx:endIdx]

        # normalize frequency domain data to [0, +1]
        xIn = normalizePositiveArray(xfs)

        xInR = np.reshape(xIn, (1, -1))
        (ypred, conf) = pred.predict(xInR)

        ypredI = int(ypred)

        #print('', ypredI)
        if conf.max() >= 0:
            print('%3d:%02d %10s %6.2f' %
                  ((int)(t / 60), t % 60, label[ypredI], conf.max()))
def main():
    parse_args()

    predictor = Predictor(get_language_files())
    best_guess = predictor.predict(get_input_string())
    
    print("Language is: {}".format(best_guess.language_name))
Exemplo n.º 5
0
   def GetExpectedReturns(self,Xtot,Ytot,k,stock):
       #Retrieve the dates for each adjustment close price or close price
       dataMongo = self.db.Prices.find_one({'BBGTicker' : stock},{'Adj Close' : 1})
       if 'Adj Close' not in list(dataMongo.keys()):
           dataMongo = self.db.Prices.find_one({'BBGTicker' : stock},{'Close' : 1})
           data=dataMongo['Close']
       else:
           data=dataMongo['Adj Close']
       dates = sorted(data.keys())
       
       #Handle incorrect start and end date entered by user       
       dh = DataHandler
       self.startDate = dh.HandleIncorrectDate(self.startDate,'',dates)
       self.endDate = dh.HandleIncorrectDate('',self.endDate,dates)
       dates = dates[dates.index(self.startDate):dates.index(self.endDate)]
 
       #Prediction must consider the window for covariance matrix
       startDate = dates[k*self.rebalanceFreq + self.window]
       pr = Predictor(stock,startDate,self.horizon,self.db)
       
       #Lookup output and features and
       TIsDates = Xtot[stock]["DATES"]
       Xtrain = Xtot[stock]["ANALYTICS"][:TIsDates.index(startDate)]
       Ytrain = Ytot[stock]["RETURNS"][:TIsDates.index(startDate)]
       Xpred = Xtot[stock]["ANALYTICS"][TIsDates.index(startDate)]
       mu = [stock,pr.PredictKNN(Xtrain,Ytrain,Xpred)/self.horizon]
       
       return mu
Exemplo n.º 6
0
def generate_model(author, steps):
    """Given an author name, processes the data/<author>.txt input for steps number
    of iterations into the model input to be used by the lambda_handler
    function.
    """
    predictor = Predictor(128)

    # Filenames.
    author_models_dir = get_dir_for_author(author)
    if not os.path.exists(author_models_dir):
        os.mkdir(author_models_dir)
    model_file = author_models_dir + author + ".model"
    vocab_file = author_models_dir +  author + ".vocab"
    commons_file = author_models_dir +  author + ".commons"
    raw_text_file = "../data/" + author + ".txt"

    # Read in the 'frequently used words' as common vocab.
    frequent = read_common_vocab("../data/20k_most_common.txt")

    # Clean the content.
    with open(raw_text_file, 'r') as raw:
        raw_words = raw.read().split(' ')
        data, _ = clean_input_data(raw_words, frequent)

    # Write out the words that occur in the clean data to the commons file.
    record_common_vocab(data, commons_file)

    # Train the model. This step takes the longest.
    predictor.train(data, steps)

    # Save the model that we have trained to disk.
    predictor.save(model_file, vocab_file)

    return predictor
Exemplo n.º 7
0
 def derivative(self, x: np.ndarray, y: np.array, est: Predictor):
     '''returns gradient (vector)'''
     m = len(y)
     updates = np.zeros(self.dim, dtype=float)
     for j in range(len(updates)):
         updates[j] = est.lr() * sum([(est.predict(x[i]) - y[i]) * x[i][j]
                                      for i in range(m)]) / m
     return updates
Exemplo n.º 8
0
 def __init__(self):
     self.FORMAT = pyaudio.paFloat32
     self.CHANNELS = 1
     self.RATE = 44100
     self.CHUNK = 1024 * 2
     self.p = None
     self.stream = None
     self.chunksRead = None
     self.predictor = Predictor()
Exemplo n.º 9
0
	def process(self, request):
		model = request['model']
		pdHandler = PrometheusDataHandler(model)
		data = pdHandler.get_data()

		pred = Predictor()
		for ip in data:
			values = data[ip]
			self.data_[ip] = pred.accuracy(values)
Exemplo n.º 10
0
 def testAll(self):
     logProbabilities = numpy.asarray([[0.4, 0.80, 0.50],
                                       [0.45, 0.4, 0.41],
                                       [0.4, 0.41, 0.45]])
     expected = [1,0,2]
     
     target = Predictor()
     
     self.assertEquals(expected, target.getPredictions(logProbabilities))
Exemplo n.º 11
0
 def __init__(self) -> None:
     self.dim = 3
     self.est = [\
                 Predictor(np.random.normal(0, 0.5, self.dim), 100, 0.001),\
                 Predictor(np.random.normal(0, 0.5, self.dim), 100, 0.001),\
                 Predictor(np.random.normal(0, 0.5, self.dim), 80, 0.01),\
                 Predictor(np.random.normal(0, 0.5, self.dim), 50, 0.01) ]
     self.houses_dict = {
         1: 'Gryffindor',
         2: 'Ravenclaw',
         3: 'Slytherin',
         4: 'Hufflepuff'
     }
Exemplo n.º 12
0
    def __init__(self):

        if exists('database.dat'):
            # deserialize database is much faster.
            print('deserialize the QA database...')
            self.search_engine = SearchEngine('cppjieba/dict', 'database.dat')
        else:
            # load database from txt is slower.
            print('load from QA database from txt format...')
            self.search_engine = SearchEngine('cppjieba/dict')
            self.search_engine.loadFromTxt('question_answer.txt')
            self.search_engine.save('database.dat')
        self.predictor = Predictor()
Exemplo n.º 13
0
 def get_data(self):
     # Gets the result from the SVM model
     p = Predictor(self.filePath)
     # Gets the result using the SVM model
     p.assess_func()
     #store result
     self.result = p.result_data
     self.data = p.raw_data  #store raw data
     self.channelName = p.header  #store electrodes information for the GUI
     self.eegLength = len(
         p.raw_data) / 256  #gets the length of the eeg in seconds
     self.bad = p.bad  # Bad channels in the EEG
     self.powers = p.powers  # power of different waves in the eeg
Exemplo n.º 14
0
	def process(self, request):
		model = request['model']
		pdHandler = PrometheusDataHandler(model)
		data = pdHandler.get_data()
		
		# Example of using a fileHandler
		# csv_values = FileHandler('../data/exchange.csv').get_data()
		# prediction = pred.arima(csv_values[0:-1-i], 1+i)

		for ip in data:
			# values is a tuple (time, workload)
			values = data[ip]
			pred = Predictor()
			prediction = pred.get_prediction(values, model)
			self.data_[ip] = prediction[-1]
Exemplo n.º 15
0
def DoWork(source_file, m, k, fltr, trade_cost, testing_start_date,
           testing_end_date):

    # Set some defaults
    p = Predictor(source_file, m, k, fltr, testing_start_date,
                  testing_end_date)
    p.read_file_to_daily_data_by_weeks(0)
    p.calc_historical_weekly_return(p.daily_data_by_weeks)

    # Initialize variables
    strategy_trade_count = 0
    in_market_count = 0
    current_state = 0
    strategy_capital = 100
    bh_capital = 100
    strategy_weekly_return = []
    bh_weekly_return = []

    for idx in range(p.start_split, p.end_split):

        # Generate weekly signals
        p.weekly_return_data = p.historical_weekly_return_data[:idx - 1]
        alist = p.find_k_closest_histories()
        r = p.calc_next_week_return(alist)
        cur_signal = p.signal(current_state, r)

        # Signal handling
        cur_index_price = p.daily_data_by_weeks[idx][-1][2]
        prev_index_price = p.daily_data_by_weeks[idx - 1][-1][2]
        actual_return = math.log(cur_index_price) - math.log(prev_index_price)
        if current_state != cur_signal:
            current_state = cur_signal
            strategy_trade_count += 1
            strategy_capital = strategy_capital * (1 - trade_cost)

        # K Nearest Neighbor
        strategy_capital = strategy_capital * (1 +
                                               actual_return * current_state)
        strategy_weekly_return.append(actual_return)

        # Buy-and-Hold
        if current_state == 1:
            #in_market_count += 1
            bh_capital = bh_capital * (1 + actual_return)
            bh_weekly_return.append(actual_return)

        print('{0}, {1}, {2}'.format(p.daily_data_by_weeks[idx - 1][-1][0],
                                     strategy_capital, bh_capital))
Exemplo n.º 16
0
def build_predictor(data, settings):
    sys.stdout.write('Building model\n')
    data_input = Input(shape=(settings['max_len'], ))
    bucket_size = Input(shape=(1, ), dtype="int8")
    embedding = Embedding(input_dim=settings['max_features'] + 3,
                          output_dim=settings['word_embedding_size'],
                          mask_zero=True,
                          name="emb")(data_input)
    encoder = Predictor(input_dim=settings['word_embedding_size'],
                        hidden_dim=settings['sentence_embedding_size'],
                        RL_dim=settings['RL_dim'],
                        max_len=settings['max_len'],
                        batch_size=settings['batch_size'],
                        random_action_prob=settings['random_action_prob'],
                        name='encoder')([embedding, bucket_size])
    layer = encoder[0]

    for idx, hidden_dim in enumerate(settings['hidden_dims']):
        layer = Dense(hidden_dim, name="dense_{}".format(idx))(layer)
        layer = Activation('tanh')(layer)
        layer = Dropout(settings['dense_dropout'])(layer)
    output = Dense(settings['num_of_classes'],
                   activation='softmax',
                   name='output')(layer)
    model = Model(inputs=[data_input, bucket_size],
                  outputs=[
                      output, encoder[1], encoder[2], encoder[3], encoder[4],
                      encoder[5]
                  ])
    return model
Exemplo n.º 17
0
class QASystem(object):
    def __init__(self):

        if exists('database.dat'):
            # deserialize database is much faster.
            print('deserialize the QA database...')
            self.search_engine = SearchEngine('cc/cppjieba/dict',
                                              'database.dat')
        else:
            # load database from txt is slower.
            print('load from QA database from txt format...')
            self.search_engine = SearchEngine('cc/cppjieba/dict')
            self.search_engine.loadFromTxt('question_answer.txt')
            self.search_engine.save('database.dat')
        self.predictor = Predictor()

    def query(self, question, count=3):

        answer_scores = self.search_engine.query(question, count)
        answer_totalscores = dict()
        for answer, match in answer_scores.items():
            _, relevance = self.predictor.predict(question, answer)
            answer_totalscores[answer] = (
                log(max(match[0], sys.float_info.min)) * relevance,
                match[1],
            )
        return answer_totalscores

    def updateDB(self, file):

        assert type(file) is str
        self.search_engine.loadFromTxt(file)
        self.search_engine.save('database.dat')
def main():
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind((host,port))
    s.listen(backlog)
    predictor = Predictor(get_language_files())

    while True: 
        client, address = s.accept() 
        data = client.recv(size)
        print("Recieved {}".format(data))
        if data:
            ngram = predictor.predict(data)
            print("Sending {}".format(ngram.language_name))

            client.send(ngram.language_name)
        client.close()
Exemplo n.º 19
0
class QASystem(object):
    def __init__(self):

        if exists('database.dat'):
            # deserialize database is much faster.
            print('deserialize the QA database...')
            self.search_engine = SearchEngine('cppjieba/dict', 'database.dat')
        else:
            # load database from txt is slower.
            print('load from QA database from txt format...')
            self.search_engine = SearchEngine('cppjieba/dict')
            self.search_engine.loadFromTxt('question_answer.txt')
            self.search_engine.save('database.dat')
        self.predictor = Predictor()

    def query(self, question, count=3):

        answer_scores = self.search_engine.query(question, count)
        answer_totalscores = dict()
        for answer, match in answer_scores.items():
            _, relevance = self.predictor.predict(question, answer)
            answer_totalscores[answer] = exp(match) + exp(relevance)
        # sort in descend order of total score
        sorted(answer_totalscores, key=operator.itemgetter(1), reverse=True)
        return answer_totalscores

    def updateDB(self, file):

        assert type(file) is str
        self.search_engine.loadFromTxt(file)
        self.search_engine.save('database.dat')
Exemplo n.º 20
0
    def runAlg(self, dataType):
        '''
        运行算法的函数,并且画图
        :param dataType: gsm / lte
        :return:
        '''
        if dataType != "gsm" and dataType != "lte":
            raise Exception("未知数据类型")

        #分类器和回归器的结果
        regRes = []
        claRes = []

        predorCla = Predictor("cla", dataType)
        predorReg = Predictor("reg", dataType)
        for time in range(10):
            print str(time) + " -- begin"

            #跑算法
            claRes.append(predorCla.fit())
            regRes.append(predorReg.fit())
            print str(time) + " -- change data"
            # 重新生成一个数据的划分
            predorReg.changeData()
            predorCla.changeData()

        #排序
        regRes.sort()
        claRes.sort()

        folder = "gsmResult/" if "gsm" == dataType else "lteResult/"

        #画图
        self.draw(regRes, folder + "regImg")
        self.draw(claRes, folder + "claImg")

        #输出结果和中位结果
        resFile = open(folder + "res", 'w')
        resFile.write("reg result:\n")
        resFile.write(str(regRes))
        resFile.write("\nreg mid:\n")
        resFile.write(str((regRes[4] + regRes[5]) / 2))

        resFile.write("\ncla result:\n")
        resFile.write(str(claRes))
        resFile.write("\ncla mid:\n")
        resFile.write(str((claRes[4] + claRes[5]) / 2))
Exemplo n.º 21
0
 def __init__(self,df,host,measurement,look_back,nb_layers,loss,metric,nb_features,optimizer,nb_epochs,nb_batch,form,freq_period) :
     Predictor.__init__(self)
     self.df=df
     self.host=host
     self.measurement=measurement
     self.form=form
     self.freq_period=freq_period
     trend_x, trend_y,seasonal_x,seasonal_y,residual_x,residual_y=self.prepare_data(df,look_back,self.freq_period)
     model_trend=self.make_models(nb_layers,loss,metric,nb_features,optimizer,True)
     model_seasonal=self.make_models(nb_layers,loss,metric,nb_features,optimizer,False)
     model_residual=self.make_models(nb_layers,loss,metric,nb_features,optimizer,False)
     model_trend=self.train_model(model_trend,trend_x,trend_y,nb_epochs,nb_batch,"trend")
     model_seasonal=self.train_model(model_seasonal,seasonal_x,seasonal_y,nb_epochs,nb_batch,"seasonal")
     model_residual=self.train_model(model_residual,residual_x,residual_y,nb_epochs,nb_batch,"residual")
     self.model_trend=model_trend     
     self.model_seasonal=model_seasonal
     self.model_residual=model_residual
Exemplo n.º 22
0
def run():
    data_loader = DataLoader(data_dir_root, data_train_file, data_test_file)
    raw_train_df, raw_test_df = data_loader.load_csv_data()
    data_loader.print_statistics()

    pre_processor = PreProcessor(
        raw_train_df,
        raw_test_df,
        cols_to_consider=cols_to_consider,
        # cols_to_consider=raw_train_df.columns[0:-1],
        target_feature='SalePrice')
    pre_processor.pre_process_data()

    print_features_info(pre_processor.raw_train_df,
                        pre_processor.clean_train_df)
    plot_target_feature(pre_processor.raw_train_df,
                        pre_processor.target_feature)
    plot_features_hist(pre_processor.raw_train_df)
    plot_correlation_numeric_features(pre_processor.clean_train_df)

    train_X, train_y = prepare_data(
        pre_processor.clean_train_df,
        class_col=pre_processor.target_feature,
        reg_encoding_features=[],
        one_hot_encoding_features=one_hot_encod_features,
        ordinal_encoding_features=features_ordinal_mappings,
        no_enc_features=no_enc_features)

    test_X, test_y = prepare_data(
        pre_processor.clean_test_df,
        class_col=pre_processor.target_feature,
        reg_encoding_features=[],
        one_hot_encoding_features=one_hot_encod_features,
        ordinal_encoding_features=features_ordinal_mappings,
        no_enc_features=no_enc_features)
    evaluator = Predictor(train_X, train_y, test_X, test_y, eval_classifiers,
                          eval_classifiers_params_grid)

    all_predictions, final_prediction = evaluator.build_models(
        grid_search=False)
    evaluation_df = evaluator.save_predictions_to_df(all_predictions,
                                                     final_prediction)
    submission_df = evaluator.save_predictions_for_submission(
        evaluation_df, id_col=pre_processor.raw_test_df['Id'])
    evaluation_df.to_csv("test_evaluation_results.csv", index=False)
    submission_df.to_csv("test_submission.csv", index=False)
Exemplo n.º 23
0
def lambda_handler(event, context):
    """Use a model for an existing author to generate length words, interleaved
       with user text input."""
    author = event["author"]
    user_text = event["userText"]
    length = event["length"]

    # Load in the predictor
    model_file = get_dir_for_author(author) + author + ".model"
    vocab_file = get_dir_for_author(author) + author + ".vocab"
    predictor = Predictor(128, model=model_file, vocab=vocab_file)

    # Clean the user data and separate out unknown words.
    common_vocab = read_common_cocab(get_dir_for_author(author) + author + ".commons")
    data, unique_user_words = clean_input_data(user_text, common_vocab)

    generated_sample = predictor.sample(length)
    return clean_generated_data(' '.join(generated_sample), unique_user_words)
Exemplo n.º 24
0
    def __init__(self,
                 location,
                 inputFile,
                 outputDir=None,
                 cns=False,
                 reject=None,
                 angleOnly=False,
                 ppm=False,
                 progressBar=None,
                 writePgm=True):

        self.input = inputFile
        self.progressBar = progressBar

        print 'DANGLE (version 1.1)'
        print DANGLE_CITE

        # 1. read config file for location of reference information
        self.reference = Reference(os.path.dirname(location))
        self.reference.outDir = outputDir or OUTDIR
        if not os.path.isdir(self.reference.outDir):
            os.makedirs(self.reference.outDir)

        self.reference.cns = cns
        self.reference.ppm = ppm
        self.reference.angleOnly = angleOnly

        if (reject is not None):
            self.reference.rejectThresh = reject

        # 2. read shifts of query protein (input) and calculate secondary shifts
        self.query = Protein(self.reference)
        self.query.readShiftsFromXml(inputFile)

        # 3. compare with DB
        print 'STEP1: Shift search'
        self.topMatches = self.compareWithShiftDB()

        # 4. make preditions from scorograms
        print 'STEP2: GLE generation'
        self.predictor = Predictor(self.query, self.topMatches, self.reference,
                                   writePgm)
        self.predictions = self.predictor.predictPhiPsiFromDatabaseMatches(
            progressBar=self.progressBar)
Exemplo n.º 25
0
def update_bert(session):

    assert type(session) is str
    # download qa database from database.
    try:
        logger.info('get the latest knowledge from wd_qa_knowledge...')
        db = MySQLdb.connect(host=db_host,
                             user=db_usr,
                             passwd=db_psw,
                             db=db_name,
                             charset='utf8')
        sql = "select question,answer from wd_qa_knowledge"
        cur = db.cursor()
        cur.execute(sql.encode('utf-8'))
        qa = str()
        for row in cur.fetchall():
            qa += row[0] + "\t" + row[1] + "\n"
        with open("question_answer.txt", "wb") as f:
            f.write(qa.encode('utf-8'))
        db.commit()
        db.close()
    except Exception as e:
        logger.error(e)
        response = jsonify({'status': 'failure'})
        socketio.emit('msg', namespace='/socket', room=session, data=response)
        return
    # generate dataset.
    try:
        logger.info('generating training set...')
        from subprocess import call
        call(
            ["./create_dataset", "-i", "question_answer.txt", "-o", "dataset"])
    except Exception as e:
        logger.error(e)
        response = jsonify({'status': 'failure'})
        socketio.emit('msg', namespace='/socket', room=session, data=response)
        return
    # finetune model
    logger.info('training...')
    from Predictor import Predictor
    predictor = Predictor()
    predictor.finetune('dataset')
    response = jsonify({"status": "success"})
    socketio.emit('msg', namespace='/socket', room=session, data=response)
Exemplo n.º 26
0
 def simulate_generations(self, num_generations, print_best):
     file_path = "csv/ESN_Results.csv"
     dw = DataWriter()
     dr = DataReader()
     dw.init_table(file_path)
     p = Predictor()
     mapping = dr.get_mapping()
     images = dr.get_images(112800, 28, 28) # 112800 images in data set
     scale_factor = 10
     for i in range(num_generations):
         sum = 0
         best_score = -100
         best_accuracy = -100
         best_net = []
         engines = []
         for net in self.networks:
             engine = [net, 0, 0]
             engines.append(engine)
         p.make_predictions(engines, mapping, images, scale_factor)
         for j in range(len(engines)):
             self.networks[j].fitness = engines[j][1]
             if engines[j][2] > best_score:
                 best_score = engines[j][2]
                 best_net = self.networks[j]
             if engines[j][1] > best_accuracy:
                 best_accuracy = engines[j][1]
         avg_accuracy = self.avg_fitness(self.networks) # avg accuracy
         for j in range(len(engines)):
             self.networks[j].fitness = engines[j][2] # change fitness to score
         avg_score = self.avg_fitness(self.networks) # avg accuracy
         avg_size = self.avg_network_size()
         if print_best:
             best_net.show_net()
         print("-----------------------------------\t\t\t\t\t\t\n       Generation " + str(i+1) + " results\n-----------------------------------\n", end='\n')
         print("Highest accuracy: " + str(best_accuracy*100) + "%\nHighest score: " + str(best_score**(1.0/scale_factor)) + "\nAverage accuracy: " + str(avg_accuracy*100) + "%\nAverage score: " + str(avg_score**(1.0/scale_factor)) + "\nNum species: " + str(len(self.species)) + "\nInnovs tried: " + str(self.networks[0].master_innov[0]) + "\nAverage connections per network: " + str(avg_size) + "\n")
         
         non_jit = self.construct_non_jit(best_net)
         pickle.dump(non_jit, open("neural_net.txt", "wb"))
         dw.write_row(file_path, [i+1, best_accuracy*100, avg_accuracy*100, best_score**(1.0/scale_factor), avg_score**(1.0/scale_factor), avg_size])
         if i != num_generations-1:
             self.prepare_next_gen(math.ceil(self.pop_size/10))
             print("\nStarting Generation " + str(i+2) + ": Species = " + str(len(self.species)) + ", Innovs = " + str(self.networks[0].master_innov[0]), end='\n')
     print("Finished simulation!")
Exemplo n.º 27
0
 def __init__(self, df, host, measurement, look_back, nb_layers, loss,
              metric, nb_features, optimizer, nb_epochs, nb_batch, form,
              freq_period):
     Predictor.__init__(self)
     self.df = df
     self.host = host
     self.measurement = measurement
     self.form = form
     self.freq_period = freq_period
     trend_x, trend_y, seasonal_x, seasonal_y, residual_x, residual_y = self.prepare_data(
         df, look_back, self.freq_period)
     model_trend = self.make_models(nb_layers, loss, metric, nb_features,
                                    optimizer, True)
     model_seasonal = self.make_models(nb_layers, loss, metric, nb_features,
                                       optimizer, False)
     model_residual = self.make_models(nb_layers, loss, metric, nb_features,
                                       optimizer, False)
     que = queue.Queue()
     threads_list = list()
     thread = Thread_train_model(model_trend, que, trend_x, trend_y,
                                 nb_epochs, nb_batch, "trend",
                                 "Trend Thread")
     thread.start()
     threads_list.append(thread)
     thread_1 = Thread_train_model(model_seasonal, que, seasonal_x,
                                   seasonal_y, nb_epochs, nb_batch,
                                   "seasonal", "Seasonal Thread")
     thread_1.start()
     threads_list.append(thread_1)
     thread_2 = Thread_train_model(model_residual, que, residual_x,
                                   residual_y, nb_epochs, nb_batch,
                                   "residual", "Residual Thread")
     thread_2.start()
     threads_list.append(thread_2)
     for t in threads_list:
         t.join()
     self.model_trend = que.get(block=False)
     self.model_save(self.model_trend, "trend")
     self.model_seasonal = que.get(block=False)
     self.model_save(self.model_seasonal, "seasonal")
     self.model_residual = que.get(block=False)
     self.model_save(self.model_residual, "residual")
Exemplo n.º 28
0
def GetPrediction():
    with graph.as_default():
        request_data = request.get_json()
        input_data = {
            "Name": 1,
            "PatientAge": [request_data["PatientAge"]],
            "TimesPerDay": [request_data["TimesPerDay"]],
            "DiagnosticCode": [request_data["DiagnosticCode"]],
            "CitySize": [request_data["CitySize"]],
            "PillCost": [request_data["PillCost"]],
            "NumberOfProducts": [request_data["NumberOfProducts"]],
            "KnownDoctorsVisits": [request_data["KnownDoctorsVisits"]],
            "Income": [request_data["Income"]],
            "DaysSinceLastViolation": [request_data["DaysSinceLastViolation"]],
            "Adhered": [request_data["Adhered"]]
        }
        input_data_df = pd.DataFrame(data=input_data)
        predictor = Predictor()
        prediction = predictor.predict(input_data_df)
        return '{ "Adhered": ' + str(prediction[0][0]) + '}'
def DoWork(source_file, m, k, fltr, trade_cost, testing_start_date, testing_end_date):

    # Set some defaults
    p = Predictor(source_file, m, k, fltr, testing_start_date, testing_end_date)
    p.read_file_to_daily_data_by_weeks(0)
    p.calc_historical_weekly_return(p.daily_data_by_weeks)

    # Initialize variables
    strategy_trade_count = 0
    in_market_count = 0
    current_state = 0
    strategy_capital = 100
    bh_capital = 100
    strategy_weekly_return = []
    bh_weekly_return = []


    for idx in range(p.start_split, p.end_split):

        # Generate weekly signals
        p.weekly_return_data = p.historical_weekly_return_data[:idx-1]
        alist = p.find_k_closest_histories()
        r = p.calc_next_week_return(alist)
        cur_signal = p.signal(current_state, r)

        # Signal handling
        cur_index_price = p.daily_data_by_weeks[idx][-1][2]
        prev_index_price = p.daily_data_by_weeks[idx-1][-1][2]
        actual_return = math.log(cur_index_price) - math.log(prev_index_price)
        if current_state != cur_signal:
            current_state = cur_signal
            strategy_trade_count += 1
            strategy_capital = strategy_capital * (1 - trade_cost)

        # K Nearest Neighbor
        strategy_capital = strategy_capital * (1 + actual_return * current_state)
        strategy_weekly_return.append(actual_return)


        # Buy-and-Hold
        if current_state == 1:
            #in_market_count += 1
            bh_capital = bh_capital * (1 + actual_return)
            bh_weekly_return.append(actual_return)

        print('{0}, {1}, {2}'.format(p.daily_data_by_weeks[idx-1][-1][0], strategy_capital, bh_capital))
Exemplo n.º 30
0
def generate_model(author, steps):
    """Given an author name, processes the data/<author>.txt input for steps number
    of iterations into the model input to be used by the lambda_handler
    function.
    """
    predictor = Predictor(128)

    # Filenames.
    author_models_dir = get_dir_for_author(author)
    if not os.path.exists(author_models_dir):
        os.mkdir(author_models_dir)
    model_file = author_models_dir + author + ".model"
    vocab_file = author_models_dir + author + ".vocab"
    commons_file = author_models_dir + author + ".commons"
    raw_text_file = "../data/" + author + ".txt"

    # Read in the 'frequently used words' as common vocab.
    frequent = read_common_vocab("../data/20k_most_common.txt")

    # Clean the content.
    with open(raw_text_file, 'r') as raw:
        raw_words = raw.read().split(' ')
        data, _ = clean_input_data(raw_words, frequent)

    # Write out the words that occur in the clean data to the commons file.
    record_common_vocab(data, commons_file)

    # Train the model. This step takes the longest.
    predictor.train(data, steps)

    # Save the model that we have trained to disk.
    predictor.save(model_file, vocab_file)

    return predictor
Exemplo n.º 31
0
def main():
    print "-- Welcome to movie-recommend! --"

    # for output readability
    np.set_printoptions(formatter={'float_kind': '{:25f}'.format})

    # baseline predictor by default
    mode = BASELINE

    # read command-line argument, if provided
    if len(sys.argv) > 1:
        if sys.argv[1] == IMPROVED or sys.argv[1] == BASELINE:
            mode = sys.argv[1]
            print "\tYou chose", mode, "predictor!"
        else:
            print "\t", sys.argv[
                1], "is not a valid argument. Default:", mode, "predictor!"
    else:
        print "\tYou did not provide any arguments. Default:", mode, "predictor!"

    # read and parse text files
    parser = Parser(mode)
    print "\tParser initialized:"
    print "\t\t", len(parser.test_set), "test points and", np.count_nonzero(
        parser.training_matrix), "training points"

    # initialize predictor and calculate rmse
    predictor = Predictor(mode, parser.training_matrix, parser.test_set)
    print "\trmse on test data (baseline):", predictor.rmse_test
    if predictor.mode == BASELINE:
        print "\trmse on training data (baseline):", predictor.rmse_training
    else:
        print "\trmse on test data (improved):", predictor.rmse_test_improved

    # execute histogram plotting and get error distribution
    error_dist = predictor.calculate_absolute_errors(
        parser.test_set, predictor.improved_matrix
    ) if predictor.mode == IMPROVED else predictor.calculate_absolute_errors(
        parser.test_set, predictor.baseline_matrix)
    print "\tHistogram saved to file. Error distribution:", error_dist
Exemplo n.º 32
0
def main():
    toy = True

    market, news = read_data()
    train_idx, val_idx, test_idx = split_data(market, toy)

    # Create preprocessors
    market_prepro = MarketPrepro()
    market_prepro.fit(train_idx, market)
    news_prepro = NewsPrepro()
    news_prepro.fit(train_idx, news)
    prepro = JoinedPreprocessor(market_prepro, news_prepro)

    # Train data generator instance
    join_generator = JoinedGenerator(prepro, train_idx, market, news)
    val_generator = JoinedGenerator(prepro, val_idx, market, news)
    print('Generators created')

    # Create and train model
    model = ModelFactory.lstm_128(
        len(market_prepro.feature_cols) + len(news_prepro.feature_cols))
    model.load_weights("best_weights.h5")
    print(model.summary())
    ModelFactory.train(model, toy, join_generator, val_generator)

    # Predict
    predictor = Predictor(prepro, market_prepro, news_prepro, model,
                          ModelFactory.look_back, ModelFactory.look_back_step)
    y_pred, y_test = predictor.predict_idx(test_idx, market, news)

    y_pred = predictor.predict(market, news)

    plt.plot(y_pred)
    plt.plot(y_test)
    plt.legend(["pred", "test"])
    plt.show()

    # get_merged_Xy(train_idx.sample(5), market, pd.DataFrame([],columns=news.columns)).head()
    print('The end')
Exemplo n.º 33
0
def main():
    training_set = TrainingSetHandler()
    training_set.load_training_set()
    gram_list = []

    for size in Constants.SIZE_OF_GRAMS:
        for language in training_set.language_list:
            gram = NGram(size, string.ascii_lowercase, 0.5)
            gram.train(training_set.training_set[language], language)
            gram_list.append(gram)

    #dump copies of grams to file
    dump_grams(gram_list)

    predic = Predictor(gram_list)
    test_set_handler = TestSetHandler()
    test_set_handler.load_test_sentence()
    for idx, sentence in enumerate(test_set_handler.test_set):
        clean_sentence = "".join([c for c in sentence[1] if c.isalpha()]).lower()
        prediction = predic.predict_this_sentence(clean_sentence)
        with open(os.path.join(Constants.OUTPUT_PATH, "out{}.txt".format(idx)), 'w') as f:
            output = OutputHelper(prediction, sentence, f)
            output.print_and_save_output()
Exemplo n.º 34
0
def main(argv):

  yolov3 = tf.keras.models.load_model(FLAGS.model, compile = False);
  predictor = Predictor(yolov3 = yolov3);
  anno = COCO(join(FLAGS.annotation_dir, 'instances_val2017.json'));
  count = 0;
  for imgid in anno.getImgIds():
    print("processing (%d/%d)" % (count, len(anno.getImgIds())));
    detections = list();
    # predict
    img_info = anno.loadImgs([imgid])[0];
    img = cv2.imread(join(FLAGS.coco_eval_dir, img_info['file_name']));
    boundings = predictor.predict(img).numpy();
    # collect results
    for bounding in boundings:
      detections.append([imgid, bounding[0], bounding[1], bounding[2] - bounding[0], bounding[3] - bounding[1], bounding[4], label_map.index(int(bounding[5]) + 1)]);
    count += 1;
  cocoDt = anno.loadRes(np.array(detections));
  cocoEval = COCOeval(anno, cocoDt, iouType = 'bbox');
  cocoEval.params.imgIds = anno.getImgIds();
  cocoEval.evaluate();
  cocoEval.accumulate();
  cocoEval.summarize();
Exemplo n.º 35
0
def load(positive_class, negative_class):
    """
    Helper function that loads in the model and creates a DirectoryManipulator.

    :return: The model and a DirectoryManipulator.
    """
    print("#" * 15)
    print("loading model...")
    print("#" * 15)
    p = Predictor(
        r"models\inceptionResNetV2_optimized_h5\inceptionResNetV2_optimized.h5",
        (positive_class, negative_class))
    print("#" * 15)
    print("model loaded.")
    print("#" * 15)
    d = DirectoryManipulator()
    return p, d
Exemplo n.º 36
0
import sys
import os
import pickle
from Predictor import Predictor

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print 'Usage:', sys.argv[0], 'spamFolder, hamFolder'
    else:
        if os.path.isdir(sys.argv[1]) and os.path.isdir(sys.argv[2]):
            print 'training...'
            predictor = Predictor(sys.argv[1], sys.argv[2])
            print predictor.predict('hw6-spamham-data/dev/dev1')
            # save to pickle
            print 'saving predictor to pickle'
            pickle.dump(predictor, open('predictor.pickle', 'w'))
        else:
            print 'training folders illegal'
Exemplo n.º 37
0
import sys
import os
import pickle
from Predictor import Predictor

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print 'Usage:', sys.argv[0], 'spamFolder, hamFolder'
    else:
        if os.path.isdir(sys.argv[1]) and os.path.isdir(sys.argv[2]):
            print 'training...'
            predictor = Predictor(sys.argv[1], sys.argv[2])
            print predictor.predict('bla')
            # save to pickle
            print 'saving predictor to pickle'
            pickle.dump(predictor, open('predictor.pickle', 'w'))
        else:
            print 'training folders illegal'
Exemplo n.º 38
0
class RLDecisionMaker:
    def __init__(self, cluster):

        #Create logger
        LOG_FILENAME = 'files/logs/Coordinator.log'
        self.log = get_logger('RLDecisionMaker', 'INFO', logfile=LOG_FILENAME)
        self.log.info("Using 'gain' : " + env_vars['gain'] +" with threshold of "+str( env_vars["decision_threshold"]*100) + "% and interval: " + str(env_vars['decision_interval']))
        self.log.info("Cluster Size from %d to %d nodes" % (env_vars['min_cluster_size'], env_vars['max_cluster_size']))

        self.debug = False
        if self.debug:
            self.currentState = 8
        else:
            self.currentState = cluster.node_count()
        self.cluster = cluster
        self.nextState = self.currentState
        self.waitForIt = env_vars['decision_interval'] / env_vars['metric_fetch_interval']
        self.pending_action = None
        self.decision = {"action": "PASS", "count": 0}

        # The policy for getting throughput and latency when computing the reward func.
        # average, centroid
        self.measurementsPolicy = 'centroid'
        self.prediction = env_vars['use_prediction']
        self.predictor = Predictor()

        # used only in simulation!!
        self.countdown = 0

        # A dictionary that will remember rewards and metrics in states previously visited
        self.memory = {}

        for i in range(env_vars["min_cluster_size"], env_vars["max_cluster_size"] + 1):
            self.memory[str(i)] = {}
            #self.memory[str(i)]['V'] = None # placeholder for rewards and metrics
            self.memory[str(i)]['r'] = None
            self.memory[str(i)]['arrayMeas'] = None



        # Load any previous statics.
        self.measurementsFile = env_vars["measurements_file"]
        self.trainingFile = env_vars["training_file"]
        self.sumMetrics = {}
        # initialize measurements file
        meas = open(self.measurementsFile, 'a+')
        if os.stat(self.measurementsFile).st_size == 0:
            # The file is empty, set the headers for each column.
            meas.write('State\t\tLambda\t\tThroughput\t\tLatency\t\tCPU\t\tTime\n')
        meas.close()

        # load training set
        meas = open(self.trainingFile, 'r+')
        if os.stat(self.trainingFile).st_size != 0:
            # Read the training set measurements saved in the file.
            meas.next()  # Skip the first line with the headers of the columns
            for line in meas:
                # Skip comments (used in training sets)
                if not line.startswith('###'):
                    m = line.split('\t\t')
                    self.add_measurement(m)
        meas.close()

    def add_measurement(self, metrics, write_file=False, write_mem=True):
        """
        adds the measurement to either memory or file or both
        @param metrics: array The metrics to store. An array containing [state, lamdba, throughput, latency, time]
        @param writeFile: boolean If set write the measurement in the txt file
        :return:
        """
        if self.measurementsPolicy.startswith('average'):
            if not self.sumMetrics.has_key(metrics[0]):
                    # Save the metric with the state as key metrics = [state, inlambda, throughput, latency]
                    self.sumMetrics[metrics[0]] = {'inlambda': 0.0, 'throughput': 0.0, 'latency': 0.0, 'divide_by': 0}

            self.sumMetrics[metrics[0]] = {'inlambda': self.sumMetrics[metrics[0]]['inlambda'] + float(metrics[1]),
                                           'throughput': self.sumMetrics[metrics[0]]['throughput'] + float(metrics[2]),
                                           'latency': self.sumMetrics[metrics[0]]['latency'] + float(metrics[3]),
                                           'divide_by': self.sumMetrics[metrics[0]]['divide_by'] + 1}
        if self.debug and write_file:
            self.log.debug("add_measurements: won't load measurement to memory")
        else:
            if write_mem:
                # metrics-> 0: state, 1: lambda, 2: thoughtput, 3:latency, 4:cpu, 5:time
                if not self.memory.has_key(metrics[0]):
                    self.memory[str(metrics[0])] = {}
                    #self.memory[str(metrics[0])]['V'] = None # placeholder for rewards and metrics
                    self.memory[str(metrics[0])]['r'] = None
                    self.memory[str(metrics[0])]['arrayMeas'] = np.array([float(metrics[1]), float(metrics[2]),
                                                                          float(metrics[3]), float(metrics[4])], ndmin=2)
                elif self.memory[metrics[0]]['arrayMeas'] is None:
                    self.memory[metrics[0]]['arrayMeas'] = np.array([float(metrics[1]), float(metrics[2]),
                                                                     float(metrics[3]), float(metrics[4])], ndmin=2)
                else:
                    self.memory[metrics[0]]['arrayMeas'] = np.append(self.memory[metrics[0]]['arrayMeas'],
                                                                     [[float(metrics[1]), float(metrics[2]),
                                                                       float(metrics[3]), float(metrics[4])]], axis=0)
                    # but add 1 zero measurement for each state for no load cases ??? too many 0s affect centroids?

        if write_file:
            if write_mem:
                used = "Yes"
            else:
                used = "No"
            ms = open(self.measurementsFile, 'a')
            # metrics[5] contains the time tick -- when running a simulation, it represents the current minute,
            # on actual experiments, it is the current time. Used for debugging and plotting
            ms.write(str(metrics[0]) + '\t\t' + str(metrics[1]) + '\t\t' + str(metrics[2]) + '\t\t' +
                     str(metrics[3]) + '\t\t' + str(metrics[4]) + '\t\t' + str(metrics[5]) + '\t\t'+ used+'\n')
            ms.close()

    # param state: string Get the average metrics (throughput, latency) for this state.
    # return a dictionary with the averages
    def get_averages(self, state):
        averages = {}
        if self.sumMetrics.has_key(state):
            averages['throughput'] = float(self.sumMetrics[state]['throughput'] / self.sumMetrics[state]['divide_by'])
            averages['latency'] = float(self.sumMetrics[state]['latency'] / self.sumMetrics[state]['divide_by'])

            self.log.debug("GETAVERAGES Average metrics for state: " + state + " num of measurements: " + str(
                self.sumMetrics[state]['divide_by']) +
                                 " av. throughput: " + str(averages['throughput']) + " av. latency: " +
                                 str(averages['latency']))
        return averages

    def doKmeans(self, state, from_inlambda, to_inlambda):
        # Run kmeans for the measurements of this state and return the centroid point (throughput, latency)
        ctd = {}
        label = []
        centroids = {}
        if self.memory[state]['arrayMeas'] != None:
            count_state_measurements = len(self.memory[state]['arrayMeas'])
            # self.log.debug("DOKMEANS " + str(len(self.memory[state]['arrayMeas'])) +
            #                " measurements available for state " + state)
            sliced_data = None
            for j in self.memory[state]['arrayMeas']:
                #self.my_logger.debug("DOKMEANS self.memory[state]['arrayMeas'][j]: "+ str(j))
                # If this measurement belongs in the slice we're insterested in
                if j[0] >= from_inlambda and j[0] <= to_inlambda:
                    #self.my_logger.debug("DOKMEANS adding measurement : "+ str(j))
                    # add it
                    if sliced_data == None:
                        sliced_data = np.array(j, ndmin=2)
                    else:
                        sliced_data = np.append(sliced_data, [j], axis=0)

            k = 1  # number of clusters
            # 1. No known lamdba values close to current lambda measurement
            if sliced_data == None:
                # Check if there are any known values from +-50% inlambda.
                #                original_inlambda = float(from_inlambda* (10/9))
                #                from_inlambda = 0.8 * original_inlambda
                #                to_inlambda = 1.2 * original_inlambda
                #                self.my_logger.debug("Changed lambda range to +- 20%: "+ str(from_inlambda) + " - "+ str(to_inlambda))
                #                for j in self.memory[state]['arrayMeas']:
                #                    #self.my_logger.debug("DOKMEANS self.memory[state]['arrayMeas'][j]: "+ str(j))
                #                    # If this measurement belongs in the slice we're insterested in
                #                    if j[0] >= from_inlambda and j[0] <= to_inlambda:
                #                        # add it
                #                        if sliced_data == None:
                #                            sliced_data = np.array(j, ndmin=2)
                #                        else:
                #                            sliced_data = np.append(sliced_data, [j], axis=0)
                #                #centroids, label = kmeans2(self.memory[state]['arrayMeas'], k, minit='points') # (obs, k)
                #            #else:
                #            if sliced_data == None:
                self.log.debug("No known lamdba values close to current lambda measurement. Returning zeros!")
            else:
                # self.log.debug("DOKMEANS length of sliced_data to be fed to kmeans: " + str(len(sliced_data))
                #                +  " (out of %d total)" % count_state_measurements)
                centroids, label = kmeans2(sliced_data, k, minit='points')
                pass

            # initialize dictionary
            num_of_meas = {}
            #num_of_meas = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}
            for j in range(0, k):
                num_of_meas[str(j)] = 0
            if len(label) > 0:
                for i in label:
                    num_of_meas[str(i)] += 1

                max_meas_cluster = max(num_of_meas.iteritems(), key=operator.itemgetter(1))[0]
                #            self.my_logger.debug("DOKMEANS state: "+ state +" kmeans2 centroids: "+ str(centroids) +" label: "+
                #                       str(num_of_meas) + " cluster with max measurements: "+ str(max_meas_cluster))
                ctd['inlambda'] = centroids[int(max_meas_cluster)][0]
                ctd['throughput'] = centroids[int(max_meas_cluster)][1]
                ctd['latency'] = centroids[int(max_meas_cluster)][2]
                ctd['cpu'] = centroids[int(max_meas_cluster)][3]
            else:
                #self.log.debug("DOKMEANS one of the clusters was empty and so label is None :|. Returning zeros")
                ctd['inlambda'] = 0.0
                ctd['throughput'] = 0.0
                ctd['latency'] = 0.0
                ctd['cpu'] = 0.0
                #return None
        else:
            self.log.debug("DOKMEANS self.memory[state]['arrayMeas'] is None :|")

        return ctd

    def moving_average(self, iterable, n=3):
        # moving_average([40, 30, 50, 46, 39, 44]) --> 40.0 42.0 45.0 43.0
        # http://en.wikipedia.org/wiki/Moving_average
        it = iter(iterable)
        d = deque(itertools.islice(it, n - 1))
        d.appendleft(0)
        s = sum(d)
        for elem in it:
            s += elem - d.popleft()
            d.append(elem)
            yield s / float(n)

    def predict_load(self):
        # Linear Regression gia na doume to slope
        stdin, stdout = os.popen2("tail -n 20 " + self.measurementsFile)
        stdin.close()
        lines = stdout.readlines();
        stdout.close()
        ten_min_l = []  # store past 10 mins lambda's
        ten_min = []  # store past 10 mins ticks
        for line in lines:
            m = line.split('\t\t')  # state, lambda, throughput, latency, cpu, time tick
            ten_min_l.append(float(m[1]))
            ten_min.append(float(m[5]))
            # run running average on the 10 mins lambda measurements
            n = 5
            run_avg_gen = self.moving_average(ten_min_l, n)
            run_avg = []
            for r in run_avg_gen:
                run_avg.append(float(r))
            ten_min_ra = ten_min[2:18]  # np.arange(i-8, i-2, 1)

        # linear regression on the running average
        #(slope, intercept, r_value, p_value, stderr) = linregress(ten_min, ten_min_l)
        (slope, intercept, r_value, p_value, stderr) = linregress(ten_min_ra, run_avg)
        # fit the running average in a polynomial
        coeff = np.polyfit(ten_min, ten_min_l, deg=2)
        self.log.debug("Slope (a): " + str(slope) + " Intercept(b): " + str(intercept))
        self.log.debug("Polynom coefficients: " + str(coeff))
        #self.my_logger.debug("next 10 min prediction "+str(float(slope * (p + 10) + intercept + stderr)))
        predicted_l = float(slope * (ten_min[19] + 10) + intercept + stderr)  # lambda in 10 mins from now
        #predicted_l = np.polyval(coeff, (ten_min[9] + 10)) # lambda in 10 mins from now

        if slope > 0:
            #if predicted_l > allmetrics['inlambda'] :
            dif = 6000 + 10 * int(slope)
            #dif = 6000 + 0.2 * int(predicted_l - allmetrics['inlambda'])
            self.log.debug("Positive slope: " + str(slope) + " dif: " + str(dif)
                                 + ", the load is increasing. Moving the lambda slice considered 3K up")
        else:
            dif = -6000 + 10 * int(slope)
            #dif = -6000 + 0.2 * int(predicted_l - allmetrics['inlambda'])
            self.log.debug("Negative slope " + str(slope) + " dif: " + str(dif)
                                 + ", the load is decreasing. Moving the lambda slice considered 3K down")
            #dif = ((predicted_l - allmetrics['inlambda'])/ allmetrics['inlambda']) * 0.1 * 6000#* allmetrics['inlambda']
            #dif = int((predicted_l / allmetrics['inlambda']) * 6000)

        return predicted_l

    def publish_to_local_ganglia(self, allmetrics):
        """
        Publishes monitoring data to local ganglia agent
        :param allmetrics:
        :return:
        """
        self.log.debug( "TAKEDECISION allmetrics: " + str(allmetrics))

        #Publish measurements to ganglia
        try:
            os.system("gmetric -n ycsb_inlambda -v " + str(
                allmetrics['inlambda']) + " -d 15 -t float -u 'reqs/sec' -S " + str(
                self.monitoring_endpoint) + ":[DEBUG] hostname")
            os.system("gmetric -n ycsb_throughput -v " + str(
                allmetrics['throughput']) + " -d 15 -t float -u 'reqs/sec' -S " + str(
                self.monitoring_endpoint) + ":[DEBUG] hostname")
            os.system(
                "gmetric -n ycsb_latency -v " + str(allmetrics['latency']) + " -d 15 -t float -u ms -S " + str(
                    self.monitoring_endpoint) + ":[DEBUG] hostname")
        except:
            pass



    def handle_metrics(self, client_metrics, server_metrics):
        # read metrics
        allmetrics = {'inlambda': 0, 'throughput': 0, 'latency': 0, 'cpu': 0}

        if not self.debug:
            ## Aggreggation of YCSB client metrics
            clients = 0
            servers = 0
            # We used to collect server cpu too, do we need it?
            #self.log.debug("TAKEDECISION state: %d, pending action: %s. Collecting metrics" % (self.currentState, str(self.pending_action)))
            for host in client_metrics.keys():
                metric = client_metrics[host]
                if isinstance(metric, dict):
                    for key in metric.keys():
                        if key.startswith('ycsb_TARGET'):
                            allmetrics['inlambda'] += float(metric[key])
                        elif key.startswith('ycsb_THROUGHPUT'):
                            allmetrics['throughput'] += float(metric[key])
                        elif key.startswith('ycsb_READ') or key.startswith('ycsb_UPDATE') or key.startswith(
                                'ycsb_RMW') or key.startswith('ycsb_INSERT'):
                            allmetrics['latency'] += float(metric[key])
                    clients += 1

            for host in server_metrics.keys():
                metric = server_metrics[host]
                if isinstance(metric, dict):
                    #check if host in active cluster hosts
                    if not host in self.cluster.get_hosts().keys():
                        continue
                    servers += 1
                    for key in metric.keys():
                        if key.startswith('cpu_idle'):
                            allmetrics['cpu'] += float(metric[key])
            try:
                allmetrics['latency'] = allmetrics['latency'] / clients
            except:
                allmetrics['latency'] = 0
            try:
                allmetrics['cpu'] = (allmetrics['cpu'] / servers) # average node cpu usage
            except:
                allmetrics['cpu'] = 0
        else:
            self.log.info("Running in DEBUG mode, no metrics retrieved!")

        return allmetrics


    # a log-related variable
    pending_action_logged = False

    def take_decision(self, client_metrics, server_metrics):
        '''
             this method reads allmetrics object created by Monitoring.py and decides whether a change
             of the number of participating
             virtual nodes is due.
        '''

        # update prediction current minute counter
        self.predictor.tick_tock()
        if client_metrics is None or server_metrics is None: return
        # first parse all metrics
        allmetrics = self.handle_metrics(client_metrics, server_metrics)
        #self.publish_to_local_ganglia(allmetrics)

        pending_action = not (self.pending_action is None) # true if there is no pending action

        # 1. Save the current metrics to file and in memory only if there is no pending action.
        self.add_measurement([str(self.currentState), allmetrics['inlambda'], allmetrics['throughput'],
                              allmetrics['latency'], allmetrics['cpu'],
                              datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
                             write_file=True, write_mem=((not pending_action) and bool(env_vars['update_metrics'])))

        # if there is a pending action, don't take a decision
        if pending_action:
            global pending_action_logged
            if not pending_action_logged:
                self.log.debug("Last action " + self.pending_action + " hasn't finished yet, see you later!")
                pending_action_logged = True
            if self.debug:
                if self.countdown == 0:
                    self.log.debug("Running a simulation, set state from " + str(self.currentState) + " to " +
                                    str(self.nextState))
                    self.currentState = self.nextState
                    self.pending_action = None
                else:
                    self.countdown -= 1
                    self.log.debug("Reducing countdown to " + str(self.countdown))

            # skip decision
            self.decision["action"] = "PASS"
            self.decision["count"] = 0
            return self.decision

        pending_action_logged = False

        # manage the interval counter (waitForIt)
        if self.waitForIt == 0:
            self.waitForIt = env_vars['decision_interval'] / env_vars['metric_fetch_interval']
        else:
            if self.waitForIt == env_vars['decision_interval'] / env_vars['metric_fetch_interval']:
                self.log.debug("New decision in " + str(float(self.waitForIt*env_vars['metric_fetch_interval'])/60) +
                               " mins, see you later!")
            self.waitForIt -= 1
            self.decision["action"] = "PASS"
            self.decision["count"] = 0
            return self.decision

        
	# Select values close to the current throughtput, define tha lambda range we're interested in -+ 5%
        slice_range=75
	from_inlambda = allmetrics['inlambda'] - slice_range
        to_inlambda = allmetrics['inlambda'] + slice_range
        if self.prediction:
            predicted_l = self.predictor.poly_regression()
            if predicted_l > 0:
                # there are enough data to make a prediction, if not use the actual lambda
                self.log.debug(
                    "Predicted: " + str(predicted_l) + " lambda :" + str(allmetrics['inlambda']))
                from_inlambda = predicted_l - slice_range
                to_inlambda = predicted_l + slice_range

        self.log.debug("TAKEDECISION state %d lambda range: %d - %d" % (self.currentState, from_inlambda, to_inlambda))
        # too low to care, the initial num of nodes can answer 1000 req/sec,
        # so consider it as 0 1000 * len(cluster.size)!!
        if 0.0 < to_inlambda < 1000:
            from_inlambda = 0.0
            self.log.debug("TAKEDECISION state %d current lambda %d changed lambda range to: %d - %d"
                           % (self.currentState, allmetrics['inlambda'], from_inlambda, to_inlambda))

        # The subgraph we are interested in. It contains only the allowed transitions from the current state.
        from_node = max(int(env_vars["min_cluster_size"]), (self.currentState - env_vars["rem_nodes"]))
        to_node = min(self.currentState + int(env_vars["add_nodes"]), int(env_vars["max_cluster_size"]))
        #self.my_logger.debug("TAKEDECISION creating graph from node: "+ str(from_node) +" to node "+ str(to_node))

        #inject the current number of nodes
        allmetrics['current_nodes'] = self.currentState

        states = fset.FuzzySet()
        # Calculate rewards using the values in memory if any, or defaults
        for i in range(from_node, to_node + 1):
            # se periptwsi pou den exeis 3anadei to state upologizei poso tha ithele na einai to throughput
            # allmetrics['max_throughput'] = float(i) * float(self.utils.serv_throughput)
            allmetrics['num_nodes'] = i

            met = {}
            if self.measurementsPolicy.startswith('average'):
                met = self.getAverages(str(i))
            elif self.measurementsPolicy.startswith('centroid'):
                met = self.doKmeans(str(i), from_inlambda, to_inlambda)
                #format met output
                out_met = {k: int(v) for k,v in met.iteritems()}
                self.log.debug("TAKEDECISION state: " + str(i) + " met: " + str(out_met))

                if met != None and len(met) > 0:
                    # Been in this state before, use the measurements
                    allmetrics['inlambda'] = met['inlambda']
                    allmetrics['throughput'] = met['throughput']
                    allmetrics['latency'] = met['latency']
                    allmetrics['cpu'] = met['cpu']
                    #self.my_logger.debug("TAKEDECISION adding visited state "+ str(i) +" with gain "+ str(self.memory[str(i)]['r']))
                    #else:
                    # No clue for this state use current measurements...
                    #self.my_logger.debug("TAKEDECISION unknown state "+ str(i) +" with gain "+ str(self.memory[str(i)]['r']))

                self.memory[str(i)]['r'] = eval(env_vars["gain"], allmetrics)
                # if self.currentState != i:
                    # self.my_logger.debug(
                    #     "TAKEDECISION adding state " + str(i) + " with gain " + str(self.memory[str(i)]['r']))
                states.add(fset.FuzzyElement(str(i), self.memory[str(i)]['r']))

            # For the current state, use current measurement
            # if self.currentState == i:
            #     if not self.debug:
            #         cur_gain = eval(env_vars["gain"], allmetrics)
            #         # for debugging purposes I compare the current reward with the one computed using the training set
            #         self.log.debug("TAKEDECISION state %d current reward: %d training set reward: %d"
            #                         % (self.currentState, cur_gain, self.memory[str(i)]['r']))
            #         self.memory[str(i)]['r'] = cur_gain
            #         #self.log.debug("TAKEDECISION adding current state " + str(i) + " with gain " + str(cur_gain))
            #     else:
            #         cur_gain = (self.memory[str(i)]['r'])
            #         self.log.debug("TAKEDECISION state %d current state training set reward: %d"
            #                        % (self.currentState, cur_gain))
            #
            #     states.add(fset.FuzzyElement(str(i), cur_gain))

        # Create the transition graph
        v = []
        for i in states.keys():
            v.append(i)
        v = set(v)
        stategraph = fgraph.FuzzyGraph(viter=v, directed=True)

        for j in range(from_node, to_node + 1):
            if j != self.currentState:
                # Connect nodes with allowed transitions from the current node.connect(tail, head, mu) head--mu-->tail
                stategraph.connect(str(j), str(self.currentState), eval(env_vars["trans_cost"], allmetrics))
                #self.my_logger.debug(
                #    "TAKEDECISION connecting state " + str(self.currentState) + " with state " + str(j))
                # Connect nodes with allowed transitions from node j.
                #for k in range(max(int(env_vars["min_cluster_size"]), j - int(env_vars["rem_nodes"])),
                #               min(j + int(env_vars["add_nodes"]), int(env_vars["max_cluster_size"])+1)):
                #    if k != j:
                #        self.my_logger.debug("TAKEDECISION connecting state "+ str(j) +" with state "+ str(k))
                #        stategraph.connect(str(k), str(j), eval(env_vars["trans_cost"], allmetrics))

        #Calculate the V matrix for available transitions
        V = {}

        for s in range(from_node, to_node + 1):
            # Get allowed transitions from this state.
            if self.memory[str(s)]['r'] != None:
                # For each state s, we need to calculate the transitions allowed.
                #allowed_transitions = stategraph.edges(head=str(s))
                #Vs = []
                #                    for t in allowed_transitions:
                # t[0] is the tail state of the edge (the next state)
                # No V from last run
                #if self.memory[t[0]]['V'] == None:
                #    self.memory[t[0]]['V'] = self.memory[t[0]]['r']

                #                    Vs.append(self.memory[t[0]]['r'])
                #                    self.my_logger.debug("TAKEDECISION tail state: "+ t[0] +" head state: "+
                #                                         t[1] +" V("+t[0]+") = "+ str(self.memory[t[0]]['V']))
                #                    self.my_logger.debug("TAKEDECISION transition cost from state:"+ str(t[1]) +" to state: "+ str(t[0]) +
                #                                         " is "+ str(stategraph.mu(t[1],t[0])))

                #                The original algo uses previous values of max reward (+ gamma * previous max), we don't
                #                if len(Vs) > 0:
                #                    V[s] = self.memory[str(s)]['r'] + float(self.utils.gamma) * max(Vs)
                #                else:
                #                    V[s] = self.memory[str(s)]['r']
                V[s] = self.memory[str(s)]['r']


        self.log.debug("TAKEDECISION Vs="+str(V))

        # Find the max V (the min state with the max value)
        max_gain = max(V.values())
        max_set = [key for key in V if V[key] == max_gain]
        self.log.debug("max set: "+str(max_set))
        self.nextState = min(max_set)
        self.log.debug("max(V): %d (GAIN=%d)" % (self.nextState, V[self.nextState]))

        #self.my_logger.debug("TAKEDECISION next state: "+ str(self.nextState))
        # Remember the V values calculated ???
        #for i in V.keys():
        #    self.memory[str(i)]['V'] = V[i]
        #    self.my_logger.debug("TAKEDECISION V("+ str(i) +") = "+ str(V[i]))

        #        vis = fuzz.visualization.VisManager.create_backend(stategraph)
        #        (vis_format, data) = vis.visualize()
        #
        #        with open("%s.%s" % ("states", vis_format), "wb") as fp:
        #            fp.write(data)
        #            fp.flush()
        #            fp.close()

        if self.nextState != self.currentState:
            self.log.debug("Decided to change state to_next: " + str(self.nextState) + " from_curr: " + str(self.currentState))
            # You've chosen to change state, that means that nextState has a greater reward, therefore d is always > 0
            current_reward = self.memory[str(self.currentState)]['r']
            d = self.memory[str(self.nextState)]['r'] - current_reward
            self.log.debug( "Difference is " + str(d) + " abs thres="+str(env_vars['decision_abs_threshold'])+" gte:"+str(float(d) < env_vars['decision_abs_threshold']))
            if (current_reward != 0 and (abs(float(d) / current_reward) < env_vars['decision_threshold']))\
                    or float(d) < env_vars['decision_abs_threshold']:
                #false alarm, stay where you are
                self.nextState = self.currentState
                # skip decision
                self.decision["action"] = "PASS"
                self.decision["count"] = 0
                self.log.debug("ups changed my mind...staying at state: " + str(self.currentState) +
                               " cause the gain difference is: " + str(abs(d)) +
                               " which is less than %d%% of the current reward, it's actually %f%%" % (int(100*env_vars['decision_threshold']) ,abs(float(d)*100) / (float(current_reward)+0.001)))
            else:
                self.log.debug("Difference "+ str(d) + " is greater than threshold ("+str(env_vars['decision_threshold'])+"). Keeping decision")
            # If the reward is the same with the state you're in, don't move
            # elif (d == 0):
            #     #false alarm, stay where you are
            #     self.nextState = self.currentState
            #     # skip decision
            #     self.decision["action"] = "PASS"
            #     self.decision["count"] = 0
            #     self.log.debug("ups changed my mind...staying at state: " + str(self.currentState) +
            #                          " cause the gain difference is: " + str(abs(d)) +
            #                          " which is less than 10% of the current reward "
            #                          + str(self.memory[str(self.currentState)]['r']))

        if self.nextState > self.currentState:
            self.decision["action"] = "ADD"
        elif self.nextState < self.currentState:
            self.decision["action"] = "REMOVE"

        self.decision["count"] = abs(int(self.currentState) - int(self.nextState))
        #self.log.debug("TAKEDECISION: action " + self.decision["action"] + " " + str(self.decision["count"]) +
        #               " nodes.")

        ## Don't perform the action if we're debugging/simulating!!!
        if self.debug:
            if self.pending_action is None and not self.decision["action"].startswith("PASS"):
                self.pending_action = self.decision['action']
                self.countdown = 2 * self.decision['count'] * 60 / env_vars['metric_fetch_interval']
                #self.currentState = str(self.nextState)
                self.log.debug("TAKEDECISION simulation, action will finish in: " + str(self.countdown) + " mins")
            else:
                self.log.debug("TAKEDECISION Waiting for action to finish: " + str(self.pending_action))

        return self.decision

    def simulate(self):
        self.log.debug("START SIMULATION!!")
        ## creates a sin load simulated for an hour
        #        for i in range(0, 3600, 10):
        #for i in range(0, 14400, 60): # 4 hours
        for i in range(0, 900, 1):
            cpu = max(5, 60 * abs(math.sin(0.05 * math.radians(i))) - int(self.currentState))
            # lamdba is the query arrival rate, throughput is the processed queries
            #l = 60000 + 40000 * math.sin(0.01 * i) + random.uniform(-4000, 4000)
            #l = 50000 * math.sin(60 * math.radians(i)/40) + 65000 + random.uniform(-8000, 8000)
            #l = 40000 * math.sin(60 * math.radians(i)/50) + 45000 + random.uniform(-4000, 4000)
            #l = 30000 * math.sin(0.02 * i) + 55000 + random.uniform(-4000, 4000)
            l = 60000 * math.sin(0.04 * i) + 75000 + random.uniform(-6000, 6000)
            # first 10 mins
            #            if i < 1200:
            #                l = 20000
            #            elif i < 2400:
            #                l = 40000
            #            elif i < 4400:
            #                l = 60000
            #            elif i < 6000:
            #                l = 40000
            #            elif i < 7200:
            #                l = 20000

            maxThroughput = (float(self.currentState) * float(env_vars["serv_throughput"]))
            #            latency = 200 # msec
            #            if (l > maxThroughput):
            #                latency += (l-maxThroughput)/10 # +100msec for every 1000 reqs queued
            #throughput = min(maxThroughput, l)# max throughput for the current cluster
            throughput = l  #(+/- e ??)
            latency = 0.0000004 * l ** 2 + 200  # msec...
            if l > maxThroughput:
                throughput = maxThroughput - 0.01 * l
                latency = 0.00001 * (l - maxThroughput) ** 2 + (0.0000004 * maxThroughput ** 2 + 200)  # msec... ?

            values = {'latency': latency, 'cpu': cpu, 'inlambda': l, 'throughput': throughput,
                      'num_nodes': self.currentState}
            self.log.debug(
                "SIMULATE i: " + str(i) + " state: " + str(self.currentState) + " values:" + str(values)
                + " maxThroughput: " + str(maxThroughput))

            #nomizw de xreiazetai giati ginetai kai take_decision kai se debug mode
            #self.addMeasurement([self.currentState, str(l), str(throughput), str(latency), str(i)], True)
            #if self.pending_action[len(self.pending_action)-1] == "done" :
            self.take_decision(values)

            time.sleep(1)
        return

    def simulate_training_set(self):
        # run state 12 lambdas
        self.log.debug("START SIMULATION!!")
        self.debug = True
        load = []
        for k in range(9, 19):
            for j in self.memory[str(k)]['arrayMeas']:
                load.append(j[0])


        #for i in range(0, 120, 1): # paizei? 1 wra ana miso lepto
        for i in range(0, 240*12, 1):
            l = load[i]
            # throughput = (800 * self.currentState)
            # if l < (800 * self.currentState):
            #     throughput = l
            values = {'inlambda': l, 'num_nodes': self.currentState}
            self.log.debug(
                "SIMULATE i: " + str(i) + " state: " + str(self.currentState) + " values:" + str(values))

            self.take_decision(values)
 def runWithoutWndchrm(self):
     tr = Trainer(load=False, loadWndchrm=False)
     tr.runWithoutWndchrm()
     pr = Predictor(load=False, loadWndchrm=False)
     pr.runWithoutWndchrm()
 def run(self):
     tr = Trainer(load=False, loadWndchrm=False)
     tr.run()
     pr = Predictor(load=False, loadWndchrm=False)
     return pr.run()
    def run(self, k=3, useOnlyRF=True):
        featureGetter = FeatureGetter()
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)
        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1)
        splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1)
        splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1)
        
        """Leave the last split for testing"""
        testNamesObs = splittedNamesObs[k]
        testCoords = splittedCoords[k]
        testDataset = splittedData[k]
        
        splittedNamesObs = splittedNamesObs[:k]
        splittedCoords = splittedCoords[:k]
        splittedData = splittedData[:k]
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)

        bestModel = None
        bestFmeasure = 0
        
        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsValid = splittedNamesObs[i]
            coordinatesValid = splittedCoords[i]
            datasetValid = splittedData[i]
            namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetValid[:,filterImportances])
            else:
                predictions = model.predict(datasetValid)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating validation results"
            [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid)
            if fmeasure > bestFmeasure:
                bestFmeasure = fmeasure
                bestModel = model
            del(datasetTrain)
            del(datasetValid)
            del(coordinatesTrain)
            del(coordinatesValid)
            del(namesObservationsTrain)
            del(namesObservationsValid)
        
        print "Calculating final results"
        predictions = bestModel.predict(testDataset)
        print "The final score is: "
        testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0])
        Predictor.finalResults(testNamesObs, predictions, testCoords)
Exemplo n.º 42
0
def DoWork(source_file, m, k, fltr, trade_cost, testing_start_date, testing_end_date):

    # Set some defaults
    p = Predictor(source_file, m, k, fltr, testing_start_date, testing_end_date)
    p.read_file_to_daily_data_by_weeks(0)
    p.calc_historical_weekly_return(p.daily_data_by_weeks)

    # Initialize variables
    strategy_trade_count = 0
    in_market_count = 0
    current_state = 0
    long_position_r = 0
    short_position_r = 0
    bh_return = 0
    strategy_weekly_return = []
    bh_weekly_return = []

    for idx in range(p.start_split, p.end_split):

        # Generate weekly signals
        p.weekly_return_data = p.historical_weekly_return_data[:idx-1]
        alist = p.find_k_closest_histories()
        r = p.calc_next_week_return(alist)
        cur_signal = p.signal(current_state, r)

        # Signal handling
        cur_index_price = p.daily_data_by_weeks[idx][-1][2]
        prev_index_price = p.daily_data_by_weeks[idx-1][-1][2]
        actual_return = math.log(cur_index_price) - math.log(prev_index_price)
        if current_state != cur_signal:
            current_state = cur_signal
            strategy_trade_count += 1

        # K Nearest Neighbor
        if current_state == 1:
            long_position_r += actual_return
            strategy_weekly_return.append(actual_return)
        elif current_state == -1:
            short_position_r += actual_return * current_state
            strategy_weekly_return.append(actual_return * current_state)

        # Buy-and-Hold
        if current_state == 1:
            in_market_count += 1
            bh_return += actual_return
            bh_weekly_return.append(actual_return)

        '''
        if current_state == 0:
            decision = 'No position!'
        elif actual_return * current_state > 0:
            decision = 'Prediction is RIGHT!'
        else:
            decision = 'Prediction is WRONG!'
        print('Predicted return = {0} and actual return = {1} - {2}'.format(r, actual_return, decision))
        '''

    # Excess return
    trade_cost_coef = math.log((1-trade_cost)/(1+trade_cost))
    strategy_return = long_position_r + short_position_r + strategy_trade_count * trade_cost_coef
    bh_return = (in_market_count/float(p.end_split - p.start_split)) * bh_return + 2 * trade_cost_coef

    print('Strategy return = {0} (long: {1} and short: {2}), bh return = {3}, excess return = {4}, sharpe = {5}'.format(strategy_return, long_position_r, short_position_r, bh_return, strategy_return-bh_return, strategy_return/Stdev(strategy_weekly_return)))
Exemplo n.º 43
0
def defaultPredData(comicId):
    pred = Predictor()
    pred.generatePredictorDataTemplate()
    directory = cacheLoc + "predictorInfo/" + str(comicId) + "/"
    shutil.copy2(cacheLoc + "predictorInfo/predictorData.txt", directory)
Exemplo n.º 44
0
    def __init__(self, cluster):

        #Create logger
        LOG_FILENAME = 'files/logs/Coordinator.log'
        self.log = get_logger('RLDecisionMaker', 'INFO', logfile=LOG_FILENAME)
        self.log.info("Using 'gain' : " + env_vars['gain'] +" with threshold of "+str( env_vars["decision_threshold"]*100) + "% and interval: " + str(env_vars['decision_interval']))
        self.log.info("Cluster Size from %d to %d nodes" % (env_vars['min_cluster_size'], env_vars['max_cluster_size']))

        self.debug = False
        if self.debug:
            self.currentState = 8
        else:
            self.currentState = cluster.node_count()
        self.cluster = cluster
        self.nextState = self.currentState
        self.waitForIt = env_vars['decision_interval'] / env_vars['metric_fetch_interval']
        self.pending_action = None
        self.decision = {"action": "PASS", "count": 0}

        # The policy for getting throughput and latency when computing the reward func.
        # average, centroid
        self.measurementsPolicy = 'centroid'
        self.prediction = env_vars['use_prediction']
        self.predictor = Predictor()

        # used only in simulation!!
        self.countdown = 0

        # A dictionary that will remember rewards and metrics in states previously visited
        self.memory = {}

        for i in range(env_vars["min_cluster_size"], env_vars["max_cluster_size"] + 1):
            self.memory[str(i)] = {}
            #self.memory[str(i)]['V'] = None # placeholder for rewards and metrics
            self.memory[str(i)]['r'] = None
            self.memory[str(i)]['arrayMeas'] = None



        # Load any previous statics.
        self.measurementsFile = env_vars["measurements_file"]
        self.trainingFile = env_vars["training_file"]
        self.sumMetrics = {}
        # initialize measurements file
        meas = open(self.measurementsFile, 'a+')
        if os.stat(self.measurementsFile).st_size == 0:
            # The file is empty, set the headers for each column.
            meas.write('State\t\tLambda\t\tThroughput\t\tLatency\t\tCPU\t\tTime\n')
        meas.close()

        # load training set
        meas = open(self.trainingFile, 'r+')
        if os.stat(self.trainingFile).st_size != 0:
            # Read the training set measurements saved in the file.
            meas.next()  # Skip the first line with the headers of the columns
            for line in meas:
                # Skip comments (used in training sets)
                if not line.startswith('###'):
                    m = line.split('\t\t')
                    self.add_measurement(m)
        meas.close()
    def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True):
        featureGetter = FeatureGetter()
        overallTP = 0
        overallFP = 0
        overallFN = 0
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)

        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        if patientSplit:
            k = 12
            (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset)
            if breakin2:
                k = 2
                (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData)
        else:
            splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k)
            splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k)
            splittedData = self.getShuffledSplits(dataset, indexesChanged, k)
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)
        
        overallArrayTP = np.zeros(12)
        overallArrayFP = np.zeros(12)
        overallArrayFN = np.zeros(12)

        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsTest = splittedNamesObs[i]
            coordinatesTest = splittedCoords[i]
            datasetTest = splittedData[i]
            namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
    
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
        
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetTest[:,filterImportances])
            else:
                predictions = model.predict(datasetTest)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating final results"
            [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest)
            print arrayTP
            print arrayFP
            print arrayFN
            
            overallArrayTP += arrayTP
            overallArrayFP += arrayFP
            overallArrayFN += arrayFN
            overallTP += truePositives
            overallFP += falsePositives
            overallFN += falseNegatives
            del(datasetTrain)
            del(datasetTest)
            del(coordinatesTrain)
            del(coordinatesTest)
            del(namesObservationsTrain)
            del(namesObservationsTest)
        
        precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
        recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
        fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
        
        print "Overall results for k=%d" %k
        print overallTP
        print overallFP
        print overallFN
        print precision
        print recall
        print fmeasure
        
        for i in range(len(overallArrayTP)):
            "Results for patient number %d:"% (i+1)
            overallTP = overallArrayTP[i]
            overallFP = overallArrayFP[i]
            overallFN = overallArrayFN[i]
            precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
            recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
            fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
            print precision
            print recall
            print fmeasure
Exemplo n.º 46
0
	def GetPredictions(logProbabilities):
		target = Predictor()
		return target.getPredictions(logProbabilities)
Exemplo n.º 47
0
  sys.exit()
if sys.argv[1] == "load":
  usePickle = True
elif sys.argv[1] == "train":
  usePickle = False
else:
  print "Usage:", usage
  sys.exit()

#create classifier
if usePickle:
  print "Importing Classifier"
  p = pickle.load(open('predictor.pickle', 'r'))
else:
  print "Training Classifier"
  p = Predictor("spam", "ham")
  print "Saving Pickle"
  pickle.dump(p, open('predictor.pickle', 'w'))

if len(sys.argv) > 2:
  if sys.argv[2] == "test":
    testDev()
    testExternal()

  elif os.path.isdir(sys.argv[2]):
      # predict all files in folder
      for f in sorted_nicely(glob.glob(sys.argv[2]+'/*')):
          print f, ':', p.predict(f)
  elif os.path.isfile(sys.argv[2]):
      # predict this file
      print sys.argv[2], ':', p.predict(sys.argv[2])
Exemplo n.º 48
0
    elif input == 5:
        # Display machine learning results

        # Get query from user
        query = getQuery(True, True, True)

        print "\n"

        # Get results from GradCafe
        gradResults = GradCafe.getResults(query, False)

        QueryUtil.refineQuery(query)

        # Get results from GoHackers
        goResults = GoHackers.getResults(query, False)

        # Predict outcome
        doExperiment = False

        predictor = Predictor(gradResults, goResults)

        if doExperiment:
            predictor.runExperiment()
        else:
            predictor.predict()

        if not continueQuery():
            break
    else:
        break