def _get_training_data_y(self): data_getter = DataGetter() config.DF_BASE_START_DATE = config.TRAINING_DATE_START config.DF_BASE_END_DATE = config.TRAINING_DATE_END df_result = data_getter.get_deltas() df_result = df_result[config.TRAINING_DATA_TARGET] y = df_result.values if self.X is None: raise Exception( 'X needs to be defined before defining Y. Run _get_training_data_x before this method.' ) y = y[0:self.X.shape[0]] y = self._one_hot_encode(y, 2) config.DF_BASE_START_DATE = config.VALIDATION_DATE_START config.DF_BASE_END_DATE = config.VALIDATION_DATE_END df_result = data_getter.get_deltas() df_result = df_result[config.TRAINING_DATA_TARGET] y_val = df_result.values if self.X_val is None: raise Exception( 'X needs to be defined before defining Y. Run _get_training_data_x before this method.' ) y_val = y_val[0:self.X_val.shape[0]] y_val = self._one_hot_encode(y_val, 2) return y, y_val
def __init_model(self): self.train_manager = DataGetter( 'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json') ''' 在data_getter中bio_convert中lebel和setence增扩为元素相同长度相同的列表,对长度不足的句子补[0],对齐最长句句子长度,这步不知道有 没有用,但是在原代码的data_manager中有这一步,data_getter和data_manager输出对象应该是相同的 ''' #self.total_size = len(self.train_manager.sentence) data = { "batch_size": 20, #"input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.get_tag2idx, } self.save_params(data) dev_manager = DataGetter( 'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json') #self.dev_batch = dev_manager.iteration() ''' 将iteration代码移植到data_getter中 ''' self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=Config.max_len, dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model()
def _get_testing_data_x(self): data_getter = DataGetter() config.DF_BASE_START_DATE = config.TESTING_DATE_START config.DF_BASE_END_DATE = config.TESTING_DATE_END reshaped_data_lstm = data_getter.get_reshaped_data_for_lstm() self.X = reshaped_data_lstm return reshaped_data_lstm
def _get_testing_data_y(self): data_getter = DataGetter() df_result = data_getter.get_deltas() df_result = df_result[config.TRAINING_DATA_TARGET] y = df_result.values if self.X is None: raise Exception( 'X needs to be defined before defining Y. Run _get_testing_data_x before this method.' ) y = y[0:self.X.shape[0]] y = self._one_hot_encode(y, 2) return y
def _get_training_data_x(self): data_getter = DataGetter() config.DF_BASE_START_DATE = config.TRAINING_DATE_START config.DF_BASE_END_DATE = config.TRAINING_DATE_END X_train = data_getter.get_reshaped_data_for_lstm() self.X = X_train config.DF_BASE_START_DATE = config.VALIDATION_DATE_START config.DF_BASE_END_DATE = config.VALIDATION_DATE_END X_val = data_getter.get_reshaped_data_for_lstm() self.X_val = X_val return X_train, X_val
def can(roles, kwargs): event_id = kwargs.get('event_id', None) if event_id: for el in DataGetter.get_user_events_roles(event_id): if el.role in roles: return True return False
def can(roles, kwargs): event_id = kwargs.get("event_id", None) if event_id: for el in DataGetter.get_user_events_roles(event_id): if el.role in roles: return True return False
def __init_model(self): self.train_manager = DataGetter( 'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json') #self.total_size = len(self.train_manager.sentence) data = { "batch_size": 20, #"input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.get_tag2idx, } self.save_params(data) dev_manager = DataGetter( 'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json') #self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=Config.max_len, dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model()
from data_getter import DataGetter D = DataGetter() D_name = D.get_name() D_number = D.get_number() class DuichengWindow(): def __init__(self): self.L1_name = D_name[0] self.L2_name = D_name[1] self.L3_name = D_name[2] self.L4_name = D_name[3] self.L5_name = D_name[4] self.L = D_number[0] self.W = D_number[1] self.G = D_number[2] self.num1 = D_number[3] self.num2 = D_number[4] self.L_a = D_number[5] self.L_b = self.L - 2 * self.G - self.L_a self.W_a1 = (self.W - 2 * self.G) // self.num1 self.W_a2 = (self.W - 2 * self.G) // self.num2 def SumWindow(self): if self.num2 == 1: L1 = 2 * ((self.L - self.G * 2) + (self.W - self.G * 2)) print('窗框{}:{}'.format(self.L1_name, L1)) L2 = self.num1 * (2 * (self.L_a + self.W_a1)) print('窗扇{}:{}'.format(self.L2_name, L2))
class Train(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self): self.train_manager = DataGetter( 'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json') ''' 在data_getter中bio_convert中lebel和setence增扩为元素相同长度相同的列表,对长度不足的句子补[0],对齐最长句句子长度,这步不知道有 没有用,但是在原代码的data_manager中有这一步,data_getter和data_manager输出对象应该是相同的 ''' #self.total_size = len(self.train_manager.sentence) data = { "batch_size": 20, #"input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.get_tag2idx, } self.save_params(data) dev_manager = DataGetter( 'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json') #self.dev_batch = dev_manager.iteration() ''' 将iteration代码移植到data_getter中 ''' self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=Config.max_len, dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() def load_config(self): try: fopen = open("config.py", 'r', encoding='UTF-8') fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("config.py", "w", encoding='UTF-8') config = { "embedding_size": 3, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, } fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = "models/" self.tags = train_manager.label self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(1): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() train_labels, train_sentences = train_manager.bio_converter() input_tensor = CustomData.__getitem__(index) sentences_tensor = torch.tensor(train_sentences, dtype=torch.long) #转换为张量 tags_tensor = torch.tensor(train_labels, dtype=torch.long) length_tensor = torch.tensor(len(self.train_manager.sentence), dtype=torch.long) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate(input_tensor) print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self, input_tensor): train_labels, train_sentences = train_manager.bio_converter() length = len(self.train_manager.sentence) _, paths = self.model(input_tensor, train_sentences) print("\teval") for label in self.labels: f1_score(labels, paths, label, self.model.tag_map)
def runBacktestOffline(self, instrument, granularity, startep, endep): dg = DataGetter(MSSQLGetter(OandaGetter(instrument, granularity))) ti = BackTestTicker(dg, startep, endep) self._run(ti)
""" This module builds and executes the mailer system. """ import os import sys import csv import requests from mailer import Mailer from data_getter import DataGetter from parameters import (MAIL_SENDER, DATABASE_LOCATION, TARGET_DATABASE, TEMPLATE_FILE, ATTACHMENTS_FOLDER, MAIL_COLUMN_NAME) mailer = Mailer() getter = DataGetter(DATABASE_LOCATION) def get_mail_title(): """Gets the title for the mail.""" trailing = sys.argv[1:] return 'Sample Diffusion' if not trailing else " ".join(trailing) def generate_message(template, subject, attachments, **kwargs): """Generates the adequate message depending on attachments.""" text = template.format(**kwargs) if not attachments: return mailer.create_text_message(MAIL_SENDER, kwargs[MAIL_COLUMN_NAME], subject, text) return mailer.create_attachments_message(MAIL_SENDER,
def __init__(self, hostid): self.hostid = hostid self.load_items() self.dgetter = DataGetter() self.set_anom() self.alarm_on = False
class ZabbixAnomaly(object): items = {} anom = None datum = None hostid = "" dgetter = None alarm_on = False def __init__(self, hostid): self.hostid = hostid self.load_items() self.dgetter = DataGetter() self.set_anom() self.alarm_on = False def set_anom(self): host = confget("jubatus_server", "host") port = int(confget("jubatus_server", "port")) name = confget("jubatus_server", "name") self.anom = client.Anomaly(host, port, name) def load_items(self): strsql = "select itemid, valuesize from target_items where hostid = \'%s\' and enabled = 1;" % (self.hostid) data = self.exec_selsql(strsql) items = {} for row in data: itemid = str(row[0]) valuesize = row[1] self.items[itemid] = Item(itemid, valuesize) def norm(self, itemid, value): return self.items[str(itemid)].norm_value(value) def learn(self, endclock): self._run_anomaly(endclock, "add") def detect(self, endclock): self._run_anomaly(endclock, "calc") # method="add"|"calc" def _run_anomaly(self, endclock, method="add"): # Prepare data if method == "add": g_data = self.dgetter.g_get_history(endclock) if method == "calc": g_data = self.dgetter.g_copy_history(endclock) datadict = {} for row in g_data: (itemid, clock, value) = row itemid = str(itemid) if datadict.has_key(itemid): self._juba_proc(clock, datadict, method) datadict = {} if not datadict.has_key(self.hostid): (hour, weekday) = self.expand_clock(clock) datadict["hostid"] = self.hostid datadict["weekday"] = weekday*1.0/7 datadict["hour"] = hour*1.0/24 datadict[itemid] = value if len(datadict) > 0: self._juba_proc(clock, datadict, method) if method=="add": cf.log("Saving learned model") self.anom.save("latest") self.dgetter.remove_history(endclock) def _juba_proc(self, clock, datadict, method="add"): #if DEBUG: # print datadict datum = Datum() for k in datadict.keys(): #print "key:%s value:%s" % (str(k), str(datadict[k])) if k == "hostid": datum.add_number(str(k), int(datadict[k])*1.0/ZBX_ITEMID_DIGITS) elif k == "weekday" or k == "hour": datum.add_number(str(k), datadict[k]) elif k != "hostid" and k != "weekday" and k != "hour": datum.add_number(str(k), self.norm(k, datadict[k])) #print datum retry_cnt = JUBA_RETRY_MAX while True: try: if method=="add": print datum ret = self.anom.add(datum) exit() if method=="calc": print datum score = self.anom.calc_score(datum) if score == float('Inf') or score > ML_LIMIT: #print datadict if self.alarm_on == False: self.alarm_on = True cf.log("[%s] score=%f" % (cf.clock2strjst(clock), score)) else: if self.alarm_on == True: self.alarm_on = False cf.log("[%s] score recovered to normal:score=%f" % (cf.clock2strjst(clock), score)) break except (msgpackrpc.error.TransportError, msgpackrpc.error.TimeoutError) as e: retry_count -= 1 if retry_count <= 0: raise self.anom.get_client().close() self.set_anom() print e time.sleep(JUBA_RETRY_INTERVAL) continue def expand_clock(self, clock): d = cf.clock2datetime(clock) hour = d.hour weekday = d.weekday() return (hour, weekday) def exec_selsql(self, strsql): return sf.exec_selsql(MYSQL_JUBATUS, strsql)