def list_all_containers(): port = START_PORT for i in range(NUM_WORKER_NODES): logger.info("Containers on Node co_node_{}: ".format(port)) (out, err) = CLIUtils.run(LIST_POD_CONTAINERS_CMD.format(port)) logger.info('list of containers: %s', out) port = port + 1
def check_label_distribution(self, input_y): _label_distribution = np.sum(np.array(input_y), 0) empty_class_ = [ i for i in range(_label_distribution.shape[0]) if _label_distribution[i] == 0 ] # 包含样本量为0的类别 self.kurtosis = stats.kurtosis(_label_distribution) self.normal_std = np.std(_label_distribution) / np.sum( _label_distribution) logger.info("check input_y kurtosis {}".format(self.kurtosis)) logger.info("check input_y class: {} and normal_std is {}".format( empty_class_, self.normal_std)) if len(empty_class_) == 0: # No empty label, all label covered! self.shuffle = False else: self.shuffle = True if self.normal_std > 0.3: # fixme: 针对类别极不均衡 self.imbalance_level = 2 # elif self.normal_std > 0.07: # fixme:针对类别不均衡 self.imbalance_level = 1 else: # fixme: 类别均衡 self.imbalance_level = 0
def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata self.domain = infer_domain(metadata) train_data_dir = self.metadata.get_dataset_name() + "/*" test_data_dir = self.metadata.get_dataset_name().replace("train", "test") + "/*" test_metadata_filename = self.metadata.get_dataset_name().replace("train", "test") + "/metadata.textproto" warmup_train_cmd = "cat {} >/dev/null".format(train_data_dir) warmup_test_cmd = "cat {} >/dev/null".format(test_data_dir) # logger.info("Note: test_metadata_filename={}, cmd={},{} AutoDL_G_CONF: {}".format(test_metadata_filename, warmup_train_cmd, warmup_test_cmd, autodl_g_conf_repr)) logger.info( "Note: test_metadata_filename={}, cmd={},{}".format( test_metadata_filename, warmup_train_cmd, warmup_test_cmd ) ) os.system(warmup_train_cmd) os.system(warmup_test_cmd) logger.info("Note:The inferred domain of current dataset is: {}.".format(self.domain)) # Domain识别及Model初始化 # DomainModel = DOMAIN_TO_MODEL[self.domain] DomainModel = meta_domain_2_model(self.domain) # self.domain_metadata = get_domain_metadata(metadata, self.domain) # logger.info("Note:The domain metadata is {}".format(self.domain_metadata)) self.domain_model = DomainModel(self.metadata) self.has_exception = False self.y_pred_last = None
def start(self, seconds): if self._context.sensor.is_high(): self._blink_handler.cancel() automationhat.light.power.off() self._context.transition_to(RunningState(self._context, seconds)) else: logger.info('The water tank is empty!')
def test_logger_info(self, mock_log): obj = object() logger.info('test', request=obj) self.assertEqual(mock_log.call_args, ( ('[{0}] test'.format(id(obj)), ), { 'extra': {} }, ))
def decide_first_num(self): snoop_data_num = min(0.01 * self.train_num, FIRST_SNOOP_DATA_NUM) # 第一次最多取700 snoop_X, snoop_Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( snoop_data_num) label_coverage, normal_std = self.check_label_coverage(snoop_Y) self.check_input_length(snoop_X[:FIRST_SNOOP_DATA_NUM]) logger.info("label_coverage is {}".format(label_coverage)) if normal_std > 0.3: dataset_read_num = min(5000, int(0.1 * self.train_num)) else: if self.class_num == 2 and self.train_num <= 50000: if label_coverage == 1.0: # 类别均匀覆盖 dataset_read_num = max(int(0.01 * self.train_num), 500) # 设置小样本下限 if self.train_num <= 10000: dataset_read_num = min( 5000, self.domain_metadata["class_num"] * 3000) else: # snoop类别有缺失, 可能为顺序进样 dataset_read_num = min(5000, int(0.1 * self.train_num)) elif self.class_num == 2 and self.train_num > 50000: if label_coverage == 1.0: # 类别均匀覆盖 # 不超过10w的数据集,取1%, 超过10w的数据集,取1000上限 dataset_read_num = min(int(0.01 * self.train_num), 1000) else: # snoop类别有缺失, 可能为顺序进样 dataset_read_num = min(5000, int(0.1 * self.train_num)) ########################### 多分类 ###################################### elif self.class_num > 2 and self.train_num <= 50000: if label_coverage == 1.0: # 类别均匀覆盖 dataset_read_num = min( int((2 / self.class_num) * self.train_num), 1000) # 设置小样本下限 if self.train_num <= 10000: dataset_read_num = min( 5000, self.domain_metadata["class_num"] * 3000) else: dataset_read_num = min(5000, int(0.1 * self.train_num)) elif self.class_num > 2 and self.train_num > 50000: if label_coverage == 1.0: # 类别均匀覆盖 # 不超过10w的数据集,取1%, 超过10w的数据集,取1500上限 dataset_read_num = min( int((2 / self.class_num) * self.train_num), 1500) else: # snoop类别有缺失, 可能为顺序进样 dataset_read_num = min(5000, int(0.1 * self.train_num)) ########################### 多分类 ###################################### if self.domain_metadata[ "language"] == "ZH" and self.check_len <= 40: dataset_read_num += min(2000, 0.1 * self.train_num) X, Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) X = X + snoop_X Y = np.concatenate([Y, snoop_Y], axis=0) return dataset_read_num, X, Y
def make_get_request(self, router_name: str): url = f'{self.base_url}{router_name}' try: logger.info(f'request {url}') output = urllib.request.urlopen(url).read().decode() logger.debug(f'output: {output}') objs = json.loads(output) return objs except Exception as ex: logger.exception(f'error on {url}', exc_info=ex)
def initialize_model(model): for pr_name, p in model.named_parameters(): if 'albert_embeddings' in pr_name: p.requires_grad = False # p.data.uniform_(-opt.param_init, opt.param_init) elif 'rezero_alpha' in pr_name: logger.info('{} is rezero param'.format(pr_name)) nn.init.zeros_(p) else: if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: nn.init.xavier_normal_(p, math.sqrt(3)) logger.info("{}: requires_grad {}".format(pr_name, p.requires_grad))
def __process_instances_for_albert(instances: List[SquadInstance], albert_tokenizer: AlbertTokenizer): new_instances = [] for instance in instances: src_limit = max_seq_len - 2 if len(instance.src) > max_seq_len - 2: logger.info("src={} exceeds {}".format(len(instance.src), max_seq_len-2)) src=[albert_tokenizer.bos_token] + instance.src[:src_limit] + [albert_tokenizer.eos_token] ans=instance.ans + [albert_tokenizer.eos_token] tgt=[albert_tokenizer.bos_token] + instance.tgt + [albert_tokenizer.eos_token] bio=['O'] + instance.bio[:src_limit] + ['O'] ner=[PAD] + instance.ner[:src_limit] + [PAD] case=[PAD] + instance.case[:src_limit] + [PAD] pos=[PAD] + instance.pos[:src_limit] + [PAD] new_instances.append(SquadInstance(src=src, tgt=tgt, bio=bio, case=case, ner=ner, pos=pos, ans=ans)) return new_instances
def train(self, dataset, remaining_time_budget=None): """Train method of domain-specific model.""" # Convert training dataset to necessary format and # store as self.domain_dataset_train logger.info("Note:train_process model.py starts train") try: # Train the model self.domain_model.train(dataset, remaining_time_budget) # Update self.done_training self.done_training = self.domain_model.done_training except Exception as exp: self.has_exception = True self.done_training = True error("Error, model_train exp={}, done_traning={}".format(exp, self.done_training))
def train(self, dataset, remaining_time_budget=None): """Train method of domain-specific model.""" # Convert training dataset to necessary format and # store as self.domain_dataset_train logger.info("Note: speech_train_process model.py starts train") as_timer("train_start") # load tf_train_dataset for first time. self.tf_dataset_trainsformer.init_train_tfds(dataset, self.train_num) if self.domain in ["speech"]: # Train the model with light model. if self.main_train_loop_num < speech_ms_mlp_conf.lightwei_train_end_loop: # fixme: need to be autotuned. ds_take_size = min(int(self.train_num * speech_ds_tds_conf.sample_ratio[self.main_train_loop_num]), self.class_num * 50) # self.domain_dataset_train = self.tf_dataset_trainsformer.get_speech_train_dataset(ds_take_size) # self.domain_model.train(self.domain_dataset_train, remaining_time_budget=remaining_time_budget) self.domain_model.train(self.tf_dataset_trainsformer.get_speech_train_dataset(ds_take_size), remaining_time_budget=remaining_time_budget) logger.info( "Note: domain={}, main_train_loop_num={}, light_model train finished.".format( self.domain, self.main_train_loop_num ) ) as_timer("speech_model_basic_train") if self.main_train_loop_num >= speech_ms_mlp_conf.midwei_train_start_loop: self.speech_widsom_model.train( # (self.domain_dataset_train["x"], self.domain_dataset_train["y"]), remaining_time_budget self.tf_dataset_trainsformer.get_speech_train_dataset_full(), remaining_time_budget ) logger.info("Note: start wisdom at np, main_train_loop_num={}".format( self.main_train_loop_num)) as_timer("speech_tr34_train") logger.info("Note:time_train model.py domain_model train finished.") # Update self.done_training self.done_training = self.domain_model.done_training self.main_train_loop_num += 1 # print(as_timer) as_timer("train_end") logger.info(as_timer) else: logger.error("Note: Domain is not Speech!")
def test(self, dataset, remaining_time_budget=None): """Test method of domain-specific model.""" # Convert test dataset to necessary format and # store as self.domain_dataset_test # self.set_domain_dataset(dataset, is_training=False) as_timer("test_start") # init tf_test_dataset for the first time. self.tf_dataset_trainsformer.init_test_tfds(dataset) self.domain_dataset_test, self.X_test = self.tf_dataset_trainsformer.get_speech_test_dataset() # As the original metadata doesn't contain number of test examples, we # need to add this information if self.domain in ["text", "speech"] and (not self.domain_metadata["test_num"] >= 0): self.domain_metadata["test_num"] = len(self.X_test) logger.info("Note:test_process test domain metadata is {}".format(self.domain_metadata)) # Make predictions if self.domain in ["speech"]: if ( self.main_train_loop_num <= speech_ms_mlp_conf.midwei_train_start_loop + speech_ms_mlp_conf.midwei_predict_block_loop ): Y_pred = self.domain_model.test(self.domain_dataset_test, remaining_time_budget=remaining_time_budget) logger.info( "Note: speech pasa_model, speech_main_train_loop={}, speech_main_test_loop={}".format( self.main_train_loop_num, self.main_test_loop_num ) ) # Update self.done_training self.done_training = self.domain_model.done_training else: Y_pred = self.speech_widsom_model.test( self.domain_dataset_test, remaining_time_budget=remaining_time_budget ) logger.info( "Note: speech dw_model, train_loop={}, test_loop={}".format( self.main_train_loop_num, self.main_test_loop_num) ) # Update self.done_training self.done_training = self.speech_widsom_model.done_training as_timer("test_end") logger.info(as_timer) else: logger.error("Note: Domain is not Speech!") self.main_test_loop_num += 1 return Y_pred
def test(self, dataset, remaining_time_budget=None): """Test method of domain-specific model.""" # Convert test dataset to necessary format and # store as self.domain_dataset_test start = time.time() self.tf_dataset_trainsformer.init_test_tfds(dataset) end = time.time() self.domain_model.time_record["init_test_tfds"] = end - start self.set_domain_dataset(dataset, is_training=False) # As the original metadata doesn't contain number of test examples, we # need to add this information if self.domain in ['text', 'speech'] and \ (not self.domain_metadata['test_num'] >= 0): self.domain_metadata['test_num'] = len(self.X_test) logger.info("Note:test_process test domain metadata is {}".format( self.domain_metadata)) # Make predictions logger.info("call num is {}".format(self.call_num)) if self.call_num == -1: # Y_pred = self.domain_model.test_first_svm(self.domain_dataset_test, # remaining_time_budget=remaining_time_budget) Y_pred = self.domain_model.test( self.domain_dataset_test, remaining_time_budget=remaining_time_budget) self.call_num += 1 else: Y_pred = self.domain_model.test( self.domain_dataset_test, remaining_time_budget=remaining_time_budget) if "test_num" not in self.domain_model.feature_dict: self.domain_model.feature_dict["test_num"] = self.domain_metadata[ 'test_num'] # Update self.done_training self.done_training = self.domain_model.done_training return Y_pred
def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata self.first_round_sample_maxnum = 200 self.call_num = -1 # 0 self.domain_dataset_train_dict = {"x": [], "y": np.array([])} # self.domain = infer_domain(metadata) self.domain = "text" logger.info("Note:The AutoDL_G_CONF: {}".format(autodl_g_conf_repr)) logger.info("Note:The inferred domain of current dataset is: {}." \ .format(self.domain)) # Domain识别及Model初始化 # DomainModel = DOMAIN_TO_MODEL[self.domain] DomainModel = meta_domain_2_model(self.domain) self.domain_metadata = get_domain_metadata(metadata, self.domain) self.class_num = self.domain_metadata["class_num"] self.train_num = self.domain_metadata["train_num"] logger.info("Note:The domain metadata is {}".format( self.domain_metadata)) self.domain_model = DomainModel(self.domain_metadata) # init for nlp self.nlp_index_to_token = None self.nlp_sep = None self.init_nlp() self.domain_model.vocab = self.vocabulary self.shuffle = False self.check_len = 0 self.imbalance_level = -1 # for tf_dataset. self.tf_dataset_trainsformer = TfDatasetTransformer( if_train_shuffle=False, config=config) self.tf_dataset_trainsformer.init_nlp_data(self.nlp_index_to_token, self.nlp_sep) self.time_record = {} self.seq_len = [] self.first_round_X = [] self.first_round_Y = np.array([]) self.X_test_raw = None
def train(self, dataset, remaining_time_budget=None): """Train method of domain-specific model.""" # Convert training dataset to necessary format and # store as self.domain_dataset_train logger.info("Note:train_process model.py starts train") # if self.call_num==0: # dataset = dataset.shuffle(min(1000, self.train_num)) start = time.time() self.tf_dataset_trainsformer.init_train_tfds(dataset, self.train_num) end = time.time() self.time_record["init_train_tfds"] = end - start if "train_num" not in self.domain_model.feature_dict: self.domain_model.feature_dict["train_num"] = self.train_num self.domain_model.feature_dict["class_num"] = self.class_num self.domain_model.feature_dict["language"] = self.domain_metadata[ 'language'] self.set_domain_dataset(dataset, is_training=True) logger.info( "Note:train_process model.py set domain dataset finished, domain_model train starts." ) self.domain_model.time_record = self.time_record # Train the model # print("check domain_y", self.domain_dataset_train_dict["y"].shape) if self.call_num == -1: # self.domain_model.train_first_svm(self.domain_dataset_train_dict["x"], self.domain_dataset_train_dict["y"], # remaining_time_budget=remaining_time_budget) self.domain_model.train( self.domain_dataset_train_dict["x"], self.domain_dataset_train_dict["y"], remaining_time_budget=remaining_time_budget) else: self.domain_model.train( self.domain_dataset_train_dict["x"], self.domain_dataset_train_dict["y"], remaining_time_budget=remaining_time_budget) self.call_num += 1 logger.info( "Note:train_process model.py domain_model train finished.") # Update self.done_training self.done_training = self.domain_model.done_training
def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata # self.domain = infer_domain(metadata) self.domain = "speech" # logger.info("Note:The AutoDL_G_CONF: {}".format(autodl_g_conf_repr)) logger.info("Note:The inferred domain of current dataset is: {}.".format(self.domain)) # Domain识别及Model初始化 # DomainModel = DOMAIN_TO_MODEL[self.domain] DomainModel = meta_domain_2_model(self.domain) self.domain_metadata = get_domain_metadata(metadata, self.domain) self.class_num = self.domain_metadata["class_num"] self.train_num = self.domain_metadata["train_num"] logger.info("Note:The domain metadata is {}".format(self.domain_metadata)) self.domain_model = DomainModel(self.domain_metadata) # fixme: 增加更新数据. self.speech_widsom_model = ASpeechWidsomModel(self.domain_metadata) self.speech_wisdom_dataset_train = None logger.info("Note:Init Speech Wisdom solution, is {}".format(self.domain_metadata)) self.main_train_loop_num = 0 self.main_test_loop_num = 0 # self.raw_tf_train_dataset = None self.dataset_sample_size = None self.dataset_read_num_second = None self.data_all_np_x_list = list() self.data_all_np_y_array = None self.ds_incr_flag = True # dataset sampling if still remain to be sampled incrementally. self.domain_dataset_train = None self.domain_dataset_test = None # for tf_dataset. self.tf_dataset_trainsformer = TfDatasetTransformer(if_train_shuffle=speech_ds_tds_conf.if_shuffle) as_timer("model_speech_init")
def __init__(self, event_loop=None): if not event_loop: event_loop = asyncio.get_event_loop() self._context = Context(event_loop) logger.info('Initial state: {}'.format(self._context.state))
def start(self, seconds): logger.info('No water!')
from apscheduler.schedulers.asyncio import AsyncIOScheduler from hardware import Sprinkler from log_utils import logger import asyncio if __name__ == '__main__': loop = asyncio.get_event_loop() sprinkler = Sprinkler(event_loop=loop) def tick(): sprinkler.start(10) scheduler = AsyncIOScheduler(event_loop=loop) scheduler.add_job(tick, 'interval', hours=1) scheduler.start() logger.info('Scheduler started') try: loop.run_forever() except (KeyboardInterrupt, SystemExit): pass finally: logger.info('Scheduler stopped')
} } num_nodes = 5 num_replicas = 3 port = START_PORT # for i in range(int(num_replicas)): # cont_port = START_PORT # image_name = "nginx" # # for i in range(int(num_replicas)): # # cont_name = "{}_{}".format(image_name, cont_port) # # (out, err) = CLIUtils.run(START_POD_CONTAINER_CMD.format(port, cont_port, image_name, cont_name)) # # logger.debug('started container id: {}'.format(out)) # # cont_port = cont_port + 1 # cont_name = "{}_{}".format(image_name, cont_port) # (out, err) = CLIUtils.run(START_POD_CONTAINER_CMD.format(port, cont_port, image_name, cont_name)) # logger.debug('started container id: {}'.format(out)) # port = port + 1 (out, err) = CLIUtils.run( START_POD_CONTAINER_CMD.format(cont_spec["node"], cont_spec["container"]["port"], cont_spec["container"]["image"], cont_spec["container"]["name"])) logger.debug('started container id: {}'.format(out)) port = START_PORT for i in range(int(num_nodes)): logger.info("Containers on Node co_node_{}: ".format(port)) (out, err) = CLIUtils.run(LIST_POD_CONTAINERS_CMD.format(port)) logger.info('list of containers: %s', out) port = port + 1
#!/usr/bin/python3 import sys from log_utils import logger from cli_utils import CLIUtils from constants import * from NodeAgent import NodeAgent import pika import datetime if len(sys.argv) < 2: logger.info("Enter the number of nodes to bring up") logger.info("./bringup_nodes.py <num_nodes>") sys.exit(1) num_nodes = sys.argv[1] # create exchange connection = pika.BlockingConnection( pika.ConnectionParameters('coqueue', 5672, '/', pika.PlainCredentials("root", "root123"))) channel = connection.channel() channel.exchange_declare('co_topic') port = START_PORT for i in range(int(num_nodes)): node_name = "co_node_{}".format(port) na = NodeAgent(node_name) na.nodes.insert_one({ "name": node_name, "description": "worker node", "heart_beat_time": datetime.datetime.utcnow(), "free_mem": 10,
def list_containers_on_node(node): logger.info("Containers on Node co_node_{}: ".format(node)) (out, err) = CLIUtils.run(LIST_POD_CONTAINERS_CMD.format(node)) logger.info('list of containers: %s', out)
def set_domain_dataset(self, dataset, is_training=True): """Recover the dataset in corresponding competition format (esp. AutoNLP and AutoSpeech) and set corresponding attributes: self.domain_dataset_train self.domain_dataset_test according to `is_training`. """ # self.dataset = None if is_training: subset = 'train' else: subset = 'test' attr_dataset = 'domain_dataset_{}'.format(subset) if not hasattr(self, attr_dataset): logger.info( "Note: Begin recovering dataset format in the original " + "competition for the subset: {}...".format(subset)) if self.domain == 'text': if DM_DS_PARAS.text.if_sample and is_training: # dataset_read_num = min(5000, self.domain_metadata["class_num"] * 3000) # if self.train_num >= 10000: # dataset_read_num = min(dataset_read_num, int(0.1 * self.train_num)) dataset_read_num, X, Y = self.decide_first_num() logger.info( "Note: set_domain_dataset text, dataset sampling, shuffle and take starts, train_read_num = {}" .format(dataset_read_num)) # Get X, Y as lists of NumPy array start = time.time() self.check_label_distribution(np.array(Y)) end = time.time() self.time_record["check_label_distribution"] = end - start self.domain_model.imbalance_level = self.imbalance_level feature_dict["check_len"] = float(self.check_len) feature_dict["kurtosis"] = float(self.kurtosis) feature_dict["first_detect_normal_std"] = float( self.normal_std) feature_dict["imbalance_level"] = self.imbalance_level feature_dict["is_shuffle"] = self.shuffle logger.info( "Note: update domain model imbalace level after first detect!" ) # if self.check_len <= 40 or self.normal_std>=0.2: # dataset_read_num += min(0.2*self.train_num, 12000) if self.shuffle and self.domain_metadata[ "language"] == "ZH": self.shuffle = False dataset_read_num = int(0.4 * self.train_num) start = time.time() _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) X = X + _X Y = np.concatenate([Y, _Y], axis=0) end = time.time() self.time_record[ "get_nlp_train_dataset_new"] = end - start _label_distribution = np.sum(Y, 0) occu_class_ = [ i for i in range(_label_distribution.shape[0]) if _label_distribution[i] != 0 ] # 已经拿到的label类别 if len(occu_class_) >= 2: pass else: # 再多取20% dataset_read_num = int(0.2 * self.train_num) logger.info( "Use extra 20% sample: Class num < 2 for ZH data!" ) _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) X = X + _X Y = np.concatenate([Y, _Y], axis=0) _label_distribution = np.sum(Y, 0) occu_class_ = [ i for i in range(_label_distribution.shape[0]) if _label_distribution[i] != 0 ] if len(occu_class_) < 2: logger.info( "Use extra 100% sample: Class num < 2!") dataset_read_num = int(self.train_num) _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) X = X + _X Y = np.concatenate([Y, _Y], axis=0) ######################### 原始英文shuffle逻辑 #################### if self.shuffle: logger.info( "Note: start shuffle dataset due to not enough labels!" ) # redo take start = time.time() del self.tf_dataset_trainsformer self.tf_dataset_trainsformer = TfDatasetTransformer( if_train_shuffle=True, config=config) end = time.time() self.time_record[ "del trainsformer and init"] = end - start start = time.time() shuffle_size = max(int(0.5 * (self.train_num)), 10000) shuffle_dataset = dataset.shuffle(shuffle_size) end = time.time() self.time_record["shuffle dataset"] = end - start start = time.time() self.tf_dataset_trainsformer.init_train_tfds( shuffle_dataset, self.train_num, pad_num=20) end = time.time() self.time_record["init_new_train_tfds"] = end - start start = time.time() X, Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) _label_distribution = np.sum(Y, 0) occu_class_ = [ i for i in range(_label_distribution.shape[0]) if _label_distribution[i] != 0 ] # 已经拿到的label类别 if len(occu_class_) >= 2: pass else: logger.info( "Use extra 100% sample: Class num < 2 for EN data!" ) dataset_read_num = int(1 * (self.train_num)) _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) X = X + _X Y = np.concatenate([Y, _Y], axis=0) end = time.time() self.time_record[ "get_nlp_train_dataset_new"] = end - start logger.info("Note: finish take after shuffle dataset") ################################################################### logger.info( "Note: set_domain_dataset text, dataset sampling, shuffle and take ends, train_read_num = {}" .format(dataset_read_num)) # self.domain_model.vocab = self.vocabulary self.domain_model.avg_word_per_sample = float( len(self.vocabulary) / self.domain_metadata["train_num"]) if "avg_word_per_sample" not in feature_dict: feature_dict[ "avg_word_per_sample"] = self.domain_model.avg_word_per_sample self.domain_model.feature_dict = feature_dict logger.info( "Note: vocab size is {} and avg_word_per_sample is {}". format(len(self.domain_model.vocab), self.domain_model.avg_word_per_sample)) elif not is_training: start = time.time() pad_num = 20 logger.info("pad num is {}".format(pad_num)) X, Y = self.tf_dataset_trainsformer.get_nlp_test_dataset( pad_num=pad_num) # self.X_test_raw = X end = time.time() self.domain_model.time_record[ "get_nlp_test_dataset_numpy_test"] = end - start if is_training: self.first_round_X = X self.first_round_Y = Y # Construct the corpus start = time.time() # 不转 corpus if self.call_num == 0: corpus = [] seq_len = [] for _x in X: _x = _x[_x != -1] num_words = max(int(_x.shape[0] * 0.1), 301) _x = _x[:num_words] _x = _x.astype(str) tokens = _x.tolist() document = self.nlp_sep.join(tokens) corpus.append(document) logger.info("USE id as corpus {}") else: corpus, seq_len = to_corpus(X, self.index_to_token, self.nlp_sep) logger.info("USE word as corpus {}") end = time.time() self.seq_len = seq_len if is_training: logger.info("to_corpus_train cost {}".format(end - start)) self.domain_model.time_record[ "to_corpus_train"] = end - start else: logger.info("to_corpus_test cost {}".format(end - start)) self.domain_model.time_record[ "to_corpus_test"] = end - start # Construct the dataset for training or test if is_training: labels = np.array(Y) cnt = np.sum(np.count_nonzero(labels, axis=1), axis=0) print("Check multi-label cnt {}".format(cnt)) if cnt > labels.shape[0]: print("Check multi-label: True") self.domain_model.multi_label = True # self.domain_model.fasttext_embeddings_index = None self.domain_model.db_model = None self.domain_model.ft_model = None domain_dataset = corpus, labels # Set the attribute self.domain_dataset_train_dict["x"] = corpus self.domain_dataset_train_dict["y"] = labels else: domain_dataset = corpus # Set the attribute self.domain_dataset_train_dict["x"] = corpus self.X_test = corpus setattr(self, attr_dataset, domain_dataset) elif self.domain == 'speech': # Set the attribute setattr(self, attr_dataset, dataset) elif self.domain in ['image', 'video', 'tabular']: setattr(self, attr_dataset, dataset) else: raise ValueError("The domain {} doesn't exist.".format( self.domain)) else: if subset == 'test': if self.X_test_raw: self.domain_dataset_test, test_seq_len = to_corpus( self.X_test_raw, self.index_to_token, self.nlp_sep) self.X_test_raw = None return if self.domain == 'text': if DM_DS_PARAS.text.if_sample and is_training: if self.domain_model.multi_label: self.domain_model.use_multi_svm = True self.domain_model.start_cnn_call_num = 2 dataset_read_num = self.train_num if dataset_read_num > 50000: dataset_read_num = 50000 logger.info(" Set Upper limit!") else: if self.imbalance_level >= 1: dataset_read_num = self.train_num self.domain_model.use_multi_svm = False self.domain_model.start_cnn_call_num = 1 if dataset_read_num > 50000: dataset_read_num = 50000 logger.info(" Set Upper limit!") else: self.domain_model.use_multi_svm = True if self.call_num <= self.domain_model.start_first_stage_call_num - 1: dataset_read_num = 3000 if self.check_len <= 40 or self.normal_std >= 0.2: dataset_read_num += min( int(0.2 * self.train_num), 12000) else: # dataset_read_num = int(self.domain_metadata["train_num"] * linear_sampling_func(self.call_num)) if self.call_num == self.domain_model.start_first_stage_call_num: dataset_read_num = int( 0.9 * self.domain_metadata["train_num"]) if dataset_read_num > 50000: dataset_read_num = 50000 else: if self.train_num <= 55555: dataset_read_num = 4000 else: dataset_read_num = 5500 logger.info( "Note: set_domain_dataset text, dataset sampling, shuffle and take starts, train_read_num = {}" .format(dataset_read_num)) # Get X, Y as lists of NumPy array start = time.time() X, Y = self.tf_dataset_trainsformer.get_nlp_train_dataset( dataset_read_num) end = time.time() # if self.call_num == 0: # logger.info("Use first round data!") # X = self.first_round_X + X # Y = np.concatenate([self.first_round_Y, Y], axis=0) if self.call_num == 1: self.time_record[ "get_nlp_train_dataset_to_numpy call_num=1"] = end - start logger.info( "Note: set_domain_dataset text, dataset sampling, shuffle and take ends, train_read_num = {}" .format(dataset_read_num)) # Construct the corpus corpus = [] start = time.time() corpus, seq_len = to_corpus(X, self.index_to_token, self.nlp_sep) end = time.time() self.seq_len.extend(seq_len) # self.time_record["to_corpus when call_num=1"] = end-start if "avg_length" not in self.domain_model.feature_dict: self.domain_model.feature_dict["avg_length"] = int( np.average(self.seq_len)) self.domain_model.feature_dict["max_length"] = int( np.max(self.seq_len)) self.domain_model.feature_dict["min_length"] = int( np.min(self.seq_len)) self.domain_model.feature_dict["seq_len_std"] = int( np.std(self.seq_len)) if self.domain_model.max_length == 0: if int(np.max(self.seq_len)) <= 301: self.domain_model.max_length = int(np.max( self.seq_len)) self.domain_model.bert_check_length = int( np.max(self.seq_len)) else: self.domain_model.max_length = int( np.average(self.seq_len)) self.domain_model.bert_check_length = int( np.average(self.seq_len)) self.domain_model.seq_len_std = int(np.std(self.seq_len)) if self.seq_len: logger.info( "Note: set domain_model max_length = {}".format( self.domain_model.max_length)) logger.info( "Note: check domain_model max_length = {}".format( int(np.max(self.seq_len)))) logger.info( "Note: check domain_model max_length std = {}".format( int(np.std(self.seq_len)))) # Construct the dataset for training or test if is_training: labels = np.array(Y) domain_dataset = corpus, labels print("\n upadte domain_dataset \n") print("check domain_dataset_train_dict y:", labels.shape) self.domain_dataset_train_dict["x"] = corpus self.domain_dataset_train_dict["y"] = labels # print(self.domain_dataset_train) # self.domain_dataset_train = domain_dataset else: domain_dataset = corpus
def train(config, model, optim:Optim, train_instances, dev_instances, word_vocab, bio_vocab, feat_vocab): model.train() start_time = time.time() batch_num = 0 num_trial = 0 report_start_time = start_time report_loss, report_words_num = 0, 0 for epoch in range(config['epoch']): for batch in batch_iter(train_instances, config['batch_size'], word_vocab=word_vocab, bio_vocab=bio_vocab, feat_vocab=feat_vocab): logger.debug("src_tokens\n{}".format(batch.src)) logger.debug("ans_tokens\n{}".format(batch.ans)) batch_num += 1 model.zero_grad() gen_output = model(batch) # (tgt_len-1, B, vocab) gen_output = gen_output.transpose(0, 1).contiguous() # (B, tgt_len-1, vocab) lprobs = torch.log_softmax(gen_output, dim=-1) batch_size = gen_output.size(0) if config['max_out_cpy']: gold = torch.tensor([x[1:] for x in batch.tgt_extended_index], dtype=torch.long, device=model.device) else: gold = torch.tensor([x[1:] for x in batch.tgt_index], dtype=torch.long, device=model.device) # (B, tgt_len-1) batch_loss = lloss(lprobs, gold, ignore_index=word_vocab.pad_idx) if config['ulloss']: batch_loss += config['ulloss_weight'] * ulloss(lprobs, gold, ignore_index=word_vocab.pad_idx) if config['seq_ulloss'] and torch.rand(1).item() < config['seq_ulloss_rate']: batch_loss += ulloss_seq(lprobs, config['seq_ulloss_ngram'], config['seq_ulloss_seq_type'], mask_p=config['seq_ulloss_mask_p']) report_loss += batch_loss.item() report_words_num += sum(batch.tgt_len) - batch_size batch_loss.backward() optim.step() if batch_num % config['log_per_batches'] == 0: logger.info('epoch {}|batch {}|avg.loss {:.4f}|ppl {:.3f}|lr {}|t {}|total t {}'.format( epoch, batch_num, report_loss/report_words_num, math.exp(report_loss / report_words_num), optim.lr, user_friendly_time_since(report_start_time), user_friendly_time_since(start_time) )) report_loss = report_words_num = 0 report_start_time = time.time() if batch_num > config['start_validate_after_batches'] and batch_num % config['validate_per_batches'] == 0: ppl = evaluate_ppl(model, dev_instances, word_vocab=word_vocab, bio_vocab=bio_vocab, feat_vocab=feat_vocab) if optim.is_better(ppl): model.save(config['model_save_path']) logger.info("model saved!") hit_trial = optim.update_lr(ppl) optim.metric_history.append(ppl) logger.info('eval ppl {}|patience {}|current lr {}|best metric {}'.format( ppl, optim.patience, optim.lr, optim.best_metric)) if hit_trial: num_trial += 1 logger.info("hit trial: [{}]".format(num_trial)) if num_trial >= config['max_num_trial']: logger.info("early stop") exit(0) logger.info('restoring parameters') state = torch.load(config['model_save_path']) model.load_state_dict(state['model_state']) model.to(device) import random test_instances = random.sample(train_instances, 100) bleus = evaluate_bleu(model, test_instances, config, word_vocab) logger.info("BLEU_1 {} BLEU_2 {} BLEU_3 {} BLEU_4 {} BLEU {}".format(*bleus))
logger.info('{} is rezero param'.format(pr_name)) nn.init.zeros_(p) else: if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: nn.init.xavier_normal_(p, math.sqrt(3)) logger.info("{}: requires_grad {}".format(pr_name, p.requires_grad)) if __name__ == '__main__': init_logger(level='info', log_file='train.log') config = load_config() device = torch.device('cpu') if config['gpu'] < 0 else torch.device('cuda:{}'.format(config['gpu'])) logger.info("training with param:\n{}".format(config)) logger.info("training with device: {}".format(device)) if config['albert']: word_vocab = AlbertVocab(config['albert_model_name'], cache_dir=config['albert_cache_dir']) else: word_vocab = load_word_vocab('squad_out/train.txt.vocab.word', config['vocab_size']) logger.info(word_vocab) bio_vocab = load_bio_vocab('squad_out/train.txt.vocab.bio') logger.info(bio_vocab) feat_vocab = load_feat_vocab('squad_out/train.txt.vocab.feat') logger.info(feat_vocab) train_instances = load_instances('squad_out/train.ins') dev_instances = load_instances('squad_out/dev.ins') if config['model'] == 'nmt': model = NMT(word_vocab, bio_vocab, feat_vocab, config['word_embed_size'], config['bio_embed_size'], config['feat_embed_size'],
def translate(model, instances, config, word_vocab, predict_save_path=None, predict_atten_engy_path=None): """ :param model: :param instances: :param beam_size: :param max_decode_step: :param vocabs: :return: List[List[str]], the translated result for each instance """ was_training = model.training model.eval() vocabs = { 'word_vocab': model.word_vocab, 'bio_vocab': model.bio_vocab, 'feat_vocab': model.feat_vocab } max_decode_step = config['max_decode_step'] dec_method = config['dec_method'] beam_size = config['beam_size'] nucleus_p = config['nucleus_p'] logger.info("translate using method {}".format(dec_method)) copy_hypothesis = [] no_copy_hypothesis = [] atten_engy = [] total_completed = 0 with torch.no_grad(): for batch in tqdm(batch_iter(instances, 1, shuffle=False, **vocabs), total=len(instances)): if dec_method == 'beam_search': instance_hypothesis, has_completed = model.beam_search( batch, beam_size, max_decode_step) elif dec_method == 'nucleus_sampling': instance_hypothesis, has_completed = model.nucleus_sampling( batch, max_decode_step, nucleus_p=nucleus_p) else: raise Exception( "decoding method {} is not supported".format(dec_method)) total_completed += int(has_completed) copy_hypothesis.append(instance_hypothesis[0][0]) no_copy_hypothesis.append(instance_hypothesis[0][1]) atten_engy.append(instance_hypothesis[0][2]) if was_training: model.train(was_training) if predict_save_path: obj = [] for idx, instance in enumerate(instances): obj.append({ 'idx': idx, 'context': " ".join(instance.src), 'ans': " ".join(instance.ans), 'gold': " ".join(instance.tgt), 'no_copy_predict': " ".join(no_copy_hypothesis[idx]), 'predict': " ".join(copy_hypothesis[idx]) }) json.dump(obj, open(predict_save_path, 'w'), indent=2) if predict_atten_engy_path: obj = [] for idx, (engy, instance, hypothesis) in enumerate( zip(atten_engy, instances, copy_hypothesis)): obj.append({ 'idx': idx, 'decode_engy': str(engy), 'src_tokens': ' '.join(instance.src), 'output_tokens': ' '.join(hypothesis) }) json.dump(obj, open(predict_atten_engy_path, 'w'), indent=2) logger.info("{} of {} is completed hypothesis".format( total_completed, len(instances))) return copy_hypothesis
from hardware import Sprinkler from log_utils import logger import asyncio loop = asyncio.get_event_loop() s = Sprinkler(loop) loop.call_later(1, s.start, 30) logger.info('starting') loop.run_forever()
def transition_to(self, state): logger.info('State transition: {} -> {}'.format(self.state, state)) self.state = state
if len(new_src_token_list) > 0: final_tokens['src'].extend(new_src_token_list) final_tokens['bio'].extend([bio_tokens[idx]] * len(new_src_token_list)) final_tokens['case'].extend([case_tokens[idx]] * len(new_src_token_list)) final_tokens['ner'].extend([ner_tokens[idx]] * len(new_src_token_list)) final_tokens['pos'].extend([pos_tokens[idx]] * len(new_src_token_list)) else: print("zero: {} {}".format(src_token, new_src_token_list)) for tgt_token in tgt_tokens: final_tokens['tgt'].extend(albert_tokenizer.tokenize(tgt_token)) else: final_tokens = {'src': src_tokens, 'tgt': tgt_tokens, 'bio': bio_tokens, 'case': case_tokens, 'ner': ner_tokens, 'pos': pos_tokens} final_tokens['ans'] = __extract_answer_from_src_and_bio(final_tokens['src'], final_tokens['bio']) if len(final_tokens['src']) > max_src_len: logger.info("trimmed seq length {} to {}".format(len(final_tokens['src']), max_src_len)) final_tokens['src'] = final_tokens['src'][:max_src_len] final_tokens['tgt'] = final_tokens['tgt'][:max_src_len] final_tokens['bio'] = final_tokens['bio'][:max_src_len] final_tokens['ner'] = final_tokens['ner'][:max_src_len] final_tokens['case'] = final_tokens['case'][:max_src_len] final_tokens['pos'] = final_tokens['pos'][:max_src_len] final_tokens['ans'] = final_tokens['ans'][:max_src_len] instance = SquadInstance(**final_tokens) instances.append(instance) return instances if __name__ == '__main__': init_logger(level='debug') config = load_config()
zip(atten_engy, instances, copy_hypothesis)): obj.append({ 'idx': idx, 'decode_engy': str(engy), 'src_tokens': ' '.join(instance.src), 'output_tokens': ' '.join(hypothesis) }) json.dump(obj, open(predict_atten_engy_path, 'w'), indent=2) logger.info("{} of {} is completed hypothesis".format( total_completed, len(instances))) return copy_hypothesis if __name__ == '__main__': config = load_config() init_logger(log_file='evaluate.log') device = torch.device('cpu') if config['gpu'] < 0 else torch.device( 'cuda:{}'.format(config['gpu'])) if config['model'] == 'nmt': model = NMT.load(config['model_save_path']) model.to(device) else: model = QGModel.load(config['model_save_path'], device) test_instances = load_instances(config['save_dir'] + '/test.ins') bleus = evaluate_bleu(model, test_instances, config, model.word_vocab, config['predict_save_path']) logger.info( '\nBLEU_1: {}\nBLEU_2: {}\nBLEU_3: {}\nBLEU_4: {}\nBLEU :{}'.format( *bleus))