def test_model(self, words): """ 测试词向量模型,查找每个词语最相近的 10个词 :param words: 待测试的词语列表 :return: """ if not os.path.exists(self.model_path): LOG.error('Model file not exists, please train the model first') exit(1) if not isinstance(words, list): words = [words] LOG.info('Testing model with words %s' % str(words)) # 加载词向量模型 mdl = word2vec.Word2Vec.load(self.model_path) for word in words: try: # 查找与 word 最相近的 10个词 tops = mdl.wv.most_similar(word, topn=10) print('\nWords most related to "%s":' % word) cpprint(tops) # 捕获异常并跳过,比如 word 不在词汇表中会抛出 KeyError except Exception as e: print('\nError:', repr(e))
def test_parser(self): lexer = Lexer('1 + 2 * (3 + 4)') parser = Parser(lexer, 10) ast = parser.expr() prettyprinter.cpprint(dataclasses.asdict(ast)) # ast.walk() Visitor().visit(ast)
def handle(self, *args, **options): # Headless Chrome options = Options() options.add_argument('--headless') browser = webdriver.Chrome(chrome_options=options) try: browser = login(browser) if browser == False: print('Error: failed to login to RMS') browser.quit() sys.exit(1) data = get_rmstop(browser) get_rms_detail(browser, data) printdata(data) cpprint(data) if adddata(data) == False: print('data {0} is already exists.'.format(data['date'])) sys.exit(1) browser.quit() except: browser.quit() import traceback print('error.') traceback.print_exc() sys.exit(1)
def test_pretty_json(): with open('tests/sample_json.json') as f: data = json.load(f) print('native pprint') nativepprint(data) print('prettyprinter') cpprint(data)
def main(): try: user = get_user() cpprint(user) except APIUnreachableException as my_e: print("le serveur est injoignable", str(my_e), my_e.custom_message) except HttpNotFound: print("L'information n'existe pas")
def main(): try: user = get_user() cpprint(user) # print(f"{user.titre} {user.nom_complet}") except APIUnreachableException: print("L'API est injoignable.") except HttpNotFound: print("l'URL n'exite pas")
def __init__(self): super(MF, self).__init__() self.config = ConfigX() cpprint(self.config.__dict__) #print the configuration # self.rg = RatingGetter() # loading raing data # self.init_model() self.iter_rmse = [] self.iter_mae = [] pass
def colorize(self,s) -> str: assert isinstance(s,str), s _stream = io.StringIO() try: litval = literal_eval(s) cpprint(litval,stream=_stream) except: cpprint(s,stream=_stream) rv = _stream.getvalue() _stream.close() return rv
def main(): """The main function.""" ap = argparse.ArgumentParser(description=__doc__) ap.add_argument('--config', default=str(Path.home() / '.philipshue.ini'), help='the config file location') args = ap.parse_args() while True: try: cp = configparser.ConfigParser() cp.read(args.config) cf = cp['DEFAULT'] bridge_location = cf['bridge_location'] bridge_username = cf['bridge_username'] except KeyError: setup(args.config) continue break print(f'Connecting to {bridge_location}...') try: bridge = qhue.Bridge(bridge_location, bridge_username) num_lights = len(bridge.lights()) print(f'Connected to {bridge_location}. {num_lights} lights found.') except requests.ConnectionError as err: print(f'{sgr(1, 31)}{err.__class__.__name__}{sgr(0)}: {err}') sys.exit(1) session = PromptSession( '> ', lexer=PygmentsLexer(Python3Lexer), style=style_from_pygments_cls(PYGMENTS_STYLE), auto_suggest=AutoSuggestFromHistory(), input_processors=[HighlightMatchingBracketProcessor('()[]{}')], history=FileHistory(Path.home() / '.philipshue.hist')) while True: try: cmd = session.prompt() start = time.perf_counter() out = exec_cmd(cmd, bridge=bridge) time_taken = time.perf_counter() - start prettyprinter.cpprint(out) print(f'Time taken: {sgr(1, 34)}{time_taken*1000:.3f} ms{sgr(0)}') except KeyboardInterrupt: pass except EOFError: break except requests.ConnectionError as err: print(f'{sgr(1, 31)}{err.__class__.__name__}{sgr(0)}: {err}') sys.exit(1) except Exception as err: print(f'{sgr(1, 31)}{err.__class__.__name__}{sgr(0)}: {err}')
def create_from_api(cls, response): first_resp = response.get('results')[0] cpprint(response) gender = first_resp.get('gender') title = first_resp.get('name').get('title') firstname = first_resp.get('name').get('first') lastname = first_resp.get('name').get('last') email = first_resp.get('email') username = first_resp.get('login').get('username') return User(gender, title, firstname, lastname, email, username)
def main(): signal(SIGINT, signal_handle) options_dict = args(usage) cpprint(options_dict) tc_handle(**options_dict)() gol._init() t = threading.Thread(target=exec, args=()) t.setDaemon(True) t.start() time.sleep(2) global win win = init_scr() win()
def main(): parser = configparser.ConfigParser() parser.read(['config.ini']) urls = set(parser.sections()) urls.remove('Planet') result = asyncio.run(amain(urls=list(urls))) prettyprinter.cpprint(result) for section in result: parser.remove_section(section) with open('config.modified.ini', 'w') as fp: parser.write(fp)
def amv_show_all_styles(data=None): from prettyprinter import cpprint from pygments import styles if data is None: data = _get_testdata() for style in styles.get_all_styles(): print(f"{style}:") try: cpprint(data, style=styles.get_style_by_name(style)) except Exception as exc: print(repr(exc)) print()
def setup(config): resp = requests.get('https://discovery.meethue.com') print('Detected Philips Hue Bridges:') prettyprinter.cpprint(resp.json()) session = PromptSession() location = session.prompt('Enter the Bridge IP address: ') username = qhue.create_new_username(location) cp = configparser.ConfigParser() cp.read(config) cf = cp['DEFAULT'] cf['bridge_location'] = location cf['bridge_username'] = username with open(config, 'w') as configfile: cp.write(configfile)
def test_large_data_performance(): data = [{'text': 'lorem ipsum dolor sit amet ' * 500}] * 200 stream = StringIO() start = datetime.datetime.now() cpprint(data, stream=stream) stream.getvalue() end = datetime.datetime.now() took = end - start print('took', took) # The bottleneck is in string to doc conversion, # specifically escaping strings many times. # There's probably more we can do here assert took < datetime.timedelta(seconds=13)
def __init__(self, fixseed = True): super(MF, self).__init__() self.config = ConfigX() self.configc = ConfigCUC() cpprint(self.config.__dict__) #print the configuration # 打印数据统计 print_data_file_stats(self.config.rating_path) print_data_file_stats(self.config.trust_path) if fixseed: np.random.seed(seed=self.config.random_state) # 固定随机种子 # self.rg = RatingGetter() # loading raing data # self.init_model() self.iter_rmse = [] self.iter_mae = [] pass
def dump(*args): """Dump variables using prettyprinter""" # Detect if running in pytest if "pytest" in sys.modules: level = None for arg in args: if type(arg) == str: # I don't want string printed with dump because it adds quotes to the string # which seems confusing at times. #prettyprinter.cpprint(arg, width=10000, ribbon_width=10000) print(arg) else: width = 120 if uvicore.config: if uvicore.config.app.dump.width: width = uvicore.config.app.dump.width prettyprinter.cpprint(arg, width=width, ribbon_width=width)
def eth2json(eth): if eth: print(eth) eth = eth.encode().decode('unicode_escape').encode( 'raw_unicode_escape').decode() #a = b"\xe8\xaf\xad\xe6\x96\x87" #print(str(a, "utf-8")) try: eth = json.loads(eth, encoding="utf-8") return cpprint(eth) except Exception as e: print(e) return ''
def gen_list_of_sections_and_html_files(source_folder_path): toc = get_toc(source_folder_path) html_files_list = [] # list of dicts html_folder_path = os.path.join(source_folder_path, "_build", "html") # Add the root for the jupyterbook > 0.12 toc config if "root" in toc: html_file_path = os.path.join(html_folder_path, str(toc["root"]) + ".html") html_files_list.append({ "section_name": "Introduction", "html_file_path": html_file_path }) # TODO There are several jupyterbook _toc configurations that are possible if "parts" in toc: parts = toc["parts"] else: logger.warn( "Key 'parts' not present in _toc. Please convert your _toc to the new jupyterbook format with format jb-book." ) exit(1) for item in parts: if ("chapters" in item.keys() ): # will exclude intro file from the transfer to zendesk # section = item["part"] section = item["caption"] files = item["chapters"] for f in files: filename = f["file"] html_file_path = os.path.join(html_folder_path, str(filename) + ".html") html_files_list.append({ "section_name": section, "html_file_path": html_file_path }) # logger.info(f"Final List of html files to be sent to Zendesk: \n {html_files_list}") logger.debug(cpprint(html_files_list)) return html_files_list
def prettyprinter_displayhook(value): if value is None: return builtins._ = None stream = StringIO() output = cpprint(value, width=get_terminal_width(default=79), stream=stream, end='') output = stream.getvalue() try: sys.stdout.write(output) except UnicodeEncodeError: encoded = output.encode(sys.stdout.encoding, 'backslashreplace') if hasattr(sys.stdout, 'buffer'): sys.stdout.buffer.write(encoded) else: text = encoded.decode(sys.stdout.encoding, 'strict') sys.stdout.write(text) sys.stdout.write('\n') builtins._ = value
def test_all_python_values(value): cpprint(value)
def __init__(self): super(GEMF, self).__init__() self.rg = RatingGetter() ex_file = 'yp_trust' self.explict_trust_path = '../data/net/' + ex_file + '.txt' weight = 0.5 # file = '%s_weight_%s' % (self.config.dataset_name, weight) file = 'yp_CUnet_weight' self.implict_trust_path = '../data/net/' + file + '.txt' # file = '%s_CUnet_weight_nnn' % self.config.dataset_name # file = '%s_less_CUnet_weight' % self.config.dataset_name # self.implict_trust_path = '../data/' + file + '.txt' # self.implict_trust_path = '../data/yp_30_39_rating_im_net_new.txt' # ft_3 & db_13 & ca_16 & yp_30_39 # & ca_23 & db_18 ############## 1 ################ # ex_file = '%s_filter_trust_new' % self.config.dataset_name # file = '%s_CUnet_weight_new' % self.config.dataset_name # self.implict_trust_path = '../data/' + file + '.txt' # self.explict_trust_path = '../data/' + ex_file + '.txt' ############## 2 ################ # file = 'ft_3_rating_im_net' # file = 'ft_3_rating_im_net_new' # ft_3 & db_18 & ca_23 & yp_30_39 for new # self.implict_trust_path = '../data/' + file + '.txt' ############## 3 ################ # weight = 0.3 # file = '%s_two_net_with_weight_%s_rewrited' % (self.config.dataset_name, weight) # file = '%s_two_net_with_weight_%s_new_rewrited' % (self.config.dataset_name, weight) # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' ############## 4 ################ # file = '%s_two_net_with_tanh_rewrited' % (self.config.dataset_name) # file = '%s_two_net_with_tanh_new_rewrited' % (self.config.dataset_name) # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' ############## 5 ################ # file = '%s_inter_net' % self.config.dataset_name # file = '%s_union_net' % self.config.dataset_name # file = '%s_union_net_expanded' % self.config.dataset_name # file = '%s_inter_net_new' % self.config.dataset_name # file = '%s_union_net_new' % self.config.dataset_name # file = '%s_union_net_new_expanded' % self.config.dataset_name # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' # parameters for matrix factorization self.config.lr = 0.01 self.config.lambdaP = 0.03 #0.03 self.config.lambdaQ = 0.01 #0.01 self.config.lambdaB = 0.01 #0.01 self.config.temp1 = 0.01 self.config.temp2 = 0.01 self.config.alpha = self.config.temp1 self.config.beta = self.config.temp2 self.config.factor = 10 self.config.isEarlyStopping = True self.config.k_fold_num = 5 # parameters for netwalker self.config.random_state = 0 self.config.number_walks = 30 # the times of random walk 5 self.config.path_length = 20 # the length of random walk 10 self.config.restart_pro = 0.1 # the probability of restarts. self.config.undirected = True self.config.ex_walk_result_path = '../data/ge/' + ex_file + '_social_corpus_filter.txt' self.config.im_walk_result_path = '../data/ge/' + file + '_social_corpus_implict.txt' # parameters for graph embedding self.config.lambdaW = 1 self.config.ex_table_path = '../data/ge/' + ex_file + '_table_filter.pkl' self.config.ex_model_out_path = '../data/ge/' + ex_file + '_result_filter.txt' self.config.im_table_path = '../data/ge/' + file + '_table_implict.pkl' self.config.im_model_out_path = '../data/ge/' + file + '_result_implict.txt' self.config.cbow = 0 self.config.neg = 5 self.config.w2v_lr = 0.01 # 0.01-0.81 self.config.win_size = 10 self.config.min_count = 3 self.config.binary = 0 self.dataSet_u = defaultdict(dict) self.dataSet_i = defaultdict(dict) self.filteredRatings = defaultdict(list) self.CUNet = defaultdict(list) self.walks = [] self.ex_walks = [] self.im_walks = [] # self.visited = defaultdict(dict) self.ex_pos_loss_total = 0 self.ex_neg_loss_total = 0 self.im_pos_loss_total = 0 self.im_neg_loss_total = 0 # cpprint('k is %s' % self.config.near_num) cpprint('implict_trust_path is %s' % self.implict_trust_path) cpprint('explict_trust_path is %s' % self.explict_trust_path) cpprint('lr is %s' % self.config.lr) cpprint('neg is %s' % self.config.neg) cpprint('w2v_lr is %s' % self.config.w2v_lr) cpprint('win_size is %s' % self.config.win_size) cpprint('alpha is %s' % self.config.alpha) cpprint('beta is %s' % self.config.beta) cpprint('lamdbaP is %s' % self.config.lambdaP) cpprint('lambdaQ is %s' % self.config.lambdaQ) cpprint('number_walks is %s' % self.config.number_walks) cpprint('path_length is %s' % self.config.path_length) # cpprint('factor is %s' % self.config.factor) self.init_model()
pass """ def only_rate_limit(self): self.ip.tc('add', 'tbf', self.nic, 0x100000, parent=0x10010, rate=self.rate+'kbit', burst=1024 * 2, latency='200ms') def only_no_rate_limit(self): self.ip.tc('add', 'netem', self.nic, 0x100000, parent=0x10010, loss=30) """ def __call__(self): self.flush_instance() if not self.flush: self.ip.tc('add', 'htb', self.nic, 0x10000, default=0x200000) self.ip.tc('add-class', 'htb', self.nic, 0x10001, parent=0x10000, rate='1000mbit', prio=4) #print(self.rate) self.ip.tc('add-class', 'htb', self.nic, 0x10010, parent=0x10001, rate=self.rate+'kbit',prio=3) self.ip.tc('add-class', 'htb', self.nic, 0x10020, parent=0x10001, rate='700mbit', prio=2) if self.loss or self.delay: #print(self.delay) self.ip.tc('add', 'netem', self.nic, 0x100000, parent=0x10010, loss=self.loss, delay=self.delay) else: self.ip.tc('add', 'tbf', self.nic, 0x100000, parent=0x10010, rate=self.rate+'kbit', burst=1024 * 2, latency='200ms') self.ip.tc('add', 'sfq', self.nic, 0x200000, parent=0x10020, perturb=10) #pyroute2 有bug,对socket家族的协议解析有不正确的地方,比如AF_INET应该解析成IPV4,但是解析成了ax25,AF_AX25解析成了all,所以将错就错用这个好了,protocols也一样的结果 self.ip.tc('add-filter', 'u32', self.nic, parent=0x10000, prio=1, protocol=socket.AF_AX25, target=0x10010, keys=self.keys) if __name__ == "__main__": options_dict = args(usage) cpprint(options_dict) tc_handle(**options_dict)()
def train(cfg): # hyperparameters global optimizer, criterion, scheduler SEED = cfg.values.seed seed_everything(SEED) MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TSVFILE = cfg.values.tsvfile log_interval = cfg.values.train_args.log_interval weight_decay = cfg.values.train_args.weight_decay tr_batch_size = cfg.values.train_args.train_batch_size val_batch_size = cfg.values.train_args.eval_batch_size epochs = cfg.values.train_args.num_epochs loss_type = cfg.values.train_args.loss_fn lr_decay_step = 1 #stepLR parameter steplr_gamma = cfg.values.train_args.steplr_gamma opti = cfg.values.train_args.optimizer scheduler_type = cfg.values.train_args.scheduler_type label_smoothing_factor = cfg.values.train_args.label_smoothing_factor device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(f'device is "{device}"') print(MODEL_NAME) if 'koelectra' in MODEL_NAME: model_config = ElectraConfig.from_pretrained(MODEL_NAME) # elif 'roberta' in MODEL_NAME: # model_config = RobertaConfig.from_pretrained(MODEL_NAME) # model_config.is_decoder = True else: model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 train_dataset = load_data("../input/data/train/" + TSVFILE) train_label = train_dataset['label'].values if MODEL_NAME=='KoBertTokenizer': tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) elif 'koelectra' in MODEL_NAME: tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME) # elif 'roberta' in MODEL_NAME: # tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # compute the class weights class_weights = compute_class_weight('balanced', np.unique(train_label), train_label) weights = torch.tensor(class_weights, dtype=torch.float) weights = weights.to(device) if loss_type == 'custom': # F1 + Cross_entropy criterion = CustomLoss() elif loss_type == 'labelsmooth': criterion = LabelSmoothingLoss(smoothing=label_smoothing_factor) elif loss_type == 'CEloss': criterion = nn.CrossEntropyLoss(weight=weights) elif loss_type == 'focal': criterion = FocalLoss() if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k=1 save_dir = increment_output_dir(cfg.values.train_args.output_dir) for idx, splits in enumerate(kfold.split(train_dataset, train_label)):#(trind, valind) if idx !=4: continue trind=splits[0] valind=splits[1] print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) tr_label = train_dataset['label'].iloc[trind].values val_label = train_dataset['label'].iloc[valind].values tr_dataset = train_dataset.iloc[trind] val_dataset = train_dataset.iloc[valind] # tokenizing dataset tokenized_train = tokenized_dataset(tr_dataset, tokenizer) tokenized_dev = tokenized_dataset(val_dataset, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, tr_label) RE_dev_dataset = RE_Dataset(tokenized_dev, val_label) train_loader = DataLoader(RE_train_dataset, batch_size=tr_batch_size, shuffle=True) val_loader = DataLoader(RE_dev_dataset, batch_size=val_batch_size, shuffle=True) if 'koelectra' in MODEL_NAME: model = ElectraForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config) model.to(device) model_dir = save_dir + f'/{k}fold' # OPTIMIZER if scheduler_type == 'CCAWR': opt_module = getattr(import_module("torch.optim"), opti) # default: Adam optimizer = opt_module( filter(lambda p: p.requires_grad, model.parameters()), lr=5e-7, #cfg.values.train_args.lr * 0.0001, # 5e-6, weight_decay=weight_decay ) scheduler = CustomCosineAnnealingWarmUpRestarts(optimizer, T_0=2, T_mult=2, eta_max=cfg.values.train_args.lr, T_up=1, gamma=0.8, last_epoch=-1) elif scheduler_type == 'stepLr': opt_module = getattr(import_module("torch.optim"), opti) # default: Adam optimizer = opt_module( filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.values.train_args.lr, # 5e-6, weight_decay=weight_decay ) scheduler = StepLR(optimizer, lr_decay_step, gamma=steplr_gamma)#794) #gamma : 20epoch => lr x 0.01 elif scheduler_type == 'cycleLR': opt_module = getattr(import_module("torch.optim"), opti) # default: Adam optimizer = opt_module( filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.values.train_args.lr, # 5e-6, weight_decay=weight_decay ) scheduler = CyclicLR(optimizer, base_lr=0.000000001, max_lr=cfg.values.train_args.lr, step_size_up=1, step_size_down=4, mode='triangular', cycle_momentum=False)#triangular2 logger = SummaryWriter(log_dir=model_dir) best_val_acc = 0 best_val_loss = np.inf for epoch in range(epochs): model.train() loss_value = 0 matches = 0 for idx, batch in enumerate(train_loader): optimizer.zero_grad() input_ids = batch['input_ids'].to(device) # token_type_ids = batch['token_type_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, # token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels) loss = criterion(outputs[1], labels) loss_value += loss.item() preds = torch.argmax(F.log_softmax(outputs[1], dim=1), dim=-1) optimizer.zero_grad() loss.backward() optimizer.step() matches += (preds == labels).sum().item() if (idx + 1) % log_interval == 0: train_loss = loss_value / log_interval train_acc = matches / tr_batch_size / log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch}/{epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + idx) logger.add_scalar("Train/lr", current_lr, epoch * len(train_loader) + idx) loss_value = 0 matches = 0 optimizer.step() if scheduler is not None: scheduler.step() model.eval() with torch.no_grad(): print("Calculating validation results...") val_loss_items = [] val_acc_items = [] for idx, val_batch in enumerate(val_loader): input_ids = val_batch['input_ids'].to(device) # token_type_ids = val_batch['token_type_ids'].to(device) attention_mask = val_batch['attention_mask'].to(device) labels = val_batch['labels'].to(device) outputs = model(input_ids, # token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels) preds = torch.argmax(F.log_softmax(outputs[1], dim=1), dim=-1) loss_item = outputs[0].item() correct = preds.eq(labels) acc_item = correct.sum().item() val_loss_items.append(loss_item) val_acc_items.append(acc_item) val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_label) best_val_loss = min(best_val_loss, val_loss) if val_acc > best_val_acc: print(f"New best model for val accuracy : {val_acc:4.2%}! saving the best model..") torch.save(model.state_dict(), f"./{model_dir}/best.pt") best_val_acc = val_acc torch.save(model.state_dict(), f"./{model_dir}/last.pt") print( f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2} || " f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}" ) logger.add_scalar("Val/loss", val_loss, epoch) logger.add_scalar("Val/accuracy", val_acc, epoch) print() with open(f"./{model_dir}/config.yaml", 'w') as file: documents = yaml.dump(cfg.values, file) k += 1 if cfg.values.val_args.fold_break: break
client_do = dospacesboto3.digital_ocean_client_init() print('DO client initialized') # print("Here is a list of the files contained in the spaces") list_of_files = dospacesboto3.get_list_of_files(client_do, s_spaces_name, folder_prefix='Pictures') # TODO: print the file list better # print(list_of_files) # cpprint(list_of_files) print('Get list of files using pagination') list_of_files_pagination = dospacesboto3.get_list_of_files_using_pagination( client_do, s_spaces_name, folder_prefix='new-folder') # print(list_of_files) cpprint(list_of_files_pagination) print("Uploading file") dospacesboto3.upload_file(client_do, s_local_file_path, s_spaces_name, 'new-folder/polarbear_1920x1080.jpeg') print("File uploaded to DigitalOcean") print('Downloading file') dospacesboto3.download_file(client_do, s_local_file_name, s_spaces_name, 'new-folder/polarbear_1920x1080.jpeg') print('File downloaded')
import yaml import io from prettyprinter import cpprint with open("specs/0.1/api.yaml", 'r') as stream: model = yaml.safe_load(stream) cpprint(model) version = model.get('version') kresources = list(model.get('resources').keys()) kmodels = list(model.get('models').keys()) kapis = list(model.get('apis').keys()) kendpoints = list(model.get('endpoints').keys()) cpprint(version) cpprint('apis: ' + str(kapis)) cpprint('endpoints: ' + str(kendpoints)) cpprint('models: ' + str(kmodels)) cpprint('resources: ' + str(kresources))
def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TRAIN_ONLY = cfg.values.train_only seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/train.tsv") additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv") whole_label = whole_df['label'].values # additional_label = additional_df['label'].values if cfg.values.tokenizer_arc: tokenizer_module = getattr(import_module('transformers'), cfg.values.tokenizer_arc) tokenizer = tokenizer_module.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999, early_stopping_threshold=0.001) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay max_grad_norm=cfg.values.train_args.max_grad_norm, logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. dataloader_num_workers=4, seed=SEED, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, load_best_model_at_end=True, # metric_for_best_model='accuracy' ) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] # train_df = pd.concat((train_df, additional_df)) val_df = whole_df.iloc[val_idx] if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' optimizer = MADGRAD(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, # callbacks=[early_stopping] ) k += 1 # train model trainer.train() else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) if not TRAIN_ONLY: train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) # train_df = pd.concat((train_df, additional_df)) if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) optimizer = transformers.AdamW(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, callbacks=[early_stopping]) # train model trainer.train() else: training_args.evaluation_strategy = 'no' if cfg.values.model_arc == 'Roberta': print('Roberta') tokenized_train = roberta_tokenized_dataset( whole_df, tokenizer) else: tokenized_train = tokenized_dataset(whole_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, whole_df['label'].values) try: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + '/only_train' training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train' optimizer = AdamP(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train()
num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train() def main(cfg): train(cfg) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--config_file_path', type=str, default='./config.yml') parser.add_argument('--config', type=str, default='base') args = parser.parse_args() cfg = YamlConfigManager(args.config_file_path, args.config) cpprint(cfg.values, sort_dict_keys=False) print('\n') main(cfg)
import requests # pretty from prettyprinter import cpprint # 新华字典库 url = 'https://www.pwxcoo.com/dictionary' #歇后语 riddle 语面 key = input('输入关键字:') params = {'type': 'xiehouyu', 'riddle': key} # 成语 # params={'type':'idiom','riddle':'兴高采烈'} # 拼音缩写 # params={'type':'idiom','riddle':'xgcl'} # 汉字 # params={'type':'word','riddle':'王'} r = requests.get(url=url, params=params) data = r.json() cpprint(data)
def log(msg) -> None: if __debug__: cpprint(msg)