def fit(model, train_dataloader, test_dataloader, optimizer, num_train_steps, num_epocs=5): time_string = strftime("%a%d%b%Y-%H%M%S", gmtime()) writer_name = os.path.join(ROOT, 'log', time_string) print(writer_name) writer = SummaryWriter(writer_name) n_gpu = 1 global_step = 0 output_model_file = os.path.join(ROOT, 'save', "finetuned_pytorch_model_3.bin") t_total = num_train_steps # model_state_dict = torch.load(output_model_file) # model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict) # model.to(device) model.train() for i_ in trange(int(num_epocs), desc="Epoch"): # Load a trained model that you have fine-tuned print('hehe') tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args['gradient_accumulation_steps'] > 1: loss = loss / args['gradient_accumulation_steps'] if args['fp16']: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args['gradient_accumulation_steps'] == 0: # scheduler.batch_step() # modify learning rate with special warm up BERT uses # lr_this_step = args['learning_rate'] * warmup_linear(global_step / t_total, args['warmup_proportion']) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % 10 == 0: logger.info('Epoch {} Step {} Loss {}'.format(i_, step, tr_loss / nb_tr_steps)) writer.add_scalar('loss', torch.mean(loss).detach().cpu().numpy(), step) if (step + 1) % 100 == 0: r = set_eval(model, test_dataloader) writer.add_scalar('evalloss', r['eval_loss'], step) writer.add_scalar('evalacc', r['eval_accuracy'], step) # logger.info('Eval after epoc {}'.format(i_ + 1)) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file)
def midis_to_sequences(folder, total_num=None, split_interval=1): """Create tensor from midi folder, using music21 and torch Args: folder: str, full or relative folder path total_num: int, if assigned, the first dimension of the output rensor will be `min(num_of_midis, total_num)` split_interval: int Returns: tensor: list of torch.Tensor, [total_num, 3, sequence_length, 3] Different sequences will have different sequence_length """ logger.info( "Prepare to create tensor from midis, folder {}".format(folder)) # set up parameters filenames = os.listdir(folder) sequences = [] # Enter main loop for count, filename in tqdm(enumerate(filenames), total=len(filenames)): if total_num is not None and count >= total_num: break filepath = os.path.join(folder, filename) sequences.append(midi_to_sequence(filepath, split_interval)) logger.info("Creation completed.") return sequences
def sequences_to_midis(sequences, split_interval=1, tune='E minor', folder=None): """Create midis from Tensor, most compatitive verson NOW PATH IS FIXED TO outputs/ Args: tensor: list of torch.Tensor, [total_num, 3, sequence_length, 3] split_interval: scalar, indicates the frequency of notes on reconstruction tune: str, 'X xxxxx', 'E minor', etc folder: str, folder to save midis Returns: None """ assert isinstance(sequences, list), "wrong sequences class, got {}".format( sequences.__class__.__name__) logger.info("{} midis to create".format(len(sequences))) # create folder if None if folder is None: rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) logger.warning( "folder name not assigned, will use current time {}".format(rq)) folder = 'outputs/' + rq if not os.path.exists(folder): os.mkdir(folder) # Enter main loop for num, sequence in tqdm(enumerate(sequences), total=len(sequences)): sequence_to_midi(sequence, split_interval, tune, folder, name=str(num) + '.mid') logger.info("Midi creation completed")
def getBatches_test(): logger.info("getBatches_test() started") sequences = midis_to_sequences('/home/hades/Documents/simple_data') batches, targets, batch_size = getBatches(sequences) assert isinstance(batches, list) assert isinstance(batches[0], torch.Tensor) assert batches[0].size()[0] == 3 assert batches[0].size()[2] == batch_size assert batches[0].size()[3] == 42 assert isinstance(targets, list) assert isinstance(targets[0], torch.Tensor) assert targets[0].size()[0] == 3 assert targets[0].size()[2] == batch_size assert targets[0].size()[3] == 3 tensors = sequences_to_tensors(sequences) batches, targets, batch_size = getBatches(tensors) assert isinstance(batches, list) assert isinstance(batches[0], torch.Tensor) assert batches[0].size()[0] == 3 assert batches[0].size()[2] == batch_size assert batches[0].size()[3] == 42 assert isinstance(targets, list) assert isinstance(targets[0], torch.Tensor) assert targets[0].size()[0] == 3 assert targets[0].size()[2] == batch_size assert targets[0].size()[3] == 3 logger.info("getBatches_test() passed")
def jdPhone_spider(args, url, beginPage, endPage): if args.mode == 'update': global phone_num headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3', 'Referer': 'https://www.jd.com/', 'DNT': '1', #'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'TE': 'Trailers', } with open(args.update_file, 'rb') as f: update_list = pickle.load(f) for ID in update_list: new_url = 'https://item.jd.com/' + str(ID) + '.html' row = one_phone(args, new_url, headers) logger.info('succeed in crawling %s feature' % len(row)) mysql_tool.auto_save_data(row, args.table_name) phone_num += 1 else: for page in range(beginPage, endPage + 1): pn = page * 2 - 1 page_num = page logger.info("crawlling No," + str(page) + "page") fullurl = url + "&page=" + str(pn) time.sleep(2) load_page(args, fullurl)
def midis_to_sequences_test(): logger.info("midis_to_sequences_test() started") sequences = midis_to_sequences('/home/hades/Documents/simple_data') assert isinstance(sequences, list) assert isinstance(sequences[0], torch.Tensor) assert sequences[0].size()[0] == 3 assert sequences[0].size()[2] == 3 logger.info("midis_to_sequences_test() passed")
def save_test(): logger.info("save_test() started") sequences = midis_to_sequences('/home/hades/Documents/simple_data') path = save(sequences, 'sequences.test') sequences = load(path) assert isinstance(sequences, list) assert isinstance(sequences[0], torch.Tensor) assert sequences[0].size()[0] == 3 assert sequences[0].size()[2] == 3 logger.info("save_test() passed")
def get_train_examples(self, data_dir, size=-1): filename = 'train.csv' logger.info("LOOKING AT {}".format(os.path.join(data_dir, filename))) if size == -1: data_df = pd.read_csv(os.path.join(data_dir, filename), engine=None) return self._create_examples(data_df, "train") else: data_df = pd.read_csv(os.path.join(data_dir, filename)) return self._create_examples(data_df.sample(size), "train")
def load(path): """Useful wrapping of pickle method Args: path: full or relative path of the file to be loaded Returns: obj: any """ with open(path, 'rb+') as f: obj = torch.load(f) logger.info("a {} object loaded from {}".format(obj.__class__.__name__, path)) return obj
def save(obj, filename): """Useful wrapping of pickle method Args: obj: Any filename: str, obj will be saved to `current_path/saves/time/filename/` Returns: path: str, full path of the file saved """ rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) path = 'saves/' + rq + '/' + filename with open(path, 'wb+') as f: torch.save(obj, f) logger.info("a {} object saved to {}".format(obj.__class__.__name__, path)) return path
def set_eval(model, eval_dataloader): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 device = torch.device( "cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu") count = 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 count += 1 if count >= 4: break eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples # loss = tr_loss/nb_tr_steps if tr_loss else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': 0 } # 'loss': loss} output_eval_file = os.path.join(args['output_dir'], "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def load_page(args, url): global mysql_tool global phone_num global tmp_dict headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3', 'Referer': 'https://www.jd.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'TE': 'Trailers', } params = ( ('keyword', '手机'), ('enc', 'utf-8'), ('wq', '手机'), ('pvid', '70b2126fcf3246ce9f32710d41799ede'), ) response = requests.get(url, headers=headers, params=params) html = response.content content = etree.HTML(html) content_list = content.xpath( '//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@href') for i in range(1, 31): try: result = re.split(r":", content_list[i - 1])[1] content_list[i - 1] = result except Exception as e: continue for j in content_list: new_url = "http:" + j logger.info('trying to crawling No. %s phone info...' % phone_num) phone_num += 1 row = one_phone(args, new_url, headers) print(len(row)) if args.mode == 'debug': tmp_dict[UNIKEY] = row else: if phone_num == 1: mysql_tool.auto_create_table(row, 'jd_phone_raw') mysql_tool.auto_save_data(row, 'jd_phone_raw') phone_num += 1
def convert_single(single_text, max_seq_length=None, tokenizer=None) -> InputFeatures: if tokenizer is None: tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=args['do_lower_case']) if max_seq_length is None: max_seq_length = args['max_seq_length'] tokens_a = tokenizer.tokenize(single_text) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length logger.info("*** Example ***") logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) return InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=0)
def train(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args['do_lower_case']) train_examples = None num_train_steps = None processors = { "news_cat_label": LabelTextProcessor } processor = processors[args['task_name'].lower()](args['data_dir']) if args['do_train']: train_examples = processor.get_train_examples(args['full_data_dir'], size=args['train_size']) num_train_steps = int( len(train_examples) / args['train_batch_size'] / args['gradient_accumulation_steps'] * args[ 'num_train_epochs']) eval_examples = processor.get_dev_examples(args['data_dir'], size=args['val_size']) train_features = convert_examples_to_features(train_examples, label_list, args['max_seq_length'], tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size']) eval_features = convert_examples_to_features( eval_examples, label_list, args['max_seq_length'], tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args['eval_batch_size']) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args['eval_batch_size']) model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels=label_list.__len__()) model = model.to(device) _, optimizer = opt.get_opt(model, num_train_steps) fit(model, train_dataloader, eval_dataloader, optimizer, num_train_steps, args['num_train_epochs'])
def train(self, episodes=-1): # Hacky... if episodes < 0: episodes = self.max_episodes episode = 0 all_rewards = [] try: # Set this to "while True" for genuine convergence for e in range(episodes): # Start episode episode_reward = 0 self.episode_count = e t = 0 state = self.env.reset() state = np.reshape(state, [1, self.n_features]) while True: # self.env.render() # Select action action = self._select_action(state) # Execute transition next_state, reward, done, info = self.env.step(action) episode_reward += reward next_state = np.reshape(next_state, [1, self.n_features]) # Store experience tuple in memory self.memory.append( (state, action, reward, next_state, done)) state = next_state # Replay using mini batch self._update_Q() # Copy learned Q function into target network if t % self.net_replacement_freq == 0: self.Q_ = clone_model(self.Q) self.Q_.set_weights(self.Q.get_weights()) t += 1 if done: break all_rewards.append(episode_reward) sma = np.mean(all_rewards[-SMA_WINDOW:]) logger.info('{},{},{},{}'.format(episode, episode_reward, self.epsilon, sma)) episode += 1 # Uncomment for episodic epsilon decay if not self.epsilon_special: if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay_rate # Special case: stepwise epsilon decay else: if episode < 150: self.epsilon = 1.0 elif episode < 250: self.epsilon = 0.5 else: self.epsilon = 0.0 # Convergence if sma >= 200: self.solved = True break except KeyboardInterrupt: logger.info('KeyboardInterrupt: halting training') finally: plot(all_rewards) self._save_model() return all_rewards
args = parser.parse_args() init_logger(args.log_file) beginPage = int(args.begin) endPage = int(args.end) url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8" global mysql_tool mysql_tool = mysql_tool(args, 'localhost', 'root', '20192019_yhf', 3306, 'spiders', logger=logger) global phone_num phone_num = 1 global tmp_dict tmp_dict = {} global page_num page_num = args.begin if args.mode in ['crawl', 'sparse_table']: while page_num <= args.end: try: jdPhone_spider(args, url, beginPage, endPage) except: logger.info('trying connect again!') else: jdPhone_spider(args, url, beginPage, endPage) if args.mode == 'debug': with open('result/debug.pk', 'wb') as f: pickle.dump(tmp_dict, f)
def train(self, env): # Track rewards all_rewards = [] try: while True: state = env.reset() state = [ round(s, PRECISION) for s in state[:CONTINUOUS_OBSERVATIONS] ] action = self._query_initial( state, env.discrete_obs_space ) # set the state and get first action episode_return = 0 steps = 0 total_Q_update = 0 while True: new_state, reward, done, details = env.step(action) new_state = [ round(s, PRECISION) for s in new_state[:CONTINUOUS_OBSERVATIONS] ] # env.render() episode_return += reward # if steps % 10 == 0: # print([x for x in new_state]) # print("step {} total_reward {:+0.2f}".format(steps, episode_return)) steps += 1 if done: break action, delta_Q = self._query(state, action, new_state, reward, env.discrete_obs_space) total_Q_update += delta_Q all_rewards.append(episode_return) sma = np.mean(all_rewards[-SMA_WINDOW:]) if self.episodes % 10 == 0: if self.episodes >= SMA_WINDOW: logger.info( 'Episode {} | Reward = {} | SMA = {}'.format( self.episodes, episode_return, sma)) else: logger.info('Episode {} | Reward = {}'.format( self.episodes, episode_return)) # Convergence if self.episodes > SMA_WINDOW and sma >= SOLUTION_THRESHOLD: break self.episodes += 1 except KeyboardInterrupt: logger.warn('KeyboardInterrupt - halting training') plot(all_rewards, title='Rewards per episode', xlab='Episode', ylab='Reward') logger.info('{}% of actions were random'.format( round(100. * self.random_actions / self.total_actions, 2)))
def sequences_to_midis_test(): logger.info("sequences_to_midis_test() started") sequences = midis_to_sequences('/home/hades/Documents/simple_data') sequences_to_midis(sequences) logger.info("sequences_to_midis_test() passed")
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + ["[SEP]"] segment_ids += [1] * (len(tokens_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length labels_ids = label_map[example.labels] if ex_index < 0: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label: %s (id = %s)" % (example.labels, labels_ids)) features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=labels_ids)) return features