def _add_doi(metadata, identifier, citekey): """Add an entry from a DOI.""" info_messages = [] with StatusMessage('Querying DOI metadata...') as message: if metadata.doi_exists(identifier): raise ZoiaAddException(f'DOI {identifier} already exists.') # Query Semantic Scholar to get the corresponding arxiv ID (if there is # one) in a separate thread. arxiv_queue = ThreadQueue() arxiv_process = ThreadProcess( target=lambda q, x: q.put(requests.get(x)), args=( arxiv_queue, f'https://api.semanticscholar.org/v1/paper/{identifier}', ), ) arxiv_process.start() doi_metadata = _get_doi_metadata(identifier) metadatum = zoia.backend.metadata.Metadatum.from_dict(doi_metadata) if citekey is None: citekey = zoia.parse.citekey.create_citekey(metadata, metadatum) paper_dir = os.path.join(metadata.config.library_root, citekey) os.mkdir(paper_dir) message.update( 'Querying Semantic Scholar for corresponding arXiv ID...') arxiv_metadata_response = arxiv_queue.get() arxiv_process.join() arxiv_metadata = json.loads(arxiv_metadata_response.text) if (arxiv_id := arxiv_metadata.get('arxivId')) is not None: doi_metadata['arxiv_id'] = arxiv_id message.update('Downloading PDF from arXiv...') pdf_response = requests.get( f'https://arxiv.org/pdf/{arxiv_id}.pdf') if pdf_response.status_code == 200: with open(os.path.join(paper_dir, 'document.pdf'), 'wb') as fp: fp.write(pdf_response.content) doi_metadata['pdf_md5'] = hashlib.md5( pdf_response.content).hexdigest() else: info_messages.append('Was unable to fetch a PDF') metadata[citekey] = doi_metadata
def _add_arxiv_id(metadata, identifier, citekey=None): info_messages = [] with StatusMessage('Querying arXiv...') as message: if metadata.arxiv_id_exists(identifier): raise ZoiaAddException(f'arXiv paper {identifier} already exists.') # Downloading the PDF can take a while, so start it early in a separate # thread. pdf_queue = ThreadQueue() pdf_process = ThreadProcess( target=lambda q, x: q.put(requests.get(x)), args=(pdf_queue, f'https://arxiv.org/pdf/{identifier}.pdf'), ) pdf_process.start() arxiv_metadata = _get_arxiv_metadata(identifier) if 'doi' in arxiv_metadata: message.update('Querying DOI information...') arxiv_metadata.update(_get_doi_metadata(arxiv_metadata['doi'])) if citekey is None: metadatum = zoia.backend.metadata.Metadatum.from_dict( arxiv_metadata) citekey = zoia.parse.citekey.create_citekey(metadata, metadatum) paper_dir = os.path.join(metadata.config.library_root, citekey) os.mkdir(paper_dir) message.update(text='Downloading PDF...') pdf = pdf_queue.get() pdf_process.join() if pdf.status_code == 200: with open(os.path.join(paper_dir, 'document.pdf'), 'wb') as fp: fp.write(pdf.content) md5_hash = hashlib.md5(pdf.content).hexdigest() arxiv_metadata['pdf_md5'] = md5_hash if metadata.pdf_md5_hash_exists(md5_hash): raise ZoiaAddException( f'arXiv paper {identifier} already exists.') else: info_messages.append('Was unable to fetch a PDF') metadata[citekey] = arxiv_metadata return citekey, metadatum, info_messages
def train(layer, logger, shapes, args, e, data_size, trainloader): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() layer.train() batch_idx = 0 def backward_rank2(): residual = None batch_idx = 0 grad_recv1 = torch.zeros(shapes[2]) dist.recv(tensor=grad_recv1, src=3) while True: print(" backward batch_idx:" + str(batch_idx)) grad_recv1 = grad_recv1.cuda(2) try: inputs, outputs = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % 3 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: transfer(4, inputs.grad.cpu(), None) print("backend In send..") break grad_recv1 = transfer(4, inputs.grad.cpu(), shapes[2]) #shapes[1] print("backward send.......") print("backard end....") def backward_rank1(): residual = None batch_idx = 0 grad_recv1 = torch.zeros(shapes[1]) dist.recv(tensor=grad_recv1, src=2) while True: print(" backward batch_idx:" + str(batch_idx)) grad_recv1 = grad_recv1.cuda(1) try: inputs, outputs = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % 3 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: transfer(5, inputs.grad.cpu(), None) print("backend In send..") break grad_recv1 = transfer(5, inputs.grad.cpu(), shapes[1]) #shapes[1] print("backward send.......") print("backard end....") def backward_rank0(semaphore): batch_idx = 0 grad_recv = torch.zeros(shapes[0]) dist.recv(tensor=grad_recv, src=1) while True: grad_recv = grad_recv.cuda(0) print(" backwardbatch_idx:" + str(batch_idx)) try: loss = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break loss.backward(grad_recv) if batch_idx % 3 == 0: # print("step: " + str(batch_idx)) optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: print("eq...") break grad_recv = transfer(6, None, shapes[0]) #shapes[0] print("backward send.....") print("backward end..") if dist.get_rank() == 0: outputs_queue = ThreadQueue(args.buffer_size) semaphore = Semaphore(args.buffer_size) back_process = Process(target=backward_rank0, args=(semaphore, )) back_process.start() for batch_idx, (inputs, targets) in enumerate(trainloader): print("batch: " + str(batch_idx)) inputs = inputs.cuda(0) outputs = layer(inputs) outputs_queue.put(outputs) #outputs = q_act(outputs, char=True) transfer(dist.get_rank(), outputs.cpu(), None) print("send........") print("start to end....") back_process.join() e.set() print("end....") elif dist.get_rank() == 1: outputs_queue = ThreadQueue(args.buffer_size) back_process = Process(target=backward_rank1, args=()) rec_val = torch.zeros(shapes[0]) dist.recv(tensor=rec_val, src=0) #fix bug.. back_process.start() for index, (_, targets) in enumerate(trainloader): print("batch_idx:" + str(index)) rec_val = rec_val.cuda(1) rec_val.requires_grad_() outputs = layer(rec_val) outputs_queue.put([rec_val, outputs]) if index == data_size - 1: transfer(dist.get_rank(), outputs.cpu(), None) print("the last send........") continue rec_val = transfer(dist.get_rank(), outputs.cpu(), shapes[0]) print("send.................") print("start to end....") back_process.join() e.wait() print("end......") elif dist.get_rank() == 2: outputs_queue = ThreadQueue(args.buffer_size) back_process = Process(target=backward_rank2, args=()) rec_val = torch.zeros(shapes[1]) dist.recv(tensor=rec_val, src=1) back_process.start() for index, (_, targets) in enumerate(trainloader): print("batch_idx:" + str(index)) rec_val = rec_val.cuda(2) rec_val.requires_grad_() outputs = layer(rec_val) outputs_queue.put([rec_val, outputs]) if index == data_size - 1: transfer(dist.get_rank(), outputs.cpu(), None) print("the last send........") continue rec_val = transfer(dist.get_rank(), outputs.cpu(), shapes[1]) print("send.................") print("start to end....") back_process.join() e.wait() print("end......") elif dist.get_rank() == 3: rec_val = None residual = None train_loss = 0 correct = 0 total = 0 criterion.cuda(3) rec_val = torch.zeros(shapes[2]) dist.recv(tensor=rec_val, src=2) for batch_idx, (_, targets) in enumerate(trainloader): rec_val = rec_val.cuda(3) rec_val.requires_grad_() outputs = layer(rec_val) # start to backward.... targets = targets.cuda(3) loss = criterion(outputs, targets) loss.backward() quantize_grad = rec_val.grad.cpu() if batch_idx % 3 == 0: optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) optimizer.zero_grad() else: progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) logger.error("train:" + str(train_loss / (batch_idx + 1))) acc_str = "tacc: %.3f" % (100. * correct / total, ) logger.error(acc_str) if batch_idx == data_size - 1: transfer(dist.get_rank(), quantize_grad, None) continue rec_val = transfer(dist.get_rank(), quantize_grad, shapes[2]) #print("\n start to end....") e.wait() print("end....")
def pipe_dream(layer, logger, args, backward_event, targets_queue, e, data_size, trainloader): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) layer.train() if dist.get_rank() == 0: criterion.cuda(0) output_queue = ThreadQueue(2) data_iter = iter(trainloader) batch_idx = 0 while True: try: if output_queue.qsize() == 2: backward_event.wait() optimizer.zero_grad() grad = torch.zeros([args.batch_size, 128, 16, 16]) dist.recv(tensor=grad, src=1) outputs = output_queue.get() outputs.backward(grad.cuda(0)) optimizer.step() backward_event.clear() continue else: inputs, targets = next(data_iter) inputs = inputs.cuda(0) targets_queue.put(targets.numpy(), block=False) outputs = layer(inputs) send_opt = dist.isend(tensor=outputs.cpu(), dst=1) send_opt.wait() output_queue.put(outputs) batch_idx += 1 except StopIteration as stop_e: send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() while output_queue.qsize() > 0: #backward_event.wait() optimizer.zero_grad() grad = torch.zeros([args.batch_size, 128, 16, 16]) dist.recv(tensor=grad, src=1) outputs = output_queue.get() outputs.backward(grad.cuda(0)) optimizer.step() #backward_event.clear() break elif dist.get_rank() == 1: batch_idx = 0 train_loss = 0 correct = 0 total = 0 criterion.cuda(1) while True: print("while........................") try: rec_val = torch.zeros([args.batch_size, 128, 16, 16]) dist.recv(tensor=rec_val, src=0) print("recv.......") except RuntimeError as error: print("runtime........................") #e.wait() break rec_val = rec_val.cuda(1) rec_val.requires_grad_() optimizer.zero_grad() outputs = layer(rec_val) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(1) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) if not backward_event.is_set(): print("set.....") backward_event.set() send_opt = dist.isend(tensor=rec_val.grad.cpu(), dst=0) print("send.....") if batch_idx % 10 == 0: logger.error("train:" + str(train_loss / (batch_idx + 1))) batch_idx += 1
def train(layer, logger, args, grad_queue, targets_queue, e, data_size, trainloader, start_event): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() layer.train() def backward(): start_event.wait() batch_idx = 0 while True: try: grad = grad_queue.get(block=True, timeout=1) #quantize_package = grad_queue.get(block=True, timeout=1) #grad = dequantize(quantize_package, [args.batch_size, 256, 4, 4]) #grad = grad.cuda() #grad = torch.from_numpy(grad) #grad = dense(grad, [args.batch_size, 256, 4, 4]).cuda(0) #grad = torch.from_numpy(grad).cuda(0).float() grad = torch.from_numpy(grad.astype(np.float32)).cuda(0) #grad = dequantize(grad, [args.batch_size, 256, 4, 4]) grad = dequantize(grad) except Empty as empty: print("backward empty.....") break loss = outputs_queue.get(block=False) loss.backward(grad) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if dist.get_rank() == 0: criterion.cuda(0) outputs_queue = ThreadQueue(args.buffer_size) back_process = Process(target=backward) back_process.start() for batch_idx, (inputs, targets) in enumerate(trainloader): print("batch: " + str(batch_idx)) inputs, targets = inputs.cuda(0), targets outputs = layer(inputs) send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=1) # if batch_idx < 30: send_opt.wait() targets_queue.put(targets.numpy()) outputs_queue.put(outputs) print("send....") send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() back_process.join() e.set() elif dist.get_rank() == 1: batch_idx = 0 train_loss = 0 correct = 0 total = 0 criterion.cuda(1) residual = None while True: try: rec_val = torch.zeros([args.batch_size, 256, 4, 4], dtype=torch.int8) dist.recv(tensor=rec_val, src=0) except RuntimeError as error: e.wait() break rec_val = dq_act(rec_val) rec_val = rec_val.cuda(1) rec_val.requires_grad_() outputs = layer(rec_val) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(1) loss = criterion(outputs, targets) loss.backward() #spare_grad, residual = sparse2(rec_val.grad, 0.01, True, residual) #grad_queue.put(spare_grad.cpu().numpy()) #print('before grad put') #grad_queue.put(rec_val.grad.cpu().half().numpy()) #print('after grad put') #quantize_grad = quantize(rec_val.grad, num_bits=args.bit, half=True) #grad_queue.put(quantize_grad.cpu().numpy()) #quantize_package = quantize(rec_val.grad, num_bits=args.bit, byte=True) #grad_queue.put(quantize_package) quantize_grad = quantize(rec_val.grad) grad_queue.put(quantize_grad.cpu().numpy().astype(np.int8)) if batch_idx == 0: start_event.set() if batch_idx % args.buffer_size == 0: optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) optimizer.zero_grad() else: progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) #if batch_idx % 10 == 0: logger.error("train:" + str(train_loss / (batch_idx + 1))) batch_idx += 1 acc_str = "tacc: %.3f" % (100. * correct / total,) logger.error(acc_str)
DstSession = sessionmaker(bind=db_engine, autoflush=False) dstssn = DstSession() if True: # settings.TWEETS: try: command = sys.argv[1] print(command) except IndexError: command = '' if command == 'location': ISLOCATION = True else: ISLOCATION = False user_queue = ThreadQueue() # load excel file for input fname = 'word_list.xlsx' wb = load_workbook(fname) ws = wb.active ii = i = 2 while True: if not ws.cell(row=i, column=1).value: break t1 = str(ws.cell(row=i, column=4).value).lower().strip(' ') t2 = str(ws.cell(row=i, column=5).value).lower().strip(' ') t1 = re.sub(' 00:00:00', '', t1) t2 = re.sub(' 00:00:00', '', t2) permno = str(ws.cell(row=i, column=1).value).lower().strip(' ')
def train(layer, logger, args, grad_queue, targets_queue, e, data_size, trainloader): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() layer.train() def backward(): batch_idx = 0 while True: try: grad = grad_queue.get(block=True, timeout=1) #grad = torch.from_numpy(grad) #grad = dense(grad, [args.batch_size, 128, 16, 16]).cuda(0) grad = torch.from_numpy(grad).cuda(0).float() except Empty as empty: print("backward empty.....") break loss = outputs_queue.get(block=False) loss.backward(grad) if batch_idx % 2 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if dist.get_rank() == 0: criterion.cuda(0) start_flag = True outputs_queue = ThreadQueue(20) for batch_idx, (inputs, targets) in enumerate(trainloader): print("batch: " + str(batch_idx)) inputs, targets = inputs.cuda(0), targets outputs = layer(inputs) outputs_queue.put(outputs) print('put......') targets_queue.put(targets.numpy()) print(outputs.cpu().size()) send_opt = dist.isend(tensor=outputs.cpu(), dst=1) if batch_idx % 10 == 0: send_opt.wait() #send_opt.wait() print("send....") if start_flag and grad_queue.qsize() > 0: start_flag = False back_process = Process(target=backward) back_process.start() send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() back_process.join() e.set() elif dist.get_rank() == 1: batch_idx = 0 train_loss = 0 correct = 0 total = 0 criterion.cuda(1) residual = None while True: try: rec_val = torch.zeros([args.batch_size, 128, 16, 16]) dist.recv(tensor=rec_val, src=0) print("recv.......") except RuntimeError as error: e.wait() break rec_val = rec_val.cuda(1) rec_val.requires_grad_() outputs = layer(rec_val) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(1) loss = criterion(outputs, targets) loss.backward() #spare_grad, residual = sparse2(rec_val.grad, 0.01, True, residual) #grad_queue.put(spare_grad.cpu().numpy()) grad_queue.put(rec_val.grad.cpu().half().numpy()) if batch_idx % 2 == 0: optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) optimizer.zero_grad() else: progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) if batch_idx % 10 == 0: logger.error("train:" + str(train_loss / (batch_idx + 1))) batch_idx += 1
def train(layer, logger, shapes, args, e, data_size, trainloader): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() layer.train() batch_idx = 0 def backward_rank1(): residual = None batch_idx = 0 ten_len = tensor_len(shapes[1]) #grad_recv1 = torch.zeros(shapes[1], dtype=torch.int8) #grad_recv1 = torch.HalfTensor(torch.Size(shapes[1])) grad_recv1 = torch.zeros(ten_len + 2) #shapes[1] dist.recv(tensor=grad_recv1, src=2) while True: print(" backward batch_idx:" + str(batch_idx)) #grad_recv1 = unpack(grad_recv1.cuda(), shapes[1]) #grad_recv1 = dequantize(grad_recv1.cuda().float()) grad_recv1 = de_piecewise_quantize(grad_recv1.cuda(), shapes[1]) #grad_recv1 = grad_recv1.cuda() try: inputs, outputs = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break inputs.requires_grad_() outputs.backward(grad_recv1) #inputs_grad = quantize(inputs.grad, char=True).cpu() #inputs_grad, residual = compress(inputs.grad, residual=residual) inputs_grad, residual = piecewise_quantize(inputs.grad, logger=logger, residual=residual) #inputs_grad = inputs_grad.cpu() #inputs_grad = inputs.grad.cpu() if batch_idx % 2 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: transfer2(3, inputs_grad, None) print("backend In send..") break grad_recv1 = transfer2(3, inputs_grad, ten_len + 2) #shapes[1] print("backward send.......") print("backard end....") def backward_rank0(semaphore): batch_idx = 0 ten_len = tensor_len(shapes[0]) grad_recv = torch.zeros(ten_len + 2) #grad_recv = torch.zeros(shapes[0], dtype=torch.int8) #grad_recv = torch.HalfTensor(torch.Size(shapes[0])) dist.recv(tensor=grad_recv, src=1) while True: #semaphore.release() #grad_recv = dequantize(grad_recv.cuda().float()) grad_recv = de_piecewise_quantize(grad_recv.cuda(), shapes[0]) #grad_recv = unpack(grad_recv.cuda(), shapes[0]) print(" backwardbatch_idx:" + str(batch_idx)) # grad_recv = grad_recv.cuda() try: loss = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break loss.backward(grad_recv) if batch_idx % 2 == 0: # print("step: " + str(batch_idx)) optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: print("eq...") break grad_recv = transfer2(4, None, ten_len + 2) #shapes[0] print("backward send.....") print("backward end..") if dist.get_rank() == 0: outputs_queue = ThreadQueue(args.buffer_size) semaphore = Semaphore(args.buffer_size) back_process = Process(target=backward_rank0, args=(semaphore, )) back_process.start() for batch_idx, (inputs, targets) in enumerate(trainloader): #semaphore.acquire() print("batch: " + str(batch_idx)) inputs = inputs.cuda() outputs = layer(inputs) # outputs_queue.put(outputs) outputs = q_act(outputs, char=True) transfer(dist.get_rank(), outputs.cpu(), None) print("send........") print("start to end....") back_process.join() e.set() print("end....") elif dist.get_rank() == 1: outputs_queue = ThreadQueue(args.buffer_size) back_process = Process(target=backward_rank1, args=()) rec_val = torch.zeros(shapes[0], dtype=torch.int8) dist.recv(tensor=rec_val, src=0) #fix bug.. back_process.start() for index, (_, targets) in enumerate(trainloader): print("batch_idx:" + str(index)) rec_val = dq_act(rec_val) rec_val = rec_val.cuda() rec_val.requires_grad_() outputs = layer(rec_val) outputs_queue.put([rec_val, outputs]) outputs = q_act(outputs, char=True) if index == data_size - 1: transfer(dist.get_rank(), outputs.cpu(), None) print("the last send........") continue rec_val = transfer(dist.get_rank(), outputs.cpu(), shapes[0]) print("send.................") print("start to end....") back_process.join() e.wait() print("end......") elif dist.get_rank() == 2: rec_val = None residual = None train_loss = 0 correct = 0 total = 0 correct_5 = 0 correct_1 = 0 criterion.cuda() if not torch.is_tensor(rec_val): rec_val = torch.zeros(shapes[1], dtype=torch.int8) dist.recv(tensor=rec_val, src=1) for batch_idx, (_, targets) in enumerate(trainloader): rec_val = dq_act(rec_val) rec_val = rec_val.cuda() rec_val.requires_grad_() outputs = layer(rec_val) # start to backward.... targets = targets.cuda() loss = criterion(outputs, targets) loss.backward() #quantize_grad = quantize(rec_val.grad, char=True).cpu() # for_view = rec_val.grad.view(-1).tolist() # logger.error("grad: " + str(for_view)) #quantize_grad, residual = compress(rec_val.grad, residual=residual) quantize_grad, residual = piecewise_quantize(rec_val.grad, logger=logger, residual=residual) #quantize_grad = quantize_grad.cpu() #quantize_grad = rec_val.grad.cpu() if batch_idx % 2 == 0: optimizer.step() train_loss += loss.item() #_, predicted = outputs.max(1) #total += targets.size(0) #correct += predicted.eq(targets).sum().item() _, predicted = outputs.topk(5, 1, True, True) total += targets.size(0) targets = targets.view(targets.size(0), -1).expand_as(predicted) correct = predicted.eq(targets).float() correct_5 += correct[:, :5].sum() correct_1 += correct[:, :1].sum() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct_5 / total, correct_5, total)) optimizer.zero_grad() else: progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct_5 / total, correct_5, total)) logger.error("train:" + str(train_loss / (batch_idx + 1))) acc_str = "tacc1: %.3f" % (100. * correct_1 / total, ) logger.error(acc_str) acc_str5 = "tacc5: %.3f" % (100. * correct_5 / total, ) logger.error(acc_str5) if batch_idx == data_size - 1: transfer(dist.get_rank(), quantize_grad, None) continue rec_val = transfer(dist.get_rank(), quantize_grad, shapes[1]) #print("\n start to end....") e.wait() print("end....")
def train(layer, logger, args, grad_queue, grad_queue2, targets_queue, e, data_size, trainloader, start_event, start_event2): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() layer.train() def backward_rank0(semaphore, start_event2): start_event2.wait() batch_idx = 0 while True: try: semaphore.release() print("before grad recv") grad_recv = torch.zeros([args.batch_size, 256, 4, 4], dtype=torch.int8) dist.recv(tensor=grad_recv, src=1) print("after grad recv...") except RuntimeError as error: print("backward runtime error") break grad_recv = dequantize(grad_recv.cuda(0).float()) loss = outputs_queue.get(block=False) loss.backward(grad_recv) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 def backward_rank1(semaphore, start_event, start_event2): start_event.wait() batch_idx = 0 while True: try: #semaphore.release() print("before grad recv...") grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=grad_recv1, src=2) print("after grad recv.....") except RuntimeError as error: print("backward runtime error") send_opt = dist.isend(tensor=torch.zeros(0), dst=0) send_opt.wait() break grad_recv1 = dequantize(grad_recv1.cuda(0).float()) inputs, outputs = outputs_queue.get(block=False) inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() inputs_grad = quantize(inputs.grad, char=True).cpu() print(inputs_grad.size()) if batch_idx == 0: start_event2.set() #send_opt = dist.isend(tensor=inputs_grad, dst=0) #send_opt.wait() dist.send(tensor=inputs_grad, dst=0) batch_idx += 1 if dist.get_rank() == 0: criterion.cuda(0) outputs_queue = ThreadQueue(args.buffer_size) semaphore = Semaphore(args.buffer_size) back_process = Process(target=backward_rank0, args=(semaphore, start_event2)) back_process.start() for batch_idx, (inputs, targets) in enumerate(trainloader): semaphore.acquire() print("batch: " + str(batch_idx)) inputs, targets = inputs.cuda(0), targets outputs = layer(inputs) targets_queue.put(targets.numpy()) outputs_queue.put(outputs) send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=1) send_opt.wait() print("send....") print("start to end..") send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() back_process.join() e.set() elif dist.get_rank() == 1: batch_idx = 0 criterion.cuda(0) outputs_queue = ThreadQueue(10) semaphore = Semaphore(args.buffer_size - 1) back_process = Process(target=backward_rank1, args=(semaphore, start_event, start_event2)) back_process.start() while True: try: print("before semaphore......") #semaphore.acquire() rec_val = torch.zeros([args.batch_size, 256, 4, 4], dtype=torch.int8) dist.recv(tensor=rec_val, src=0) print("after recv.....") except RuntimeError as error: print("runtime errror") send_opt = dist.isend(tensor=torch.zeros(0), dst=2) send_opt.wait() back_process.join() e.wait() break print("before dq...") rec_val = dq_act(rec_val) rec_val = rec_val.cuda(0) rec_val.requires_grad_() print("before output......") outputs = layer(rec_val) # if batch_idx % args.buffer_size == 0: # optimizer.step() # optimizer.zero_grad() print("before queue") outputs_queue.put([rec_val, outputs]) print("after queue") #send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=2) #send_opt.wait() dist.send(tensor=q_act(outputs, char=True).cpu(), dst=2) batch_idx += 1 print("send end...") elif dist.get_rank() == 2: batch_idx = 0 train_loss = 0 correct = 0 total = 0 criterion.cuda(0) while True: try: #print("before recv....") rec_val = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=rec_val, src=1) #print("after recv.....") except RuntimeError as error: #traceback.format_exc(error) send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() e.wait() break rec_val = dq_act(rec_val) rec_val = rec_val.cuda(0) rec_val.requires_grad_() outputs = layer(rec_val) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(0) loss = criterion(outputs, targets) loss.backward() if batch_idx % args.buffer_size == 0: optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) optimizer.zero_grad() else: progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) #if batch_idx % 10 == 0: logger.error("train:" + str(train_loss / (batch_idx + 1))) acc_str = "tacc: %.3f" % (100. * correct / total,) logger.error(acc_str) if batch_idx == 0: start_event.set() quantize_grad = quantize(rec_val.grad, char=True) #send_opt = dist.isend(tensor=quantize_grad.cpu(), dst=1) #send_opt.wait() dist.send(tensor=quantize_grad.cpu(), dst=1) batch_idx += 1