def _lazy_import_torch(self): try: import torch except ImportError: raise ImportError('Need to install Pytorch: go to pytorch.org') import torchvision import torchvision.transforms as transforms import torch.nn as nn self.use_cuda = not self.opt.get('no_cuda', False) and torch.cuda.is_available() if self.use_cuda: logging.debug(f'Using CUDA') torch.cuda.set_device(self.opt.get('gpu', -1)) self.torch = torch self.torchvision = torchvision self.transforms = transforms self.nn = nn
def __init__(self, opt: Opt): try: # tensorboard is a very expensive thing to import. Wait until the # last second to import it. from tensorboardX import SummaryWriter except ImportError: raise ImportError('Please run `pip install tensorboard tensorboardX`.') if opt['tensorboard_logdir'] is not None: tbpath = opt['tensorboard_logdir'] else: tbpath = opt['model_file'] + '.tensorboard' logging.debug(f'Saving tensorboard logs to: {tbpath}') if not PathManager.exists(tbpath): PathManager.mkdirs(tbpath) self.writer = SummaryWriter(tbpath, comment=json.dumps(opt))
def checksum(self, dpath): """ Checksum on a given file. :param dpath: path to the downloaded file. """ sha256_hash = hashlib.sha256() with PathManager.open(os.path.join(dpath, self.file_name), "rb") as f: for byte_block in iter(lambda: f.read(65536), b""): sha256_hash.update(byte_block) if sha256_hash.hexdigest() != self.hashcode: # remove_dir(dpath) raise AssertionError( f"Checksum for {self.file_name} from \n{self.url}\n" "does not match the expected checksum. Please try again.") else: logging.debug("Checksum Successful")
def finalize(self, frequencies: Dict[str, int], num_symbols: int = 30000, minfreq: int = 2) -> bool: """ Build the codecs. :param frequencies: dictionary of (token: frequency) pairs :param num_symbols: Number of BPE symbols. Recommend 30000-40000. If <= 0, default 30000 will be used. :param minfreq: Minimum frequency of a token before forced BPE decomposition. If <= 0 will use subword-nmt default of 2. :return did_finalize: return whether codecs are finalized this call. """ if hasattr(self, 'bpe'): # we already finalized the codecs return False logging.debug(f'Saving bpe codecs to {self.codecs}') dictionary = ("{} {}".format(k, v) for k, v in frequencies.items()) if num_symbols <= 0: num_symbols = 30000 if minfreq <= 0: minfreq = 2 codec_dir, _ = os.path.split(self.codecs) PathManager.mkdirs(codec_dir) with PathManager.open(self.codecs, 'w', encoding='utf-8') as outstream: learn_bpe.learn_bpe( dictionary, outstream, num_symbols=num_symbols, min_frequency=minfreq, is_dict=True, ) self._load_from_codecs() return True
def batch_act_sdm( self, observations: List[Dict[str, Message]], knowledge_agent_observations: List[Message], ) -> Tuple[List[Message], List[int], List[Message]]: """ Search Decision batch act. :param observations: observations for batch act. :param knowledge_agent_observations: observations to modify with the decision from the search decision agent. :return (batch_reply, search_indices, observations): batch_reply: reply from the search decision agent search_indices: batch indices with which to use search. observations: modified knowledge agent observations """ search_indices = [] batch_reply_sdm = [{} for _ in range(len(knowledge_agent_observations))] if self.search_decision is SearchDecision.ALWAYS: [o.force_set('skip_retrieval', False) for o in knowledge_agent_observations] search_indices = list(range(len(knowledge_agent_observations))) elif self.search_decision is SearchDecision.NEVER: [o.force_set('skip_retrieval', True) for o in knowledge_agent_observations] else: assert self.search_decision is SearchDecision.COMPUTE assert self.search_decision_agent batch_reply_sdm = self.search_decision_agent.batch_act( [o['search_decision_agent'] for o in observations] ) for i, reply in enumerate(batch_reply_sdm): logging.debug(f"Example {i}: {reply['text']}") if reply['text'] == self.opt['search_decision_do_search_reply']: search_indices.append(i) knowledge_agent_observations[i].force_set('skip_retrieval', False) elif reply['text'] == self.opt['search_decision_dont_search_reply']: knowledge_agent_observations[i].force_set('skip_retrieval', True) else: logging.error( f"SDM Reply: {reply['text']}; defaulting to no search" ) knowledge_agent_observations[i].force_set('skip_retrieval', True) return batch_reply_sdm, search_indices, knowledge_agent_observations
def _untar(path, fname, delete=True, flatten=False): """ Unpack the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import tarfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) # very painfully manually extract files so that we can use PathManger.open # instead, lest we are using fb internal file services with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf: for item in tf: item_name = item.name while item_name.startswith("./"): # internal file systems will actually create a literal "." # directory, so we gotta watch out for that item_name = item_name[2:] if flatten: # flatten the tar file if there are subdirectories fn = os.path.join(path, os.path.split(item_name)[-1]) else: fn = os.path.join(path, item_name) logging.debug(f"Extracting to {fn}") if item.isdir(): PathManager.mkdirs(fn) elif item.isfile(): with PathManager.open(fn, 'wb') as wf, tf.extractfile( item.name) as rf: tarfile.copyfileobj(rf, wf) else: raise NotImplementedError( "No support for symlinks etc. right now.") if delete: PathManager.rm(fullpath)
def untar(path, fname, deleteTar=True): """ Unpack the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool deleteTar: If true, the archive will be deleted after extraction. """ logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) shutil.unpack_archive(fullpath, path) if deleteTar: os.remove(fullpath)
def batch_act_sqm( self, observations: List[Dict[str, Message]], search_indices: List[int] ) -> List[Message]: """ Search Query Generator batch act. :param observations: list of observations :param search_indices: list of batch indices for which search is required. :return batch_reply: return the batch reply from the search query agent """ batch_reply_sqm = [{} for _ in range(len(observations))] if self.search_query_agent and search_indices: batch_replies_with_search = self.search_query_agent.batch_act( [ o for i, o in enumerate( [o['search_query_agent'] for o in observations] ) if i in search_indices ] ) for i, reply in zip(search_indices, batch_replies_with_search): batch_reply_sqm[i] = reply search_queries = [o.get('text', '') for o in batch_reply_sqm] if self.inject_query_string: for i in range(len(search_queries)): if search_queries[i]: search_queries[i] = ' '.join( [search_queries[i], self.inject_query_string] ) logging.debug(f"Search Queries: {search_queries}") self.knowledge_agent.model_api.set_search_queries(search_queries) else: try: self.knowledge_agent.model_api.set_search_queries([]) except AttributeError: # Gold Documents, most likely pass return batch_reply_sqm
def retrieve_and_score( self, query: torch.LongTensor ) -> Tuple[List[List[Document]], torch.Tensor]: """ Retrieve and score. Encode :param query: query tokens :return (docs, scores): docs: list of (text, title) tuples for each batch example scores: doc scores """ query_enc = self.query_encoder(query) scores = self.score_memories(query_enc) top_docs, top_doc_scores = [], [] for i in range(query.size(0)): scores_i = scores[i] memories_i, scores_i = argsort_scores_and_docs( scores_i, self.memory_vec_dict[i], self.n_docs # type: ignore ) mem_docs = [] for mem in memories_i: mem_doc = Document('', self._tokenizer.decode(mem), '') # type: ignore mem_doc.TITLE_DELIM = self.opt['memory_doc_title_delimiter'] mem_docs.append(mem_doc) if len(mem_docs) < self.n_docs: # add dummy docs num_blank = self.n_docs - len(mem_docs) mem_docs += [BLANK_DOC] * num_blank scores_i = torch.cat( [scores_i, torch.zeros(num_blank).to(scores_i)]) top_docs.append(mem_docs) top_doc_scores.append(scores_i) logging.debug(scores_i) return top_docs, torch.stack(top_doc_scores)
def unzip(path, fname, deleteZip=True): """ Unzip the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool deleteZip: If true, the archive will be deleted after extraction. """ logging.debug(f'unzipping {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(fullpath, "r") as zip_ref: zip_ref.extractall(path) if deleteZip: os.remove(fullpath)
def dump_data(opt): # create repeat label agent and assign it to the specified task agent = RepeatLabelAgent(opt) world = create_task(opt, agent) opt.log() if opt['outfile'] is None: outfile = tempfile.mkstemp(prefix='{}_{}_'.format( opt['task'], opt['datatype']), suffix='.txt')[1] else: outfile = opt['outfile'] if opt['num_examples'] == -1: num_examples = world.num_examples() else: num_examples = opt['num_examples'] log_timer = TimeLogger() logging.debug('starting to convert...') logging.info(f'saving output to {outfile}') fw = open(outfile, 'w') text = '' for _ in range(num_examples): world.parley() world.acts[0]['labels'] = world.acts[0].get( 'labels', world.acts[0].pop('eval_labels', None)) samp = world.acts[0] text += samp["text"].replace("\n", " ") + " " fw.write("__label__%s %s\n" % (samp["labels"][0].replace(' ', '_'), text)) if world.acts[0].get('episode_done', False): text = '' if log_timer.time() > opt['log_every_n_secs']: text, _log = log_timer.log(world.total_parleys, world.num_examples()) logging.info(text) if world.epoch_done(): logging.info('epoch done') break fw.close()
def _place_modulelist(self, submodule: torch.nn.Module) -> None: if not isinstance(submodule, torch.nn.ModuleList): # not a ModuleList, leave it untouched return if getattr(submodule, 'model_parallel_exempt', False): return assert isinstance(submodule, torch.nn.ModuleList) # for typechecker layers = submodule # mark this section as MP layers.is_model_parallel = True # type: ignore # next, let's figure out how many parameters we can assign to each GPU, # but not make actual assignments yet. Assignments come later because we # want consecutive layers to be collocated keyfunc = self.__device_allocations.__getitem__ layer_assignments = {k: 0 for k in self.devices} for layer_no, layer in enumerate(layers): if layer_no == 0: # hard code the first layer to be 0. mostfree = 'cuda:0' else: # otherwise dynamic allocation mostfree = min(self.devices, key=keyfunc) # 32 is a totally arbitrary, made up number that worked in practice # on the large models I tested on. I believe it should be roughly # batch size, but this was set empirically. self.__device_allocations[mostfree] += trainable_parameters(layer) * 32 # mark a layer as going to the given element layer_assignments[mostfree] += 1 devices = [d for i, d in enumerate(self.devices[:]) if layer_assignments[d] > 0] for layer_no, layer in enumerate(layers): layer_gpu = devices[0] assert layer_assignments[layer_gpu] > 0 logging.debug(f"Model Parallel: Assigning {layer_no} to {layer_gpu}") layer._mp_gpu = layer_gpu layers[layer_no] = layer.to(layer_gpu) layer_assignments[layer_gpu] -= 1 if layer_assignments[layer_gpu] == 0: devices.pop(0)
def setup_data(self, fold): domains = self.opt.get('domains', DOMAINS) chunks = self._load_data(fold, domains) domains_cnt = Counter() for _, row in chunks.iterrows(): domains_cnt[row['domain']] += 1 first = True utterances = row['utterances'][:] if (len(utterances) >= 3 and utterances[0]['speaker'] == 'USER' and utterances[1]['speaker'] == 'ASSISTANT' and utterances[2]['speaker'] == 'ASSISTANT' and "help you?" in utterances[1]['text']): # skip this one utterances.pop(1) if self.opt['include_ontology']: yield { 'text': f"{ONTO_TOKEN} {row['ontology']}", 'label': '' }, True first = False while utterances: utt = utterances.pop(0) segtxt, slots = self._segments2text(utt.get('segments', [])) if utt['speaker'] == 'USER': yield { 'text': utt['text'], 'label': f'{CALL_TOKEN} {segtxt}', 'domain': row['domain'], 'slots': slots, 'type': 'apicall', }, first first = False elif utt['speaker'] == 'ASSISTANT': yield { 'text': f'{RESP_TOKEN} {segtxt}', 'label': utt['text'], 'domain': row['domain'], 'slots': slots, 'type': 'apiresp', }, first first = False logging.debug(f"Fold {fold} domains: {domains_cnt}")
def _batch_generate(self, texts: List[str]) -> List[str]: """ Batch generate items from an input list of texts. :param texts: list of texts :return generations: return agent generations for each input. """ start = time.time() active_agents = self.agents[: len(texts)] for agent_i, t_i in zip(active_agents, texts): agent_i.observe(Message({'text': t_i, 'episode_done': True})) agent_replies = self.agents[0].batch_act([a.observation for a in active_agents]) logging.debug(f'Generated: {time.time() - start:.2f}') for agent_i, reply_i in zip(active_agents, agent_replies): agent_i.self_observe(reply_i) self.generations = [r.get('text', 'dummy') for r in agent_replies] return self.generations
def dump_data(opt): # create repeat label agent and assign it to the specified task agent = RepeatLabelAgent(opt) world = create_task(opt, agent) opt.log() ignorefields = opt.get('ignore_fields', '') if opt['outfile'] is None: outfile = tempfile.mkstemp(prefix='{}_{}_'.format( opt['task'], opt['datatype']), suffix='.txt')[1] else: outfile = opt['outfile'] if opt['num_examples'] == -1: num_examples = world.num_examples() else: num_examples = opt['num_examples'] log_timer = TimeLogger() logging.debug('starting to convert...') logging.info(f'saving output to {outfile}') fw = open(outfile, 'w') for _ in range(num_examples): world.parley() acts = world.get_acts() value = acts[0].get('labels', acts[0].pop('eval_labels', None)) acts[0].force_set('labels', value) txt = msg_to_str(acts[0], ignore_fields=ignorefields) fw.write(txt + '\n') if acts[0].get('episode_done', False): fw.write('\n') if log_timer.time() > opt['log_every_n_secs']: text, _log = log_timer.log(world.total_parleys, world.num_examples()) logging.info(text) if world.epoch_done(): logging.info('epoch done') break fw.close()
def _load_metadata(self, datapath): """ Load metadata. Metadata should be saved at <identifier>.metadata Metadata should be of the following format: { 'date': <date collected>, 'opt': <opt used to collect the data>, 'speakers': <identity of speakers>, ... Other arguments. } """ try: metadata = Metadata(datapath) return metadata except RuntimeError: logging.debug( 'Metadata does not exist. Please double check your datapath.') return None
def _unzip(path, fname, delete=True): """ Unpack the given zip file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import zipfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf: for member in zf.namelist(): outpath = os.path.join(path, member) if zf.getinfo(member).is_dir(): logging.debug(f"Making directory {outpath}") PathManager.mkdirs(outpath) continue logging.debug(f"Extracting to {outpath}") with zf.open(member, 'r') as inf, PathManager.open(outpath, 'wb') as outf: shutil.copyfileobj(inf, outf) if delete: try: PathManager.rm(fullpath) except PermissionError: logging.error( f"Tried to delete {fullpath} but got a permission error. This " "is known to happen in Windows and is probably not fatal.")
def _unzip(path, fname, delete=True): """ Unpack the given zip file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import zipfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf: for member in zf.namelist(): outpath = os.path.join(path, member) if zf.getinfo(member).is_dir(): logging.debug(f"Making directory {outpath}") PathManager.mkdirs(outpath) continue logging.debug(f"Extracting to {outpath}") with zf.open(member, 'r') as inf, PathManager.open(outpath, 'wb') as outf: shutil.copyfileobj(inf, outf) if delete: PathManager.rm(fullpath)
def write_memory(self, mem_dict: Dict[int, torch.LongTensor]): """ Write vectors to memory. Assume that we clear the memory as well. :param mem_dict: mapping from memory slot to 2D-tokenized memories """ self.active_memory_slots = list(mem_dict.keys()) with torch.no_grad(): slot_num_mems = [m.size(0) for m in mem_dict.values()] logging.debug(f'Writing {slot_num_mems} memories') mem_vecs = torch.cat(list(mem_dict.values()), dim=0) mem_encs = self.memory_encoder(mem_vecs) offset = 0 for mem_slot, num_mems in zip(mem_dict.keys(), slot_num_mems): self.memory_vec_dict[mem_slot] = mem_vecs[ # type: ignore offset:offset + num_mems] self.memory_enc_dict[mem_slot] = mem_encs[offset:offset + num_mems] offset += num_mems
def generate_memories( self, input: torch.LongTensor, num_inputs: torch.LongTensor ) -> List[List[str]]: """ Generate memories from input. Each input is split into the lines of conversational context. These are considered independently. We then assign a prefix ("your/partner's persona:") dependent on whether the bot or it's partner said the line. :param input: input to the memory decoder :param num_inputs: number of lines per batch item """ assert self.agent_dict is not None memories = [] offset = 0 for idx, i in enumerate(input): if num_inputs[idx] == 0: continue context_lines_vec = i[offset : offset + num_inputs[idx]] offset += num_inputs[idx] context_lines = [ self.agent_dict.vec2txt(self.clean_input(j)) for j in context_lines_vec ] raw_memories_i = list(reversed(self._batch_generate(context_lines))) logging.debug(f'raw memories: {raw_memories_i}') memories_i = self._extract_from_raw_memories(raw_memories_i) logging.debug(f'memories to write: {memories_i}') mem_string = '\n'.join(memories_i) logging.verbose(f'Writing memories: {mem_string}') memories.append(memories_i) self.memories_full_list = memories return memories
def search(self, query_vectors: np.array, top_docs: int) -> List[Tuple[List[int], List[np.array]]]: """ Search FAISS index. :param query_vectors: query vectors into the index :param top_docs: number of docs to return :return top_docs: returns, for each query vector: a list of document ids (according to db), a list of reconstructed document vectors """ query_vectors = self.get_search_vectors(query_vectors) logging.debug(f'query_vectors {query_vectors.shape}') _scores, indexes, vectors = self.index.search_and_reconstruct( query_vectors, top_docs) db_ids = [[self.index_id_to_db_id[i] for i in query_top_idxs] for query_top_idxs in indexes] result = [(db_ids[i], vectors[i]) for i in range(len(db_ids))] return result
def download(url, path, fname, redownload=False, num_retries=5): """ Download file using `requests`. If ``redownload`` is set to false, then will not download tar file again if it is present (default ``False``). """ outfile = os.path.join(path, fname) download = not PathManager.exists(outfile) or redownload logging.info(f"Downloading {url} to {outfile}") retry = num_retries exp_backoff = [2**r for r in reversed(range(retry))] pbar = tqdm.tqdm(unit='B', unit_scale=True, desc='Downloading {}'.format(fname)) while download and retry > 0: response = None with requests.Session() as session: try: response = session.get(url, stream=True, timeout=5) # negative reply could be 'none' or just missing CHUNK_SIZE = 32768 total_size = int(response.headers.get('Content-Length', -1)) # server returns remaining size if resuming, so adjust total pbar.total = total_size done = 0 with PathManager.open(outfile, 'wb') as f: for chunk in response.iter_content(CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) if total_size > 0: done += len(chunk) if total_size < done: # don't freak out if content-length was too small total_size = done pbar.total = total_size pbar.update(len(chunk)) break except ( requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, ): retry -= 1 pbar.clear() if retry > 0: pl = 'y' if retry == 1 else 'ies' logging.debug( f'Connection error, retrying. ({retry} retr{pl} left)') time.sleep(exp_backoff[retry]) else: logging.error('Retried too many times, stopped retrying.') finally: if response: response.close() if retry <= 0: raise RuntimeError( 'Connection broken too many times. Stopped retrying.') if download and retry > 0: pbar.update(done - pbar.n) if done < total_size: raise RuntimeError( f'Received less data than specified in Content-Length header for ' f'{url}. There may be a download problem.') pbar.close()
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ): if 'metrics' in model_response and 'type' in teacher_action: # keep copies of metrics across both api calls/responses prefix = teacher_action['type'] keys = list(model_response['metrics'].keys()) for k in keys: self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k]) if 'text' not in model_response or not labels or 'type' not in teacher_action: return domain = teacher_action['domain'] if teacher_action['type'] == 'apicall': # also count slot accuracy text = model_response['text'] slot_guesses = set( text.replace( CALL_TOKEN + " ", "").split(' ; ')) # prevent cheating via repeated guesses correct = 0 for slot_guess in slot_guesses: if ' = ' not in slot_guess: continue try: slot, guess = slot_guess.split(' = ') except ValueError: continue if teacher_action['slots'].get(slot) == guess: self.metrics.add('slot_p', AverageMetric(1)) self.metrics.add(f'{domain}_slot_p', AverageMetric(1)) correct += 1 else: self.metrics.add('slot_p', AverageMetric(0)) self.metrics.add(f'{domain}_slot_p', AverageMetric(0)) logging.debug( f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}" ) if teacher_action['slots']: self.metrics.add( 'slot_r', AverageMetric(correct, len(teacher_action['slots']))) self.metrics.add( f'{domain}_slot_r', AverageMetric(correct, len(teacher_action['slots'])), ) self.metrics.add( 'jga', AverageMetric(correct == len(teacher_action['slots']))) elif teacher_action['type'] == 'apiresp': # keep track of statistics by domain f1_metric = F1Metric.compute(model_response['text'], labels) bleu_metric = BleuMetric.compute(model_response['text'], labels) self.metrics.add(f'{domain}_lex_f1', f1_metric) self.metrics.add(f'{domain}_lex_bleu', bleu_metric) delex_text = model_response['text'] delex_label = labels[0] # compute delexicalized string metrics for slot, value in teacher_action['slots'].items(): delex_text = delex_text.replace(value, slot) delex_label = delex_label.replace(value, slot) f1_metric = F1Metric.compute(delex_text, (delex_label, )) self.metrics.add('delex_f1', f1_metric) self.metrics.add(f'{domain}_delex_f1', f1_metric) bleu_metric = BleuMetric.compute(delex_text, [delex_label]) self.metrics.add('delex_bleu', bleu_metric) self.metrics.add(f'{domain}_delex_bleu', bleu_metric)
def access_long_term_memory( self, query_vec: torch.LongTensor, memory_indices: torch.LongTensor, memory_vec: Optional[torch.LongTensor], num_memories: torch.LongTensor, memory_decoder_vec: Optional[torch.LongTensor], generated_memories: List[List[str]], ) -> Tuple[Optional[List[List[Document]]], Optional[torch.Tensor]]: """ Access long term memory. :param query_vec: retrieval vector for the long-term memory :param memory_indices: indices to access memory slots :param memory_vec: extracted memories from the observation :param num_memories: bsz-length tensor corresponding to number of memories per batch item :param memory_decoder_vec: input to the memory decoder :param generated_memories: memories generated by the memory decoder :return memories, memory_scores: return memories and memory scores, if there are memories retrieved """ start = time.time() memories = None memory_scores = None memory_dict = {} indices = memory_indices.tolist() if memory_vec is not None: # Only look in memory_vec for batch elements with memories memory_ids = [m for m in indices if num_memories[m] > 0] memory_dict = { batch_id: memory_vec[batch_id, :num_memories[mem_id]] for batch_id, mem_id in enumerate(memory_ids) } if memory_decoder_vec is not None: for batch_id in indices: new_mems_i = generated_memories[batch_id] if not new_mems_i: continue tokenized = [ self.long_term_memory.tokenize_query(m) for m in generated_memories[batch_id] ] if batch_id in memory_dict: tokenized += memory_dict[batch_id].tolist() new_mems_i, _ = padded_tensor( tokenized, pad_idx=self.dict[self.dict.null_token] # type: ignore ) memory_dict[batch_id] = new_mems_i.to(query_vec) if self.knowledge_access_method in [ KnowledgeAccessMethod.ALL, KnowledgeAccessMethod.MEMORY_ONLY, ]: # Add dummy memories just in case we are retrieving from memories if memory_vec is not None: seqlen = memory_vec.size(-1) elif memory_decoder_vec is not None: seqlen = memory_decoder_vec.size(-1) else: seqlen = query_vec.size(-1) for batch_id in indices: if batch_id not in memory_dict: memory_dict[batch_id] = torch.zeros(1, seqlen).to(query_vec) if memory_dict: # first make sure all memories are padded to the same length. max_length = max([m.size(-1) for m in memory_dict.values()]) for batch_id in memory_dict: vec = memory_dict[batch_id] if vec.size(-1) < max_length: memory_dict[batch_id] = torch.cat( [ vec, torch.zeros( (*vec.shape[:-1], max_length - vec.size(-1))).fill_( self.dict[self.dict.null_token]).to(vec), ], dim=1, ) self.long_term_memory.write_memory(memory_dict) # type: ignore logging.debug(f'Write Memory Complete: {time.time() - start:.2f}') if self.long_term_memory.has_memory(): memories, memory_scores = self.long_term_memory.retrieve( query_vec[memory_indices] # type: ignore ) logging.debug( f'Memory Retrieval Complete: {time.time() - start:.2f}') logging.debug(f'memories: {memories}') logging.verbose('Reading from Memory') return memories, memory_scores
def retrieve_and_concat( self, input: torch.LongTensor, input_lengths: torch.LongTensor, query_generator_vec: torch.LongTensor, query_vec: torch.LongTensor, input_turns_cnt: torch.LongTensor, memory_vec: torch.LongTensor, num_memories: torch.LongTensor, gold_doc_vec: torch.LongTensor, gold_doc_title_vec: torch.LongTensor, num_gold_docs: torch.LongTensor, memory_decoder_vec: torch.LongTensor, num_memory_decoder_vecs: torch.LongTensor, skip_search: torch.BoolTensor, ) -> Tuple[torch.LongTensor, List[List[Document]], torch.Tensor]: """ Override RagModel.retrieve_and_concat to perform different retrieval, depending on the RetrieverType. """ self.flush_previous_retriever_search_results() start = time.time() logging.debug(f'Begin encoder: {time.time() - start:.2f}') if input_turns_cnt is not None: if query_generator_vec is not None: query_generator_vec = query_generator_vec.repeat_interleave( input_turns_cnt, dim=0) # type: ignore if memory_vec is not None: memory_vec = memory_vec.repeat_interleave( input_turns_cnt, dim=0) # type: ignore if num_memories is not None: num_memories = num_memories.repeat_interleave( input_turns_cnt, dim=0) # type: ignore if memory_decoder_vec is not None: memory_decoder_vec = memory_decoder_vec.repeat_interleave( input_turns_cnt, dim=0) # type: ignore if num_memory_decoder_vecs is not None: num_memory_decoder_vecs = num_memory_decoder_vecs.repeat_interleave( input_turns_cnt, dim=0) # type: ignore n_input = (input_turns_cnt.sum().item() if input_turns_cnt is not None else input.size(0)) # 0a. Classify retrieval type, if necessary generated_memories = [[] for _ in range(int(n_input))] if memory_decoder_vec is not None: generated_memories = self.memory_decoder.generate_memories( memory_decoder_vec, num_memory_decoder_vecs) if self.should_generate_query: assert self.has_query_generator() retrieval_type, search_queries = self.query_generator.classify_retrieval( query_generator_vec, num_memories, generated_memories, skip_search) logging.debug(f'Classify Retrieval: {time.time() - start:.2f}') else: retrieval_type = torch.LongTensor(input.size(0)) search_queries = None # 1. Retrieve top_docs: List[List[Document]] = [[] for _ in range(int(n_input))] doc_scores: List[List[torch.Tensor]] = [[] for _ in range(int(n_input))] # 1a. retrieve from faiss or search search_indices = self.get_retrieval_indices(retrieval_type, RetrievalType.SEARCH) if search_indices.numel() > 0: search_docs, search_doc_scores = self.perform_search( search_queries, query_vec, search_indices) logging.debug(f'Search Complete: {time.time() - start:.2f}') logging.debug(f'search: {search_docs}') if gold_doc_vec is not None: logging.debug(f'num gold docs: {num_gold_docs}') self._fill_docs_and_scores( top_docs, doc_scores, search_indices, search_docs, search_doc_scores, gold_doc_vec, gold_doc_title_vec, num_gold_docs, ) # 1b. memory search memory_indices = self.get_retrieval_indices(retrieval_type, RetrievalType.MEMORY) if memory_indices.numel() > 0: memories, memory_scores = self.access_long_term_memory( query_vec, memory_indices, memory_vec, num_memories, memory_decoder_vec, generated_memories, ) logging.debug(f'Memory Access Complete: {time.time() - start:.2f}') if memories is not None and memory_scores is not None: self._fill_docs_and_scores(top_docs, doc_scores, memory_indices, memories, memory_scores) # 1c. no search no_search_indices = self.get_retrieval_indices(retrieval_type, RetrievalType.NONE) if no_search_indices.numel() > 0: dummy_docs, dummy_scores = self.dummy_retriever.retrieve( query_vec[no_search_indices] # type: ignore ) logging.debug('no search') self._fill_docs_and_scores(top_docs, doc_scores, no_search_indices, dummy_docs, dummy_scores) # 2. Expand the input if input_turns_cnt is not None: input = input.repeat_interleave(input_turns_cnt, dim=0) # type: ignore input_lengths = input_lengths.repeat_interleave( input_turns_cnt, dim=0) # type: ignore # Filtering empty doc_scores added due to dynamic batching (if used) doc_scores = [[s for s in ds if s is not None] for ds in doc_scores if ds] top_doc_scores = torch.stack( [torch.cat([s_i for s_i in scores_i]) for scores_i in doc_scores]) expanded_input = self.concat_docs_and_input(input, input_lengths, top_docs, top_doc_scores.size(1)) return expanded_input, top_docs, top_doc_scores
def download_multiprocess(urls, path, num_processes=32, chunk_size=100, dest_filenames=None, error_path=None): """ Download items in parallel (e.g. for an image + dialogue task). WARNING: may have issues with OS X. :param urls: Array of urls to download :param path: directory to save items in :param num_processes: number of processes to use :param chunk_size: chunk size to use :param dest_filenames: optional array of same length as url with filenames. Images will be saved as path + dest_filename :param error_path: where to save error logs :return: array of tuples of (destination filename, http status code, error message if any). Note that upon failure, file may not actually be created. """ pbar = tqdm.tqdm(total=len(urls), position=0) # Resume TODO: isfile() may take too long ?? Should I try in a .tmp file if dest_filenames: if len(dest_filenames) != len(urls): raise Exception( 'If specified, destination filenames must equal url array in length.' ) else: def _naming_fn(url, url_metadata=None): return hashlib.md5(url.encode('utf-8')).hexdigest() dest_filenames = [_naming_fn(url) for url in urls] items = zip(urls, dest_filenames) remaining_items = [ it for it in items if not PathManager.exists(os.path.join(path, it[1])) ] logging.info( f'Of {len(urls)} items, {len(urls) - len(remaining_items)} already existed; only going to download {len(remaining_items)} items.' ) pbar.update(len(urls) - len(remaining_items)) pool_chunks = ((remaining_items[i:i + chunk_size], path, _download_multiprocess_single) for i in range(0, len(remaining_items), chunk_size)) remaining_chunks_count = math.ceil(float( len(remaining_items) / chunk_size)) logging.info( f'Going to download {remaining_chunks_count} chunks with {chunk_size} images per chunk using {num_processes} processes.' ) pbar.desc = 'Downloading' all_results = [] collected_errors = [] with Pool(num_processes) as pool: for idx, chunk_result in enumerate( pool.imap_unordered(_download_multiprocess_map_chunk, pool_chunks, 2)): all_results.extend(chunk_result) for dest_file, http_status_code, error_msg in chunk_result: if http_status_code != 200: # msg field available as third item in the tuple # not using b/c error log file would blow up collected_errors.append({ 'dest_file': dest_file, 'status_code': http_status_code, 'error': error_msg, }) logging.error( f'Bad download - chunk: {idx}, dest_file: {dest_file}, http status code: {http_status_code}, error_msg: {error_msg}' ) pbar.update(len(chunk_result)) pbar.close() if error_path: now = time.strftime("%Y%m%d-%H%M%S") error_filename = os.path.join( error_path, 'parlai_download_multiprocess_errors_%s.log' % now) with PathManager.open(os.path.join(error_filename), 'w') as error_file: error_file.write(json.dumps(collected_errors)) logging.error(f'Summary of errors written to {error_filename}') logging.info(f'Of {len(remaining_items)} items attempted downloading, ' f'{len(collected_errors)} had errors.') logging.debug('Finished downloading chunks.') return all_results
def retrieve_and_score( self, query: torch.LongTensor ) -> Tuple[List[List[Document]], torch.Tensor]: """ Retrieves relevant documents for the query (the conversation context). This method conducts three main steps that are flagged in the main code as well. Step 1: generate search queries for the conversation context batch.This step uses the query generator model (self.query_generator). Step 2: use the search client to retrieve documents.This step uses retrieval API agent (self.search_client) Step 3: generate the list of Document objects from the retrieved content. Here if the documents too long, the code splits them and chooses a chunk based on the selected `doc_chunks_ranker` in the opt. """ # step 1 search_queries = self.generate_search_query(query) # step 2 search_results_batch = self.search_client.retrieve(search_queries, self.n_docs) # step 3 top_docs = [] top_doc_scores = [] max_n_docs: int = self.n_docs for sq, search_results in zip(search_queries, search_results_batch): if not search_results: search_results = self._empty_docs(self.n_docs) elif len(search_results) < self.n_docs: remain_docs = self.n_docs - len(search_results) search_results.extend(self._empty_docs(remain_docs)) docs_i = [] scors_i = [] # Change this debug later logging.debug(f'URLS:\n{self._display_urls(search_results)}') for i, doc in enumerate(search_results): url = doc['url'] title = doc['title'] dcontent = doc['content'] assert type(dcontent) in ( str, list, ), f'Unrecognized retrieved doc: {dcontent}' full_text = ( dcontent if isinstance(dcontent, str) else '\n'.join(doc['content']) ) doc_chunks = [ dc[0] for dc in self.pick_chunk(sq, title, full_text, url) ] for splt_id, splt_content in enumerate(doc_chunks): docs_i.append( Document( docid=url, text=splt_content, title=f'{title}_{splt_id}' ) ) scors_i.append(self.rank_score(i)) max_n_docs = max(max_n_docs, len(docs_i)) top_docs.append(docs_i) top_doc_scores.append(scors_i) # Pad with empty docs for i in range(len(top_docs)): n_empty = max_n_docs - len(top_docs[i]) if n_empty: top_docs[i] = top_docs[i] + [BLANK_DOC] * n_empty top_doc_scores[i] = top_doc_scores[i] + [0] * n_empty self.top_docs = top_docs return top_docs, torch.Tensor(top_doc_scores).to(query.device)
def shutdown(self): logging.debug("Killing all the worker processes") for p in self._process_pool.processes: p.kill() super().shutdown()
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): logging.error('Should be passed opt not Parser') opt = opt.parse_args() if not opt.get('dict_file'): logging.error( 'Tried to build dictionary but `--dict-file` is not set. Set ' 'this param so the dictionary can be saved.') return if skip_if_built and PathManager.exists(opt['dict_file']): # Dictionary already built, skip all loading or setup logging.debug("dictionary already built.") return None if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if PathManager.exists( opt['dict_file']) or (hasattr(dictionary, 'is_prebuilt') and dictionary.is_prebuilt()): # Dictionary already built, return loaded dictionary agent logging.debug("dictionary already built.") return dictionary if is_distributed(): raise ValueError( 'Dictionaries should be pre-built before distributed train.') ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['batchsize'] = 1 # Set this to none so that image features are not calculated when Teacher is # instantiated while building the dict ordered_opt['image_mode'] = 'no_image_model' ordered_opt.log() datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary log_time = TimeLogger() total = world_dict.num_examples() if opt['dict_maxexs'] >= 0: total = min(total, opt['dict_maxexs']) log_every_n_secs = opt.get('log_every_n_secs', None) if log_every_n_secs: pbar = tqdm.tqdm(total=total, desc='Building dictionary', unit='ex', unit_scale=True) else: pbar = None while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0: logging.info('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if pbar: pbar.update(1) if pbar: pbar.close() dictionary.save(opt['dict_file'], sort=True) logging.info(f'dictionary built with {len(dictionary)} tokens ' f'in {log_time.total_time():.1f}s') return dictionary
def retrieve_and_score( self, query: torch.LongTensor ) -> Tuple[List[List[Document]], torch.Tensor]: """ Override retrieve and score to filter out docs that contain the label string. Copy over the whole thing because we need to check before chunking. """ # step 1 search_queries = self.generate_search_query(query) # type: ignore # step 2 search_results_batch = self.search_client.retrieve( search_queries, self.n_docs ) # type: ignore # step 3 top_docs = [] top_doc_scores = [] max_n_docs: int = self.n_docs # type: ignore for batch_id, (sq, search_results) in enumerate( zip(search_queries, search_results_batch) ): if not search_results: search_results = self._empty_docs(self.n_docs) # type: ignore elif len(search_results) < self.n_docs: # type: ignore remain_docs = self.n_docs - len(search_results) # type: ignore search_results.extend(self._empty_docs(remain_docs)) # type: ignore docs_i = [] scors_i = [] # Change this debug later logging.debug( f'URLS:\n{self._display_urls(search_results)}' ) # type: ignore label_text = self.dict.vec2txt( self.label_vec[batch_id, :-1] ) # type: ignore for i, doc in enumerate(search_results): url = doc['url'] title = doc['title'] dcontent = doc['content'] assert type(dcontent) in ( str, list, ), f'Unrecognized retrieved doc: {dcontent}' full_text = ( dcontent if isinstance(dcontent, str) else '\n'.join(doc['content']) ) if label_text in full_text: docs_i.append(BLANK_DOC) scors_i.append(0) else: doc_chunks = [ dc[0] for dc in self.pick_chunk( sq, title, full_text, url ) # type: ignore ] for splt_id, splt_content in enumerate(doc_chunks): docs_i.append( Document( docid=url, text=splt_content, title=f'{title}_{splt_id}' ) ) scors_i.append(self.rank_score(i)) # type: ignore max_n_docs = max(max_n_docs, len(docs_i)) top_docs.append(docs_i) top_doc_scores.append(scors_i) # Pad with empty docs for i in range(len(top_docs)): n_empty = max_n_docs - len(top_docs[i]) if n_empty: top_docs[i] = top_docs[i] + [BLANK_DOC] * n_empty top_doc_scores[i] = top_doc_scores[i] + [0] * n_empty self.top_docs = top_docs self.search_queries = search_queries return top_docs, torch.Tensor(top_doc_scores).to(query.device)