def set_embeddings(self): # Read word embeddings. if not self.opt.get('embedding_file'): logger.warning( '[ WARNING: No embeddings provided. ' 'Keeping random initialization. ]' ) return logger.info('[ Loading pre-trained embeddings ]') embeddings = load_embeddings(self.opt, self.word_dict) logger.info('[ Num embeddings = %d ]' % embeddings.size(0)) # Sanity check dimensions new_size = embeddings.size() old_size = self.network.embedding.weight.size() if new_size[1] != old_size[1]: raise RuntimeError('Embedding dimensions do not match.') if new_size[0] != old_size[0]: logger.warning( '[ WARNING: Number of embeddings changed (%d->%d) ]' % (old_size[0], new_size[0]) ) # Swap weights self.network.embedding.weight.data = embeddings # If partially tuning the embeddings, keep the old values if self.opt['tune_partial'] > 0: if self.opt['tune_partial'] + 2 < embeddings.size(0): fixed_embedding = embeddings[self.opt['tune_partial'] + 2 :] self.network.fixed_embedding = fixed_embedding
def split_into_seen_unseen(dpath: str): """ Following WoW, we have overlap in train, valid, and test seen but none in test unseen. Do an 80:10:5:5 split between train, valid, test_seen, test_unseen or as close to it. ~205 documents for test_unseen to do this, and movies 1 and 3 have 90 and 117 movies, respectively, which is about that """ random.seed(42) cdir = os.path.join(dpath, "conversations") new = {"train": {}, "valid": {}, "test_seen": {}, "test_unseen": {}} for fold in ["test", "valid", "train"]: with PathManager.open(os.path.join(cdir, f"{fold}_deduped.json")) as f: data = json.load(f) for k, v in data.items(): if v["wikiDocumentIdx"] == 1 or v["wikiDocumentIdx"] == 3: new["test_unseen"][k] = v else: rand = random.randint(1, 95) if rand <= 80: new["train"][k] = v elif rand <= 90: new["valid"][k] = v else: new["test_seen"][k] = v for fold in new: with PathManager.open( os.path.join(cdir, f"{fold}_split_seen_unseen.json"), "w+") as f: json.dump(new[fold], f, indent=2) c_cnt = len(new[fold]) logger.info(f"Seen/unseen {fold} conversation count: {c_cnt}")
def main(): random.seed(42) # Get command line arguments parser = ParlaiParser(True, True) RemoteAgentAgent.add_cmdline_args(parser) opt = parser.parse_args() remote = RemoteAgentAgent(opt) if opt.get('task'): world = create_task(opt, [remote]) else: if opt.get('model'): local = create_agent(opt) else: local = LocalHumanAgent(opt) # the remote-host goes **second** agents = [local, remote] if not opt['remote_host'] else [remote, local] world = DialogPartnerWorld(opt, agents) # Talk to the remote agent with world: while True: world.parley() logger.info(world.display())
def truncate(data, row, col): global MAX_SZ if len(data) > MAX_SZ: over = len(data) - MAX_SZ pct = over / len(data) logger.info('Data size is too large for scipy to index all of it. ' 'Throwing out {} entries ({}%% of data).'.format( over, pct)) data = data[:MAX_SZ] row = row[:MAX_SZ] col = col[:MAX_SZ] return data, row, col
def __init__(self, tfidf_path=None, strict=True): """ Args: tfidf_path: path to saved model file strict: fail on empty queries or continue (and return empty result) """ # Load from disk logger.info('Loading %s' % tfidf_path) matrix, metadata = utils.load_sparse_csr(tfidf_path) self.doc_mat = matrix self.ngrams = metadata['ngram'] self.hash_size = metadata['hash_size'] self.tokenizer = tokenizers.get_class(metadata['tokenizer'])() self.doc_freqs = metadata['doc_freqs'].squeeze() self.doc_dict = metadata.get('doc_dict', None) self.num_docs = self.doc_mat.shape[1] - 1 self.strict = strict
def build_deduped_split(dpath: str): """ Original CMU-DoG has 110 ids that are used in multiple of train/valid/test. Get rid of the duplication. """ cdir = os.path.join(dpath, "conversations") data = {} for fold in ["test", "valid", "train"]: fpath = os.path.join(cdir, f"{fold}.json") with PathManager.open(fpath) as f: data[fold] = json.load(f) train_len = len(data["train"]) valid_len = len(data["valid"]) test_len = len(data["test"]) logger.info( f"Converation count with duplicates: train-{train_len}, valid-{valid_len}, test-{test_len}" ) train_valid = set(data["train"].keys()) & set(data["valid"].keys()) train_test = set(data["train"].keys()) & set(data["test"].keys()) valid_test = set(data["valid"].keys()) & set(data["test"].keys()) for key in train_valid: data["train"].pop(key) for key in train_test: data["train"].pop(key) for key in valid_test: data["test"].pop(key) train_len = len(data["train"]) valid_len = len(data["valid"]) test_len = len(data["test"]) logger.info( f"Converation count without duplicates: train-{train_len}, valid-{valid_len}, test-{test_len}" ) for fold in ["test", "valid", "train"]: fpath = os.path.join(cdir, f"{fold}_deduped.json") with PathManager.open(fpath, "w+") as f: json.dump(data[fold], f, indent=2)
def get_count_matrix(args, db_opts): """ Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ global MAX_SZ with DocDB(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) if len(data) > MAX_SZ: break if len(data) > MAX_SZ: logger.info('Reached max indexable size, breaking.') break workers.close() workers.join() logger.info('Creating sparse matrix...') data, row, col = truncate(data, row, col) count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids) + 1)) count_matrix.sum_duplicates() return count_matrix
def run(args): # ParlAI version of run method, modified slightly logger.info('Counting words...') count_matrix = get_count_matrix(args, {'db_path': args.db_path}) logger.info('Making tfidf vectors...') tfidf = get_tfidf_matrix(count_matrix) logger.info('Getting word-doc frequencies...') freqs = get_doc_freqs(count_matrix) filename = args.out_dir logger.info('Saving to %s' % filename) metadata = { 'doc_freqs': freqs, 'tokenizer': args.tokenizer, 'hash_size': args.hash_size, 'ngram': args.ngram, } utils.save_sparse_csr(filename, tfidf, metadata)
def get_count_matrix_t(args, db_opts): """ Form a sparse word to document count matrix (inverted index, torch ver). M[i, j] = # times word i appears in document j. """ global MAX_SZ with DocDB(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = torch.sparse.FloatTensor( torch.LongTensor([row, col]), torch.FloatTensor(data), torch.Size([args.hash_size, len(doc_ids) + 1]), ).coalesce() return count_matrix
def store_contents(opt, task, save_path, context_length=-1, include_labels=True): """ Preprocess and store a corpus of documents in sqlite. Args: task: ParlAI tasks of text (and possibly values) to store. save_path: Path to output sqlite db. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) logger.info('Reading into database...') conn = sqlite3.connect(save_path) c = conn.cursor() c.execute('CREATE TABLE documents (id INTEGER PRIMARY KEY, text, value);') if not task: logger.info('No data to initialize table: just creating table.') logger.info('Add more data by passing observations to the agent.') logger.info('Committing...') conn.commit() conn.close() return ordered_opt = opt.copy() dt = opt.get('datatype', '').split(':') ordered_opt['datatype'] = ':'.join([dt[0], 'ordered'] + dt[1:]) ordered_opt['batchsize'] = 1 ordered_opt['numthreads'] = 1 ordered_opt['task'] = task teacher = create_task_agent_from_taskname(ordered_opt)[0] episode_done = False current = [] triples = [] context_length = context_length if context_length >= 0 else None context = deque(maxlen=context_length) with tqdm(total=teacher.num_episodes()) as pbar: while not teacher.epoch_done(): # collect examples in episode while not episode_done: action = teacher.act() current.append(action) episode_done = action['episode_done'] for ex in current: if 'text' in ex: text = ex['text'] context.append(text) if len(context) > 1: text = '\n'.join(context) # add labels to context labels = ex.get('labels', ex.get('eval_labels')) label = None if labels is not None: label = random.choice(labels) if include_labels: context.append(label) # use None for ID to auto-assign doc ids--we don't need to # ever reverse-lookup them triples.append((None, text, label)) c.executemany('INSERT OR IGNORE INTO documents VALUES (?,?,?)', triples) pbar.update() # reset flags and content episode_done = False triples.clear() current.clear() context.clear() logger.info('Read %d examples from %d episodes.' % (teacher.num_examples(), teacher.num_episodes())) logger.info('Committing...') conn.commit() conn.close()
def main(): # Get command line arguments parser = ParlaiParser() parser.add_argument('-n', '--num-iters', default=10, type=int) parser.add_argument('-a', '--num-agents', default=1, type=int) opt = parser.parse_args() agents = [] for _ in range(opt['num_agents']): agents.append(Agent(opt)) opt['datatype'] = 'train' world_train = create_task(opt, agents) opt['datatype'] = 'valid' world_valid = create_task(opt, agents) start = time.time() # train / valid loop for _ in range(1): logger.info('[ training ]') for _ in range(opt['num_iters']): # train for a bit world_train.parley() logger.info('[ training summary. ]') logger.info(world_train.report()) logger.info('[ validating ]') for _ in range(1): # check valid accuracy world_valid.parley() logger.info('[ validation summary. ]') logger.info(world_valid.report()) logger.info('finished in {} s'.format(round(time.time() - start, 2)))
parser.add_argument( '--tokenizer', type=str, default='simple', help=("String option specifying tokenizer type to use " "(e.g. 'corenlp')"), ) parser.add_argument( '--num-workers', type=int, default=None, help='Number of CPU processes (for tokenizing, etc)', ) args = parser.parse_args() logger.info('Counting words...') count_matrix, doc_dict = get_count_matrix(args, {'db_path': args.db_path}) logger.info('Making tfidf vectors...') tfidf = get_tfidf_matrix(count_matrix) logger.info('Getting word-doc frequencies...') freqs = get_doc_freqs(count_matrix) basename = os.path.splitext(os.path.basename(args.db_path))[0] basename += '-tfidf-ngram=%d-hash=%d-tokenizer=%s' % ( args.ngram, args.hash_size, args.tokenizer, ) filename = os.path.join(args.out_dir, basename)