def load_data(args, filename, max_examples=-1, dataset_name='msmarco'): """Load examples from preprocessed file. One example per line, JSON encoded.""" # Load JSON lines with open(filename) as f: data = [ json.loads(line) for line in tqdm(f, total=count_file_lines(filename)) ] examples = [] # based on model_type, we arrange the data model_type = args.model_type.upper() for example in tqdm(data): if dataset_name == 'msmarco': session_queries = [] for query in example['query']: qObj = Query(query['id']) qObj.text = ' '.join(query['tokens']) qtokens = query['tokens'] qtokens = [BOS_WORD] + qtokens + [EOS_WORD] if len(qtokens) == 0 or len(qtokens) > args.max_query_len: continue qObj.tokens = qtokens # --- record the candidate documents candidates = [] for candidate in query['candidates']: document = Document(candidate['id']) # TODO: what should we use for documents? title/content? content_tokens = candidate['content'].split() if len(content_tokens) == 0: continue content_tokens = content_tokens[:args.max_doc_len] document.tokens = content_tokens assert isinstance(candidate['label'], bool) document.label = 1 if candidate['label'] else 0 candidates.append(document) if len(candidates) == args.num_candidates: qObj.documents = candidates session_queries.append(qObj) # sessions must contain at least 2 queries if len(session_queries) < 2: continue session = Session(example['session_id']) session.queries = session_queries examples.append(session) if max_examples != -1 and len(examples) > max_examples: break return examples
def load_data(args, filename, max_examples=-1, dataset_name='msmarco'): """Load examples from preprocessed file. One example per line, JSON encoded.""" # Load JSON lines with open(filename) as f: data = [ json.loads(line) for line in tqdm(f, total=count_file_lines(filename)) ] examples = [] # based on model_type, we arrange the data model_type = args.model_type.upper() for example in tqdm(data): if dataset_name == 'msmarco': session_queries = [] for query in example['query']: qObj = Query(query['id']) qObj.text = ' '.join(query['tokens']) qtokens = query['tokens'] qtokens = [BOS_WORD] + qtokens + [EOS_WORD] if len(qtokens) == 0 or len(qtokens) > args.max_query_len: continue qObj.tokens = qtokens session_queries.append(qObj) # sessions must contain at least 2 queries if len(session_queries) < 2: continue if model_type == 'SEQ2SEQ': # every session will contain only 2 queries for i in range(len(session_queries) - 1): session = Session(example['session_id'] + str(i)) session.queries = session_queries[i:i + 2] assert len(session) == 2 examples.append(session) elif model_type == 'ACG': # every session will contain only 2 queries # but the first query is the concatenation of all previous queries till timestep i for i in range(len(session_queries) - 1): session = Session(example['session_id'] + str(i)) session.add_one_query(session_queries[0:i + 1]) session.add_query(session_queries[i + 1]) assert len(session) == 2 examples.append(session) elif model_type == 'HREDQS': session = Session(example['session_id']) session.queries = session_queries examples.append(session) if max_examples != -1 and len(examples) > max_examples: break return examples
def index_embedding_words(embedding_file): """Put all the words in embedding_file into a set.""" words = set() with open(embedding_file) as f: for line in tqdm(f, total=count_file_lines(embedding_file)): w = Vocabulary.normalize(line.rstrip().split(' ')[0]) words.add(w) words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD]) return words
def load_embeddings(self, words, embedding_file): """Load pretrained embeddings for a given list of words, if they exist. Args: words: iterable of tokens. Only those that are indexed in the dictionary are kept. embedding_file: path to text file of embeddings, space separated. """ emb_layer = self.network.embedder.word_embeddings words = {w for w in words if w in self.src_dict} logger.info('Loading pre-trained embeddings for %d words from %s' % (len(words), embedding_file)) # When normalized, some words are duplicated. (Average the embeddings). vec_counts, embedding = {}, {} with open(embedding_file, encoding='utf8') as f: # Skip first line if of form count/dim. line = f.readline().rstrip().split(' ') if len(line) != 2: f.seek(0) duplicates = set() for line in tqdm(f, total=count_file_lines(embedding_file)): parsed = line.rstrip().split(' ') assert (len(parsed) == emb_layer.word_vec_size + 1) w = self.src_dict.normalize(parsed[0]) if w in words: vec = torch.Tensor([float(i) for i in parsed[1:]]) if w not in vec_counts: vec_counts[w] = 1 embedding[w] = vec else: duplicates.add(w) vec_counts[w] = vec_counts[w] + 1 embedding[w].add_(vec) if len(duplicates) > 0: logging.warning('WARN: Duplicate embedding found for %s' % ', '.join(duplicates)) for w, c in vec_counts.items(): embedding[w].div_(c) emb_layer.init_word_vectors(self.src_dict, embedding, self.args.fix_embeddings) logger.info('Loaded %d embeddings (%.2f%%)' % (len(vec_counts), 100 * len(vec_counts) / len(words)))
def load_data(args, filename, max_examples=-1, dataset_name='msmarco'): """Load examples from preprocessed file. One example per line, JSON encoded.""" # Load JSON lines with open(filename) as f: data = [json.loads(line) for line in tqdm(f, total=count_file_lines(filename))] examples = [] for session in tqdm(data): if dataset_name == 'msmarco': for query in session['query']: qObj = Query(query['id']) qtokens = query['tokens'] qtokens = [BOS_WORD] + qtokens + [EOS_WORD] if len(qtokens) == 0 or len(qtokens) > args.max_query_len: continue if len(query['candidates']) != args.num_candidates: continue if args.use_char_ngram > 0: char_n_grams = [] offset = args.use_char_ngram for i in range(len(qtokens)): term = '#' + qtokens[i] + '#' for j in range(0, len(term) - offset + 1): char_n_grams.append(term[j:j + offset]) qtokens = char_n_grams qObj.tokens = qtokens candidates = [] for candidate in query['candidates']: document = Document(candidate['id']) # TODO: what should we use for documents? title/content? content_tokens = candidate['content'].split() if len(content_tokens) == 0: continue content_tokens = content_tokens[:args.max_doc_len - 2] content_tokens = [BOS_WORD] + content_tokens + [EOS_WORD] if args.use_char_ngram > 0: char_n_grams = [] offset = args.use_char_ngram for i in range(len(content_tokens)): term = '#' + content_tokens[i] + '#' for j in range(0, len(term) - offset + 1): char_n_grams.append(term[j:j + offset]) content_tokens = char_n_grams document.tokens = content_tokens assert isinstance(candidate['label'], bool) document.label = 1 if candidate['label'] else 0 candidates.append(document) if len(candidates) == args.num_candidates: qObj.documents = candidates examples.append(qObj) if max_examples != -1 and len(examples) > max_examples: break return examples