if __name__ == '__main__': parser = argparse.ArgumentParser(description='Compute embeddings for KILT topics') parser.add_argument('--topics', required=True) parser.add_argument('--output', default="embedding.pkl", help="Name and path to output file.") parser.add_argument('--encoder', metavar='path to query encoder checkpoint or encoder name', required=True, help="Path to query encoder pytorch checkpoint or hgf encoder model name") parser.add_argument('--tokenizer', metavar='name or path', required=True, help="Path to a hgf tokenizer name or path") parser.add_argument('--device', metavar='device to run query encoder', required=False, default='cpu', help="Device to run query encoder, cpu or [cuda:0, cuda:1, ...]") args = parser.parse_args() query_iterator = get_query_iterator(args.topics, TopicsFormat.KILT) query_encoder = DprQueryEncoder(encoder_dir=args.encoder, tokenizer_name=args.tokenizer, device=args.device) texts = [] embeddings = [] for i, (topic_id, text) in enumerate(tqdm(query_iterator)): texts.append(text) embeddings.append(query_encoder.encode(text)) df = pd.DataFrame({ 'text': texts, 'embedding': embeddings }) df.to_pickle(args.output)
metavar='num', required=False, default=1, help="Specify batch size to search the collection concurrently.") parser.add_argument('--threads', type=int, metavar='num', required=False, default=1, help="Maximum number of threads to use.") parser.add_argument('--tokenizer', type=str, help='tokenizer used to preprocess topics') args = parser.parse_args() query_iterator = get_query_iterator(args.topics, TopicsFormat(args.topics_format)) topics = query_iterator.topics if os.path.exists(args.index): # create searcher from index directory searcher = SimpleSearcher(args.index) else: # create searcher from prebuilt index name searcher = SimpleSearcher.from_prebuilt_index(args.index) if args.language != 'en': searcher.set_language(args.language) if not searcher: exit()
parser.add_argument('--encoder', type=str, help='encoder name or path', default='facebook/dpr-question_encoder-multiset-base', required=False) parser.add_argument('--output', type=str, help='path to store query embeddings', required=True) parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cpu', required=False) args = parser.parse_args() query_iterator = get_query_iterator( args.topics, TopicsFormat(TopicsFormat.DEFAULT.value)) topics = query_iterator.topics encoder = DkrrDprQueryEncoder(args.encoder, args.device) embeddings = {'id': [], 'text': [], 'embedding': []} for index, (topic_id, text) in enumerate( tqdm(query_iterator, total=len(topics.keys()))): embeddings['id'].append(topic_id) embeddings['text'].append(text) embeddings['embedding'].append(encoder.encode(text)) embeddings = pd.DataFrame(embeddings) embeddings.to_pickle(args.output)