help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}") run_parser.add_argument('--output', type=str, metavar='path', required=False, help="Path to output file.") run_parser.add_argument('--max-passage', action='store_true', default=False, help="Select only max passage from document.") run_parser.add_argument('--max-passage-hits', type=int, metavar='num', required=False, default=100, help="Final number of hits when selecting only max passage.") run_parser.add_argument('--max-passage-delimiter', type=str, metavar='str', required=False, default='#', help="Delimiter between docid and passage id.") run_parser.add_argument('--batch-size', type=int, metavar='num', required=False, default=1, help="Specify batch size to search the collection concurrently.") run_parser.add_argument('--threads', type=int, metavar='num', required=False, default=1, help="Maximum number of threads to use.") args = parse_args(parser, commands) query_iterator = get_query_iterator(args.run.topics, TopicsFormat(args.run.topics_format)) topics = query_iterator.topics query_encoder = init_query_encoder(args.dense.encoder, args.dense.tokenizer, args.run.topics, args.dense.encoded_queries, args.dense.device) if os.path.exists(args.dense.index): # create searcher from index directory dsearcher = SimpleDenseSearcher(args.dense.index, query_encoder) else: # create searcher from prebuilt index name dsearcher = SimpleDenseSearcher.from_prebuilt_index(args.dense.index, query_encoder)
required=False, default=1, help="Specify batch size to search the collection concurrently.") parser.add_argument('--threads', type=int, metavar='num', required=False, default=1, help="Maximum number of threads to use.") parser.add_argument('--tokenizer', type=str, help='tokenizer used to preprocess topics') args = parser.parse_args() query_iterator = get_query_iterator(args.topics, TopicsFormat(args.topics_format)) topics = query_iterator.topics if os.path.exists(args.index): # create searcher from index directory searcher = SimpleSearcher(args.index) else: # create searcher from prebuilt index name searcher = SimpleSearcher.from_prebuilt_index(args.index) if args.language != 'en': searcher.set_language(args.language) if not searcher: exit()
parser.add_argument('--encoder', type=str, help='encoder name or path', default='facebook/dpr-question_encoder-multiset-base', required=False) parser.add_argument('--output', type=str, help='path to store query embeddings', required=True) parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cpu', required=False) args = parser.parse_args() query_iterator = get_query_iterator( args.topics, TopicsFormat(TopicsFormat.DEFAULT.value)) topics = query_iterator.topics encoder = DkrrDprQueryEncoder(args.encoder, args.device) embeddings = {'id': [], 'text': [], 'embedding': []} for index, (topic_id, text) in enumerate( tqdm(query_iterator, total=len(topics.keys()))): embeddings['id'].append(topic_id) embeddings['text'].append(text) embeddings['embedding'].append(encoder.encode(text)) embeddings = pd.DataFrame(embeddings) embeddings.to_pickle(args.output)