def QueryPreprocessingFn(args, line, tokenizer): line_arr = line.split('\t') q_id = int(line_arr[0]) # if 'fairseq' in args.model_type: # passage=tokenizer.encode(full_text) # else: # passage = tokenizer.encode( # line_arr[1].rstrip(), # add_special_tokens=True, # max_length=args.max_query_length) if 'fairseq' not in args.train_model_type: passage = tokenizer.encode( line_arr[1].rstrip(), add_special_tokens=True, max_length=args.max_seq_length,) pad_token_id=tokenizer.pad_token_id elif 'fast' in args.train_model_type: full_text=line_arr[1].rstrip().lower() passage =tokenizer.encode(full_text, add_special_tokens=True).ids[:args.max_seq_length] pad_token_id=1 else: full_text=line_arr[1].rstrip().lower() passage =list(np.array( tokenizer.encode(full_text)[:args.max_seq_length])) pad_token_id=1 passage_len = min(len(passage), args.max_query_length) input_id_b = pad_input_ids(passage, args.max_query_length,pad_token=pad_token_id) return q_id.to_bytes(8,'big') + passage_len.to_bytes(4,'big') + np.array(input_id_b,np.int32).tobytes()
def PassagePreprocessingFn(args, line, tokenizer): if args.data_type == 0: line_arr = line.split('\t') p_id = int(line_arr[0][1:]) # remove "D" url = line_arr[1].rstrip() title = line_arr[2].rstrip() p_text = line_arr[3].rstrip() full_text = url + "<sep>" + title + "<sep>" + p_text # keep only first 10000 characters, should be sufficient for any # experiment that uses less than 500 - 1k tokens full_text = full_text[:args.max_doc_character] else: line = line.strip() line_arr = line.split('\t') p_id = int(line_arr[0]) p_text = line_arr[1].rstrip() # keep only first 10000 characters, should be sufficient for any # experiment that uses less than 500 - 1k tokens full_text = p_text[:args.max_doc_character] passage = tokenizer.encode( full_text, add_special_tokens=True, max_length=args.max_seq_length, ) passage_len = min(len(passage), args.max_seq_length) input_id_b = pad_input_ids(passage, args.max_seq_length) return p_id.to_bytes(8, 'big') + passage_len.to_bytes(4, 'big') + np.array( input_id_b, np.int32).tobytes()
def PassagePreprocessingFn(args, line, tokenizer): if args.data_type == 0: line_arr = line.split('\t') p_id = int(line_arr[0][1:]) # remove "D" url = line_arr[1].rstrip() title = line_arr[2].rstrip() p_text = line_arr[3].rstrip() if 'fast' in args.train_model_type: full_text = url.lower() + " [SEP] " + title.lower() + " [SEP] " + p_text.lower() elif 'fairseq' in args.train_model_type: full_text = url + " </s> " + title + " </s> " + p_text else: full_text = url + " "+tokenizer.sep_token_id+" " + title + " "+tokenizer.sep_token_id+" " + p_text # keep only first 10000 characters, should be sufficient for any # experiment that uses less than 500 - 1k tokens full_text = full_text[:args.max_doc_character] else: line = line.strip() line_arr = line.split('\t') p_id = int(line_arr[0]) p_text = line_arr[1].rstrip() # keep only first 10000 characters, should be sufficient for any # experiment that uses less than 500 - 1k tokens full_text = p_text[:args.max_doc_character] # if 'fairseq' in args.model_type: # passage=tokenizer.encode(full_text) # else: # passage = tokenizer.encode( # full_text, # add_special_tokens=True, # max_length=args.max_seq_length, # ) if 'fairseq' not in args.train_model_type: passage = tokenizer.encode( full_text, add_special_tokens=True, max_length=args.max_seq_length,) pad_token_id=tokenizer.pad_token_id elif 'fast' in args.train_model_type: if args.data_type == 1: full_text=full_text.lower() # else: # full_text=full_text.lower() #full_text = url.lower() + "[SEP]" + title.lower() + "[SEP]" + p_text.lower() #full_text=full_text.lower() passage =tokenizer.encode(full_text, add_special_tokens=True).ids[:args.max_seq_length] pad_token_id=1 else: full_text=full_text.lower() passage =list(np.array( tokenizer.encode(full_text)[:args.max_seq_length])) pad_token_id=1 passage_len = min(len(passage), args.max_seq_length) input_id_b = pad_input_ids(passage, args.max_seq_length,pad_token=pad_token_id) return p_id.to_bytes(8,'big') + passage_len.to_bytes(4,'big') + np.array(input_id_b,np.int32).tobytes()
def QueryPreprocessingFn(args, line, tokenizer): line_arr = line.split('\t') q_id = int(line_arr[0]) passage = tokenizer.encode(line_arr[1].rstrip(), add_special_tokens=True, max_length=args.max_query_length) passage_len = min(len(passage), args.max_query_length) input_id_b = pad_input_ids(passage, args.max_query_length) return q_id.to_bytes(8, 'big') + passage_len.to_bytes(4, 'big') + np.array( input_id_b, np.int32).tobytes()