def test_flatten(self): dset_split = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, { "train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"] }) self.assertListEqual(list(dset["train"].features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset["train"].features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") })) del dset
def test_cast_array_to_features(): arr = pa.array([[0, 1]]) assert cast_array_to_feature(arr, Sequence( Value("string"))).type == pa.list_(pa.string()) with pytest.raises(TypeError): cast_array_to_feature(arr, Sequence(Value("string")), allow_number_to_str=False)
def test_dataset_with_image_feature_with_none(): data = {"image": [None]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert item["image"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(item is None for item in batch["image"]) column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(item is None for item in column) # nested tests data = {"images": [[None]]} features = Features({"images": Sequence(Image())}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"images"} assert all(i is None for i in item["images"]) data = {"nested": [{"image": None}]} features = Features({"nested": {"image": Image()}}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"nested"} assert item["nested"].keys() == {"image"} assert item["nested"]["image"] is None
def test_image_feature_type_to_arrow(): features = Features({"image": Image()}) assert features.arrow_schema == pa.schema({"image": Image().pa_type}) features = Features({"struct_containing_an_image": {"image": Image()}}) assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})}) features = Features({"sequence_of_images": Sequence(Image())}) assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)})
def save_data(train_df, val_df): train_f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'question_type': Value(dtype='int32', id=None) }) train_datasets = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) file = open("../../data/question_type.pkl", "wb") pickle.dump(train_datasets, file) file.close()
def get_etr_dataset(args): etr_path = p.join(args.path.train_data_dir, "etr_qa_dataset.json") if not p.exists(etr_path): raise FileNotFoundError( f"ETRI 데이터 셋 {etr_path}로 파일명 바꿔서 데이터 넣어주시길 바랍니다.") with open(etr_path, "r") as f: etr_dict = json.load(f) # print(etr_dict["data"][0]) new_dataset = defaultdict(list) cnt = 0 for datas in etr_dict["data"]: title = datas["title"] context = datas["paragraphs"][0]["context"] for questions in datas["paragraphs"][0]["qas"]: question = questions["question"] answers = { "answer_start": [questions["answers"][0]["answer_start"]], "text": [questions["answers"][0]["text"]], } new_dataset["id"].append(f"etr-custom-{cnt}") new_dataset["title"].append(title) new_dataset["context"].append(context) new_dataset["question"].append(question) new_dataset["answers"].append(answers) cnt += 1 f = Features({ "answers": Sequence( feature={ "text": Value(dtype="string", id=None), "answer_start": Value(dtype="int32", id=None) }, length=-1, id=None, ), "id": Value(dtype="string", id=None), "context": Value(dtype="string", id=None), "question": Value(dtype="string", id=None), "title": Value(dtype="string", id=None), }) df = pd.DataFrame(new_dataset) etr_dataset = Dataset.from_pandas(df, features=f) return etr_dataset
def main( rag_example_args: "RagExampleArguments", processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): ###################################### logger.info("Step 1 - Create the dataset") ###################################### # The dataset needed for RAG must have three columns: # - title (string): title of the document # - text (string): text of a passage of the document # - embeddings (array of dimension d): DPR representation of the passage # Let's say you have documents in tab-separated csv files with columns "title" and "text" assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file" # You can load a Dataset object this way dataset = load_dataset( "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"] ) # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # Then split the documents into passages of 100 words dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) # And compute the embeddings ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name) new_features = Features( {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))} ) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer), batched=True, batch_size=processing_args.batch_size, features=new_features, ) # And finally save your dataset passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset") dataset.save_to_disk(passages_path) # from datasets import load_from_disk # dataset = load_from_disk(passages_path) # to reload the dataset ###################################### logger.info("Step 2 - Index the dataset") ###################################### # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) # And save the index index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss") dataset.get_index("embeddings").save(index_path)
def make_negative_dataset(args, bm25, queries, answers, contexts, name, num=16): total = [] scores, indices = bm25.get_relevant_doc_bulk(queries, topk=num * 2) answers, indices = np.array(answers, dtype="object"), np.array(indices) contexts = np.array(contexts, dtype="object") for idx, query in enumerate(queries): label = idx % num answer = answers[idx] context_list = contexts[indices[idx]] check_in = np.argwhere(context_list == answer) if check_in.shape[0] == 0: context_list[label] = answer context_list = context_list[:num] else: context_list[check_in[0][0]] = context_list[num] context_list[label] = answer context_list = context_list[:num] if idx % 100 == 0: print("query: ", query) print("answer: ", answer) print("negative:", context_list) print("label:", label) tmp = { "query": query, "negative_samples": context_list, "label": label } total.append(tmp) df = pd.DataFrame(total) f = Features({ "query": Value(dtype="string", id=None), "negative_samples": Sequence(feature=Value(dtype="string", id=None), length=-1, id=None), "label": Value(dtype="int32", id=None), }) dataset = Dataset.from_pandas(df, features=f) dataset.save_to_disk(os.path.join(args.path.train_data_dir, name))
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device): """ Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py """ import faiss if isinstance(data, str): dataset = load_dataset("csv", data_files=data, delimiter="\t", column_names=["title", "text"]) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( partial(split_documents, split_text_n=args.split_text_n, split_text_character=args.split_text_character), batched=True, num_proc=args.process_count, ) ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to( device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( ctx_encoder_name) new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=device), batched=True, batch_size=args.rag_embed_batch_size, features=new_features, ) if isinstance(data, str): dataset = dataset["train"] if args.save_knowledge_dataset: output_dataset_directory = os.path.join(args.output_dir, "knowledge_dataset") os.makedirs(output_dataset_directory, exist_ok=True) dataset.save_to_disk(output_dataset_directory) index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) return dataset
def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir, csv_path): kb_dataset = load_dataset("csv", data_files=[csv_path], split="train", delimiter="\t", column_names=["title", "text"]) kb_dataset = kb_dataset.map( split_documents, batched=True, num_proc=1) # if you want you can load already splitted csv. kb_list = [ kb_dataset.shard(total_processes, i, contiguous=True) for i in range(total_processes) ] data_shrad = kb_list[process_num] arrow_folder = "data_" + str(process_num) passages_path = os.path.join(shard_dir, arrow_folder) context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base") ctx_encoder = ctx_encoder.to(device=device) def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast, device) -> dict: """Compute the DPR embeddings of document passages""" input_ids = ctx_tokenizer(documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt")["input_ids"] embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output return {"embeddings": embeddings.detach().cpu().numpy()} new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = data_shrad.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=context_tokenizer, device=device), batched=True, batch_size=16, features=new_features, ) dataset.save_to_disk(passages_path)
def run_sparse_retrieval(datasets, training_args): #### retreival process #### retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json" # context_path="all_wikipedia_documents.json" ) # sparse embedding retrieval # retriever.get_sparse_embedding() #df = retriever.retrieve(datasets['validation']) # bm25 retrieval # retriever.get_embedding_BM25() # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10) # elastic search retrieval # retriever.get_elastic_search() df = retriever.retrieve_ES(query_or_dataset=datasets['validation'], topk=10) # faiss retrieval # df = retriever.retrieve_faiss(dataset['validation']) if training_args.do_predict: # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. f = Features({ 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) elif training_args.do_eval: # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def run_sparse_retrieval(datasets, training_args, inf_args): #### retreival process #### if inf_args.retrieval == None: retriever = SparseRetrieval_BM25PLUS( tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") elif inf_args.retrieval.lower() == "sparse": retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") # elif inf_args.retrieval.lower() == "bm25" or inf_args.retrieval.lower() == "bm25": # retriever = SparseRetrieval_BM25(tokenize_fn=tokenize, # data_path="./data", # context_path="wikipedia_documents.json") retriever.get_sparse_embedding() df = retriever.retrieve(datasets['validation'], inf_args.k) # faiss retrieval # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. if training_args.do_predict: f = Features({ 'contexts': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. elif training_args.do_eval: f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def read_dataset_from_csv(csv_path): """ read the prepared csv data as Dataset object """ df = pd.read_csv(csv_path, converters={ 'token': str, 'written': str, 'spoken': str }) feature_tag = Sequence( ClassLabel(num_classes=3, names=list(pd.factorize(df['tag'])[1]))) df['tag'] = df['tag'].apply(feature_tag.feature.str2int) df_text = df.groupby(['sentence_id']).agg({'token': list, 'tag': list}) dataset = Dataset.from_pandas(df_text) dataset.features["tag"] = feature_tag return dataset
def load_dataset(self) -> None: logger.debug('loading rag dataset: %s', self.name) self.dataset = load_dataset('csv', data_files=[self.csv_path], split='train', delimiter=',', column_names=['title', 'text']) self.dataset = self.dataset.map( split_documents, batched=False, num_proc=6, batch_size=100, ) ctx_encoder = DPRContextEncoder.from_pretrained( self.context_encoder).to(device=self.device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( self.context_encoder) new_features = Features({ 'text': Value('string'), 'title': Value('string'), 'embeddings': Sequence(Value('float32')) }) # optional, save as float32 instead of float64 to save space self.dataset = self.dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=self.device), batched=True, batch_size=16, features=new_features, ) self.dataset.save_to_disk(self.dataset_path) index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) self.dataset.add_faiss_index('embeddings', custom_index=index) self.dataset.get_index('embeddings').save(self.faiss_path)
def predict(self, input_path, output_path): key = 'tmp' input_df = pd.DataFrame() if self.pretrained: input_df['src_token'] = read_txt(input_path) input_df['src_token'] = input_df['src_token'].str.lower() input_df['token'] = input_df['src_token'].str.split() input_df['tag'] = input_df['token'].apply(lambda x: ['O'] * len(x)) input_df['sentence_id'] = input_df.index trainer = Trainer(model=self.model, tokenizer=self.tokenizer, data_collator=self.data_collator) feature_tag = Sequence( ClassLabel(num_classes=3, names=self.label_list)) input_df['tag'] = input_df['tag'].apply( feature_tag.feature.str2int) eval_dataset = Dataset.from_pandas(input_df) eval_dataset.features["tag"] = feature_tag # predict tokenized_datasets = DatasetDict({ key: eval_dataset }).map(self.tokenize_and_align_labels, batched=True) _, true_predictions = self.predict_dataset(trainer, tokenized_datasets[key]) result = save_classifier_result(eval_dataset, true_predictions, output_path) return result else: input_df['token'] = read_txt(input_path) input_df['sentence_id'] = input_df.index input_df['tag'] = 'B' input_df.to_csv(output_path, index=False) print("Result saved to ", output_path) return input_df
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file table = csv.read_csv("./data/train.csv", parse_options=ParseOptions(delimiter="\t")) class_label_ = table.column("label").unique() class_label = ClassLabel(num_classes=len(class_label_), names=class_label_.tolist()) train = main_ner.process_data(data_args.train_file, class_label) test = main_ner.process_data(data_args.test_file, class_label) val = main_ner.process_data(data_args.validation_file, class_label) # table = csv.read_csv(data_args.train_file) extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, delimiter="\t", quoting=csv_lib.QUOTE_NONE) train_dataset = datasets["train"] test_dataset = datasets["test"] val_dataset = datasets["validation"] table = train_dataset.data label = table.column("label") class_label_ = label.unique() class_label = Sequence(feature=ClassLabel( num_classes=len(class_label_), names=class_label_.tolist())) train_dataset.features['ner_tags'] = class_label # train_ner_list: ChunkedArray = class_label.feature.str2int(train_dataset.data.column('label').to_numpy()) # train_ner_array = pa.array(train_ner_list) # train_data = train_dataset.data.append_column("ner_tags", train_ner_array) train_dataset._data = train test_dataset.features['ner_tags'] = class_label test_dataset._data = test val_dataset.features['ner_tags'] = class_label # val_ner_list: ChunkedArray = class_label.feature.str2int(val_dataset.data.column('label').to_numpy()) # val_ner_array = pa.array(val_ner_list) # val_data = val_dataset.data.append_column("ner_tags", val_ner_array) val_dataset._data = val # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list seq: Sequence = features[label_column_name] # label_list = ["O", "B-GENE", "I-GENE"] # label_to_id = {i: i for i in range(len(label_list))} if isinstance(seq.feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] if len(examples) == 3: for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs else: print("asdasdsa") tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: logger.info("*** Predict ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results
# Padding of token_boxes up the bounding boxes to the sequence length. input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"] padding_length = max_seq_length - len(input_ids) token_boxes += [pad_token_box] * padding_length encoding['bbox'] = token_boxes assert len(encoding['input_ids']) == max_seq_length assert len(encoding['attention_mask']) == max_seq_length assert len(encoding['token_type_ids']) == max_seq_length assert len(encoding['bbox']) == max_seq_length return encoding # we need to define the features ourselves as the bbox of LayoutLM are an extra feature features = Features({ 'input_ids': Sequence(feature=Value(dtype='int64')), 'bbox': Array2D(dtype="int64", shape=(512, 4)), 'attention_mask': Sequence(Value(dtype='int64')), 'token_type_ids': Sequence(Value(dtype='int64')), 'image_path': Value(dtype='string'), 'words': Sequence(feature=Value(dtype='string')), }) classes = ["bill", "invoice", "others", "Purchase_Order", "remittance"] # Model Loading device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @st.cache(allow_output_mutation=True) def load_model(): url = "https://vast-ml-models.s3-ap-southeast-2.amazonaws.com/Document-Classification-5-labels-final.bin"
def make_custom_dataset(dataset_path): if not (os.path.isdir("../data/train_dataset") or os.path.isdir("../data/wikipedia_documents.json")): raise Exception("Set the original data path to '../data'") train_f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) if not os.path.isfile("../data/preprocess_wiki.json"): with open("../data/wikipedia_documents.json", "r") as f: wiki = json.load(f) new_wiki = dict() for ids in range(len(wiki)): new_wiki[str(ids)] = run_preprocess_to(wiki[str(ids)]) with open('../data/preprocess_wiki.json', 'w', encoding='utf-8') as make_file: json.dump(new_wiki, make_file, indent="\t", ensure_ascii=False) if not os.path.isfile("/opt/ml/input/data/preprocess_train.pkl"): train_dataset = load_from_disk("../data/train_dataset")['train'] val_dataset = load_from_disk("../data/train_dataset")['validation'] new_train_data, new_val_data = [], [] for data in train_dataset: new_data = run_preprocess(data) new_train_data.append(new_data) for data in val_dataset: new_data = run_preprocess(data) new_val_data.append(new_data) train_df = pd.DataFrame(new_train_data) val_df = pd.DataFrame(new_val_data) dataset = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) save_pickle(dataset_path, dataset) if 'preprocess' in dataset_path: return dataset if 'squad' in dataset_path: train_data = get_pickle("../data/preprocess_train.pkl")["train"] val_data = get_pickle("../data/preprocess_train.pkl")["validation"] korquad_data = load_dataset("squad_kor_v1")["train"] df_train_data = pd.DataFrame(train_data) df_val_data = pd.DataFrame(val_data) df_korquad_data = pd.DataFrame( korquad_data, columns=['answers', 'context', 'id', 'question']) df_total_train = pd.concat([df_train_data, df_korquad_data]) dataset = DatasetDict({ 'train': Dataset.from_pandas(df_total_train, features=train_f), 'validation': Dataset.from_pandas(df_val_data, features=train_f) }) save_pickle("../data/korquad_train.pkl", dataset) return train_dataset if 'concat' in dataset_path: base_dataset = get_pickle("../data/preprocess_train.pkl") train_dataset, val_dataset = base_dataset["train"], base_dataset[ "validation"] train_data = [{ "id": train_dataset[i]["id"], "question": train_dataset[i]["question"], "answers": train_dataset[i]["answers"], "context": train_dataset[i]["context"] } for i in range(len(train_dataset))] val_data = [{ "id": val_dataset[i]["id"], "question": val_dataset[i]["question"], "answers": val_dataset[i]["answers"], "context": val_dataset[i]["context"] } for i in range(len(val_dataset))] config = {'host': 'localhost', 'port': 9200} es = Elasticsearch([config]) k = 5 # k : how many contexts to concatenate for idx, train in enumerate(train_data): res = search_es(es, "wiki-index", question["question"], k) context_list = [(hit['_source']['document_text'], hit['_score']) for hit in res['hits']['hits']] contexts = train["context"] count = 0 for context in context_list: # if same context already exists, don't concatenate if train["context"] == context[0]: continue contexts += " " + context[0] count += 1 if count == (k - 1): break train_data[idx]["context"] = contexts for idx, val in enumerate(val_data): res = search_es(es, "wiki-index", question["question"], k) context_list = [(hit['_source']['document_text'], hit['_score']) for hit in res['hits']['hits']] contexts = val["context"] count = 0 for context in context_list: if val["context"] == context[0]: continue contexts += " " + context[0] count += 1 if count == (k - 1): break val_data[idx]["context"] = contexts train_df = pd.DataFrame(train_data) val_df = pd.DataFrame(val_data) dataset = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) save_pickle(dataset_path, dataset) return dataset if "split_wiki_400" in dataset_path: with open("/opt/ml/input/data/preprocess_wiki.json", "r") as f: wiki = json.load(f) new_wiki = dict() for i in tqdm(range(len(wiki))): if len(wiki[str(i)]["text"]) < 800: new_wiki[str(i)] = wiki[str(i)] continue data_list, count = passage_split_400(wiki[str(i)]["text"]) for j in range(count): new_wiki[str(i) + f"_{j}"] = { "text": data_list[j], "corpus_source": wiki[str(i)]["corpus_source"], "url": wiki[str(i)]["url"], "domain": wiki[str(i)]["domain"], "title": wiki[str(i)]["title"], "author": wiki[str(i)]["author"], "html": wiki[str(i)]["html"], "document_id": wiki[str(i)]["document_id"] } save_data("../data/wiki-index-split-400.json", new_wiki) if "split_wiki" in dataset_path and dataset_path != "split_wiki_400": with open("/opt/ml/input/data/preprocess_wiki.json", "r") as f: wiki = json.load(f) limit = 0 if "800" in dataset_path: limit = 800 if "1000" in dataset_path: limit = 1000 new_wiki = dict() for i in tqdm(range(len(wiki))): if len(wiki[str(i)]["text"]) < limit: new_wiki[str(i)] = wiki[str(i)] continue data_1, data_2 = passage_split(wiki[str(i)]["text"]) new_wiki[str(i) + f"_1"] = { "text": data_1, "corpus_source": wiki[str(i)]["corpus_source"], "url": wiki[str(i)]["url"], "domain": wiki[str(i)]["domain"], "title": wiki[str(i)]["title"], "author": wiki[str(i)]["author"], "html": wiki[str(i)]["html"], "document_id": wiki[str(i)]["document_id"] } new_wiki[str(i) + f"_2"] = { "text": data_2, "corpus_source": wiki[str(i)]["corpus_source"], "url": wiki[str(i)]["url"], "domain": wiki[str(i)]["domain"], "title": wiki[str(i)]["title"], "author": wiki[str(i)]["author"], "html": wiki[str(i)]["html"], "document_id": wiki[str(i)]["document_id"] } save_data(f"../data/split_wiki_{limit}.json")
def test_cast_array_to_features_nested(): arr = pa.array([[{"foo": [0]}]]) assert cast_array_to_feature(arr, [{ "foo": Sequence(Value("string")) }]).type == pa.list_(pa.struct({"foo": pa.list_(pa.string())}))
None: 'violet', 'pn': 'yellow', 'h': 'red', 'wh': 'purple', 'fg': 'brown', 'fn': 'grey', 'tb': 'beige' } id2label = {v: k for k, v in label2id.items()} LABELS = [label2id[L] for L in LABELS] from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D FEATURES = Features({ 'image': Array3D(dtype="int64", shape=(3, 224, 224)), 'input_ids': Sequence(feature=Value(dtype='int64')), 'attention_mask': Sequence(Value(dtype='int64')), 'token_type_ids': Sequence(Value(dtype='int64')), 'bbox': Array2D(dtype="int64", shape=(512, 4)), 'labels': Sequence(ClassLabel(names=LABELS + [max(LABELS) + 1])) }) NUM_LABELS = len(LABELS) PROCESSOR_PICKLE = f"processor_module{NUM_LABELS}.pickle" MODEL_PICKLE = f"model_module{NUM_LABELS}.pickle" EPOCHS_LAYOUT = 84 PDF_UPLOAD_DIR = hidden_folder + "/pdf_upload/" ELMO_DIFFERENCE_MODEL_PATH = hidden_folder + "elmo_difference_models"
def main( rag_example_args: "RagExampleArguments", processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): ###################################### logger.info("Step 1 - Create the dataset") ###################################### # The dataset needed for RAG must have three columns: # - title (string): title of the document # - text (string): text of a passage of the document # - embeddings (array of dimension d): DPR representation of the passage # Let's say you have documents in tab-separated csv files with columns "title" and "text" assert os.path.isfile( rag_example_args.csv_path), "Please provide a valid path to a csv file" # You can load a Dataset object this way dataset = load_dataset("csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]) # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # Then split the documents into passages of n words (changing the param in split_text to what you want n to be) dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) # And compute the embeddings if use_generated_model: model_path = 'ragfinetune_4_4_false_50_true/checkpoint2/' # SET PATH TO CHECKPOINT config = RagConfig.from_pretrained(model_path) config.n_docs = 4 config.n_docs_splits = 4 retriever = RagRetriever.from_pretrained(model_path, config=config) checkpoint_model = RagSequenceForGeneration.from_pretrained( model_path, config=config, retriever=retriever).cuda() ctx_encoder = checkpoint_model.generator.get_encoder() ctx_tokenizer = checkpoint_model.retriever.generator_tokenizer else: ctx_encoder = DPRContextEncoder.from_pretrained( rag_example_args.dpr_ctx_encoder_model_name).to(device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( rag_example_args.dpr_ctx_encoder_model_name) new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer), batched=True, batch_size=processing_args.batch_size, features=new_features, ) # And finally save your dataset passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset") dataset.save_to_disk(passages_path) # from datasets import load_from_disk # dataset = load_from_disk(passages_path) # to reload the dataset ###################################### logger.info("Step 2 - Index the dataset") ###################################### # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) # And save the index index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss") dataset.get_index("embeddings").save(index_path) # dataset.load_faiss_index("embeddings", index_path) # to reload the index ###################################### logger.info("Step 3 - Load RAG") ###################################### # Easy way to load the model retriever = RagRetriever.from_pretrained(rag_example_args.rag_model_name, index_name="custom", indexed_dataset=dataset) model = RagSequenceForGeneration.from_pretrained( rag_example_args.rag_model_name, retriever=retriever) tokenizer = RagTokenizer.from_pretrained(rag_example_args.rag_model_name) # For distributed fine-tuning you'll need to provide the paths instead, as the dataset and the index are loaded separately. # retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path=passages_path, index_path=index_path) ###################################### logger.info("Step 4 - Have fun") ###################################### question = rag_example_args.question or "What does Moses' rod turn into ?" input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"] generated = model.generate(input_ids) generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0] logger.info("Q: " + question) logger.info("A: " + generated_string)
def retrieve(self, query_or_dataset, topk=1): assert self.p_embedding is not None, "get_embedding()을 먼저 수행한 후에 retrieve()를 작동시켜 주세요. " total = [] # 중복을 걸러내기 위해 40 + topk (확인된 최대 중복 개수 40 + topk개)으로 최소값을 설정하고, topk의 alpha 배수로 뽑습니다. alpha = 2 doc_scores, doc_indices = self.get_relevant_doc_bulk( query_or_dataset["question"], topk=max(40 + topk, alpha * topk) ) for idx, example in enumerate(tqdm(query_or_dataset, desc="Retrieval: ")): doc_scores_topk = [doc_scores[idx][0]] doc_indices_topk = [doc_indices[idx][0]] pointer = 1 while len(doc_indices_topk) != topk: is_non_duplicate = True new_text_idx = doc_indices[idx][pointer] new_text = self.contexts[new_text_idx] for d_id in doc_indices_topk: if fuzz.ratio(self.contexts[d_id], new_text) > 65: is_non_duplicate = False break if is_non_duplicate: doc_scores_topk.append(doc_scores[idx][pointer]) doc_indices_topk.append(new_text_idx) pointer += 1 if pointer == max(40 + topk, alpha * topk): break assert len(doc_indices_topk) == topk, "중복 없는 topk 추출을 위해 alpha 값을 증가시켜 주세요." for doc_id in range(topk): doc_idx = doc_indices_topk[doc_id] tmp = { "question": example["question"], "id": example["id"], "context_id": self.context_ids[doc_idx], # retrieved id "context": self.contexts[doc_idx], # retrieved document } if "context" in example.keys() and "answers" in example.keys(): tmp["original_context"] = example["context"] # original document tmp["answers"] = example["answers"] # original answer total.append(tmp) df = pd.DataFrame(total) if self.args.train.do_predict is True: f = Features( { "context": Value(dtype="string", id=None), "id": Value(dtype="string", id=None), "question": Value(dtype="string", id=None), "context_id": Value(dtype="int32", id=None), } ) else: f = Features( { "answers": Sequence( feature={"text": Value(dtype="string", id=None), "answer_start": Value(dtype="int32", id=None)}, length=-1, id=None, ), "context": Value(dtype="string", id=None), "id": Value(dtype="string", id=None), "question": Value(dtype="string", id=None), "original_context": Value(dtype="string", id=None), "context_id": Value(dtype="int32", id=None), } ) datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)}) return datasets
import os.path as p from collections import defaultdict import pandas as pd from datasets import load_dataset from datasets import concatenate_datasets from datasets import Sequence, Value, Features, Dataset, DatasetDict from utils.tools import get_args f = Features({ "answers": Sequence( feature={ "text": Value(dtype="string", id=None), "answer_start": Value(dtype="int32", id=None) }, length=-1, id=None, ), "id": Value(dtype="string", id=None), "context": Value(dtype="string", id=None), "question": Value(dtype="string", id=None), "title": Value(dtype="string", id=None), }) def remove_multiple_indexes(rlist, indexes):