class QAPipeline(BaseEstimator): """ A scikit-learn implementation of the whole cdQA pipeline Parameters ---------- metadata: pandas.DataFrame dataframe containing your corpus of documents metadata header should be of format: title, paragraphs. reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional bert_version: str Bert pre-trained model selected in the list: bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese. kwargs: kwargs for BertQA(), BertProcessor() and TfidfRetriever() Please check documentation for these classes Examples -------- >>> from cdqa.pipeline.qa_pipeline import QAPipeline >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib') >>> qa_pipeline.fit(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') >>> from cdqa.pipeline.qa_pipeline import QAPipeline >>> qa_pipeline = QAPipeline() >>> qa_pipeline.fit_reader('train-v1.1.json') >>> qa_pipeline.fit(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') """ def __init__(self, reader=None, **kwargs): # Separating kwargs kwargs_bertqa = { key: value for key, value in kwargs.items() if key in BertQA.__init__.__code__.co_varnames } kwargs_processor = { key: value for key, value in kwargs.items() if key in BertProcessor.__init__.__code__.co_varnames } kwargs_retriever = { key: value for key, value in kwargs.items() if key in TfidfRetriever.__init__.__code__.co_varnames } if not reader: self.reader = BertQA(**kwargs_bertqa) elif type(reader) == str: self.reader = joblib.load(reader) else: self.reader = reader self.processor_train = BertProcessor(is_training=True, **kwargs_processor) self.processor_predict = BertProcessor(is_training=False, **kwargs_processor) self.retriever = TfidfRetriever(**kwargs_retriever) def fit(self, X=None, y=None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- X: pandas.Dataframe Dataframe with the following columns: "title", "paragraphs" """ self.metadata = X self.metadata['content'] = self.metadata['paragraphs'].apply( lambda x: ' '.join(x)) self.retriever.fit(self.metadata['content']) return self def fit_reader(self, X=None, y=None): """Train the reader (BertQA instance) of QAPipeline object Parameters ---------- X = path to json file in SQUAD format """ train_examples, train_features = self.processor_train.fit_transform(X) self.reader.fit(X=(train_examples, train_features)) return self def predict(self, X=None): """ Compute prediction of an answer to a question Parameters ---------- X: str or list of strings Sample (question) or list of samples to perform a prediction on Returns ------- If X is str prediction: tuple (answer, title, paragraph) If X is list os strings predictions: list of tuples (answer, title, paragraph) """ if (isinstance(X, str)): closest_docs_indices = self.retriever.predict( X, metadata=self.metadata) squad_examples = generate_squad_examples( question=X, closest_docs_indices=closest_docs_indices, metadata=self.metadata) examples, features = self.processor_predict.fit_transform( X=squad_examples) prediction = self.reader.predict((examples, features)) return prediction elif (isinstance(X, list)): predictions = [] for query in X: closest_docs_indices = self.retriever.predict( query, metadata=self.metadata) squad_examples = generate_squad_examples( question=query, closest_docs_indices=closest_docs_indices, metadata=self.metadata) examples, features = self.processor_predict.fit_transform( X=squad_examples) pred = self.reader.predict((examples, features)) predictions.append(pred) return predictions else: raise TypeError("The input is not a string or a list. \ Please provide a string or a list of strings as input" ) def to(self, device): ''' Send reader to CPU if device=='cpu' or to GPU if device=='cuda' ''' if device not in ('cpu', 'cuda'): raise ValueError("Attribure device should be 'cpu' or 'cuda'.") self.reader.model.to(device) self.reader.device = torch.device(device) return self def cpu(self): ''' Send reader to CPU ''' self.reader.model.cpu() self.reader.device = torch.device('cpu') return self def cuda(self): ''' Send reader to GPU ''' self.reader.model.cuda() self.reader.device = torch.device('cuda') return self
class QAPipeline(BaseEstimator): """ A scikit-learn implementation of the whole cdQA pipeline Parameters ---------- metadata: pandas.DataFrame dataframe containing your corpus of documents metadata header should be of format: title, paragraphs. reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional retrieve_by_doc: bool (default: False). If Retriever will rank by documents or by paragraphs. kwargs: kwargs for BertQA(), BertProcessor() and TfidfRetriever() Please check documentation for these classes Examples -------- >>> from cdqa.pipeline.qa_pipeline import QAPipeline >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib') >>> qa_pipeline.fit_retriever(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') >>> from cdqa.pipeline.qa_pipeline import QAPipeline >>> qa_pipeline = QAPipeline() >>> qa_pipeline.fit_reader('train-v1.1.json') >>> qa_pipeline.fit_retriever(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') """ def __init__(self, reader=None, retrieve_by_doc=True, **kwargs): # Separating kwargs kwargs_bertqa = { key: value for key, value in kwargs.items() if key in BertQA.__init__.__code__.co_varnames } kwargs_processor = { key: value for key, value in kwargs.items() if key in BertProcessor.__init__.__code__.co_varnames } kwargs_retriever = { key: value for key, value in kwargs.items() if key in TfidfRetriever.__init__.__code__.co_varnames } if not reader: self.reader = BertQA(**kwargs_bertqa) elif type(reader) == str: self.reader = joblib.load(reader) else: self.reader = reader self.processor_train = BertProcessor(is_training=True, **kwargs_processor) self.processor_predict = BertProcessor(is_training=False, **kwargs_processor) self.retriever = TfidfRetriever(**kwargs_retriever) self.retrieve_by_doc = retrieve_by_doc def fit_retriever(self, X=None, y=None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- X: pandas.Dataframe Dataframe with the following columns: "title", "paragraphs" """ if self.retrieve_by_doc: self.metadata = X self.metadata["content"] = self.metadata["paragraphs"].apply( lambda x: " ".join(x)) else: self.metadata = self._expand_paragraphs(X) self.retriever.fit(self.metadata["content"]) return self def fit_reader(self, X=None, y=None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- X: pandas.Dataframe Dataframe with the following columns: "title", "paragraphs" """ train_examples, train_features = self.processor_train.fit_transform(X) self.reader.fit(X=(train_examples, train_features)) return self def predict(self, X=None, return_logit=False, n_predictions=None): """ Compute prediction of an answer to a question Parameters ---------- X: str or list of strings Sample (question) or list of samples to perform a prediction on return_logit: boolean Whether to return logit of best answer or not. Default: False Returns ------- If X is str prediction: tuple (answer, title, paragraph) If X is list os strings predictions: list of tuples (answer, title, paragraph) If return_logits is True, each prediction tuple will have the following structure: (answer, title, paragraph, best logit) """ if isinstance(X, str): closest_docs_indices = self.retriever.predict( X, metadata=self.metadata) squad_examples = generate_squad_examples( question=X, closest_docs_indices=closest_docs_indices, metadata=self.metadata, retrieve_by_doc=self.retrieve_by_doc) examples, features = self.processor_predict.fit_transform( X=squad_examples) prediction = self.reader.predict((examples, features), return_logit, n_predictions) return prediction elif isinstance(X, list): predictions = [] for query in X: closest_docs_indices = self.retriever.predict( query, metadata=self.metadata) squad_examples = generate_squad_examples( question=query, closest_docs_indices=closest_docs_indices, metadata=self.metadata, ) examples, features = self.processor_predict.fit_transform( X=squad_examples) pred = self.reader.predict((examples, features), return_logit, n_predictions) predictions.append(pred) return predictions else: raise TypeError("The input is not a string or a list. \ Please provide a string or a list of strings as input" ) def to(self, device): """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda' """ if device not in ("cpu", "cuda"): raise ValueError("Attribute device should be 'cpu' or 'cuda'.") self.reader.model.to(device) self.reader.device = torch.device(device) return self def cpu(self): """ Send reader to CPU """ self.reader.model.cpu() self.reader.device = torch.device("cpu") return self def cuda(self): """ Send reader to GPU """ self.reader.model.cuda() self.reader.device = torch.device("cuda") return self def dump_reader(self, filename): """ Dump reader model to a .joblib object """ joblib.dump(self.reader, filename) @staticmethod def _expand_paragraphs(df): # Snippet taken from: https://stackoverflow.com/a/48532692/11514226 lst_col = "paragraphs" df = pd.DataFrame({ col: np.repeat(df[col].values, df[lst_col].str.len()) for col in df.columns.drop(lst_col) }).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns] df["content"] = df["paragraphs"] return df.drop("paragraphs", axis=1)
from cdqa.utils.converters import df2squad json_data = df2squad(df=df_X, squad_version='v1.1', output_dir='.', filename='qna_tim_ferriss') # From there we can use https://github.com/cdqa-suite/cdQA-annotator to create a supervised problem #%% [markdown] # ### Fine-Tuning #%% # Fine-tune Bert model with squad v1.1 custom data set of Tim Ferriss questions import os import torch from sklearn.externals import joblib from cdqa.reader.bertqa_sklearn import BertProcessor, BertQA train_processor = BertProcessor(do_lower_case=True, is_training=True) train_examples, train_features = train_processor.fit_transform(X='cdqa-v1.1-tim_qna.json') reader = BertQA(train_batch_size=12, learning_rate=3e-5, num_train_epochs=2, do_lower_case=True, output_dir='models') reader.fit(X=(train_examples, train_features)) # Output fine-tuned model reader.model.to('cpu') reader.device = torch.device('cpu') joblib.dump(reader, os.path.join(reader.output_dir, 'bert_tim_qa_vCPU.joblib')) #%% [markdown]
import os import torch from sklearn.externals import joblib from cdqa.reader.bertqa_sklearn import BertProcessor, BertQA # pre-process examples train_processor = BertProcessor(do_lower_case=True, is_training=True) train_examples, train_features = train_processor.fit_transform( X='data/train-v1.1.json') # train the model reader = BertQA(train_batch_size=12, learning_rate=3e-5, num_train_epochs=2, do_lower_case=True, fp16=False, output_dir='models') reader.fit(X=(train_examples, train_features)) # save GPU version locally joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa_vGPU.joblib')) # send current reader model to CPU reader.model.to('cpu') reader.device = torch.device('cpu') # save CPU it locally joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa_vCPU.joblib'))