Пример #1
0
def main():
    args        = docopt(__doc__)
    data_dir     = args["--data_dir"]
    if args["train"]    : 
        reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)
        reader.train(data_dir=data_dir, train_filename=args["--train_file_name"],dev_filename=args["--dev_file_name"],use_gpu=False, n_epochs=1, save_dir=args["--save_dir"],dev_split=0.05)
    if args["test"]     : 
        reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False)
        print(reader.eval_on_file(data_dir,args["--eval_file_name"],'cpu'))
    if args["cli"]      :
        reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False)
        query_doc_list=[]
        for text_file in list(glob.glob(data_dir+'/*.txt')):
            with open(text_file,"r") as f:
                context=f.read()
            #context=context.split(".")
            context=[context]
            for i,para in enumerate(context):    
                query_doc_list.append(Document(id=str(i),text=para))
        while 1:  
            question=input("CTRL C to exit >")
            prediction=reader.predict(question,query_doc_list)
            print("answer:>> ",prediction['answers'][0]['answer']) 
            print("-----")
            print("context:>> ",prediction['answers'][0]['context'])
            print("-------------")             
def tutorial2_finetune_a_model_on_your_data():
    # ## Create Training Data
    #
    # There are two ways to generate training data
    #
    # 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
    #                    your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
    #                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
    #                    that is compatible for training with Haystack.
    #
    # 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
    #                    REST API interface. This includes a customizable user feedback API for providing feedback on the
    #                    answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
    #                    for fine-tuning your model further.
    #
    #
    # ## Fine-tune your model
    #
    # Once you have collected training data, you can fine-tune your base models.
    # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
    # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
    # Learning effects.

    #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

    reader = FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        use_gpu=True)
    train_data = "data/squad20"
    # train_data = "PATH/TO_YOUR/TRAIN_DATA"
    reader.train(data_dir=train_data,
                 train_filename="dev-v2.0.json",
                 use_gpu=True,
                 n_epochs=1,
                 save_dir="my_model")

    # Saving the model happens automatically at the end of training into the `save_dir` you specified
    # However, you could also save a reader manually again via:
    reader.save(directory="my_model")

    # If you want to load it at a later point, just do:
    new_reader = FARMReader(model_name_or_path="my_model")
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.retriever.tfidf import TfidfRetriever
from haystack.utils import print_answers

#### TRAINING #############
# Let's take a reader as a base model
reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)

# and fine-tune it on your own custom dataset (should be in SQuAD like format)
train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data,
             train_filename="train.json",
             use_gpu=False,
             n_epochs=1)

#### Use it (same as in Tutorial 1) #############

## Indexing & cleaning documents

# Let's get the data (Game of thrones articles from wikipedia)
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Init Document store & write docs to it
document_store = SQLDocumentStore(url="sqlite:///qa.db")
write_documents_to_db(document_store=document_store,
                      document_dir=doc_dir,
# 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
#                    your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
#                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
#                    that is compatible for training with Haystack.
# 
# 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
#                    REST API interface. This includes a customizable user feedback API for providing feedback on the
#                    answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
#                    for fine-tuning your model further.
# 
# 
# ## Fine-tune your model
# 
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.

#**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
train_data = "data/squad20"
# train_data = "PATH/TO_YOUR/TRAIN_DATA" 
reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="my_model")

# If you want to load it at a later point, just do:
new_reader = FARMReader(model_name_or_path="my_model")
Пример #5
0
from haystack import Finder
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.retriever.tfidf import TfidfRetriever
from haystack.utils import print_answers

#### TRAINING #############
# Let's take a reader as a base model
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)

# and fine-tune it on your own custom dataset (should be in SQuAD like format)
train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data, train_filename="train.json", n_epochs=1)


#### Use it (same as in Tutorial 1) #############

## Indexing & cleaning documents

# Let's get the data (Game of thrones articles from wikipedia)
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)


# Init Document store & write docs to it
document_store = SQLDocumentStore(url="sqlite:///qa.db")
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)
Пример #6
0
#                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
#                    that is compatible for training with Haystack.
#
# 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
#                    REST API interface. This includes a customizable user feedback API for providing feedback on the
#                    answer returned by the API. The API provides feedback export endpoint to obtain the feedback data
#                    for fine-tuning your model further.
#
#
# ## Fine-tune your model
#
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=False)
# train_data = "data"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir="/home/bulelani/Desktop/odin/odin/src_new/data/training",
             train_filename="answers.json",
             use_gpu=False,
             n_epochs=1,
             save_dir="/home/bulelani/Desktop/odin/saved_models")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="/home/bulelani/Desktop/odin/saved_models")

# If you want to load it at a later point, just do:
# new_reader = FARMReader(model_name_or_path="my_model")
Пример #7
0
from haystack.reader.farm import FARMReader

reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)
train_data = "/home/bulelani/Desktop/odin/odin/src_new/data/training"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data,
             train_filename="demo.json",
             use_gpu=False,
             n_epochs=100,
             save_dir="/home/bulelani/Desktop/odin/my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="/home/bulelani/Desktop/odin/my_model")
Пример #8
0
from haystack.reader.farm import FARMReader

#input directory of the labels answers.json file
train_data = "/usr/src/app/data/squad20"
# output directory of the model
train_model = "/usr/src/app/data/train_model"

reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)

reader.train(data_dir=train_data,
             train_filename="answers.json",
             n_epochs=20,
             dev_split=0,
             save_dir=train_model)

print('Training successfully completed')