Exemplo n.º 1
0
 def on_click(self):
     """Indexes FAQ values for the Search Engine"""
     db_name = self.db_input.get().strip()
     faq_table_name = self.table_input.get().strip()
     if db_name and faq_table_name:
         data_storage = Database(f"{db_name}.db")
         faq_df = data_storage.get_dataframe(table=f"{faq_table_name}")
         faq_se = FAQSearchEngine()
         faq_se.create_index(
             corpus=faq_df,
             db=data_storage,
             table_name=f"{faq_table_name}_doc_term_matrix",
         )
         data_storage.close_connection()
         self.close_win()
     else:
         print("Error: But for real though, fields must not be empty.")
Exemplo n.º 2
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""This is the script responsible for fetching the Rucio documentation through GitHub"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument(
        "-t",
        "--token",
        help="GitHub api token to be used for the GET requests while fetching",
        required=True,
    )
    optional.add_argument(
        "-o",
        "--output_db",
        default="docs_input_data",
        help="Output .db file where the data is stored (default is docs_input_data)",
    )
    optional.add_argument(
        "--documentation_table",
        default="docs",
        help="Name of the table where we will store the documentation (default is docs)",
    )

    args = parser.parse_args()
    db_name = args.output_db
    token = args.token
    docs_table = args.documentation_table

    # IssueFetcher
    data_storage = Database(f"{db_name}.db")
    fetcher = FetcherFactory.get_fetcher("Rucio Documentation")
    docs_df = fetcher.fetch(api_token=token)
    fetcher.save(db=data_storage, docs_table_name=docs_table)
    print(f"Data saved on {db_name}.db")
    print("Sample docs:")
    print(docs_df.head())
    data_storage.close_connection()
Exemplo n.º 3
0
def fetch_faq_data():
    """Creates FAQ table and populates it with data in faq.json"""
    # create faq table
    print(f"Creating faq table in data_storage.db")
    data_storage = Database("data_storage.db")
    data_storage.create_faq_table()
    # load faq data
    with open(DATA_DIR + "faq.json") as json_file:
        data = json.load(json_file)
    # insert data to db
    print(f"Inserting data from faq.json file...")
    for faq in data:
        data_storage.insert_faq(faq)
    data_storage.close_connection()
Exemplo n.º 4
0
 def insert_faq_to_db(
     self, db_name, faq_table_name, question, answer, author, keywords
 ):
     """Inserts FAQ values to db"""
     # prepare data_storage
     data_storage = Database(f"{db_name}.db")
     # create table if not exists
     tables_in_db = list([table[0] for table in data_storage.get_tables()])
     if faq_table_name not in tables_in_db:
         print(f"Creating '{faq_table_name}' table in {db_name}.db")
         data_storage.create_faq_table(table_name=f"{faq_table_name}")
     # insert row
     faq_obj = FAQ(
         question=question, answer=answer, author=author, keywords=keywords
     )
     data_storage.insert_faq(faq_obj, table_name=faq_table_name)
     print(f"FAQ object inserted in '{faq_table_name}' table on {db_name}.db!")
     data_storage.close_connection()
Exemplo n.º 5
0
    def __init__(self, model=None, db_name="data_storage", num_answers_to_predict=3):

        self.model = "distilbert-base-cased-distilled-squad"
        if model:
            check_model_availability(model)
            self.model = model
        self.db_name = db_name
        # better if just CPU for inference
        gpu = 0 if torch.cuda.is_available() else -1
        self.answer_detector = AnswerDetector(
            model=self.model, device=gpu, num_answers_to_predict=num_answers_to_predict
        )
        data_storage = Database(f"{self.db_name}.db")
        faq_se, docs_se, question_se = setup_search_engines(db=data_storage)
        self.qa_interface = QAInterface(
            detector=self.answer_detector,
            question_engine=question_se,
            faq_engine=faq_se,
            docs_engine=docs_se,
        )
        # thread that innits donkeybot instance wont used db again
        data_storage.close_connection()
Exemplo n.º 6
0
 def _store_answers(self, answers):
     # a different thread is running each time so new connection to db is needed
     # could use 'sqlite3.connect('your.db', check_same_thread=False)' but need to do my own synchronization
     data_storage = Database(f"{self.db_name}.db")
     for answer in answers:
         data_storage.insert_answer(answer)
     data_storage.close_connection()
     return
Exemplo n.º 7
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description=
        """This is the script responsible for parsing GitHub issue comments""")
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument(
        "-i",
        "--input_db",
        help="Input .db file name of the raw fetched issue comments",
        required=True,
    )
    optional.add_argument(
        "-o",
        "--output_db",
        default="data_storage",
        help=
        "Output .db file name of the parsed issues (default is  data_storage)",
        required=True,
    )
    optional.add_argument(
        "--issue_comments_table",
        default="issue_comments",
        help=
        "Name of the table where we will store the parsed issues and of the table where the raw issue comments are stored (default is issue_comments)",
    )

    args = parser.parse_args()
    input_db = args.input_db
    output_db = args.output_db
    issue_comments_table = args.issue_comments_table

    # input
    raw_issue_comments_data = Database(f"{input_db}.db").get_dataframe(
        issue_comments_table)
    # output
    data_storage = Database(f"{output_db}.db")
    data_storage.create_issue_comments_table(table_name=issue_comments_table)
    # EmailParser
    print("Let's create an IssueCommentsParser.")
    parser = ParserFactory.get_parser("Issue Comment")
    parser.parse_dataframe(
        raw_issue_comments_data,
        db=data_storage,
        issue_comments_table=issue_comments_table,
    )
    print(f"Data from {input_db}.db parsed and saved on {output_db}.db")
    data_storage.close_connection()
Exemplo n.º 8
0
    def __init__(self, model=None, db_name="data_storage", num_answers_inf=1):

        self.model = "distilbert-base-cased-distilled-squad"
        if model:
            check_model_availability(model)
            self.model = model

        gpu = 0 if torch.cuda.is_available() else -1
        self.answer_detector = AnswerDetector(
            model=self.model,
            device=gpu,
            num_answers_to_predict=num_answers_inf)
        data_storage = Database(f"{db_name}.db")
        faq_se, docs_se, question_se = setup_search_engines(db=data_storage)
        self.qa_interface = QAInterface(detector=self.answer_detector,
                                        question_engine=question_se,
                                        faq_engine=faq_se,
                                        docs_engine=docs_se)
Exemplo n.º 9
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""This is the script responsible for parsing the emails"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument("-i",
                          "--input_db",
                          help="Input .db file name of the raw emails",
                          required=True)
    optional.add_argument(
        "-o",
        "--output_db",
        default="data_storage",
        help=
        "Output .db file name of the parsed emails (default is  data_storage)",
        required=True,
    )
    optional.add_argument(
        "--emails_table",
        default="emails",
        help=
        "Name of the table where we will store the parsed emails and of the table where the raw emails are stored (default is emails)",
    )

    args = parser.parse_args()
    input_db = args.input_db
    output_db = args.output_db
    emails_table = args.emails_table

    # input
    raw_emails_data = Database(f"{input_db}.db").get_dataframe(emails_table)
    # output
    data_storage = Database(f"{output_db}.db")
    data_storage.create_emails_table(table_name=emails_table)
    # EmailParser
    print("Let's create an EmailParser")
    parser = ParserFactory.get_parser("Email")
    parser.parse_dataframe(raw_emails_data,
                           db=data_storage,
                           emails_table_name=emails_table)
    print(f"Data from {input_db}.db parsed and saved on {output_db}.db")
    data_storage.close_connection()
Exemplo n.º 10
0
def test_db():
    db = Database("test.db", "test_table")
    db.create_issue_comments_table("test_table")
    yield db
    db.close_connection()
    os.remove(config.DATA_DIR + "test.db")
Exemplo n.º 11
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description=
        """With this script you can query with the Search Engine module and get top-k results."""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")
    required.add_argument(
        "-q",
        "--query",
        help="What you want to query.",
        required=True,
    )
    required.add_argument(
        "-k",
        "--top_k",
        type=check_positive,
        help="Number of documents that'll be retrieved.",
        required=True,
    )
    optional.add_argument(
        "-mq",
        "--match_questions",
        type=str2bool,
        nargs="?",  # 0 or 1 argument
        const=True,
        default=False,
        help="Match query to similar questions.",
    )
    optional.add_argument(
        "-md",
        "--match_docs",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="Match query to similar documents.",
    )
    optional.add_argument(
        "--docs_index",
        default="rucio_doc_term_matrix",
        help=
        "Name of documentation index column. (default is rucio_doc_term_matrix)",
    )
    optional.add_argument(
        "--docs_original_table",
        default="docs",
        help=
        "Name of the original table for the documentation. (default is docs)",
    )
    optional.add_argument(
        "--question_index",
        default="questions_doc_term_matrix",
        help=
        "Name of questions index column. (default is questions_doc_term_matrix)",
    )
    optional.add_argument(
        "--questions_original_table",
        default="questions",
        help=
        "Name of the original table for the questions. (default is questions)",
    )
    optional.add_argument(
        "-db",
        "--db_name",
        default="data_storage",
        help=
        "Name of database where indexes are stored. (default is data_storage)",
    )

    args = parser.parse_args()
    db_name = args.db_name
    query = args.query
    top_k = int(args.top_k)
    if not (args.match_questions or args.match_docs):
        parser.error(
            "No index to search requested, add -mq/--match_questions or -md/--match_docs"
        )
    match_questions = args.match_questions
    match_docs = args.match_docs
    docs_idx_name = args.docs_index
    docs_original_table = args.docs_original_table
    question_original_table = args.questions_original_table
    question_idx_name = args.question_index

    data_storage = Database(f"{db_name}.db")
    # load SE's
    try:
        docs_se = SearchEngine()
        docs_se.load_index(
            db=data_storage,
            table_name=docs_idx_name,
            original_table=docs_original_table,
        )
        q_se = QuestionSearchEngine()
        q_se.load_index(
            db=data_storage,
            table_name=question_idx_name,
            original_table=question_original_table,
        )
        data_storage.close_connection()
        if match_docs:
            docs_results = docs_se.search(query, top_k)
            print(f"\nTop-{top_k} retrieved documentation:")
            print(docs_results[["doc_id", "question", "name", "context"]])
        if match_questions:
            question_results = q_se.search(query, top_k)
            print(f"\nTop-{top_k} retrieved past questions:")
            print(question_results[[
                "question_id", "query", "question", "context"
            ]])

    except Exception as _e:
        print("Error : ", end="")
        sys.exit(_e)
Exemplo n.º 12
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""Use this script to ask DonkeyBot!""")
    optional = parser.add_argument_group("optional arguments")

    optional.add_argument(
        "-m",
        "--model",
        default="distilbert-base-cased-distilled-squad",
        help=
        "BERT/DistilBERT model used to inference answers. (default is distilbert-base-cased-distilled-squad)",
    )
    optional.add_argument(
        "-db",
        "--db_name",
        default="data_storage",
        help=
        "Name of database where all data is stored. (default is data_storage)",
    )
    optional.add_argument(
        "-s",
        "--store_answers",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help=
        "Store the answers on the '--answers_table' table. (default is False)",
    )
    optional.add_argument(
        "-n",
        "--num_answers_predicted_per_document",
        default=3,
        help="Number of answers predicted per document. (default is 3)",
    )
    optional.add_argument(
        "--answers_table",
        default="answers",
        help="Name of the answers table. (default is 'answers')",
    )

    args = parser.parse_args()
    db_name = args.db_name
    model = args.model
    answers_table = args.answers_table
    store_answers = args.store_answers
    num_answers_inf = int(args.num_answers_predicted_per_document)

    check_model_availability(model)

    # prepare data_storage
    data_storage = Database(f"{db_name}.db")
    # check for the answers table
    tables_in_db = list([table[0] for table in data_storage.get_tables()])
    if answers_table not in tables_in_db:
        print(f"Creating '{answers_table}' table in {db_name}.db")
        data_storage.create_answers_table(table_name=f"{answers_table}")

    # load answer detector
    print("Loading AnswerDetector...")
    gpu = 0 if torch.cuda.is_available() else -1
    answer_detector = AnswerDetector(model=model,
                                     device=gpu,
                                     num_answers_to_predict=num_answers_inf)

    # load search engines
    faq_se, docs_se, question_se = setup_search_engines(db=data_storage)

    # load interface
    qa_interface = QAInterface(
        detector=answer_detector,
        question_engine=question_se,
        faq_engine=faq_se,
        docs_engine=docs_se,
    )

    # Main Loop
    print("DonkeyBot ready to be asked!")
    try:
        while True:
            print("\nCTRL+C to exit donkeybot")
            query = str(input("ask question: "))
            top_k = int(input("how many answers: "))
            start_time = time.time()
            answers = qa_interface.get_answers(query, top_k=top_k)
            print(
                f"Total inference time: {round(time.time() - start_time, 2)} seconds"
            )
            print_answers(answers)

            if store_answers:
                for answer in answers:
                    data_storage.insert_answer(answer,
                                               table_name=f"{answers_table}")
    except KeyboardInterrupt:
        data_storage.close_connection()
        sys.exit("\nExiting...")
Exemplo n.º 13
0
def test_db():
    db = Database("db_for_tests.db")
    yield db
    db.close_connection()
Exemplo n.º 14
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""This is the script responsible for fetching GitHub issues and comments"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument(
        "-r",
        "--repo",
        help="Name of the GitHub repository we are fetching from. Format `user/repo`",
        required=True,
    )
    required.add_argument(
        "-t",
        "--token",
        help="GitHub api token to be used for the GET requests while fetching",
        required=True,
    )
    optional.add_argument(
        "-o",
        "--output_db",
        default="issues_input_data",
        help="Output .db file where the data is stored (default is issues_input_data)",
    )
    optional.add_argument(
        "--max_pages",
        default=201,
        type=int,
        help="Maximum number of pages we will request through GitHubs api (default is 201)",
        required=False,
    )
    optional.add_argument(
        "--issues_table",
        default="issues",
        help="Name of the table where we will store the issues (default is issues)",
    )
    optional.add_argument(
        "--comments_table",
        default="issue_comments",
        help="Name of the table where we will store the comments (default is issue_comments)",
    )

    args = parser.parse_args()
    db_name = args.output_db
    repository = args.repo
    token = args.token
    issues_table = args.issues_table
    comments_table = args.comments_table
    max_pages = args.max_pages

    # IssueFetcher
    data_storage = Database(f"{db_name}.db")
    fetcher = FetcherFactory.get_fetcher("Issue")
    (issues_df, comments_df) = fetcher.fetch(
        repo=repository, api_token=token, max_pages=max_pages
    )
    fetcher.save(
        db=data_storage,
        issues_table_name=issues_table,
        comments_table_name=comments_table,
    )

    print(f"Raw issues data stored on {db_name}.db")
    print("Sample:")
    print("Issues")
    print(issues_df.head())
    print("Comments")
    print(comments_df.head())
    data_storage.close_connection()
Exemplo n.º 15
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""Run this script to detect and save questions originating from GitHub issues"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument("-db", "--db_name", help="Database name of our storage")
    optional.add_argument(
        "--issues_table",
        default="issues",
        help="Name given to the table holding the issues. (default is issues)",
    )
    optional.add_argument(
        "--questions_table",
        default="questions",
        help="Name given to the table holding the questions. (default is questions)",
    )

    args = parser.parse_args()
    db_name = args.db_name
    issues_table = args.issues_table
    questions_table = args.questions_table

    data_storage = Database(f"{db_name}.db")
    tables_in_db = list([table[0] for table in data_storage.get_tables()])
    assert issues_table in tables_in_db

    if questions_table not in tables_in_db:
        print(f"Creating '{questions_table}' table in {db_name}.db")
        data_storage.create_question_table(table_name=questions_table)

    issues_df = data_storage.get_dataframe(issues_table)

    qd = QuestionDetector("issue")
    print("Detecting questions in issues that have comments...")
    issues_with_questions = 0
    total_questions = 0
    for i in tqdm(range(len(issues_df.index))):
        text = str(issues_df.clean_body.values[i])
        issue_id = int(issues_df.issue_id.values[i])
        questions_detected = qd.detect(text)
        if not questions_detected:
            continue
        else:
            issues_with_questions += 1
            for question in questions_detected:
                total_questions += 1
                question.set_origin_id(issue_id)
                # make sure to find the context for each question
                question.find_context_from_table(data_storage, table_name=issues_table)
                if question.context == "":
                    continue
                else:
                    data_storage.insert_question(question, table_name=questions_table)

    print(f"Type of the question objects : {type(question)}")
    print(f"Total questions detected: {total_questions}")
    print(f"Number of issues with questions: {issues_with_questions}")
    data_storage.close_connection()
Exemplo n.º 16
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description=
        """Run this script to detect and save questions originating from emails"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument("-db",
                          "--db_name",
                          help="Database name of our storage")
    optional.add_argument(
        "--emails_table",
        default="emails",
        help="Name given to the table holding the emails. (default is emails)",
    )
    optional.add_argument(
        "--questions_table",
        default="questions",
        help=
        "Name given to the table holding the questions. (default is questions)",
    )

    args = parser.parse_args()
    db_name = args.db_name
    emails_table = args.emails_table
    questions_table = args.questions_table

    data_storage = Database(f"{db_name}.db")
    tables_in_db = list([table[0] for table in data_storage.get_tables()])
    assert emails_table in tables_in_db

    if questions_table not in tables_in_db:
        print(f"Creating '{questions_table}' table in {db_name}.db")
        data_storage.create_question_table(table_name=questions_table)

    emails_df = data_storage.get_dataframe(emails_table)
    # only keep emails that are part of a conversation to search for Questions in them
    conv_df = (emails_df[emails_df["conversation_id"].notnull()].sort_values(
        by=["conversation_id", "email_date"]).reset_index(drop=True))

    qd = QuestionDetector("email")
    print("Detecting questions in emails that are part of conversations...")
    emails_with_questions = 0
    total_questions = 0
    for i in tqdm(range(len(conv_df.index))):
        text = str(conv_df.clean_body.values[i])
        email_id = int(conv_df.email_id.values[i])
        questions_detected = qd.detect(text)
        if not questions_detected:
            continue
        else:
            emails_with_questions += 1
            for question in questions_detected:
                total_questions += 1
                question.set_origin_id(email_id)
                # make sure to find the context for each question
                question.find_context_from_table(data_storage,
                                                 table_name=emails_table)
                if question.context == "":
                    continue
                else:
                    data_storage.insert_question(question,
                                                 table_name=questions_table)

    print(f"Type of the question objects : {type(question)}")
    print(f"Total questions detected: {total_questions}")
    print(f"Number of emails with questions: {emails_with_questions}")
    data_storage.close_connection()
Exemplo n.º 17
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""This script indexes our data for SearchEngine and QuestionSearchEngine."""
    )
    optional = parser.add_argument_group("optional arguments")

    optional.add_argument(
        "-db",
        "--db_name",
        default="data_storage",
        help="Database name of our storage. (default is data_storage)",
    )
    optional.add_argument(
        "--documentation_table",
        default="docs",
        help="Name of the table where the documentation is stored (default is docs)",
    )
    optional.add_argument(
        "--questions_table",
        default="questions",
        help="Name given to the table holding the questions. (default is questions)",
    )
    optional.add_argument(
        "--faq_table",
        default="faq",
        help="Name given to the table holding the FAQ. (default is faq)",
    )

    args = parser.parse_args()
    db_name = args.db_name
    docs_table = args.documentation_table
    questions_table = args.questions_table
    faq_table = args.faq_table

    data_storage = Database(f"{db_name}.db")

    # Documentation SearchEngine
    docs_se = SearchEngine()
    docs_df = data_storage.get_dataframe(docs_table)
    # let's not index the release-notes in this version of the bot
    # this code also exists is load_index() for rucio documents
    docs_df = docs_df[docs_df["doc_type"] != "release_notes"]
    print("Indexing Rucio documentation for the SearchEngine...")
    docs_se.create_index(
        corpus=docs_df, db=data_storage, table_name="rucio_doc_term_matrix"
    )

    # QuestionSearchEngine
    questions_se = QuestionSearchEngine()
    questions_df = data_storage.get_dataframe(questions_table)
    print("Indexing Questions for the QuestionSearchEngine...")
    questions_se.create_index(
        corpus=questions_df,
        db=data_storage,
        table_name=f"{questions_table}_doc_term_matrix",
    )

    # FAQSearchEngine
    faq_se = FAQSearchEngine()
    faq_df = data_storage.get_dataframe(faq_table)
    print("Indexing FAQ for the FAQSearchEngine...")
    faq_se.create_index(
        corpus=faq_df, db=data_storage, table_name=f"{faq_table}_doc_term_matrix"
    )
Exemplo n.º 18
0
 def update_label(self, answer_id, label):
     data_storage = Database(f"{self.db_name}.db")
     assert label in (0, 1)
     data_storage.update_label(answer_id, label)
     data_storage.close_connection()
     return