示例#1
0
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description="""Run this script to detect and save questions originating from GitHub issues"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument("-db", "--db_name", help="Database name of our storage")
    optional.add_argument(
        "--issues_table",
        default="issues",
        help="Name given to the table holding the issues. (default is issues)",
    )
    optional.add_argument(
        "--questions_table",
        default="questions",
        help="Name given to the table holding the questions. (default is questions)",
    )

    args = parser.parse_args()
    db_name = args.db_name
    issues_table = args.issues_table
    questions_table = args.questions_table

    data_storage = Database(f"{db_name}.db")
    tables_in_db = list([table[0] for table in data_storage.get_tables()])
    assert issues_table in tables_in_db

    if questions_table not in tables_in_db:
        print(f"Creating '{questions_table}' table in {db_name}.db")
        data_storage.create_question_table(table_name=questions_table)

    issues_df = data_storage.get_dataframe(issues_table)

    qd = QuestionDetector("issue")
    print("Detecting questions in issues that have comments...")
    issues_with_questions = 0
    total_questions = 0
    for i in tqdm(range(len(issues_df.index))):
        text = str(issues_df.clean_body.values[i])
        issue_id = int(issues_df.issue_id.values[i])
        questions_detected = qd.detect(text)
        if not questions_detected:
            continue
        else:
            issues_with_questions += 1
            for question in questions_detected:
                total_questions += 1
                question.set_origin_id(issue_id)
                # make sure to find the context for each question
                question.find_context_from_table(data_storage, table_name=issues_table)
                if question.context == "":
                    continue
                else:
                    data_storage.insert_question(question, table_name=questions_table)

    print(f"Type of the question objects : {type(question)}")
    print(f"Total questions detected: {total_questions}")
    print(f"Number of issues with questions: {issues_with_questions}")
    data_storage.close_connection()
def main():
    # Parse cli arguments
    parser = argparse.ArgumentParser(
        description=
        """Run this script to detect and save questions originating from emails"""
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")

    required.add_argument("-db",
                          "--db_name",
                          help="Database name of our storage")
    optional.add_argument(
        "--emails_table",
        default="emails",
        help="Name given to the table holding the emails. (default is emails)",
    )
    optional.add_argument(
        "--questions_table",
        default="questions",
        help=
        "Name given to the table holding the questions. (default is questions)",
    )

    args = parser.parse_args()
    db_name = args.db_name
    emails_table = args.emails_table
    questions_table = args.questions_table

    data_storage = Database(f"{db_name}.db")
    tables_in_db = list([table[0] for table in data_storage.get_tables()])
    assert emails_table in tables_in_db

    if questions_table not in tables_in_db:
        print(f"Creating '{questions_table}' table in {db_name}.db")
        data_storage.create_question_table(table_name=questions_table)

    emails_df = data_storage.get_dataframe(emails_table)
    # only keep emails that are part of a conversation to search for Questions in them
    conv_df = (emails_df[emails_df["conversation_id"].notnull()].sort_values(
        by=["conversation_id", "email_date"]).reset_index(drop=True))

    qd = QuestionDetector("email")
    print("Detecting questions in emails that are part of conversations...")
    emails_with_questions = 0
    total_questions = 0
    for i in tqdm(range(len(conv_df.index))):
        text = str(conv_df.clean_body.values[i])
        email_id = int(conv_df.email_id.values[i])
        questions_detected = qd.detect(text)
        if not questions_detected:
            continue
        else:
            emails_with_questions += 1
            for question in questions_detected:
                total_questions += 1
                question.set_origin_id(email_id)
                # make sure to find the context for each question
                question.find_context_from_table(data_storage,
                                                 table_name=emails_table)
                if question.context == "":
                    continue
                else:
                    data_storage.insert_question(question,
                                                 table_name=questions_table)

    print(f"Type of the question objects : {type(question)}")
    print(f"Total questions detected: {total_questions}")
    print(f"Number of emails with questions: {emails_with_questions}")
    data_storage.close_connection()