Exemplo n.º 1
0
def test_remove_url_with_pre_process_text():
    # pre_process_text removes trailing/extra whitespaces
    # thus the correct_text doesn't have one
    text = "text with url : https://g\nithub.com/ru\ncio/rucio"
    text_2 = "text with url : https://github.com/rucio/rucio"
    correct_text = "text with url :"
    assert utils.pre_process_text(text, remove_url=True) == correct_text
    assert utils.pre_process_text(text, remove_url=True) == correct_text
Exemplo n.º 2
0
    def parse(
        self,
        issue_id,
        comment_id,
        creator,
        created_at,
        body,
        db=Database,
        issue_comments_table="issue_comments",
    ):
        """
        Parses a single issue's comment.

        :param [issue_id,...,body]   : all the raw issue comment's attributes
        :param db                    : <bot Database object> to where we store the parsed issue comments
        :param issue_comments_table  : in case we need use a different table name (default 'issue_comments')
        :returns issue_comment       : IssueComment object
        """
        # The date format returned from the GitHub API is in the ISO 8601 format: "%Y-%m-%dT%H:%M:%SZ"
        issue_comment_created_at = utils.convert_to_utc(
            created_at, "%Y-%m-%dT%H:%M:%SZ")
        issue_comment_clean_body = utils.pre_process_text(body,
                                                          fix_url=True,
                                                          remove_newline=True)
        issue_comment = IssueComment(
            issue_id=issue_id,
            comment_id=comment_id,
            creator=creator,
            created_at=issue_comment_created_at,
            body=body,
            clean_body=issue_comment_clean_body,
        )

        db.insert_issue_comment(issue_comment, table_name=issue_comments_table)
        return issue_comment
Exemplo n.º 3
0
def test_fix_urls_with_pre_process_text():
    text = (
        "text with url : https://g\nithub.com/ru\ncio/rucio that has line newline char"
    )
    correct_text = (
        "text with url : https://github.com/rucio/rucio that has line newline char"
    )
    assert utils.pre_process_text(text, fix_url=True) == correct_text
Exemplo n.º 4
0
def test_lemmatizer_with_pre_process_text():
    # nltk's word tokenizer might break tokens a bit weird eg. "->" to "- >"
    test_words = {
        "rocks": "rock",
        "corpora": "corpus",
        "developers": "developer"
    }
    for word in test_words.keys():
        assert utils.pre_process_text(word, lemmatize=True) == test_words[word]
Exemplo n.º 5
0
    def parse(
        self,
        issue_id,
        title,
        state,
        creator,
        created_at,
        comments,
        body,
        db=Database,
        issues_table_name="issues",
    ):
        """
        Parses a single issue.

        <!> Note  : The parse() method is only expected to be used after an an issues table
        has been created in the db. To create said table use the Database object's
        .create_issues_table() method before attempting to parse.

        :param [issue_id,...,body]  : all the raw issue attributes
        :param db                 : <bot Database object> to where we store the parsed issues
        :param issues_table_name  : in case we need use a different table name (default 'issues')
        :returns issue            : an <Issue object> created by the IssueParser
        """
        # The date format returned from the GitHub API is in the ISO 8601 format: "%Y-%m-%dT%H:%M:%SZ"
        issue_created_at = utils.convert_to_utc(created_at,
                                                "%Y-%m-%dT%H:%M:%SZ")
        issue_clean_body = utils.pre_process_text(self.clean_issue_body(body),
                                                  fix_url=True)
        issue = Issue(
            issue_id=issue_id,
            title=title,
            state=state,
            creator=creator,
            created_at=issue_created_at,
            comments=comments,
            body=body,
            clean_body=issue_clean_body,
        )

        # no comments -> no context,  only insert relevant data to db
        if issue.comments > 0:
            db.insert_issue(issue, table_name=issues_table_name)
        return issue
Exemplo n.º 6
0
    def clean_body(body):
        """
        Cleans the email's body.

        Applies the following:
        1) Remove newline characters from inside urls
        2) Replace newline characters with ' ' space
        3) Remove extra whitespaces
        4) Decontract words
        5) Try to find matches based on the regex patterns.
           If said matches exist, only keep the text up to the
           earliest match. These patterns are inside emails right
           before text from previous emails is pasted/quoted.
           eg.
           Example of a reply email:
                "Dear Nick
                        ...
                 Thanks, George.
                 On <DATE> Nick wrote:
                    >> Previous email body
                    >> Previous email body "

            from which we only keep:
                "Dear Nick
                        ...
                 Thanks, George."

        :param  body        : body of an email
        :returns clean_email_body : cleaned body of an email
        """
        # steps 1-4 done with utils.pre_process_text function
        clean_email_body = utils.pre_process_text(
            body, fix_url=True, remove_newline=True
        )
        # match the 4 regex patterns
        try:
            start_1 = start_2 = start_3 = start_4 = None
            if config.ON_HDR_REGEX.search(clean_email_body) is not None:
                for on_hdr_match in config.ON_HDR_REGEX.finditer(clean_email_body):
                    start_1 = on_hdr_match.start()
                    break
            if config.ORIGINAL_MSG_REGEX.search(clean_email_body) is not None:
                for or_msg_match in config.ORIGINAL_MSG_REGEX.finditer(
                    clean_email_body
                ):
                    start_2 = or_msg_match.start()
                    break
            if config.QUOTED_REGEX.search(clean_email_body) is not None:
                for quote_match in config.QUOTED_REGEX.finditer(clean_email_body):
                    start_3 = quote_match.start()
                    break
            if config.HEADER_REGEX.search(clean_email_body) is not None:
                for hdr_match in config.HEADER_REGEX.finditer(clean_email_body):
                    start_4 = hdr_match.start()
                    break

            # if any matches keep text up to the earliest/first match
            if all(start is None for start in [start_1, start_2, start_3, start_4]):
                return clean_email_body
            else:
                min_start = min(
                    start
                    for start in [start_1, start_2, start_3, start_4]
                    if start is not None
                )
                clean_email_body = clean_email_body[:min_start]
                return clean_email_body
        except Exception as _e:
            print(_e)
Exemplo n.º 7
0
def test_lower_with_pre_process_text():
    text = "A SAMPLE TEXT with upper case"
    assert utils.pre_process_text(text, lower_text=True) == text.lower()
Exemplo n.º 8
0
def test_pre_process_text_extra_whitespace():
    text = "   text    wont    contain    extra    spaces   "
    correct_text = "text wont contain extra spaces"
    assert utils.pre_process_text(text) == correct_text
Exemplo n.º 9
0
def test_stemmer_with_pre_process_text():
    # nltk's word tokenizer might break tokens a bit weird eg. "->" to "- >"
    words = ["program", "programs", "programer", "programing", "programers"]
    root = "program"
    for word in words:
        assert utils.pre_process_text(word, stem=True) == root
Exemplo n.º 10
0
def test_remove_stopwords_with_pre_process_text():
    # nltk's word tokenizer might break tokens a bit weird eg. "->" to "- >"
    text = "random stopwords -> our she when or too from how am re most while will"
    correct_text = "random stopwords - >"
    assert utils.pre_process_text(text, remove_stop_words=True) == correct_text
Exemplo n.º 11
0
def test_replace_numbers_with_pre_process_text():
    text = "Here are all the numbers : 1234567890 ;)"
    correct_text = "Here are all the numbers : hhhhhhhhhh ;)"
    assert (utils.pre_process_text(text,
                                   remove_numbers=True,
                                   numbers_replacement="h") == correct_text)
Exemplo n.º 12
0
def test_remove_numbers_with_pre_process_text():
    text = "Here are all the numbers : 1234567890 ;)"
    correct_text = "Here are all the numbers : ;)"
    assert utils.pre_process_text(text, remove_numbers=True) == correct_text
Exemplo n.º 13
0
def test_replace_punctuation_with_pre_process_text():
    text = """Here is all the punctuation : !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"""
    correct_text = "Here is all the punctuation h hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh"
    assert (utils.pre_process_text(
        text, remove_punctuation=True,
        punctuation_replacement="h") == correct_text)
Exemplo n.º 14
0
def test_remove_punctuation_with_pre_process_text():
    # remember, pre_process_text removes extra whitespaces
    text = """Here is all the punctuation : !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"""
    correct_text = """Here is all the punctuation"""
    assert utils.pre_process_text(text,
                                  remove_punctuation=True) == correct_text
Exemplo n.º 15
0
def test_decontract_with_pre_process_text():
    text = "won't, can't, shouldn't, we're, that's, I'd, we'll, aren't, they've, I'm"
    correct_text = "will not, can not, should not, we are, that is, I would, we will, are not, they have, I am"
    assert utils.pre_process_text(text, decontract_words=True) == correct_text
Exemplo n.º 16
0
def test_remove_newline_with_pre_process_text():
    # is not supposed to concat words that are broken from newline char
    text = "text\n with multi\nple newline\n chars"
    correct_text = "text with multi ple newline chars"
    assert utils.pre_process_text(text, remove_newline=True) == correct_text