Пример #1
0
def test_gibberish_classification():
    assert classify_gibberish("This is code: <pre>code</pre>", "stackoverflow.com") \
        == classify_gibberish("This is code:", "superuser.com")
    assert classify_gibberish("", "stackoverflow.com") == (False, 1)
    assert classify_gibberish("asaaasaadsapgoeaaaaafallppppp", "stackoverflow.com")[0] is True
    assert classify_gibberish("Try this\n<pre><code>some code here</code></pre>", "stackoverflow.com") \
        == (False, 1)
Пример #2
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query
        else:
            url = "http://api.stackexchange.com/2.2/questions/" + ";".join(
                str(x) for x in posts
            ) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        if "quota_remaining" in response:
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.apiquota = 0
            return

        if site == "stackoverflow.com":
            if len(response["items"]
                   ) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0][
                    "last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(
                    post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            if owner_rep <= 50:
                is_spam, reason, why = check_if_spam(title, body, owner_name,
                                                     owner_link, site, q_id,
                                                     False, False)
                if is_spam:
                    try:
                        handle_spam(title, body, owner_name, site, link,
                                    owner_link, q_id, reason, False, why)
                    except:
                        print "NOP"

            classified, gibberish_score = classify_gibberish(body, site)
            if classified and gibberish_score >= 65:
                GlobalVars.bayesian_testroom.send_message(
                    "[ SmokeDetector | GibberishClassifierBeta ] "
                    u"Potential gibberish body ({}%): [{}]({}) on `{}`".format(
                        gibberish_score, title, link, site))
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(
                            answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    if owner_rep <= 50:
                        is_spam, reason, why = check_if_spam(
                            answer_title, body, owner_name, owner_link, site,
                            a_id, True, False)
                        if is_spam:
                            try:
                                handle_spam(title, body, owner_name, site,
                                            link, owner_link, a_id, reason,
                                            True, why)
                            except:
                                print "NOP"

                    classified, gibberish_score = classify_gibberish(
                        body, site)
                    if classified and gibberish_score >= 65:
                        GlobalVars.bayesian_testroom.send_message(
                            "[ SmokeDetector | GibberishClassifierBeta ] "
                            u"Potential gibberish answer ({}%): [{}]({}) on `{}`"
                            .format(gibberish_score, title, link, site))
            except:
                print "no answers"
        return
Пример #3
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = (
                "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((&pagesize="
                + pagesize
                + min_query
            )
        else:
            url = (
                "http://api.stackexchange.com/2.2/questions/"
                + ";".join(str(x) for x in posts)
                + "?site="
                + site
                + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
            )
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        if "quota_remaining" in response:
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.apiquota = 0
            return

        if site == "stackoverflow.com":
            if len(response["items"]) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0]["last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            if owner_rep <= 50:
                is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
                if is_spam:
                    try:
                        handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why)
                    except:
                        print "NOP"

            classified, gibberish_score = classify_gibberish(body, site)
            if classified and gibberish_score >= 65:
                GlobalVars.bayesian_testroom.send_message(
                    "[ SmokeDetector | GibberishClassifierBeta ] "
                    u"Potential gibberish body ({}%): [{}]({}) on `{}`".format(gibberish_score, title, link, site)
                )
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    if owner_rep <= 50:
                        is_spam, reason, why = check_if_spam(
                            answer_title, body, owner_name, owner_link, site, a_id, True, False
                        )
                        if is_spam:
                            try:
                                handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why)
                            except:
                                print "NOP"

                    classified, gibberish_score = classify_gibberish(body, site)
                    if classified and gibberish_score >= 65:
                        GlobalVars.bayesian_testroom.send_message(
                            "[ SmokeDetector | GibberishClassifierBeta ] "
                            u"Potential gibberish answer ({}%): [{}]({}) on `{}`".format(
                                gibberish_score, title, link, site
                            )
                        )
            except:
                print "no answers"
        return
Пример #4
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        if "quota_remaining" in response:
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.apiquota = 0
            return

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
            if owner_rep <= 50 and is_spam:
                try:
                    handle_spam(title, owner_name, site, link, owner_link, q_id, reason, False)
                except:
                    print "NOP"

            classified, gibberish_score = classify_gibberish(body, site)
            if classified and gibberish_score >= 65:
                GlobalVars.bayesian_testroom.send_message(
                    "[ SmokeDetector | GibberishClassifierBeta ] "
                    "Potential gibberish body (%s%%): [%s](%s) on `%s`"
                    % (gibberish_score, title, link, site)
                )
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False)
                    if owner_rep <= 50 and is_spam:
                        try:
                            handle_spam(title, owner_name, site, link, owner_link, a_id, reason, True)
                        except:
                            print "NOP"

                    classified, gibberish_score = classify_gibberish(body, site)
                    if classified and gibberish_score >= 65:
                        GlobalVars.bayesian_testroom.send_message(
                            "[ SmokeDetector | GibberishClassifierBeta ] "
                            "Potential gibberish answer (%s%%): [%s](%s) on `%s`"
                            % (gibberish_score, title, link, site)
                        )
            except:
                print "no answers"
        return