def add_to_db(adder, comments, verified, verified_user_id, is_rude):
     for parsed_args in comments:
         params = CSVDataUploader.make_site_comment_params(
             parsed_args, verified, verified_user_id, is_rude)
         if SiteComment.is_exist(adder, params.get('comment_id')):
             continue
         adder.add(SiteComment(params))
Пример #2
0
def play():
    rude_comments = SiteComment.rude_comments()
    normal_comments = SiteComment.normal_comments()
    rude_words_wiki = WiktionaryOrg.humiliation_words()
    total_rude = 0
    total_normal = 0
    for comment in rude_comments:
        if len(comment.processed_body.split(' ')) == 0:
            continue

        words = [
            word for word in comment.processed_body.split(u' ')
            if word in rude_words_wiki
        ]
        if len(words) > 0:
            total_rude += 1
        # print("[Words: %s] ([%s]) || %s\r\n" % ( str(u' '.join(words)), str(u' '.join(comment.processed_body.split(u' '))), str(comment.body) ))

    for comment in normal_comments:
        if len(comment.processed_body.split(' ')) == 0:
            continue

        words = [
            word for word in comment.processed_body.split(u' ')
            if word in rude_words_wiki
        ]
        if len(words) > 0:
            total_normal += 1
#            print("[Words: %s] || %s\r\n" % ( str(words), str(comment.body) ))

    print("Total rude %s, total normal %s, comment [%s]" %
          (str(total_rude), str(total_normal), str(comment.body)))
Пример #3
0
def api_features():
    if g.user is None:
        abort(404)

    x = int(request.args.get("x", -1))
    if x < 0:
        abort(404)

    if CURRENT_MODEL != MODEL_LOGISITIC_REGRESSION:
        abort(404)

    rude_comments = SiteComment.rude_comments()
    normal_comments = SiteComment.normal_comments()

    def get_data(comments, feature, label):
        data = list()
        for comment in comments:
            feature_value = SiteCommentFeatures.manual_feature_value(
                comment, feature)
            data.append({"x": feature_value, "label": label})
        return data

    positive_data = get_data(rude_comments, x, SiteCommentFeatures.RUDE_CLASS)
    negative_data = get_data(normal_comments, x,
                             SiteCommentFeatures.NORMAL_CLASS)

    return jsonify(
        **{
            "x_name": SiteCommentFeatures.feature_desc(x),
            "positive": positive_data,
            "negative": negative_data
        })
Пример #4
0
def analyse_with_bayes_classifier():
    rude_comments = SiteComment.rude_comments()
    normal_comments = SiteComment.normal_comments()

    classifier = BinaryNaiveBayesClassifier(True)
    classifier.train(rude_comments, normal_comments)
    classifier.print_params()

    return classifier
Пример #5
0
def actions_verify(comment_id):
    if g.user is None or g.user.role != "moderator":
        abort(404)

    if request.args.get("is_rude", None) is None:
        abort(404)

    is_rude = json.loads(request.args.get("is_rude").lower())

    comment = SiteComment.by_comment_id(comment_id)
    if comment is None:
        abort(404)

    adder = DBModelAdder()
    adder.start()

    comment.verified = datetime.datetime.now()
    comment.is_rude = is_rude
    comment.verified_user_id = g.user.user_id
    comment.skipped = None

    adder.add(comment)
    adder.done()

    resp = {
        "status": True,
        "msg": "OK",
        "comment_id": comment_id,
        "is_rude": is_rude,
        "verified_user_id": g.user.user_id
    }

    return jsonify(**resp)
Пример #6
0
def analyse_comments(analysed_at=None):
    classifier = None

    if CURRENT_MODEL == MODEL_LOGISITIC_REGRESSION:
        classifier = analyse_with_logistic_regression()

    if classifier is None:
        print("Classifier is not set up. Set up classifier first.")
        return

    print("Model is ready. Starting analysis...")
    suspected = 0
    adder = DBModelAdder()
    adder.start()
    comments_for_analysis = SiteComment.comments_for_analysis(analysed_at)
    for comment in comments_for_analysis:
        comment.analysed = datetime.datetime.now()
        comment.looks_rude = classifier.classify_rude(comment)
        adder.add(comment)
        if comment.looks_rude:
            suspected += 1
    adder.done()

    print("Analysis was done for %s comments, %s suspected to be rude." %
          (str(len(comments_for_analysis)), str(suspected)))
Пример #7
0
def load_comments_from_se_to_db():
    def make_site_comment_params(comment, info):
        comment_id, post_id, body, creation_date, author_id, author_name = comment
        question_id, answer_id, post_author_id, post_author_name, score, title, post_creation_date = info

        return {
            "comment_id": comment_id,
            "question_id": question_id,
            "answer_id": answer_id,
            "post_author_id": post_author_id,
            "post_score": score,
            "title": title,
            "body": body,
            "processed_body": process_text(body),
            "creation_date": creation_date,
            "author_id": author_id,
            "author_name": author_name,
            "verified": None,
            "is_rude": False,
            "diff_with_post":
            (creation_date - post_creation_date).total_seconds()
        }

    last_one = SiteComment.last_comment()
    comments = get_recent_comments(last_one.creation_date)
    infos = dict()
    ids = [comment[1] for comment in comments]
    page_size = 20
    counter = 0

    while counter < len(ids):
        req_ids = ids[counter:counter + page_size]
        info = get_post_infos(req_ids)
        infos.update(info)
        counter += page_size

    adder = DBModelAdder()
    adder.start()

    for comment in comments:
        if SiteComment.is_exist(adder, comment[0]):
            continue
        adder.add(
            SiteComment(
                make_site_comment_params(comment, infos.get(comment[1]))))

    adder.done()
Пример #8
0
def create_model():
    if CURRENT_MODEL == MODEL_LOGISITIC_REGRESSION:
        feature_list = SiteCommentFeatureList(SiteComment.rude_comments(),
                                              SiteComment.normal_comments())
        feature_maker = feature_list.maker()

        classifier = LogisticRegaression(feature_list, feature_maker, True)
        classifier.train()
        rude_total, rude_right, normal_total, normal_right = classifier.test(
            True)

        tpr = float(rude_right) / float(rude_total)
        tnr = float(normal_right) / float(normal_total)
        total_objects = float(rude_total + normal_total)
        acc = (rude_right / total_objects) * tpr + (normal_right /
                                                    total_objects) * tnr
        print("Accuracy: %s, rude: %s (%s), normal: %s (%s) " %
              (str(acc), str(rude_right), str(rude_total), str(normal_right),
               str(normal_total)))
        adder = DBModelAdder()
        adder.start()

        feature_data = feature_maker.store()
        json_fd = JSONObjectData(JSONObjectData.FEATURE_TYPE_ID,
                                 json.dumps(feature_data))
        adder.add(json_fd)

        classifier_data = classifier.store()
        classifier_extra = {
            "acc": acc,
            "rude_right": rude_right,
            "rude_total": rude_total,
            "normal_right": normal_right,
            "normal_total": normal_total
        }
        json_cd = JSONObjectData(JSONObjectData.LOGREG_TYPE_ID,
                                 json.dumps(classifier_data),
                                 json.dumps(classifier_extra))
        adder.add(json_cd)

        adder.done()
        print("A new logistic regression classifier was added to the DB.")
    else:
        print("Please specify a model to create first.")
Пример #9
0
def verifying():
    if g.user is None or g.user.role != "moderator":
        return redirect(url_for('index'))
    page = max(int(request.args.get("page", 1)), 1)
    paginator = SiteComment.paginate_unverified(page)
    return render_template('index.html',
                           paginator=paginator,
                           base_url=url_for("verifying"),
                           so_url=SO_URL,
                           active_tab="verifying")
Пример #10
0
def analyse_with_cosine():
    stats = DocsStats()
    rude_comments = SiteComment.rude_comments()
    rude_docs = list()
    for comment in rude_comments:
        rude_docs.append(
            Document(stats, comment.id, comment.body, comment.processed_body))

    unverified_comments = SiteComment.comments_for_analysis()
    unverified_docs = list()
    for comment in unverified_comments:
        unverified_docs.append(
            Document(stats, comment.id, comment.body, comment.processed_body))

    stats.calculate_idfs()
    stats.vectorise_documents()

    cosine = CosineSimilarity(rude_docs)
    rude_cluster = cosine.biggest_cluster()
    for item in rude_cluster:
        print("- ", item.body, "\r\n")
Пример #11
0
def comment_feed():
    limit = min(int(session.get("limit", 30)), 1000)
    comments = SiteComment.analysed_as_rude(limit)
    last_update = datetime.datetime.now()
    if len(comments) > 0:
        last_update = comments[0].analysed
    resp = make_response(
        render_template('feed_proto.xml',
                        app_url=APP_URL,
                        app_title=FEED_APP_TITLE,
                        so_url=SO_URL,
                        last_update=last_update,
                        entries=comments))
    resp.headers['Content-type'] = 'application/atom+xml; charset=utf-8'
    return resp
Пример #12
0
def dump_verified_comments():
    directory = "./dump"

    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory)

    def dump_to(filename, comments):
        def date_or_none(field):
            return field.strftime(
                "%Y-%m-%d %H:%M:%S") if field is not None else None

        with open(filename, 'w', encoding="utf8") as csvfile:
            writer = csv.writer(csvfile, lineterminator='\n', delimiter=',')
            for comment in comments:
                writer.writerow([
                    comment.comment_id, comment.question_id, comment.answer_id,
                    comment.post_author_id, comment.post_score, comment.title,
                    comment.body,
                    date_or_none(comment.creation_date), comment.author_id,
                    comment.author_name, comment.diff_with_post,
                    date_or_none(comment.verified), comment.is_rude,
                    comment.verified_user_id,
                    date_or_none(comment.added),
                    date_or_none(comment.analysed), comment.looks_rude,
                    date_or_none(comment.skipped)
                ])

    rude_comments = SiteComment.rude_comments()
    dump_to(directory + "/rude_comments.csv", rude_comments)

    normal_comments = SiteComment.normal_comments()
    dump_to(directory + "/normal_comments.csv", normal_comments)

    skipped_comments = SiteComment.skipped_comments()
    dump_to(directory + "/skipped_comments.csv", skipped_comments)
Пример #13
0
def actions_skipp(comment_id):
    if g.user is None or g.user.role != "moderator":
        abort(404)

    comment = SiteComment.by_comment_id(comment_id)
    if comment is None:
        abort(404)

    adder = DBModelAdder()
    adder.start()

    comment.skipped = datetime.datetime.now()
    comment.verified = None
    comment.verified_user_id = -1

    adder.add(comment)
    adder.done()

    resp = {"status": True, "msg": "OK", "comment_id": comment_id}

    return jsonify(**resp)
Пример #14
0
def check_to_rebuild():
    saved_data = JSONObjectData.last(JSONObjectData.LOGREG_TYPE_ID)
    feature_saved_data = JSONObjectData.last(JSONObjectData.FEATURE_TYPE_ID)
    if saved_data is None or feature_saved_data is None:
        print("There are no saved data. Starting rebuilding...")
        create_model()
        print("Now, do analysis for previous comments with the new model...")
        analyse_comments(datetime.datetime.now())
        return

    unseen_for_model = SiteComment.verified_after(saved_data.added)
    print(
        "There are currently %s comments which the model has not seen. The threshold is %s"
        % (str(unseen_for_model), str(REBUILD_MODEL_THRESHOLD)))
    if unseen_for_model >= REBUILD_MODEL_THRESHOLD:
        print("We are above the threshold. Starting rebuilding...")
        create_model()
        print("Now, do analysis for previous comments with the new model...")
        analyse_comments(datetime.datetime.now())
        return

    print("No reason to rebuild. We will wait a bit more.")
 def add(adder, comments):
     for comment in comments:
         if SiteComment.is_exist(adder, comment.get("comment_id")):
             continue
         adder.add(SiteComment(comment))