def api_features(): if g.user is None: abort(404) x = int(request.args.get("x", -1)) if x < 0: abort(404) if CURRENT_MODEL != MODEL_LOGISITIC_REGRESSION: abort(404) rude_comments = SiteComment.rude_comments() normal_comments = SiteComment.normal_comments() def get_data(comments, feature, label): data = list() for comment in comments: feature_value = SiteCommentFeatures.manual_feature_value( comment, feature) data.append({"x": feature_value, "label": label}) return data positive_data = get_data(rude_comments, x, SiteCommentFeatures.RUDE_CLASS) negative_data = get_data(normal_comments, x, SiteCommentFeatures.NORMAL_CLASS) return jsonify( **{ "x_name": SiteCommentFeatures.feature_desc(x), "positive": positive_data, "negative": negative_data })
def play(): rude_comments = SiteComment.rude_comments() normal_comments = SiteComment.normal_comments() rude_words_wiki = WiktionaryOrg.humiliation_words() total_rude = 0 total_normal = 0 for comment in rude_comments: if len(comment.processed_body.split(' ')) == 0: continue words = [ word for word in comment.processed_body.split(u' ') if word in rude_words_wiki ] if len(words) > 0: total_rude += 1 # print("[Words: %s] ([%s]) || %s\r\n" % ( str(u' '.join(words)), str(u' '.join(comment.processed_body.split(u' '))), str(comment.body) )) for comment in normal_comments: if len(comment.processed_body.split(' ')) == 0: continue words = [ word for word in comment.processed_body.split(u' ') if word in rude_words_wiki ] if len(words) > 0: total_normal += 1 # print("[Words: %s] || %s\r\n" % ( str(words), str(comment.body) )) print("Total rude %s, total normal %s, comment [%s]" % (str(total_rude), str(total_normal), str(comment.body)))
def analyse_with_bayes_classifier(): rude_comments = SiteComment.rude_comments() normal_comments = SiteComment.normal_comments() classifier = BinaryNaiveBayesClassifier(True) classifier.train(rude_comments, normal_comments) classifier.print_params() return classifier
def create_model(): if CURRENT_MODEL == MODEL_LOGISITIC_REGRESSION: feature_list = SiteCommentFeatureList(SiteComment.rude_comments(), SiteComment.normal_comments()) feature_maker = feature_list.maker() classifier = LogisticRegaression(feature_list, feature_maker, True) classifier.train() rude_total, rude_right, normal_total, normal_right = classifier.test( True) tpr = float(rude_right) / float(rude_total) tnr = float(normal_right) / float(normal_total) total_objects = float(rude_total + normal_total) acc = (rude_right / total_objects) * tpr + (normal_right / total_objects) * tnr print("Accuracy: %s, rude: %s (%s), normal: %s (%s) " % (str(acc), str(rude_right), str(rude_total), str(normal_right), str(normal_total))) adder = DBModelAdder() adder.start() feature_data = feature_maker.store() json_fd = JSONObjectData(JSONObjectData.FEATURE_TYPE_ID, json.dumps(feature_data)) adder.add(json_fd) classifier_data = classifier.store() classifier_extra = { "acc": acc, "rude_right": rude_right, "rude_total": rude_total, "normal_right": normal_right, "normal_total": normal_total } json_cd = JSONObjectData(JSONObjectData.LOGREG_TYPE_ID, json.dumps(classifier_data), json.dumps(classifier_extra)) adder.add(json_cd) adder.done() print("A new logistic regression classifier was added to the DB.") else: print("Please specify a model to create first.")
def analyse_with_cosine(): stats = DocsStats() rude_comments = SiteComment.rude_comments() rude_docs = list() for comment in rude_comments: rude_docs.append( Document(stats, comment.id, comment.body, comment.processed_body)) unverified_comments = SiteComment.comments_for_analysis() unverified_docs = list() for comment in unverified_comments: unverified_docs.append( Document(stats, comment.id, comment.body, comment.processed_body)) stats.calculate_idfs() stats.vectorise_documents() cosine = CosineSimilarity(rude_docs) rude_cluster = cosine.biggest_cluster() for item in rude_cluster: print("- ", item.body, "\r\n")
def dump_verified_comments(): directory = "./dump" if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) def dump_to(filename, comments): def date_or_none(field): return field.strftime( "%Y-%m-%d %H:%M:%S") if field is not None else None with open(filename, 'w', encoding="utf8") as csvfile: writer = csv.writer(csvfile, lineterminator='\n', delimiter=',') for comment in comments: writer.writerow([ comment.comment_id, comment.question_id, comment.answer_id, comment.post_author_id, comment.post_score, comment.title, comment.body, date_or_none(comment.creation_date), comment.author_id, comment.author_name, comment.diff_with_post, date_or_none(comment.verified), comment.is_rude, comment.verified_user_id, date_or_none(comment.added), date_or_none(comment.analysed), comment.looks_rude, date_or_none(comment.skipped) ]) rude_comments = SiteComment.rude_comments() dump_to(directory + "/rude_comments.csv", rude_comments) normal_comments = SiteComment.normal_comments() dump_to(directory + "/normal_comments.csv", normal_comments) skipped_comments = SiteComment.skipped_comments() dump_to(directory + "/skipped_comments.csv", skipped_comments)