def toxicScore(s): result = Detoxify('original').predict(s) scores = [] metric = result for key in result.keys(): scores.append(result[key]) metric[key] = int(round(metric[key] * 1000)) return max(scores), metric, result
def test_packaged_model(): model = Detoxify("original") results = model.predict(toxicity_texts) assert results == { 'toxicity': [0.12640126049518585, 0.0008546802564524114], 'severe_toxicity': [0.00022532008006237447, 0.00011462702241260558], 'obscene': [0.0018298450158908963, 0.00016588227299507707], 'threat': [0.0005070280167274177, 0.00013761487207375467], 'insult': [0.009287197142839432, 0.0001857876923168078], 'identity_hate': [0.0018323149997740984, 0.00015746793360449374] }
def get_toxicity(self): toxic_count = {} for i in range(30): post = self.submissions.loc[i] text = str(post['title'] + post['body']) label = Detoxify('original').predict(text) toxic_count = { k: label.get(k, 0) + toxic_count.get(k, 0) for k in set(label) | set(toxic_count) } return toxic_count
def run(model_name, input_obj, dest_file, from_ckpt): """Loads model from checkpoint or from model name and runs inference on the input_obj. Displays results as a pandas DataFrame object. If a dest_file is given, it saves the results to a txt file. """ text = load_input_text(input_obj) if model_name is not None: res = Detoxify(model_name).predict(text) else: res = Detoxify(checkpoint=from_ckpt).predict(text) res_df = pd.DataFrame(res, index=[text] if isinstance(text, str) else text).round(5) print(res_df) if dest_file is not None: res_df.index.name = "input_text" res_df.to_csv(dest_file) return res
def predict_bert(model_name, user_input): """Loads model from checkpoint or from model name and runs inference on the input_obj. Displays results as a pandas DataFrame object. If a dest_file is given, it saves the results to a txt file. """ text = [user_input] if model_name is not None: res = Detoxify(model_name).predict(text) # else: # res = Detoxify(checkpoint=from_ckpt).predict(text) res_df = pd.DataFrame( res, index=[text] if isinstance(text, str) else text).round(5) print(res_df) return res
def why_hate(): text = None # Convert from CSV to pandas if flask.request.content_type == 'text/plain': text = flask.request.data.decode('utf-8') text = io.StringIO(text) text.seek(0) text = text.read() else: return flask.Response(response='This predictor only supports text data', status=415, mimetype='application/json') print('Invoked with: {}'.format(text)) class_names = ['not hate','hate'] model = Detoxify("original") model.model.cpu() def predictor(texts): logits = model.model(**model.tokenizer(texts, return_tensors="pt", truncation=True, padding=True ))[0][:, 0] score = torch.sigmoid(logits).detach().numpy() if isinstance(texts, list) and len(texts) > 1: score = score.reshape(-1, 1) scores = np.concatenate([1 - score, score], 1) return scores else: scores = np.expand_dims(np.array([1 - score, score]), 0).reshape(-1, 2) return scores explainer = LimeTextExplainer(class_names=class_names) exp = explainer.explain_instance(text, predictor, num_features=20, num_samples=10) output = dict(exp.as_list()) print(output) output = flask.jsonify(output) del model return output
import datetime import calendar from detoxify import Detoxify from sklearn.preprocessing import RobustScaler import ssl ssl._create_default_https_context = ssl._create_unverified_context # each model takes in either a string or a list of strings week_days = [ 'day_Friday', 'day_Monday', 'day_Saturday', 'day_Sunday', 'day_Thursday', 'day_Tuesday', 'day_Wednesday' ] textstat.set_lang("en") toxicity_model = Detoxify('original') def subjective(text): words = ["i", "my"] count = 0 text = text.lower().split() for word in words: count += text.count(word) return count def weekday_from_date(date): year, month, day = date.split("-") day_number = datetime.date(day=int(day), month=int(month),
from flask import Flask, request from detoxify import Detoxify import logging logging.basicConfig(level=logging.INFO) model_original = 'original' model_unbiased = 'unbiased' model_multilingual = 'multilingual' """ Set model_name to model_original, model_unbiased or model_multilingual """ model_name = model_original logging.info("Loading " + model_name + " model from Detoxify") model = Detoxify(model_name) logging.info(model_name + " model loaded") app = Flask(__name__) @app.route('/analyzeRequest', methods=['POST']) def analyzeRequest(): body = request.get_json() results = model.predict(body["input"]) results = {k.upper():str(v) for (k,v) in results.items()} return results, 200
from pprint import pprint from detoxify import Detoxify from pandas import DataFrame if __name__ == '__main__': texts = [ "RT @realDonaldTrump: I was very surprised & disappointed that Senator Joe Manchin of West Virginia voted against me on the Democrat’s total…", "RT @realDonaldTrump: Crazy Nancy Pelosi should spend more time in her decaying city and less time on the Impeachment Hoax! https://t.co/eno…", "RT @SpeakerPelosi: The House cannot choose our impeachment managers until we know what sort of trial the Senate will conduct. President Tr…", "RT @RepAdamSchiff: Lt. Col. Vindman did his job. As a soldier in Iraq, he received a Purple Heart. Then he displayed another rare form o…" ] # original: bert-base-uncased / Toxic Comment Classification Challenge original = Detoxify("original") # unbiased: roberta-base / Unintended Bias in Toxicity Classification unbiased = Detoxify("unbiased") for text in texts: print("----------------") print(f"TEXT: '{text}'") original_results = original.predict(text) #original_results["text"] = text original_results["model"] = "original" unbiased_results = unbiased.predict(text) #unbiased_results["text"] = text
df_account = pd.melt(df_account) # ----------------------- Data cleansing --------------------------------------- # Characters to remove spec_chars = ['\n', '\t', '\r'] # Replace defined characters with a whitespace for char in spec_chars: df['tweet'] = df['tweet'].str.strip().replace(char, ' ') # Split and re-join each tweet df['tweet'] = df['tweet'].str.split().str.join(" ") # ----------------------- Hate speech level prediction ------------------------- # Instance the model results = Detoxify('multilingual').predict(list(df['tweet'])) # Add the new info to the previous DataFrame df['toxicity'] = results['toxicity'] # Define a class for each tweet (toxic or non-toxic) df['class'] = df['toxicity'].apply(lambda toxicity: 'toxic' if toxicity >= 0.5 else 'non-toxic') racist_words = [ 'africana', 'africano', 'china', 'chino', 'extranjera', 'extranjero', 'gitana', 'gitano', 'india', 'indigena', 'indio', 'inmigrante', 'latina', 'latino', 'mantera', 'mantero', 'mena', 'mora', 'moro', 'negra', 'negrata', 'negro', 'paki', 'panchita', 'panchito', 'sudaca', 'tiraflecha' ]
import sys import time import json model_name = 'unbiased' debug = False if "multilingual" in sys.argv: model_name = 'multilingual' elif "small" in sys.argv: model_name += '-small' if "debug" in sys.argv: debug = True model = Detoxify(model_name) def now(): return int(time.time() * 1000) logfile = open("logs/artemis.log", "a") def log(message): if debug: logfile.write(f"{str(now())}:{str.rstrip(message)}\n") logfile.flush()
def model(self): return Detoxify(self.model_name)
from detoxify import Detoxify from fastapi import FastAPI from fastapi.responses import JSONResponse from pydantic import BaseModel class TextRequest(BaseModel): text: str # Start API app = FastAPI( title="Cricket", description="Your personal Jiminy Cricket when posting online." ) model = Detoxify("original", checkpoint="model.ckpt") @app.post("/check/") async def check(r: TextRequest): response = model.predict(r.text) response_json = { "toxicity": float(response["toxicity"]), "severe_toxicity": float(response["severe_toxicity"]), "obscene": float(response["obscene"]), "threat": float(response["threat"]), "insult": float(response["insult"]), "identity_hate": float(response["identity_hate"]), } return JSONResponse(content=response_json)
def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model is None: cls.model = Detoxify('original') cls.model.model.eval() return cls.model
from detoxify import Detoxify import ssl try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context results = Detoxify('original')