Пример #1
0
def toxicScore(s):

    result = Detoxify('original').predict(s)
    scores = []
    metric = result
    for key in result.keys():
        scores.append(result[key])
        metric[key] = int(round(metric[key] * 1000))

    return max(scores), metric, result
def test_packaged_model():
    model = Detoxify("original")
    results = model.predict(toxicity_texts)
    assert results == {
        'toxicity': [0.12640126049518585, 0.0008546802564524114],
        'severe_toxicity': [0.00022532008006237447, 0.00011462702241260558],
        'obscene': [0.0018298450158908963, 0.00016588227299507707],
        'threat': [0.0005070280167274177, 0.00013761487207375467],
        'insult': [0.009287197142839432, 0.0001857876923168078],
        'identity_hate': [0.0018323149997740984, 0.00015746793360449374]
    }
Пример #3
0
    def get_toxicity(self):
        toxic_count = {}

        for i in range(30):
            post = self.submissions.loc[i]
            text = str(post['title'] + post['body'])
            label = Detoxify('original').predict(text)
            toxic_count = {
                k: label.get(k, 0) + toxic_count.get(k, 0)
                for k in set(label) | set(toxic_count)
            }

        return toxic_count
Пример #4
0
def run(model_name, input_obj, dest_file, from_ckpt):
    """Loads model from checkpoint or from model name and runs inference on the input_obj.
    Displays results as a pandas DataFrame object.
    If a dest_file is given, it saves the results to a txt file.
    """
    text = load_input_text(input_obj)
    if model_name is not None:
        res = Detoxify(model_name).predict(text)
    else:
        res = Detoxify(checkpoint=from_ckpt).predict(text)

    res_df = pd.DataFrame(res, index=[text] if isinstance(text, str) else text).round(5)
    print(res_df)
    if dest_file is not None:
        res_df.index.name = "input_text"
        res_df.to_csv(dest_file)

    return res
Пример #5
0
def predict_bert(model_name, user_input):
    """Loads model from checkpoint or from model name and runs inference on the input_obj.
    Displays results as a pandas DataFrame object.
    If a dest_file is given, it saves the results to a txt file.
    """
    text = [user_input]
    if model_name is not None:
        res = Detoxify(model_name).predict(text)
    # else:
    #     res = Detoxify(checkpoint=from_ckpt).predict(text)

    res_df = pd.DataFrame(
        res, index=[text] if isinstance(text, str) else text).round(5)
    print(res_df)
    return res
Пример #6
0
def why_hate():
    text = None

    # Convert from CSV to pandas
    if flask.request.content_type == 'text/plain':
        text = flask.request.data.decode('utf-8')
        text = io.StringIO(text)
        text.seek(0)
        text = text.read()
    else:
        return flask.Response(response='This predictor only supports text data', status=415, mimetype='application/json')

    print('Invoked with: {}'.format(text))


    class_names = ['not hate','hate']
    model = Detoxify("original")
    model.model.cpu()

    def predictor(texts):
        logits = model.model(**model.tokenizer(texts, return_tensors="pt", truncation=True, padding=True ))[0][:, 0]
        score = torch.sigmoid(logits).detach().numpy()
        if isinstance(texts, list) and len(texts) > 1:
            score = score.reshape(-1, 1)
            scores = np.concatenate([1 - score, score], 1)
            return scores
        else:
            scores = np.expand_dims(np.array([1 - score, score]), 0).reshape(-1, 2)
            return scores

    explainer = LimeTextExplainer(class_names=class_names)

    exp = explainer.explain_instance(text, predictor, num_features=20, num_samples=10)
    output = dict(exp.as_list())
    print(output)
    output = flask.jsonify(output)
    del model

    return output
Пример #7
0
import datetime
import calendar
from detoxify import Detoxify
from sklearn.preprocessing import RobustScaler
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# each model takes in either a string or a list of strings
week_days = [
    'day_Friday', 'day_Monday', 'day_Saturday', 'day_Sunday', 'day_Thursday',
    'day_Tuesday', 'day_Wednesday'
]

textstat.set_lang("en")
toxicity_model = Detoxify('original')


def subjective(text):
    words = ["i", "my"]
    count = 0
    text = text.lower().split()
    for word in words:
        count += text.count(word)

    return count


def weekday_from_date(date):
    year, month, day = date.split("-")
    day_number = datetime.date(day=int(day), month=int(month),
from flask import Flask, request
from detoxify import Detoxify
import logging

logging.basicConfig(level=logging.INFO)

model_original = 'original'
model_unbiased = 'unbiased'
model_multilingual = 'multilingual'

"""
Set model_name to model_original, model_unbiased or model_multilingual
"""
model_name = model_original

logging.info("Loading " + model_name + " model from Detoxify")
model = Detoxify(model_name)
logging.info(model_name + " model loaded")

app = Flask(__name__)

@app.route('/analyzeRequest', methods=['POST'])
def analyzeRequest():    
    body = request.get_json()
    results = model.predict(body["input"])
    results = {k.upper():str(v) for (k,v) in results.items()}
    return results, 200
Пример #9
0
from pprint import pprint

from detoxify import Detoxify
from pandas import DataFrame

if __name__ == '__main__':

    texts = [
        "RT @realDonaldTrump: I was very surprised & disappointed that Senator Joe Manchin of West Virginia voted against me on the Democrat’s total…",
        "RT @realDonaldTrump: Crazy Nancy Pelosi should spend more time in her decaying city and less time on the Impeachment Hoax! https://t.co/eno…",
        "RT @SpeakerPelosi: The House cannot choose our impeachment managers until we know what sort of trial the Senate will conduct. President Tr…",
        "RT @RepAdamSchiff: Lt. Col. Vindman did his job. As a soldier in Iraq, he received a Purple Heart. Then he displayed another rare form o…"
    ]

    # original: bert-base-uncased / Toxic Comment Classification Challenge
    original = Detoxify("original")

    # unbiased: roberta-base / Unintended Bias in Toxicity Classification
    unbiased = Detoxify("unbiased")

    for text in texts:

        print("----------------")
        print(f"TEXT: '{text}'")

        original_results = original.predict(text)
        #original_results["text"] = text
        original_results["model"] = "original"

        unbiased_results = unbiased.predict(text)
        #unbiased_results["text"] = text
Пример #10
0
    df_account = pd.melt(df_account)

    # ----------------------- Data cleansing ---------------------------------------

    # Characters to remove
    spec_chars = ['\n', '\t', '\r']
    # Replace defined characters with a whitespace
    for char in spec_chars:
        df['tweet'] = df['tweet'].str.strip().replace(char, ' ')
    # Split and re-join each tweet
    df['tweet'] = df['tweet'].str.split().str.join(" ")

    # ----------------------- Hate speech level prediction -------------------------

    # Instance the model
    results = Detoxify('multilingual').predict(list(df['tweet']))

    # Add the new info to the previous DataFrame
    df['toxicity'] = results['toxicity']

    # Define a class for each tweet (toxic or non-toxic)
    df['class'] = df['toxicity'].apply(lambda toxicity: 'toxic'
                                       if toxicity >= 0.5 else 'non-toxic')

    racist_words = [
        'africana', 'africano', 'china', 'chino', 'extranjera', 'extranjero',
        'gitana', 'gitano', 'india', 'indigena', 'indio', 'inmigrante',
        'latina', 'latino', 'mantera', 'mantero', 'mena', 'mora', 'moro',
        'negra', 'negrata', 'negro', 'paki', 'panchita', 'panchito', 'sudaca',
        'tiraflecha'
    ]
Пример #11
0
import sys
import time
import json

model_name = 'unbiased'
debug = False

if "multilingual" in sys.argv:
    model_name = 'multilingual'
elif "small" in sys.argv:
    model_name += '-small'

if "debug" in sys.argv:
    debug = True

model = Detoxify(model_name)


def now():
    return int(time.time() * 1000)


logfile = open("logs/artemis.log", "a")


def log(message):
    if debug:
        logfile.write(f"{str(now())}:{str.rstrip(message)}\n")
        logfile.flush()

Пример #12
0
 def model(self):
     return Detoxify(self.model_name)
Пример #13
0
from detoxify import Detoxify
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel


class TextRequest(BaseModel):
    text: str


# Start API
app = FastAPI(
    title="Cricket", description="Your personal Jiminy Cricket when posting online."
)

model = Detoxify("original", checkpoint="model.ckpt")


@app.post("/check/")
async def check(r: TextRequest):
    response = model.predict(r.text)

    response_json = {
        "toxicity": float(response["toxicity"]),
        "severe_toxicity": float(response["severe_toxicity"]),
        "obscene": float(response["obscene"]),
        "threat": float(response["threat"]),
        "insult": float(response["insult"]),
        "identity_hate": float(response["identity_hate"]),
    }
    return JSONResponse(content=response_json)
Пример #14
0
 def get_model(cls):
     """Get the model object for this instance, loading it if it's not already loaded."""
     if cls.model is None:
         cls.model = Detoxify('original')
         cls.model.model.eval()
     return cls.model
Пример #15
0
from detoxify import Detoxify
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

results = Detoxify('original')