Пример #1
0
def main(components=None):
    initials, vowels, finals, repeat_cnt, total_cnt = components or gibberish_components(
    )
    pf = ProfanityFilter()
    cnt = 0
    profane_cnt = 0
    with alive_bar(total_cnt) as bar:
        for i in initials:
            for v in vowels:
                for f in finals:
                    prefix = ''.join([i, v, f])
                    if pf.is_profane(prefix):
                        print(
                            cnt, 'All %s words beginning with "%s..."' %
                            (repeat_cnt, prefix))
                        cnt += repeat_cnt
                        profane_cnt += repeat_cnt
                        bar(incr=repeat_cnt)
                        continue
                    for v2 in vowels:
                        for f2 in finals:
                            cnt += 1
                            word = ''.join([prefix, v2, f2])
                            if pf.is_profane(word):
                                profane_cnt += 1
                                print(cnt, word)
                            bar()
    print('Done! Found %s profane words in %s total' % (profane_cnt, cnt))
Пример #2
0
    def process(self, message, **kwargs):
        # burada custom olarak ne yapmak istiyorsak tanimliyoruz
        pf = ProfanityFilter()
        text = message.text
        value = "na"
        confidence = 0
        #ornegin text = "This is shit."
        # Eger kelime kufurse confidence skor olarak 100 atiyoruz
        if pf.is_profane(text):
            tokens = text.split(" ")
            for token in tokens:
                if pf.is_profane(token):
                    value = token
                    confidence =100

        if value != 'na':
            entity = self.convert_to_rasa(value, confidence)
            message.set("entities", [entity], add_to_output=True)
        else:
            pass 
    def process(self, message, **kwargs):

        pf = ProfanityFilter()

        text = message.text
        #text = "This is shit"  == True  | False if True:
        value = 'na'
        confidence = 0
        if pf.is_profane(text):
            tokens = text.split(" ")
            for token in tokens:
                if pf.is_profane(token):
                    value = token
                    confidence = 100
        if value != 'na':
            entity = self.convert_to_rasa(value, confidence)

            message.set("entities", [entity], add_to_output=True)
        else:
            pass
def Predict(texts):
    pf = ProfanityFilter()
    sid = SentimentIntensityAnalyzer()
    labels = []
    for text in texts:
        if (pf.is_profane(text)):
            labels.append(0)
        else:
            ss = sid.polarity_scores(text)
            if (ss['compound'] <= -0.05):
                labels.append(0)
            else:
                labels.append(1)
    return labels
Пример #5
0
def is_profane(url):

    if len(url) < 3:
        return False

    if getattr(settings, "ENABLE_FAST_PROFANITY_CHECKING", True):
        parts = urlparse(get_decodedurl(url))
        partslist = []
        if not (parts.path or parts.netloc):
            raise InvalidURLError(
                "Badly formatted URL passed to is_url_profane")
        splitters = r"\.|\/|\_|\-|\~|\$|\+|\!|\*|\(|\)|\,"  # all the URL-safe characters, escaped
        if parts.netloc:
            partslist = partslist + re.split(splitters, parts.netloc)
        if parts.path:
            partslist = partslist + re.split(splitters, parts.path)
        if parts.query:
            partslist = partslist + re.split(splitters, parts.query)

        # speed optimization
        check4btlw = True
        stringlist = []
        for item in partslist:
            if len(item) > 0:
                if len(item) > 5:
                    check4btlw = False
                for substring in get_all_substrings(item, 2):
                    if len(substring) > 0:
                        stringlist.append(substring)
        partslist = list(dict.fromkeys(stringlist))  # removes dupes

        if check4btlw:
            for part in partslist:
                if part in BAD_THREE_LETTER_WORDS:
                    return True

        score = PredictProfanity(partslist)
        if score.any() == 1:
            return True

        if getattr(settings, "ENABLE_DEEP_PROFANITY_CHECKING", True):
            pf = ProfanityFilter()
            for part in partslist:
                if pf.is_profane(part):
                    return True

    return False
Пример #6
0
class ProfanityDetector:
    def __init__(self, profane_words_filepath: str):
        words = []
        with open(profane_words_filepath, encoding='utf8') as f:
            for line in f:
                word = line.strip()
                words.append(word)
                if word.count('ё') > 0:
                    word = word.replace('ё', 'е')
                words.append(word)
        self._ru_words = words
        self._ru_pf = ProfanityFilter()
        self._ru_pf.custom_profane_word_dictionaries = {'en': words}
        self._r = sr.Recognizer()

    def get_profanity(self, voice_path: str) -> list:
        data = []
        phrases = 0
        profane_phrases = 0
        for root, dirs, files in os.walk(voice_path):
            for file in files:
                phrases += 1
                with sr.AudioFile(os.path.join(root, file)) as source:
                    audio = self._r.record(source)
                    try:
                        res = self._r.recognize_google(audio, show_all=True)
                        res_ru = self._r.recognize_google(audio,
                                                          language="ru",
                                                          show_all=True)
                        if res_ru:
                            is_profane = False
                            text = ""
                            for text_alt in res_ru['alternative']:
                                text = text_alt['transcript']
                                if '*' in text:
                                    is_profane = True
                                    break
                                text = " ".join(word.lower()
                                                for word in text.split())
                                text = re.sub(r'-\s\r\n\s+|-\s\r\n|\r\n', '',
                                              text)
                                text = re.sub(
                                    r'[.,:;%©?*!@#$^&()\d]|[+=]|[\[]|[\]]|[/]|"|\s{2,}|-',
                                    ' ', text)
                                text = " ".join(
                                    pymorphy2.MorphAnalyzer().parse(str(
                                        word))[0].normal_form
                                    for word in text.split())
                                is_profane = self._ru_pf.is_profane(text)
                                if is_profane:
                                    break
                            if is_profane:
                                profane_phrases += 1
                                print(file, "RU PROFANE", text, "best:",
                                      res_ru['alternative'][0]['transcript'])
                            else:
                                print(file, "RU NOT PROFANE",
                                      res_ru['alternative'][0]['transcript'])
                            row = {
                                "filename":
                                file,
                                "lang":
                                "ru",
                                "is_profane":
                                is_profane,
                                "text_best_recogn":
                                res_ru['alternative'][0]['transcript'],
                                "text_profane":
                                text if is_profane else "",
                                "prob":
                                1 if is_profane else 0,
                            }
                            data.append(row)
                        if res:
                            is_profane = False
                            prob = 0
                            text = ""
                            for text_alt in res['alternative']:
                                text = text_alt['transcript']
                                if '*' in text:
                                    is_profane = True
                                    break
                                text = " ".join(word.lower()
                                                for word in text.split())
                                prob = predict_prob([text])[0]
                                is_profane = prob > 0.5
                                if is_profane:
                                    break
                            if is_profane:
                                profane_phrases += 1
                                print(file, "ENG PROFANE", text, "best:",
                                      res['alternative'][0]['transcript'])
                            else:
                                print(file, "ENG NOT PROFANE",
                                      res['alternative'][0]['transcript'])
                            row = {
                                "filename":
                                file,
                                "lang":
                                "eng",
                                "is_profane":
                                is_profane,
                                "text_best_recogn":
                                res['alternative'][0]['transcript'],
                                "text_profane":
                                text if is_profane else "",
                                "prob":
                                prob,
                            }
                            data.append(row)
                    except sr.UnknownValueError:
                        print(
                            "Google Speech Recognition could not understand audio"
                        )
                    except sr.RequestError as e:
                        print(
                            "Could not request results from Google Speech Recognition service; {0}"
                            .format(e))

        return data
from profanity_filter import ProfanityFilter

pf = ProfanityFilter()

extra_negative_words = {'en': {}}
with open("five-letter-words-extra-negative.txt", "r") as extra_negative_file:
    words = extra_negative_file.read().splitlines()
    extra_negative_words['en'] = dict.fromkeys(words)

pf.extra_profane_word_dictionaries = extra_negative_words

with open("five-letter-words.txt", "r") as source_file, open(
        "five-letter-words-clean.txt",
        "w") as clean_file, open("five-letter-words-profane.txt",
                                 "w") as profane_file:
    words = source_file.read().splitlines()
    for word in words:
        if pf.is_profane(word):
            profane_file.write(f"{word}\n")
            print(
                f"Word \'{word}\' is profane or negative. Omitting from the cleaned file."
            )
        else:
            clean_file.write(f"{word}\n")
Пример #8
0
#final = list(data)

# Prints the value stored under the array: final
#print(final[0])

#length = len(final)
#print(length)

#Fetches the string: "cd" in the array: final and returns its index value
#print(final.index("cd"))

#We are reading the columns and storing them. The default file is mydata.csv - Change it as needed. Dont change the column name
text = pd.read_csv("mydata.csv", sep=",", usecols=["tweet_text"], squeeze=True)
user = pd.read_csv("mydata.csv", sep=",", usecols=["username"], squeeze=True)

#Converting the stored data into lists
text_list = list(text)
user_list = list(user)

# Getting length of list
length = len(text_list)
i = 0

# Iterating using while loop
while (i < length):
    if (pf.is_profane(text_list[i])):
        print("Bullying Detected. User" + " " + user_list[i])
    else:
        print("No bullying detected")
    i += 1
Пример #9
0
def organizeFile(filename: str,
                 sort_sections: bool = False,
                 skip_profanity_check: bool = None) -> None:
    global PF
    if skip_profanity_check is None:
        skip_profanity_check = not PROFANITY_CHECK
    if PF is None:
        PF = ProfanityFilter()

    phrases: Dict[str, List[Phrase]] = collections.OrderedDict({
        #EPhraseFlags.OLD_VOX.name: [],
        #EPhraseFlags.NOT_VOX: [],
        #EPhraseFlags.SFX.name:     [],
    })
    phrasesByID = {}
    for p in ParsePhraseListFrom(filename):
        if p.id.lower() in phrasesByID:
            log.warning('Skipping duplicate %s...', p.id)
            continue
        assignTo = ''
        if p.hasFlag(EPhraseFlags.SFX):
            assignTo = EPhraseFlags.SFX.name
        elif p.hasFlag(EPhraseFlags.OLD_VOX):
            assignTo = EPhraseFlags.OLD_VOX.name
        else:
            assignTo = p.category
        phrasesByID[p.id.lower()] = p
        if assignTo not in phrases:
            phrases[assignTo] = []
        phrases[assignTo] += [p]

    if sort_sections:
        newPhOD = collections.OrderedDict()
        for k in sorted(phrases.keys()):
            newPhOD[k] = phrases[k]
        phrases = newPhOD
    with open(filename + '.sorted', 'w') as w:
        divider_len = max([len(x) for x in phrases.keys()]) + 4
        divider = '#' * divider_len
        for section, sectionPhrases in phrases.items():
            if section != '':
                w.write(f'\n{divider}\n## {section}\n{divider}\n\n')
            for phrase in sorted(sectionPhrases, key=lambda x: x.id):
                for comm in phrase.comments_before:
                    comm = comm.rstrip()
                    w.write(f'#{comm}\n')
                key = newkey = phrase.id
                if '/' not in key:
                    newkey = key.lower()
                value = phrase.phrase
                if phrase.hasFlag(EPhraseFlags.SFX):
                    w.write(f'{newkey} = @{value}\n')
                else:
                    if not skip_profanity_check and PF.is_profane(value):
                        log.warning(
                            f'{filename}: Phrase {phrase.id} contains profanity.'
                        )
                    if key != value:
                        w.write(f'{newkey} = {value}\n')
                    else:
                        w.write(f'{newkey}\n')