Пример #1
0
def normalizing_validation_set():
    with open('data/valid.json', 'r', encoding='utf-8') as json_file:
        validation_data = json.load(json_file)

    with open('data/most_frequent_words.json', 'r',
              encoding='utf-8') as json_file:
        most_frequent_words = json.load(json_file)

    parsivar_normalizer = parsivar.Normalizer()
    hazm_normalizer = hazm.Normalizer()
    sentence_tokenizer = hazm.SentenceTokenizer()
    word_tokenizer = hazm.WordTokenizer(join_verb_parts=False)

    all_sentence_tokens = []
    for text in validation_data:
        text = parsivar_normalizer.sub_alphabets(text)
        text = hazm_normalizer.normalize(text)
        text = remove_english_characters(text)
        text = mask_numbers(text)
        text = remove_punctuations(text)
        text = remove_diacritics(text)
        text = remove_emojis(text)

        text = text.replace('\n', ' ')
        text = text.replace('?', '؟')
        text = text.replace('؟', ' ؟ ')
        text = text.replace('.', ' . ')
        text = text.replace('  ', ' ')
        sentences = sentence_tokenizer.tokenize(text)

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)

            if words[-1] == '.' or words[-1] == '؟':
                words = words[:-1]

            if len(words) == 0:
                continue

            final_sentence_tokens = []
            for ind, word in enumerate(words):
                if word == 'NUM':
                    if len(final_sentence_tokens
                           ) == 0 or final_sentence_tokens[-1] != 'NUM':
                        final_sentence_tokens.append(word)
                elif word not in most_frequent_words:
                    if len(final_sentence_tokens
                           ) == 0 or final_sentence_tokens[-1] != 'UNK':
                        final_sentence_tokens.append(word)
                else:
                    final_sentence_tokens.append(word)

            all_sentence_tokens.append(final_sentence_tokens)

    with open('data/validation_sentences.json', 'w') as json_file:
        json.dump(all_sentence_tokens, json_file, ensure_ascii=False)
Пример #2
0
    def __init__(self):
        self.punctuations = [
            '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
            '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`',
            '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²',
            '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫',
            '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›',
            '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《',
            '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076',
            '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�', '?', '؟', '.', '،', '؛',
            '•', '●'
        ]
        self.diacritics_pattern = re.compile(
            "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]")
        self.emojis_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            "]+",
            flags=re.UNICODE)
        self.latin_characters_pattern = re.compile("["
                                                   "\u0041-\u007a"
                                                   "\u00c0-\u036f"
                                                   "\u0400-\u050f"
                                                   "\u0342-\u03ff"
                                                   "]")
        self.numbers_pattern = re.compile("[0-9]")
        self.space_patterns = [
            (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '),
            (re.compile("[\f\r\t\n]"), ' '),
            (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"),
             '\u200c'),
            (re.compile(
                "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003"
                "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''),
        ]
        self.stopwords = hazm.stopwords_list()[:200] + [
            'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم',
            'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد',
            'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار',
            'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست',
            'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید',
            'نیستند'
        ]

        self.normalizer = parsivar.Normalizer()
        self.stemmer = parsivar.FindStems()
        self.lemmatizer = hazm.Lemmatizer()
 def __init__(self,
              mask=None,
              size=900,
              stop_words_addr=default_stop_words_path,
              mask_addr=None):
     self.hazm_normalizer = hazm.Normalizer()
     self.parsivar_normalizer = parsivar.Normalizer()
     self.stemmer = hazm.Stemmer()
     self.lemmatizer = hazm.Lemmatizer()
     self.stop_words = set(hazm.stopwords_list(stop_words_addr))
     mask = np.array(
         Image.open(mask_addr)) if mask_addr is not None else None
     self.generator = WordCloud(width=size,
                                height=size,
                                include_numbers=False,
                                persian_normalize=False,
                                collocations=True,
                                mask=mask,
                                background_color='white')
Пример #4
0
def clean(path):

    character_refinement_patterns = [
        (r'[a-zA-Z0-9]', ''),
        (r'(\u0631){3,}', r'\1'),
        (r'(\u0648){3,}', r'\1'),
        (r'(\u0633){3,}', r'\1'),
        (r'(\u06a9){3,}', r'\1'),
        (r'(\u0627){3,}', r'\1'),
        (r'(\u06cc){3,}', r'\1'),
        (r'(\u062e){3,}', r'\1'),
        (r'(\u0645){3,}', r'\1'),
        (r'(\u0646){3,}', r'\1'),
        (r'(\u0647){3,}', r'\1'),
        (r'(\u0639){3,}', r'\1'),
        (r'(\u0634){3,}', r'\1'),
        (r'\u0622', '\u0627'),
        (r'\u0651', ''),
        (r'[^\u0600-\u06FF\u200c \s_#]', ''),
        (r'[#_]', ' '),
        (r'[\u06f0-\u06f9]', ''),
        (r'  +', ' '),
    ]
    compile_patterns = lambda patterns: [(re.compile(pattern), repl)
                                         for pattern, repl in patterns]
    character_refinement_patterns = compile_patterns(
        character_refinement_patterns)

    with open(path, "r", encoding="utf_8") as f:
        lines = f.readlines()

    normalizer = parsivar.Normalizer()
    normalized_lines = [normalizer.normalize(line) for line in lines]
    cleaned = []

    for line in normalized_lines:

        for pattern, repl in character_refinement_patterns:
            line = pattern.sub(repl, line)

        cleaned.append(line)

    return cleaned
Пример #5
0
    def __init__(self):
        self.number_replacement = 'N'

        self.avg_sentence_length = 165

        self.punctuations = [
            '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
            '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`',
            '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²',
            '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫',
            '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›',
            '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《',
            '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076',
            '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�'
        ]
        self.diacritics_pattern = re.compile(
            "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]")
        self.emojis_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            "]+",
            flags=re.UNICODE)
        self.latin_characters_pattern = re.compile("["
                                                   "\u0041-\u007a"
                                                   "\u00c0-\u036f"
                                                   "\u0400-\u050f"
                                                   "\u0342-\u03ff"
                                                   "]")
        self.numbers_pattern = re.compile("[0-9]+")
        self.space_patterns = [
            (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '),
            (re.compile("[\f\r\t\n]"), ' '),
            (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"),
             '\u200c'),
            (re.compile(
                "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003"
                "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''),
        ]

        self.normalizer = parsivar.Normalizer()
Пример #6
0
def normalizing_training_set():
    with open('data/train.json', 'r', encoding='utf-8') as json_file:
        training_data = json.load(json_file)

    parsivar_normalizer = parsivar.Normalizer()
    hazm_normalizer = hazm.Normalizer()
    sentence_tokenizer = hazm.SentenceTokenizer()
    word_tokenizer = hazm.WordTokenizer(join_verb_parts=False)

    word_frequency = {}
    all_sentence_tokens = []
    for text in training_data:
        text = parsivar_normalizer.sub_alphabets(text)
        text = hazm_normalizer.normalize(text)
        text = remove_english_characters(text)
        text = mask_numbers(text)
        text = remove_punctuations(text)
        text = remove_diacritics(text)
        text = remove_emojis(text)

        text = text.replace('\n', ' ')
        text = text.replace('?', '؟')
        text = text.replace('؟', ' ؟ ')
        text = text.replace('.', ' . ')
        text = text.replace('  ', ' ')
        sentences = sentence_tokenizer.tokenize(text)

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)

            if words[-1] == '.' or words[-1] == '؟':
                words = words[:-1]

            if len(words) == 0:
                continue

            for word in words:
                if word not in word_frequency:
                    word_frequency[word] = 0
                word_frequency[word] += 1

            all_sentence_tokens.append(words)

    with open('data/words_frequency.json', 'w') as json_file:
        json.dump(word_frequency, json_file, ensure_ascii=False)

    frequency_rank_threshold = 10000
    most_frequent_words = sorted(word_frequency,
                                 key=word_frequency.get,
                                 reverse=True)[:frequency_rank_threshold]

    final_all_sentence_tokens = []
    for sentence_tokens in all_sentence_tokens:
        final_sentence_tokens = []
        for ind, token in enumerate(sentence_tokens):
            if token == 'NUM':
                if len(final_sentence_tokens
                       ) == 0 or final_sentence_tokens[-1] != 'NUM':
                    final_sentence_tokens.append(token)
            elif token not in most_frequent_words:
                if len(final_sentence_tokens
                       ) == 0 or final_sentence_tokens[-1] != 'UNK':
                    final_sentence_tokens.append(token)
            else:
                final_sentence_tokens.append(token)
        final_all_sentence_tokens.append(final_sentence_tokens)

    with open('data/training_sentences.json', 'w') as json_file:
        json.dump(final_all_sentence_tokens, json_file, ensure_ascii=False)
    with open('data/most_frequent_words.json', 'w') as json_file:
        json.dump(most_frequent_words, json_file, ensure_ascii=False)
Пример #7
0
 def __init__(self):
     self.normalizer = parsivar.Normalizer()
Пример #8
0
    normalized_lines = [normalizer.normalize(line) for line in lines]
    cleaned = []

    for line in normalized_lines:

        for pattern, repl in character_refinement_patterns:
            line = pattern.sub(repl, line)

        cleaned.append(line)

    return cleaned


flatten = lambda t: [item for sublist in t for item in sublist]

normalizer = parsivar.Normalizer()
# verbal_filename = 'persian-stopwords\\verbal'
# nonverbal_filename = 'persian-stopwords\\nonverbal'
persian_filename = 'persian-stopwords\\persian'
# verbal_stop = [normalizer.normalize(w) for w in codecs.open(os.path.join(Directory_Path,verbal_filename), encoding='utf-8').read().split('\n') if w]
# non_verbal_stop = [normalizer.normalize(w) for w in codecs.open(os.path.join(Directory_Path,nonverbal_filename), encoding='utf-8').read().split('\n') if w]
persian_stop = sorted(
    list(
        set([
            normalizer.normalize(w) for w in codecs.open(
                os.path.join(Directory_Path, persian_filename),
                encoding='utf-8').read().split('\n') if w
        ])))
stopwords = persian_stop

offensive_lexicon = []