def normalizing_validation_set(): with open('data/valid.json', 'r', encoding='utf-8') as json_file: validation_data = json.load(json_file) with open('data/most_frequent_words.json', 'r', encoding='utf-8') as json_file: most_frequent_words = json.load(json_file) parsivar_normalizer = parsivar.Normalizer() hazm_normalizer = hazm.Normalizer() sentence_tokenizer = hazm.SentenceTokenizer() word_tokenizer = hazm.WordTokenizer(join_verb_parts=False) all_sentence_tokens = [] for text in validation_data: text = parsivar_normalizer.sub_alphabets(text) text = hazm_normalizer.normalize(text) text = remove_english_characters(text) text = mask_numbers(text) text = remove_punctuations(text) text = remove_diacritics(text) text = remove_emojis(text) text = text.replace('\n', ' ') text = text.replace('?', '؟') text = text.replace('؟', ' ؟ ') text = text.replace('.', ' . ') text = text.replace(' ', ' ') sentences = sentence_tokenizer.tokenize(text) for sentence in sentences: words = word_tokenizer.tokenize(sentence) if words[-1] == '.' or words[-1] == '؟': words = words[:-1] if len(words) == 0: continue final_sentence_tokens = [] for ind, word in enumerate(words): if word == 'NUM': if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'NUM': final_sentence_tokens.append(word) elif word not in most_frequent_words: if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'UNK': final_sentence_tokens.append(word) else: final_sentence_tokens.append(word) all_sentence_tokens.append(final_sentence_tokens) with open('data/validation_sentences.json', 'w') as json_file: json.dump(all_sentence_tokens, json_file, ensure_ascii=False)
def __init__(self): self.punctuations = [ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²', '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫', '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›', '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《', '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076', '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�', '?', '؟', '.', '،', '؛', '•', '●' ] self.diacritics_pattern = re.compile( "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]") self.emojis_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" "]+", flags=re.UNICODE) self.latin_characters_pattern = re.compile("[" "\u0041-\u007a" "\u00c0-\u036f" "\u0400-\u050f" "\u0342-\u03ff" "]") self.numbers_pattern = re.compile("[0-9]") self.space_patterns = [ (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '), (re.compile("[\f\r\t\n]"), ' '), (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"), '\u200c'), (re.compile( "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003" "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''), ] self.stopwords = hazm.stopwords_list()[:200] + [ 'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم', 'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد', 'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست', 'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید', 'نیستند' ] self.normalizer = parsivar.Normalizer() self.stemmer = parsivar.FindStems() self.lemmatizer = hazm.Lemmatizer()
def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white')
def clean(path): character_refinement_patterns = [ (r'[a-zA-Z0-9]', ''), (r'(\u0631){3,}', r'\1'), (r'(\u0648){3,}', r'\1'), (r'(\u0633){3,}', r'\1'), (r'(\u06a9){3,}', r'\1'), (r'(\u0627){3,}', r'\1'), (r'(\u06cc){3,}', r'\1'), (r'(\u062e){3,}', r'\1'), (r'(\u0645){3,}', r'\1'), (r'(\u0646){3,}', r'\1'), (r'(\u0647){3,}', r'\1'), (r'(\u0639){3,}', r'\1'), (r'(\u0634){3,}', r'\1'), (r'\u0622', '\u0627'), (r'\u0651', ''), (r'[^\u0600-\u06FF\u200c \s_#]', ''), (r'[#_]', ' '), (r'[\u06f0-\u06f9]', ''), (r' +', ' '), ] compile_patterns = lambda patterns: [(re.compile(pattern), repl) for pattern, repl in patterns] character_refinement_patterns = compile_patterns( character_refinement_patterns) with open(path, "r", encoding="utf_8") as f: lines = f.readlines() normalizer = parsivar.Normalizer() normalized_lines = [normalizer.normalize(line) for line in lines] cleaned = [] for line in normalized_lines: for pattern, repl in character_refinement_patterns: line = pattern.sub(repl, line) cleaned.append(line) return cleaned
def __init__(self): self.number_replacement = 'N' self.avg_sentence_length = 165 self.punctuations = [ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²', '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫', '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›', '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《', '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076', '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�' ] self.diacritics_pattern = re.compile( "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]") self.emojis_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" "]+", flags=re.UNICODE) self.latin_characters_pattern = re.compile("[" "\u0041-\u007a" "\u00c0-\u036f" "\u0400-\u050f" "\u0342-\u03ff" "]") self.numbers_pattern = re.compile("[0-9]+") self.space_patterns = [ (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '), (re.compile("[\f\r\t\n]"), ' '), (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"), '\u200c'), (re.compile( "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003" "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''), ] self.normalizer = parsivar.Normalizer()
def normalizing_training_set(): with open('data/train.json', 'r', encoding='utf-8') as json_file: training_data = json.load(json_file) parsivar_normalizer = parsivar.Normalizer() hazm_normalizer = hazm.Normalizer() sentence_tokenizer = hazm.SentenceTokenizer() word_tokenizer = hazm.WordTokenizer(join_verb_parts=False) word_frequency = {} all_sentence_tokens = [] for text in training_data: text = parsivar_normalizer.sub_alphabets(text) text = hazm_normalizer.normalize(text) text = remove_english_characters(text) text = mask_numbers(text) text = remove_punctuations(text) text = remove_diacritics(text) text = remove_emojis(text) text = text.replace('\n', ' ') text = text.replace('?', '؟') text = text.replace('؟', ' ؟ ') text = text.replace('.', ' . ') text = text.replace(' ', ' ') sentences = sentence_tokenizer.tokenize(text) for sentence in sentences: words = word_tokenizer.tokenize(sentence) if words[-1] == '.' or words[-1] == '؟': words = words[:-1] if len(words) == 0: continue for word in words: if word not in word_frequency: word_frequency[word] = 0 word_frequency[word] += 1 all_sentence_tokens.append(words) with open('data/words_frequency.json', 'w') as json_file: json.dump(word_frequency, json_file, ensure_ascii=False) frequency_rank_threshold = 10000 most_frequent_words = sorted(word_frequency, key=word_frequency.get, reverse=True)[:frequency_rank_threshold] final_all_sentence_tokens = [] for sentence_tokens in all_sentence_tokens: final_sentence_tokens = [] for ind, token in enumerate(sentence_tokens): if token == 'NUM': if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'NUM': final_sentence_tokens.append(token) elif token not in most_frequent_words: if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'UNK': final_sentence_tokens.append(token) else: final_sentence_tokens.append(token) final_all_sentence_tokens.append(final_sentence_tokens) with open('data/training_sentences.json', 'w') as json_file: json.dump(final_all_sentence_tokens, json_file, ensure_ascii=False) with open('data/most_frequent_words.json', 'w') as json_file: json.dump(most_frequent_words, json_file, ensure_ascii=False)
def __init__(self): self.normalizer = parsivar.Normalizer()
normalized_lines = [normalizer.normalize(line) for line in lines] cleaned = [] for line in normalized_lines: for pattern, repl in character_refinement_patterns: line = pattern.sub(repl, line) cleaned.append(line) return cleaned flatten = lambda t: [item for sublist in t for item in sublist] normalizer = parsivar.Normalizer() # verbal_filename = 'persian-stopwords\\verbal' # nonverbal_filename = 'persian-stopwords\\nonverbal' persian_filename = 'persian-stopwords\\persian' # verbal_stop = [normalizer.normalize(w) for w in codecs.open(os.path.join(Directory_Path,verbal_filename), encoding='utf-8').read().split('\n') if w] # non_verbal_stop = [normalizer.normalize(w) for w in codecs.open(os.path.join(Directory_Path,nonverbal_filename), encoding='utf-8').read().split('\n') if w] persian_stop = sorted( list( set([ normalizer.normalize(w) for w in codecs.open( os.path.join(Directory_Path, persian_filename), encoding='utf-8').read().split('\n') if w ]))) stopwords = persian_stop offensive_lexicon = []