def clean_data(): correct_base_path = "data/benchmark/fixed/correct/" wrong1_base_path = "data/benchmark/fixed/wrong/" dist_base_path = "data/benchmark/test/" files_name = os.listdir(correct_base_path) c = 0 for file_name in files_name: correct_file_lines = open(correct_base_path + file_name, 'r').readlines() wrong_file_lines = open(wrong1_base_path + file_name, 'r').readlines() if len(correct_file_lines) != len(wrong_file_lines): print("this files is not matche: " + file_name) continue correct_name = dist_base_path + "c_" + file_name wrong_name = dist_base_path + "w_" + file_name correct_file = open(correct_name, 'w') wrong_file = open(wrong_name, 'w') for i in range(0, len(correct_file_lines)): correct_line = word_tokenize(normalize(correct_file_lines[i])) wrong_line = word_tokenize(normalize(wrong_file_lines[i])) correct_match, wrong_match = lcs(correct_line, wrong_line) correct_str = "" for x in correct_match: correct_str += x + " " wrong_str = "" for x in wrong_match: wrong_str += x + " " correct_str = correct_str.replace("_", "") wrong_str = wrong_str.replace("_", "") correct_file.write(correct_str + "\n") wrong_file.write(wrong_str + "\n")
def reader(self): self.data = [] words = {} for indx, record in enumerate(open(self.path_dataset, 'r')): record = json.loads(record) record["category"] = record["category"].split("-")[0].strip() record = self.pre_processor(record) self.data.append(record) for word in word_tokenize(record['body']): if word in words: words[word] += 1 else: words[word] = 1 if indx != 0 and indx % 100 == 0: print(indx) for indx, record in enumerate(self.data): if len(word_tokenize(record['body'])) <= 512: continue count = {} for word in word_tokenize(record['body']): count[word] = words[word] valid_word = [ item[0] for item in sorted( words.items(), key=lambda kv: kv[1], reverse=True)[:512] ] record['body'] = " ".join( word for indx, word in enumerate(word_tokenize(record['body'])) if word in valid_word and indx <= 512).strip() if indx % 100 == 0: print("*", indx) # plt.hist(x=words.values(), # bins=40) # plt.show() return self.data
def export_time(question, tokens, labels): labels = np.array(labels) b_time = np.where(labels == "B_TIM")[0] i_time = np.where(labels == "I_TIM")[0] url = None n = len(b_time) if n == 0: st_arr = word_tokenize(question) t_ = export_time_single(st_arr, question) if t_ == None: res, url, adhan_names = adhan_handler( None, tokens, labels, question) if res != None: return res, True, url, adhan_names return [t_], False, None, None elif n >= 2: t_ = [] time_texts = [] for i in range(n): st_arr = [] if i < n - 1: ida = i_time[np.where( (i_time > b_time[i]) & (i_time < b_time[i + 1]))] else: ida = i_time[np.where(i_time > b_time[i])] for t in np.r_[b_time[i], ida]: st_arr.append(tokens[int(t)]) time_texts.append(" ".join(st_arr)) t_.append(export_time_single(st_arr, force_return=True)) is_adhan_needed = False new_t = copy(t_) for i, t in enumerate(t_): if t_[i] == None: new_t[i] = time_texts[i] is_adhan_needed = True if is_adhan_needed: res, url, adhan_names = adhan_handler( new_t, tokens, labels, question) if res != None: if not None in res: return res, True, url, adhan_names return t_, False, None, None else: st_arr = [] for t in np.r_[b_time, i_time]: st_arr.append(tokens[int(t)]) t_ = export_time_single(st_arr, force_return=True) if t_ == None: t_ = export_time_single(word_tokenize(question), question) if t_ == None: res, url, adhan_names = adhan_handler( None, tokens, labels, question) if res != None: return res, True, url, adhan_names return [t_], False, None, None
def find_max(folder_path): files_name = glob.glob(folder_path + '*.csv') max_len = 0 xxx = 0 for file in files_name: dataset = pandas.read_csv(file) for i, item in enumerate(list(dataset.tweet)): if len(hazm.word_tokenize(item)) >= max_len: max_len = len(hazm.word_tokenize(item)) print(max_len)
def load_data(file_name='./dataset/fa_2.xlsx'): data, labels = prepare_data(pd.read_excel(file_name)) unique_labels = np.unique(labels) data_new = list([bag_of_words(word_tokenize(d)) for d in data]) lfeats = dict() for label in unique_labels: idx = np.where(labels == label)[0] data_c = data[idx] lfeats[label] = list([bag_of_words(word_tokenize(d)) for d in data_c]) return lfeats
def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() data = normalizer.normalize(word) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) analyses = [] for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def stem_data(dat): normalizer = hazm.Normalizer() dat = normalizer.normalize(dat) sent = hazm.sent_tokenize(dat) words = [] for s in sent: tagged = list(tagger.tag(hazm.word_tokenize(s))) new_tag = list(tagged) for token in tagged: if token[0] in stop_words: new_tag.remove(token) lemmatizer = hazm.Lemmatizer() for token in new_tag: stemmed = lemmatizer.lemmatize(token[0], pos=token[1]) stemmer = hazm.Stemmer() stemmed = stemmer.stem(stemmed) if len(stemmed) > 0 and ('#' not in stemmed): words.append(stemmed) return words
def countTextWords(text): normalizer = hazm.Normalizer() text = normalizer.normalize(text) tokens = hazm.word_tokenize(text) stemmer = hazm.Stemmer() words = [stemmer.stem(token) for token in tokens] return len(words)
def stremme(val): Log.logger.info('Data stemme by hazm package ') # words = [[stemmer.stem(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)] words = [[ps.run(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)] words = words[0] val = ' '.join(words) return val
def draw_cloud(cleantweets, image_path, show_image=False): text = " ".join(str(tweet) for tweet in cleantweets) text = get_display(arabic_reshaper.reshape(text)) tokens = word_tokenize(text) dic = Counter(tokens) if verbose: print(dic.most_common(max_words)) twitter_mask = np.array(Image.open(f'twitter-logo-q{export_quality}.png')) font_path = select_a_font() wordcloud = WordCloud(font_path=font_path, max_words=max_words, margin=0, width=5000, height=5000, min_font_size=4, max_font_size=700, background_color="white", mask=twitter_mask) wordcloud.generate_from_frequencies(dic) image = wordcloud.to_image() wordcloud.to_file(image_path) if show_image: image.show() print(f"Generated image {image_path}")
def query_process(query): str = normalizer.normalize(query) str = str.translate(str.maketrans('_|ẖ–;،"…=$&@*-/:<>!+.()«»٪؟', ' ', '\u200c\u202c\u200f\u200e\u2069\u2067\u200b\u200d')) words = word_tokenize(str) words = list(dict.fromkeys(words)) i = 0 while i < len(words): while True: if i >= len(words): break repeat = False word = words[i] lem_word = lemmatizer.lemmatize(word).split('#')[0] if lem_word == '': lem_word = 'است' if word in stopwords or lem_word in stopwords: words.remove(word) repeat = True if repeat == False: break for t in range(len(samewords)): if lem_word in samewords[t]: lem_word = samewords[t][0] break words[i] = lem_word i = i + 1 return words
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def most_freq_words(): title, text, ids = extract_data_as_string() listToStr = ' '.join(map(str, title + text)) listToStr = word_tokenize(listToStr) word_count = nltk.FreqDist(listToStr) return word_count.most_common(30)
def pre_processor(self, record): # remove url record['body'] = re.sub(r'^https?:\/\/.*[\r\n]*', '', record['body'], flags=re.MULTILINE) record['body'] = re.sub(r'^www?:\/\/.*[\r\n]*', '', record['body'], flags=re.MULTILINE) record['body'] = " ".join( word for word in record['body'].split() if not word.endswith(".ir") and not word.endswith(".com") and not word.endswith(".org") and not word.startswith("www.")).strip() record['body'] = re.sub(r'\s+\d+\s+', ' ', record['body']) # removeing stop words record['body'] = " ".join(word for word in word_tokenize( self.normalizer.normalize(record['body'])) if word not in self.stopwords).strip() record['body'] = re.sub('\u00a0', ' ', record['body']) record['body'] = re.sub(r'\([^)]*\)', '', record['body']).\ strip().strip("انتهای پیام") return record
def location_handler(question, tokens, labels, check_validation=True): loc = location_(question, tokens, labels) if loc: if check_validation: problem_list = [] for i, l in enumerate(loc): l_inf = get_city_info(l) problem = False if not l_inf: if l in ["تهرون", "ترون"]: loc[i] = "تهران" elif l in ["گم"]: loc[i] = "قم" elif l in ["اصفان", "اصفون"]: loc[i] = "اصفهان" else: problem = True problem_list.append([i, problem]) w_t = np.array(hazm.word_tokenize(question)) bloc = np.where(labels == "B_LOC")[0] - 1 iloc = np.where(labels == "I_LOC")[0] - 1 if len(bloc) >= len(problem_list): for i in range(len(problem_list)): if problem_list[i][1]: if i != len(problem_list) - 1: il = iloc[(iloc > bloc[i]) & (iloc < bloc[i+1])] else: il = iloc[iloc > bloc[i]] loc[problem_list[i][0]] = location_fix( question, [" ".join(w_t[np.r_[bloc[i], il]])])[0] else: loc = [USER_CITY] return loc
def tokenize(corpus, lemma=True, punctuation=True, space_to_space=False): if (not punctuation): # table = str.maketrans({key: None for key in string.punctuation}) # corpus = corpus.translate(table) corpus = corpus.replace(',', ' ') corpus = corpus.replace("\u220c", "") corpus = corpus.replace('(', ' ') corpus = corpus.replace(')', ' ') corpus = corpus.replace('.', ' ') corpus = corpus.replace("،", " ") corpus = corpus.replace("«", " ") corpus = corpus.replace("»", " ") if (space_to_space): tokenized = corpus.split(' ') else: tokenized = word_tokenize(corpus) if (lemma): lemmatizer = Lemmatizer() for i in range(len(tokenized)): tokenized[i] = lemmatizer.lemmatize(tokenized[i]).split('#')[0] return tokenized
def test(train_results, classes, test_case=False): if test_case: file = open("../TestCase/test.txt", 'r', encoding='utf-8') output = open("../TestCase/my-output.txt", 'w+', encoding='utf-8') output_2 = open("../../ClsModel/NaiveBayes/TestCase.output.txt", 'w+', encoding='utf-8') classes = ["c1", "c2"] else: file = open("../test.txt", 'r', encoding='utf-8') output = open("../output.txt", 'w+', encoding='utf-8') output_2 = open("../../ClsModel/NaiveBayes/Test.output.txt", 'w+', encoding='utf-8') for line in file.readlines(): tag_sentence = hazm.word_tokenize(line) sentence = tag_sentence[1:] for c in classes: output.write(c + " ") output_2.write(c + " ") p = 0 for word in sentence: if word in train_results[c][1]: p += math.log10(train_results[c][1][word]) else: p += math.log10(train_results[c][1]['<Unk>']) p += math.log10(train_results[c][0]) output.write(str(p) + " ") output_2.write(str(p) + " ") output.write("\n") output_2.write("\n")
def create_user_files(self): evaluation_users = pandas.read_csv(self.evaluation_user_path) for user_index, user in enumerate(evaluation_users.user): user_tweets = [] user_csv_path = self.crawled_data_path + user + '.csv' user_csv = pandas.read_csv(user_csv_path) user_csv["length"] = user_csv["reply_to"].apply( lambda x: len(ast.literal_eval(x)) == 0) user_csv_noreply = user_csv.loc[user_csv["length"], :].drop( ["length"], axis=1) for i, item in enumerate(user_csv_noreply.tweet): item = str(item) url = re.findall(r"http\S+", item) if (url == []): item = self.new_normalizer.Normalizer_text(item) if len(hazm.word_tokenize(item)) >= 5: user_tweets.append(item) if i % 1000 == 0: print( f'{(i/len(user_csv_noreply))*100 :.2f} done {user} {user_index}' ) user_csv_noreply_normed = pandas.DataFrame({'tweet': user_tweets}) user_csv_noreply_normed.to_csv(user_csv_path.replace( 'crawled_data', 'evaluation_user_data_big'), index=False)
def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def compute_test_perplexity(n, data_path, tokenized): with open(data_path, "r", encoding="utf-8") as test: test = test.read() test_tokenized = word_tokenize(test) ngrams = generate_n_gram(tokenized, n) ngrams_minus_1 = generate_n_gram(tokenized, n - 1) sum_log_probs = 0 for i in range(len(test_tokenized)): found = False for ngram in ngrams: if ngram[0] == test_tokenized[i - n:i]: count_ngram = ngram[1] found = True for ngram_1 in ngrams_minus_1: if ngram_1[0] == test_tokenized[i - n:i - 1]: prob = count_ngram / ngram_1[1] sum_log_probs = sum_log_probs + math.log(prob, 2) if (not found): prob = 1 sum_log_probs = sum_log_probs + math.log(prob, 2) perplexity_by_log = 2**(-1.0 * sum_log_probs / len(test_tokenized)) # print (perplexity_by_log) return
def home(): if request.method == 'POST': inputText = request.form['text'] nltk_stopwords = get('stopwords') # stemmer = Stemmer() title_body_tokenized = word_tokenize(inputText) title_body_tokenized_filtered = [ w for w in title_body_tokenized if not w in nltk_stopwords ] # title_body_tokenized_filtered_stemming = [stemmer.stem(w) for w in title_body_tokenized_filtered] # print(title_body_tokenized_filtered_stemming) vectorizer = get('vectorizer') title_body_tokenized_filtered_stemming_vectorized = vectorizer.transform( title_body_tokenized_filtered) model = get('model') predict = model.predict( title_body_tokenized_filtered_stemming_vectorized) lables = get('lables') lable = lables[predict] return render_template('index.html', lable=lable, stemer=title_body_tokenized_filtered_stemming) else: return render_template('index.html')
def extract_metadata(self, tweet): important_words = [] syms = [] hashtags = [] content_len = 0 content = self.normalizer.normalize(tweet['content']) if 'های وب' in content: syms.append('های_وب') sentences = sent_tokenize(content) for sentence in sentences: sentence = sentence.translate(str.maketrans('', '', self.punctuations)) words = word_tokenize(sentence) content_len += len(words) sent_syms, sent_hashs = self.get_symbols(words) syms += sent_syms hashtags += sent_hashs tags = self.tagger.tag(words) verbs = [word for (word, role) in tags if role == 'V'] filtered_words = ([word.replace('#', '') for word in words if word.replace('#', '') not in self.stop_words and word.replace('#', '') not in verbs and set(word.replace('#', '')).intersection(self.persian_alphabet) and len(word.replace('#', '')) > 1]) important_words += filtered_words syms = list(set(syms)) hashtags = list(set(hashtags)) bigrams = self.get_ngrams(important_words, 2) trigrams = self.get_ngrams(important_words, 3) candidate_words = hashtags + syms + important_words + bigrams + trigrams keywords = self.get_keywords(candidate_words, content_len) return keywords, syms, hashtags
def __call__(self, text): # preprocessing text = unicode(text) text = normalize_numbers(text) # text = ''.join(char for char in unicodedata.normalize('NFD', text) # if unicodedata.category(char) != 'Mn') # Strip accents # text = re.sub("[^ a-z'.,?!\-]", "", text) normalizer = hazm.Normalizer() text = normalizer.normalize(text) # tokenization words = hazm.word_tokenize(text) # tokens = pos_tag(words) # tuples of (word, tag) # steps prons = [] for word in words: if not any(letter in word for letter in self.graphemes): pron = [word] # elif word in self.homograph2features: # Check homograph # pron1, pron2, pos1 = self.homograph2features[word] # if pos.startswith(pos1): # pron = pron1 # else: # pron = pron2 elif word in self.tihu: # lookup tihu dict pron = self.tihu[word] else: # predict for oov pron = self.predict(word) prons.extend(pron) prons.extend([" "]) return prons[:-1]
def prepareText(text): normalizer = hazm.Normalizer() text = normalizer.normalize(text) tokens = hazm.word_tokenize(text) stemmer = hazm.Stemmer() words = [stemmer.stem(token) for token in tokens] return words
def data_aug(dataset, w2c_model): train_aug_text = [] train_aug_label = [] start_time = time.time() for i, item in enumerate(dataset.text): item = str(item) item_label = dataset.label[i] train_aug_text.append(item) train_aug_label.append(item_label) item_tokenized = hazm.word_tokenize(item) for num_aug in range(2): result = data_w2v_aug(num_aug, item_tokenized, w2c_model) train_aug_text.append(result) train_aug_label.append(item_label) if i % 1000 == 0: print(f'{i} ta rafte {time.time() - start_time}') start_time = time.time() auged_dataframe = pandas.DataFrame({ 'text': train_aug_text, 'label': train_aug_label }) return auged_dataframe
def preProcessing(self, doc, level=0): """ This function remove punctuations and some useless prepositions and return a list of words. """ junkList = [ ".", "-", "]", "[", "،", "؛", ":", ")", "(", "!", "؟", "»", "«", "ْ" ] junkWords = [ "که", "از", "با", "برای", "با", "به", "را", "هم", "و", "در", "تا", "یا", "هر", "می", "بر" ] pronouns = [ "من", "تو", "او", "ما", "شما", "ایشان", "آنها", "اینها", "آن", "این", "اونجا", "آنجا", "انجا", "اینها", "آنها", "اینکه" ] for char in junkList: doc = doc.replace(char, " ") result = [] doc = hazm.Normalizer().normalize(doc) doc = hazm.word_tokenize(doc) for word in doc: word.strip() if word not in junkWords and word not in pronouns: result.append(word) return result
def remove_stop_words(val): Log.logger.info('Stop words removed') stops = Constant.STOP_WORDS words = [[word for word in word_tokenize(sentence) if word not in stops] for sentence in sent_tokenize(val)] words = words[0] val = ' '.join(words) return val
def get_names(text): tagged_words = tagger.tag(word_tokenize(text)) words = set(filter( lambda word: is_word_ok(word), [tagged_word[0] for tagged_word in filter(lambda tagged_word: tagged_word[1] == 'N', tagged_words)] )) return words.union(get_hash_tags(text))
def tokenize_text(text): text = text.replace('.', ' ') text = re.sub('\s+', ' ', text).strip() text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '').replace( 'ي', 'ی').replace('ك', 'ک') normalized_text = normalizer.normalize(text) tokens = word_tokenize(normalized_text) return tokens
def tokenize(self, text): text = self.remove_symbols(text) text = re.sub('\s+', ' ', text).strip() text = text.lower() text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '').replace( 'ي', 'ی').replace('ك', 'ک') normalized_text = normalizer.normalize(text) return word_tokenize(normalized_text)
def statement_pre_processing(input_statement): normalizer = Normalizer() lemmatizer = Lemmatizer() input_statement = normalizer.normalize(input_statement) input_statement = [ lemmatizer.lemmatize(word) for word in word_tokenize(input_statement) if word not in stops ] return input_statement
def bigram_cleaner(text): text = re.sub(Text_cleaner.persian_regex, ' ', text) text = re.sub('[ ]+', ' ', text) normalizer = Normalizer() text = normalizer.normalize(text) tokenized = word_tokenize(text) return tokenized
def topics(self, model, document, dictionary=None): if dictionary is not None: self.dictionary = dictionary text = [w for w in word_tokenize(document) if w not in self.stopwords and len(w) > 1] corpus = self.dictionary.doc2bow(text) print 'Which LDA topic maximally describes a document?\n' print 'Original document: ' + document print 'Topic probability mixture: ' + str(model[corpus]) print 'Maximally probable topic: topic #' + str(max(model[corpus], key=itemgetter(1))[0]) return model[corpus]
def hazmtoalpheiosfile(data,uri): root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF") oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri}) oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',) oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget') hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri}) title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'}) title.text = "Morphology of " + uri wordslist = etree.SubElement("words") normalizer = Normalizer() data = normalizer.normalize(data) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] word = etree.SubElement(wordslist,'word') form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) form.text = item entry = etree.SubElement(word, 'entry') infl = etree.SubElement(entry,'inlf') term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) stem = etree.SubElement(term, 'stem') stem.text = wordstem pofs = etree.SubElement(infl, 'pofs') pofs.text = wordpofs return root
def document(filepath): f = open(filepath, 'r', encoding='utf-8', errors='ignore') txt = f.read() f.close() txt = remove_punctuation(txt) normalizer = Normalizer() txt = normalizer.normalize(txt) document = word_tokenize(txt) document = [word for word in document if word not in stop_words and not word.isdigit()] return document
def texts(self, categories={'Politics'}, limit=None): docs = self.hamshahri.docs() print 'start reading corpus...' count = 0 texts = [] for doc in docs: if limit is not None and count == limit: break if len(categories.intersection(set(doc["categories_en"]))) > 0: count += 1 for sent in sent_tokenize(doc['text']): if len(sent) <= 1: continue texts.append([word for word in word_tokenize(sent) if word not in self.stopwords and len(word) > 1]) return texts
def construct_language_model_from_tabnak_collection(dictionary): file = open("data/tabnakNewsCollection.json", 'r') normalizer = SCNormalizer() language_model = LanguageModel(dictionary) i = 0 for line in file: try: data = json.loads(line) content = data['title'] + " " + data['content'] normalized_content = normalizer.normalize(content) word_tokenized = word_tokenize(normalized_content) for word in word_tokenized: word = word.replace("_", PersianTools().HalfSpace) language_model.add(word) i += 1 if i % 1000 == 0: print(i) except: print("error accured reading json file") language_model.export_to_file() return language_model
def StringError_avg(c): temp_1=[] temp_2=[]#the distances matrix that contain average distances of clusters for i in range(len(c)): temp_1.append(word_tokenize(c[i])) temp_2.append([]) m=0 while (m < len(temp_1)):#generation of distances matrix or temp_2 for i in range(len(temp_1)): count=0 for word_1 in temp_1[m]: for word_2 in temp_1[i]: count=count+StringError(word_1, word_2) count=count/(len(temp_1[m])*len(temp_1[i])) temp_2[m].append(count) m+=1 print temp_2
from __future__ import unicode_literals import os,sys,codecs from hazm import Normalizer,sent_tokenize, word_tokenize reader=codecs.open(os.path.abspath(sys.argv[1]),'r',encoding='utf-8') writer=codecs.open(os.path.abspath(sys.argv[2]),'w',encoding='utf-8') count=1 line=reader.readline() normalizer = Normalizer() while line: if count%1000==0: sys.stdout.write(str(count)+'...') if line.strip(): n=normalizer.normalize(line.strip()) tok=word_tokenize(n) sen=u' '.join(tok).replace('_',' ').replace(' ',' ').replace(' ',' ') l=sen+u'\n' writer.write(l) else: writer.write(u'\n') count+=1 line=reader.readline() sys.stdout.write('\n') writer.flush() writer.close()
def k_means(doc,number_of_clusters,numbers_of_iterations): literals=["به","با","از","در","بی","برای","چون","اندر","زیر","بر","الی","جز","الا","مگر","نزد","نزدیک","پیش","روی","میان","پی","جلوی","مانند","چون","درون","فراز","درباره ","محص","خاطر","نظر","راه","مثل","توسط","خلاف","دنبال","زعم","سبب","خلال","راه","سر","عین","وقت","هنگام","بجز","همچون","همچون","زیبا","قشنگ","روشن","مشخص","بزرگ","فوقالعاده","خوب","ناراحت","کوچک","مهربان","محبوب","معتقد","خوشگل","ممنون","سبک","موقت","احمق","شلوغ","مهم","جدید","بد","دور","کامل","موافق","مقارن","اجتماعی","معین","صادق","مسخره","غمگین","سرغ","خوشحال","مناسب","کند","زشت","پارسا","قدیمی","سخت","خوش","غریبه"] doc_list=doc_normalizer(doc) l_1=[] for i in range(len(doc_list)): l_1.append(word_tokenize(doc_list[i])) for i in range(len(l_1)): for word in l_1[i]: if word in literals: del word l_2=doc_stemmer(l_1) l_vectors=[] for i in range(len(doc_list)): l_vectors.append([]) for i in range(len(l_2)):#converting document to equivalent vector model for j in range(len(l_2[i])): l_vectors[i].append(w(l_2,l_2[i],l_2[i][j])) l_centeroids=[] for i in range(number_of_clusters):#generating random initial centeroids l_centeroids.append([]) for i in range(len(l_centeroids)): for j in range(len(doc_list)): l_centeroids[i].append(random.random()) l_cosines=[] for i in range(len(l_vectors)): l_cosines.append([]) for i in range(len(l_vectors)): for j in range(len(l_centeroids)): l_cosines[i].append(cosine(l_vectors[i], l_centeroids[j])) #print l_cosines l_index=[] for i in range(len(l_cosines)): l_index.append([]) for i in range(len(l_cosines)): for j in range(len(l_cosines[i])): if l_cosines[i][j]==min(l_cosines[i]): l_index[i].append(j) #print l_index l_clusters_1=[] l_clusters_2=[] l_clusters_vectors=[] for i in range(number_of_clusters): l_clusters_1.append([]) l_clusters_2.append([]) l_clusters_vectors.append([]) for i in range(len(l_index)): for j in range(len(l_index)): if l_index[i][0]==l_index[j][0] and j not in l_clusters_1[l_index[i][0]]: l_clusters_1[l_index[i][0]].append(j) l_clusters_2[l_index[i][0]].append(l_2[j]) l_clusters_vectors[l_index[i][0]].append(l_vectors[j]) #print l_clusters_1 #print l_clusters_2 #print l_clusters_vectors #print centeroid_generator(l_clusters_vectors) iteration=0 while iteration<numbers_of_iterations: l_centeroids_2=centeroid_generator(l_clusters_vectors) l_cosines_2=[] for i in range(len(l_vectors)): l_cosines_2.append([]) for i in range(len(l_vectors)): for j in range(len(l_centeroids_2)): if l_centeroids_2[j]: l_cosines_2[i].append(cosine(l_vectors[i], l_centeroids_2[j])) #print l_cosines_2 l_index_2=[] for i in range(len(l_cosines_2)): l_index_2.append([]) for i in range(len(l_cosines_2)): for j in range(len(l_cosines_2[i])): if l_cosines_2[i][j]==min(l_cosines_2[i]): l_index_2[i].append(j) l_clusters_1=[] l_clusters_2=[] l_clusters_vectors=[] for i in range(number_of_clusters): l_clusters_1.append([]) l_clusters_2.append([]) l_clusters_vectors.append([]) for i in range(len(l_index_2)): for j in range(len(l_index_2)): if l_index_2[i][0]==l_index_2[j][0] and j not in l_clusters_1[l_index_2[i][0]]: l_clusters_1[l_index_2[i][0]].append(j) l_clusters_2[l_index_2[i][0]].append(l_2[j]) l_clusters_vectors[l_index_2[i][0]].append(l_vectors[j]) iteration+=1 print l_clusters_vectors print l_clusters_1 print l_clusters_2
tmp = line.split('\n') mylabel.append(int(tmp[0])) file_to_read.close() file_to_read = open(sentence_path, 'r') file_content = file_to_read.readlines() file_to_read.close() index = 0 for line in file_content: tmp = line.split('\n') tmp = tmp[0] tmp = normalizer.normalize(tmp) #print(tmp) #print(sent_tokenize(tmp)) word_tokenized = word_tokenize(tmp) #print(word_tokenized) labeledSent = TaggedDocument(words = word_tokenized, tags = [index]) sentences.append(labeledSent) index += 1 num_features = 100 min_word_count = 5 context = 8 num_workers = 4 print("Training model...") model = Doc2Vec(sentences, workers=num_workers, size = num_features, min_count = min_word_count, window = context) print("model Trained.") for epoch in range(num_Of_epoch): model.train(sentences)
def sentences(file="simple_text"): normalizer = Normalizer() for line in open(file, "r", encoding="utf-8").readlines(): for sent in sent_tokenize(line): yield word_tokenize(line)
hamshahri = HamshahriReader() normalizer = Normalizer() tagger = POSTagger() parser = DependencyParser(tagger=tagger) extractor = InformationExtractor() texts = [] output = open('informations.txt', 'w') for text in Bar(max=310000).iter(hamshahri.texts()): texts.append(normalizer.normalize(text)) if len(texts) <= 1000: continue sentences = [] for text in texts: for sentence in sent_tokenize(text): words = word_tokenize(sentence) if len(words) >= 3: sentences.append(words) texts = [] tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) for sentence in parsed: # print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output) for information in extractor.extract(sentence): print(*information, sep=' - ', file=output) print(file=output)
# row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()]) #print labels #print confusion_matrix return precision if __name__ == '__main__': rd = HamshahriReader(config.corpora_root) counter = Counter() docs = [] normalizer = Normalizer() stemmer = Stemmer() for doc in rd.docs(count=config.documents_count): doc['text'] = normalizer.normalize(doc['text']) doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])] counter.update([doc['cat']]) docs.append(doc) print counter all_words = [] for doc in docs: all_words.extend(doc['words']) dist = nltk.FreqDist(word for word in all_words) word_features = dimension_reduction(all_words, dist) print len(word_features) / float(len(all_words)) * 100.0 features_set = [(doc_features(doc, word_features), doc['cat']) for doc in docs] #train_set, test_set = features_set[:len(docs)/2], features_set[len(docs)/2:len(docs)]