def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def stem_data(dat): normalizer = hazm.Normalizer() dat = normalizer.normalize(dat) sent = hazm.sent_tokenize(dat) words = [] for s in sent: tagged = list(tagger.tag(hazm.word_tokenize(s))) new_tag = list(tagged) for token in tagged: if token[0] in stop_words: new_tag.remove(token) lemmatizer = hazm.Lemmatizer() for token in new_tag: stemmed = lemmatizer.lemmatize(token[0], pos=token[1]) stemmer = hazm.Stemmer() stemmed = stemmer.stem(stemmed) if len(stemmed) > 0 and ('#' not in stemmed): words.append(stemmed) return words
def __iter__(self): wiki = WikipediaReader(fawiki_dump=self.dump_file) for doc in wiki.docs(): sentences = sent_tokenize(doc['text']) for sentence in sentences: # You should apply any preprocess before yield yield tokenizer.tokenize(sentence)
def stremme(val): Log.logger.info('Data stemme by hazm package ') # words = [[stemmer.stem(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)] words = [[ps.run(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)] words = words[0] val = ' '.join(words) return val
def _get_summarize(self, num_sentences): # if str(word not in stopwords.words()] words = [ word for word in self.base_words if word not in stopwords.words('persian') ] word_frequencies = FreqDist(words) most_frequent_words = [ pair[0] for pair in word_frequencies.items()[:100] ] actual_sentences = sent_tokenize(self.input) output_sentences = [] for word in most_frequent_words: for i in range(0, len(self.working_sentences)): if (word in self.working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break return self._reorder_sentences(output_sentences)
def extract_metadata(self, tweet): important_words = [] syms = [] hashtags = [] content_len = 0 content = self.normalizer.normalize(tweet['content']) if 'های وب' in content: syms.append('های_وب') sentences = sent_tokenize(content) for sentence in sentences: sentence = sentence.translate(str.maketrans('', '', self.punctuations)) words = word_tokenize(sentence) content_len += len(words) sent_syms, sent_hashs = self.get_symbols(words) syms += sent_syms hashtags += sent_hashs tags = self.tagger.tag(words) verbs = [word for (word, role) in tags if role == 'V'] filtered_words = ([word.replace('#', '') for word in words if word.replace('#', '') not in self.stop_words and word.replace('#', '') not in verbs and set(word.replace('#', '')).intersection(self.persian_alphabet) and len(word.replace('#', '')) > 1]) important_words += filtered_words syms = list(set(syms)) hashtags = list(set(hashtags)) bigrams = self.get_ngrams(important_words, 2) trigrams = self.get_ngrams(important_words, 3) candidate_words = hashtags + syms + important_words + bigrams + trigrams keywords = self.get_keywords(candidate_words, content_len) return keywords, syms, hashtags
def evaluate_summarizer(clf, dataset, used_features, remove_stopwords=False): rouge = Rouge() empty_score = { 'rouge-1': {'p':0, 'f': 0, 'r': 0}, 'rouge-2': {'p':0, 'f': 0, 'r': 0}, 'rouge-l': {'p':0, 'f': 0, 'r': 0} } total_scores = { 'rouge-1':{'p': 0, 'f': 0, 'r': 0}, 'rouge-2':{'p': 0, 'f': 0, 'r': 0}, 'rouge-l':{'p': 0, 'f': 0, 'r': 0} } avg_scores = empty_score total_summaries = 0 #diff_summs = 0 for key in dataset: total_summaries += 1 text = dataset[key]['text'] gold_summaries = dataset[key]['summaries'] best_score = empty_score for ref_key in gold_summaries: ref = gold_summaries[ref_key] ref_len = len(hazm.sent_tokenize(ref)) if remove_stopwords: ref = farsi.remove_stop_words_and_puncs(ref) summary = summ(text, clf, key[4:6], used_features, ref_len) lines = [s + "\n\n" for s in summary] summary = " ".join(summary) if remove_stopwords: summary = farsi.remove_stop_words_and_puncs(summary) #if len(summary) != len(ref): # diff_summs += 1 if len(summary) == 0: continue try: scores = rouge.get_scores(ref, summary)[0] except: print(ref) print(summary) o = 1 o += 1 """f_file = open('/tmp/summaries/' + ref_key + str(scores["rouge-1"]["f"]) + '.txt', '+w') f_file.writelines(lines) f_file.close()""" best_score = best_rouge_f(best_score, scores) for test_type in best_score: for param in best_score[test_type]: total_scores[test_type][param] += best_score[test_type][param] total_docs = len(dataset) for test_type in total_scores: for param in total_scores[test_type]: avg_scores[test_type][param] = total_scores[test_type][param]/total_summaries return avg_scores
def s_normal(val): words = [] for sentence in sent_tokenize(val): for word in word_tokenize(sentence): end = word.find('#') if end == -1: end = len(word) words.append(word[:end]) val = ' '.join(words) return val
def tokenize(paragraph, wanted_list): normal = Normalizer(remove_extra_spaces=True, punctuation_spacing=True, persian_style=False, persian_numbers=False, remove_diacritics=False, affix_spacing=False, token_based=False) for sentence in sent_tokenize(normal.normalize(paragraph)): wanted_list.append(sentence)
def do_tokenize(text: str, **kwargs) -> typing.List[typing.List[Token]]: """Normalize, tokenize, and recognize part of speech""" sentences_tokens = [] sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): sentence_tokens.append(Token(text=word, pos=pos)) sentences_tokens.append(sentence_tokens) return sentences_tokens
def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() data = normalizer.normalize(word) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) analyses = [] for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def texts(self, categories={'Politics'}, limit=None): docs = self.hamshahri.docs() print 'start reading corpus...' count = 0 texts = [] for doc in docs: if limit is not None and count == limit: break if len(categories.intersection(set(doc["categories_en"]))) > 0: count += 1 for sent in sent_tokenize(doc['text']): if len(sent) <= 1: continue texts.append([word for word in word_tokenize(sent) if word not in self.stopwords and len(word) > 1]) return texts
def readTrainTestFiles(self): """This function loads all train and test datas into list and preprocess them""" self.firstClassTrainList = [] self.secondClassTrainList = [] self.firstClassTestList = [] self.secondClassTestList = [] for fileName in self.firstClassTrainFiles: self.firstClassTrainList.append( self.preProcessing(open(fileName, 'r').read())) for fileName in self.secondClassTrainFiles: self.secondClassTrainList.append( self.preProcessing(open(fileName, 'r').read())) for fileName in self.firstClassTestFiles: sentences = hazm.sent_tokenize(open(fileName, 'r').read()) for sentence in sentences: self.firstClassTestList.append(self.preProcessing(sentence)) for fileName in self.secondClassTestFiles: sentences = hazm.sent_tokenize(open(fileName, 'r').read()) for sentence in sentences: self.secondClassTestList.append(self.preProcessing(sentence))
def calculate_embeding(datatype): c = 0 X_Word_embeding = [] X_LSTM_1 = [] X_LSTM_2 = [] X_avg = [] X_texts = [] hd5_capacity = 1000 with open('testdata/{0}/cleaned_captions.txt'.format(datatype)) as f: # with open('testdata/cluster_2.txt') as f: # with open('testdata/cleaned_captions.txt') as f: while True: c = c + 1 print(c) # if c==50: # break sample = f.readline() sents = sent_tokenize(sample) sents_tokens = [word_tokenize(sent) for sent in sents] try: word_encoder = e.sents2elmo(sents_tokens, 0) LSTM_hidden_1 = e.sents2elmo(sents_tokens, 1) LSTM_hidden_2 = e.sents2elmo(sents_tokens, 2) average_layers = e.sents2elmo(sents_tokens, -1) X_Word_embeding.append( _calculate_caption_embeding(word_encoder)) X_LSTM_1.append(_calculate_caption_embeding(LSTM_hidden_1)) X_LSTM_2.append(_calculate_caption_embeding(LSTM_hidden_2)) X_avg.append(_calculate_caption_embeding(average_layers)) # X_texts.append(sample) if c % hd5_capacity == 0: postfix = str(int(c / hd5_capacity)) _save_embeddings_parts(X_Word_embeding, X_LSTM_1, X_LSTM_2, X_avg, postfix) X_Word_embeding = [] X_LSTM_1 = [] X_LSTM_2 = [] X_avg = [] except ZeroDivisionError: print(sents_tokens) continue
def write_to_string(input_text, label6, label41): #output_string6 = "" #output_string41 = "" output_string = "" wo_tag_text = re.sub('<[^<]+?>', '', input_text) wo_tag_text = re.sub(' ', '', wo_tag_text) sent_list = hazm.sent_tokenize(wo_tag_text) for sent in sent_list: word_list = hazm.word_tokenize(sent) word_tokenize_sent = "" for word in word_list: word_tokenize_sent += word + ' ' output_string += word_tokenize_sent + "\t" output_string6 = output_string + "__label__" + label6 + '\n' output_string41 = output_string + "__label__" + label41 + '\n' return output_string6, output_string41
def worker(identifier, skip, count): tagger = POSTagger() done = 0 start = time.time() stopwords = load_stopwords() documents_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.HAMSHAHRI_DATABASE][Settings.HAMSHAHRI_COLLECTION] tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.TAGS_DATABASE][Settings.HAMSHAHRI_COLLECTION] batch_size = 50 for batch in range(0, count, batch_size): hamshahri_cursor = documents_collection.find().skip( skip + batch).limit(batch_size) for doc in hamshahri_cursor: words = [] sentences = sent_tokenize(doc['text']) sents = [] for sentence in sentences: tokens = word_tokenize(sentence) text = [word for word in tokens if word not in stopwords] sents.append(text) tags = tagger.tag_sents(sents) for sent in tags: for word, tag in sent: words.append({'word': word, "pos": tag}) tags_collection.insert({ "id": doc["id"], "categories_fa": doc["categories_fa"], "text": doc["text"], "words": words }) done += 1 #if done % 100 == 0: end = time.time() print 'Worker' + str(identifier) + ': Done ' + str( done) + ' out of ' + str(count) + ' in ' + ( "%.2f" % (end - start)) + ' sec ~ ' + ("%.2f" % (done / (end - start))) + '/sec' sys.stdout.flush()
def process(self, message: Message, **kwargs: Any) -> None: text = message.text for sentence_str in sent_tokenize(text): sentence = Sentence(sentence_str) tokens = word_tokenize(sentence_str) pos_tags = [] if self.component_config.pos: pos_tags = self._pos_tagger.tag(tokens) for idx, token_str in enumerate(tokens): token = Token(text=token_str) if self.component_config.stemmer: token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str) if self.component_config.lemmatizer: token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize( token_str) if self.component_config.pos: token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1] sentence.add_token(token) message.add_sentence(sentence)
def bow(text): global normalizer global tagger global stemmer global lemmatizer text = hz.sent_tokenize(normalizer.normalize(text)) tagged = [tagger.tag(hz.word_tokenize(sent)) for sent in text] bag_of_words = defaultdict(int) for sentence in tagged: words = [ lemmatizer.lemmatize(w[0]).split('#')[0] if w[1] is 'V' else stemmer.stem(str(w[0])) for w in sentence ] for w in words: bag_of_words[w] += 1 return bag_of_words
def hazmtoalpheiosfile(data,uri): root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF") oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri}) oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',) oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget') hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri}) title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'}) title.text = "Morphology of " + uri wordslist = etree.SubElement("words") normalizer = Normalizer() data = normalizer.normalize(data) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] word = etree.SubElement(wordslist,'word') form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) form.text = item entry = etree.SubElement(word, 'entry') infl = etree.SubElement(entry,'inlf') term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) stem = etree.SubElement(term, 'stem') stem.text = wordstem pofs = etree.SubElement(infl, 'pofs') pofs.text = wordpofs return root
def text_to_tokens( self, text: str ) -> typing.Iterable[typing.Tuple[typing.List[str], typing.List[Token]]]: """ Process text into words and sentence tokens using hazm. Returns: (original_words, sentence_tokens) for each sentence """ try: import hazm except ImportError: _LOGGER.warning("hazm is highly recommended for language 'fa'") _LOGGER.warning("pip install 'hazm>=0.7.0'") # Fall back to parent implementation yield from super().text_to_tokens(text) # Load normalizer if not hasattr(self, "normalizer"): normalizer = hazm.Normalizer() setattr(self, "normalizer", normalizer) # Load tagger if not hasattr(self, "tagger"): # Load part of speech tagger model_path = self.lang_dir / "postagger.model" tagger = hazm.POSTagger(model=str(model_path)) setattr(self, "tagger", tagger) sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: original_words = [] sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): original_words.append(word) sentence_tokens.append( Token(text=word, features={TokenFeatures.PART_OF_SPEECH: pos})) yield original_words, sentence_tokens
def tok(dataTok): normalizer = Normalizer() tokenizer = WordTokenizer(join_verb_parts=False, replace_links=True, replace_IDs=True, replace_numbers=True, replace_hashtags=True) s = time.time() ij = 0 #dataTok.apply (lambda x: dataTok1.append(sent_tokenize(x)) ) for row in dataTok: _sents = sent_tokenize(row) _sents = stop_word(_sents) for _sent in _sents: _temp = _sent.replace(".", "").replace(",", "").replace( "،", "").replace("؛", "").strip() _wrds = [] _wrds = normalizer.normalize(_temp) dataTok1.append(tokenizer.tokenize(_wrds)) print("Data: ", dataTok1.__len__()) e = time.time() print("Tokenize Done, Time: ", e - s, " !\n")
from wordfreq import zipf_frequency if len(sys.argv) < 2: print('error') sys.exit() raw_text = str(sys.argv[1]) normalizer_instance = Hazm.Normalizer() lemmatizer_instance = Hazm.Lemmatizer() stem_finder_instance = Hazm.Stemmer() remove_non_persian_regex = re.compile('[^آ-ی]') raw_text = remove_non_persian_regex.sub( ' ', raw_text) #We replace all non persian texts normalized_text = normalizer_instance.normalize(raw_text) sentences = Hazm.sent_tokenize(normalized_text) result_tokens = list() less_accurate_tokens = list() def add_to_tokens_if_not_exists(parsed_token): exists = False for result_token in result_tokens: if parsed_token == result_token: exists = True break if exists == False: #Part four:Choose token based on frequency Or search and give score based on frequency freq = zipf_frequency(parsed_token, 'fa') if freq < 6: result_tokens.append(parsed_token)
likes = int(unidecode(likes.replace(',', ''))) disLike = int(unidecode(disLike.replace(',', ''))) temp = { 'comment': comment, 'prod id': name, 'price': price, 'like': likes, 'dislike': disLike, 'date': date, 'person': person, 'buyer': buyer } for sent in hazm.sent_tokenize(CleanPersianText(comment)): comment = hazm.word_tokenize(sent) while len(comment) >= maxLenSent: temp['comment'] = ' '.join(comment[:maxLenSent]) comment = comment[(maxLenSent - 2):] out = out.append(temp, ignore_index=True) if len(comment) > 3: temp['comment'] = ' '.join(comment) out = out.append(temp, ignore_index=True) out.to_excel('cleaned1.xlsx') nameDF.to_excel('Prod_spec1.xlsx')
def get_answer(self, question, tokens, labels): answer = { 'type': ['4'], 'city': [], 'date': [], 'time': [], 'religious_time': [], 'calendar_type': [], 'event': [], 'api_url': [''], 'result': [] } generated_sentence = "" is_time_asked = False for t in time_asked: if t in question: is_time_asked = True if is_time_asked: return self.time.get_answer(question, tokens, labels) date_list = [] date_list_jalali = [] exportdate = export_date(question, tokens, labels, True) events = [] which_date_is_event = [] for i, d in enumerate(exportdate): if d[0]: date_list.append(d[0]) if (not d[1][0]) and (not d[1][1]) and (type(d[1][2]) != bool): events.append(d[1][2]) which_date_is_event.append(i) d_n = len(date_list) today = datetime.datetime.today() no_date = False if d_n == 0: date_list = [today] d_n = 1 no_date = True date_list = unique_without_sort(date_list) d_n = len(date_list) date_list_jalali = [] for d in date_list: j = gregorian_to_jalali(d.year, d.month, d.day) date_list_jalali.append(format_jalali_date(j)) answer["date"] = date_list_jalali event_list = events answer["event"] = list(event_list) self.bii = concatenate_bi(tokens, labels, "B_DAT", "I_DAT") if no_date: answer["result"] = date_list_jalali generated_sentence = "امروز، {} است".format( tr_single_date(date_list[0], force_date=True)) else: if d_n == 1: asingle, generated_sentence = self.get_single_answer( question, answer, date_list, events) if asingle != None: answer = asingle else: answer["result"] = date_list_jalali trsd = tr_single_date(date_list[0], True) if self.bii: if date_list[0].date() >= today.date(): generated_sentence = "{}، {} میباشد".format( " ".join(self.bii), trsd) else: generated_sentence = "{}، {} بوده است".format( " ".join(self.bii), trsd) else: if date_list[0].date() >= today.date(): generated_sentence = "تاریخ داده شده {} است".format( trsd) else: generated_sentence = "تاریخ داده شده {} بوده".format( trsd) else: answer["result"] = [] tokenize_questions = hazm.sent_tokenize(question) if len(tokenize_questions) == 1: tokenize_questions = question.split(" و ") if d_n == len(tokenize_questions): generated_sentence = "" if d_n != len(events): s = 0 for i, (d, tk) in enumerate( zip(date_list, tokenize_questions)): if i in which_date_is_event: n_answer, n_generated_sentence = self.get_single_answer( tk, answer, [d], [events[which_date_is_event[s]]], self.bii[i] if len(self.bii) == d_n else None) s += 1 else: n_answer, n_generated_sentence = self.get_single_answer( tk, answer, [d], None, self.bii[i] if len(self.bii) == d_n else None) if n_answer != None: answer = n_answer if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence else: n_answer, n_generated_sentence = self.get_single_answer( question, answer, [d], events, self.bii[i] if len(self.bii) == d_n else None) if n_answer != None: answer = n_answer if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence else: n_generated_sentence = "تاریخ داده شده {} میباشد".format( tr_single_date(d)) j = gregorian_to_jalali( d.year, d.month, d.day) answer["result"].append( format_jalali_date(j)) if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence else: for i in range(d_n): n_answer, n_generated_sentence = self.get_single_answer( tokenize_questions[i], answer, [date_list[i]], [events[i]], self.bii[i] if len(self.bii) == d_n else None) if n_answer != None: answer = n_answer if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence else: n_answer, n_generated_sentence = self.get_single_answer( question, answer, [date_list[i]], [events[i]], self.bii[i] if len(self.bii) == d_n else None) if n_answer != None: answer = n_answer if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence else: j = gregorian_to_jalali( date_list[i].year, date_list[i].month, date_list[i].day) answer["result"].append( format_jalali_date(j)) n_generated_sentence = "تاریخ داده شده {} است".format( tr_single_date(date_list[i])) if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence else: for d in date_list: n_answer, n_generated_sentence = self.get_single_answer( question, answer, [d], events, self.bii[i] if len(self.bii) == d_n else None) if n_answer != None: answer = n_answer if generated_sentence: generated_sentence = generated_sentence + " و " + n_generated_sentence else: generated_sentence = n_generated_sentence return answer, cleaning(generated_sentence)
def convertVWDataFormat(self, min, max): firstCounter = 0 secondCounter = 0 firstClassTrainSentences = [] secondClassTrainSentences = [] firstClassTestSentences = [] secondClassTestSentences = [] fileTrain = open("Train.txt", "w") fileTest = open("Test.txt", "w") for fileName in self.firstClassTrainFiles: sentences = hazm.sent_tokenize(open(fileName, 'r').read()) for s in sentences: firstClassTrainSentences.append(s) for fileName in self.secondClassTrainFiles: sentences = hazm.sent_tokenize(open(fileName, 'r').read()) for s in sentences: secondClassTrainSentences.append(s) for fileName in self.firstClassTestFiles: sentences = hazm.sent_tokenize(open(fileName, 'r').read()) for s in sentences: firstClassTestSentences.append(s) for fileName in self.secondClassTestFiles: sentences = hazm.sent_tokenize(open(fileName, 'r').read()) for s in sentences: secondClassTestSentences.append(s) while firstCounter < len( firstClassTrainSentences) and secondCounter < len( secondClassTrainSentences): firstClassTrainSentences[firstCounter] = self.preProcessingVW( firstClassTrainSentences[firstCounter]) secondClassTrainSentences[secondCounter] = self.preProcessingVW( secondClassTrainSentences[secondCounter]) if len(firstClassTrainSentences[firstCounter]) >= 0 and len( secondClassTrainSentences[secondCounter]) >= 0: fileTrain.write( str(max) + " |" + firstClassTrainSentences[firstCounter] + "\n") fileTrain.write( str(min) + " |" + secondClassTrainSentences[secondCounter] + "\n") firstCounter += 1 secondCounter += 1 firstCounter = 0 secondCounter = 0 while firstCounter < len( firstClassTestSentences) and secondCounter < len( secondClassTestSentences): firstClassTestSentences[firstCounter] = self.preProcessingVW( firstClassTestSentences[firstCounter]) secondClassTestSentences[secondCounter] = self.preProcessingVW( secondClassTestSentences[secondCounter]) if len(firstClassTestSentences[firstCounter]) >= 0 and len( secondClassTestSentences[secondCounter]) >= 0: fileTest.write( str(max) + " |" + firstClassTestSentences[firstCounter] + "\n") fileTest.write( str(min) + " |" + secondClassTestSentences[secondCounter] + "\n") firstCounter += 1 secondCounter += 1
def normalize_user_text(text): pe = PersianEditor() text = pe.cleanup(text) my_text = put_space_punc(text) textSplited = sent_tokenize(my_text) return textSplited
def mallet(x): num_features_mallet = int(len(x.firstClassTrainDictionary) / 20) mallet_features = [] first_class_sentences = [] second_class_sentences = [] for i in x.firstClassAllFiles: with open(i, "r") as fileName: sent = hazm.sent_tokenize(fileName.read()) for i in sent: first_class_sentences.append(i) for i in x.secondClassAllFiles: with open(i, "r") as fileName: sent = hazm.sent_tokenize(fileName.read()) for i in sent: second_class_sentences.append(i) proportionClasses = len(first_class_sentences) / len( second_class_sentences) print("nesbat : ", len(first_class_sentences) / len(second_class_sentences)) for i in x.effectiveFeatures1stClass[0:int(num_features_mallet * proportionClasses / (1 + proportionClasses))]: mallet_features.append(i[0]) for i in x.effectiveFeatures2stClass[0:int(num_features_mallet / (1 + proportionClasses))]: mallet_features.append(i[0]) # print ( int (num_features_mallet /(1+proportionClasses) ) , # int (num_features_mallet*proportionClasses/(1+proportionClasses) ) ) emam = open("emam.txt", "w") shah = open("shah.txt", "w") # create mallet format file and add features to them with open("mallet-2.0.8/mallet.txt", "w") as f: first_counter = 0 second_counter = 0 while first_counter < len( first_class_sentences) or second_counter < len( second_class_sentences): if first_counter < len(first_class_sentences): f.write(str(first_counter) + " ") f.write("emam ") emam.write( str(first_counter) + " " + first_class_sentences[first_counter] + "\n") for j in mallet_features: if j in first_class_sentences[first_counter]: f.write(j) f.write(" ") f.write("len:" + str(len(first_class_sentences[first_counter])) + " ") f.write("hasNum:" + str(hasNumbers(first_class_sentences[first_counter])) + " ") f.write("\n") first_counter += 1 if second_counter < len(second_class_sentences): f.write(str(second_counter) + " ") f.write("shah ") shah.write( str(second_counter) + " " + second_class_sentences[second_counter] + "\n") for j in mallet_features: if j in second_class_sentences[second_counter]: f.write(j) f.write(" ") f.write("len:" + str(len(second_class_sentences[second_counter])) + " ") f.write( "hasNum:" + str(hasNumbers(second_class_sentences[second_counter])) + " ") f.write("\n") second_counter += 1
def singel_char(val): words = [[word for word in word_tokenize(sentence) if len(word)>1] for sentence in sent_tokenize(val)] words = words[0] val = ' '.join(words) return val
def remove_stop_words(val): Log.logger.info('Stop words removed') stops = Constant.STOP_WORDS words = [[word for word in word_tokenize(sentence) if word not in stops] for sentence in sent_tokenize(val)] words = words[0] val = ' '.join(words) return val
def lemma(val): words = [[lemmatizer.lemmatize(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)] words = words[0] val = ' '.join(words) return val
def sentences(file="simple_text"): normalizer = Normalizer() for line in open(file, "r", encoding="utf-8").readlines(): for sent in sent_tokenize(line): yield word_tokenize(line)
hamshahri = HamshahriReader() normalizer = Normalizer() tagger = POSTagger() parser = DependencyParser(tagger=tagger) extractor = InformationExtractor() texts = [] output = open('informations.txt', 'w') for text in Bar(max=310000).iter(hamshahri.texts()): texts.append(normalizer.normalize(text)) if len(texts) <= 1000: continue sentences = [] for text in texts: for sentence in sent_tokenize(text): words = word_tokenize(sentence) if len(words) >= 3: sentences.append(words) texts = [] tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) for sentence in parsed: # print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output) for information in extractor.extract(sentence): print(*information, sep=' - ', file=output) print(file=output)
sample_set2 = readData(file2) #Normalizind data n = Normalizer() for sample in sample_set1: sample = n.normalize(sample) n = Normalizer() for sample2 in sample_set2: sample2 = n.normalize(sample2) #SENTENCE TOKENIZATION all_sentences1 = [] for sample in sample_set1: sentences1 = sent_tokenize(sample) all_sentences1.extend(sentences1) #print(all_sentences) all_sentences2 = [] for sample in sample_set2: sentences2 = sent_tokenize(sample) all_sentences2.extend(sentences2) size2 = all_sentences2.__len__() size1 = all_sentences1.__len__() for k in range(0, size1): fileo1.write(all_sentences1[k] + "\n") for i in range(0, size2):