def object_separate(stat_dict, file_name): # encryption determinating and separating objects by is_encrypt.py, separator.py count_result = [0, 0, 0, 0, 0, 0, 0, 0, 0] set_threshold(stat_dict) for fname in tqdm(file_name, total=len(file_name)): with open(rf"{data_path}{os.sep}{fname}", "rb") as file: pk = pickle.load(file) # Separation from separate.py parsed = parse_ip_port(pk) if parsed != -1: obj = separate(parsed, stat_dict) # Can separate if obj <= 3: if encrypt.encryption_determinate(pk[-1]): place = "encrypt" count_result[7] += 1 else: place = "plain" count_result[8] += 1 with open(rf"{save_path}{os.sep}{place}{os.sep}{fname}", "wb") as p_file: pickle.dump(pk + [obj], p_file) count_result[obj] += 1 # ICMP else: count_result[6] += 1 return count_result
def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i+1])): paragraphs.append(p) # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[czech_stemmer.cz_stem(word, aggressive=True) for word in sentence] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words(tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words(sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') summary = '' counter = 0 summary_length = max(min(round(len(sentences) / 4), 15), 3) # length between 3-15 sentences ranked_sentence_indexes = textrank(tokenized_sentences_without_stopwords, stopwords=[], top_n=summary_length) print(f'ranked_sentence_indexes: {ranked_sentence_indexes}') # add 1st sentence always summary += f'{sentences[0]}\n' counter += 1 ranked_sentence_indexes.remove(0) # add also 2nd sentence if it is in top50% if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]: summary += f'{sentences[1]}\n' counter += 1 ranked_sentence_indexes.remove(1) for sentence_index in sorted(ranked_sentence_indexes[:summary_length - counter]): if counter == summary_length: break summary += f'{sentences[sentence_index]}\n' counter += 1 return summary
def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i + 1])): paragraphs.append(p) # print(f'Num of paragraphs: {len(paragraphs)}') # for i, p in enumerate(paragraphs): # print(f'par#{i+1}: {p}') # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[ czech_stemmer.cz_stem(word, aggressive=False) for word in sentence ] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words( tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words( sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') # 1. THEMATICITY FEATURE thematicity_feature_scores = thematicity_feature( tokenized_sentences_without_stopwords) # 2. SENTENCE POSITION FEATURE - NOTE: shitty! sentence_position_scores = sentence_position_feature(len(sentences)) # 3. SENTENCE LENGTH FEATURE sentence_length_scores = sentence_length_feature(tokenized_sentences) # 4. SENTENCE PARAGRAPH POSITION FEATURE # 5. PROPER_NOUN FEATURE proper_noun_scores = proper_noun_feature(tagged_sentences) # 6. NUMERALS FEATURE numerals_scores = numerals_feature(tokenized_sentences) # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords) # 9. CENTROID SIMILARITY FEATURE centroid_similarity_scores = centroid_similarity_feature( sentences, tf_isf_scores) # 10. UPPER-CASE FEATURE (not in the paper) upper_case_scores = upper_case_feature(tokenized_sentences) # 11. QUOTES FEATURE (not in the paper) quotes_scores = quotes_feature(sentences) # 12. REFERENCES FEATURE (not in the paper) references_scores = references_feature(tokenized_sentences) # 13. TEXTRANK FEATURE (not in the paper) textrank_scores = textrank.textrank(tokenized_sentences, True, '4-1-0.0001') feature_matrix = [] feature_matrix.append(thematicity_feature_scores) feature_matrix.append(sentence_position_scores) feature_matrix.append(sentence_length_scores) feature_matrix.append(proper_noun_scores) feature_matrix.append(numerals_scores) feature_matrix.append(tf_isf_scores) feature_matrix.append(centroid_similarity_scores) feature_matrix.append(upper_case_scores) features = [ ' thema', 'sen_pos', 'sen_len', ' propn', ' num', ' tf_isf', 'cen_sim', ' upper' ] feature_matrix_2 = np.zeros((len(sentences), len(features))) for i in range(len(features)): for j in range(len(sentences)): feature_matrix_2[j][i] = feature_matrix[i][j] feature_sum = [] for i in range(len(np.sum(feature_matrix_2, axis=1))): feature_sum.append(np.sum(feature_matrix_2, axis=1)[i]) print('=====Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in feature_matrix: print('{: .4f}'.format(round(f_s[i], 4)), end='|') print('{: .4f}'.format(round(feature_sum[i], 4))) print('Training rbm...') rbm_trained = rbm.test_rbm(dataset=feature_matrix_2, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=len(features)) # another implementation of rbm, from sklearn # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1) # rbm_trained = rbm2.fit_transform(feature_matrix_2) # print(rbm_trained) rbm_trained_sums = np.sum(rbm_trained, axis=1) print('=====RBM Enhanced Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in rbm_trained[i]: print('{: .4f}'.format(round(f_s, 4)), end='|') print('{: .4f}'.format(round(rbm_trained_sums[i], 4))) enhanced_feature_sum = [] feature_sum = [] for i in range(len(np.sum(rbm_trained, axis=1))): enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i]) feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i]) print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') enhanced_feature_sum.sort(key=lambda x: x[0]) feature_sum.sort(key=lambda x: -1 * x[0]) print('=====Sorted=====') print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') # print('=====The text=====') # for x in range(len(sentences)): # print(sentences[x]) extracted_sentences_rbm = [] extracted_sentences_rbm.append([sentences[0], 0]) extracted_sentences_simple = [] extracted_sentences_simple.append([sentences[0], 0]) summary_length = max(min(round(len(sentences) / 4), 12), 3) # length between 3-12 sentences for x in range(summary_length): if enhanced_feature_sum[x][1] != 0: extracted_sentences_rbm.append([ sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1] ]) if feature_sum[x][1] != 0: extracted_sentences_simple.append( [sentences[feature_sum[x][1]], feature_sum[x][1]]) extracted_sentences_rbm.sort(key=lambda x: x[1]) extracted_sentences_simple.sort(key=lambda x: x[1]) final_text_rbm = '' for i in range(len(extracted_sentences_rbm)): final_text_rbm += extracted_sentences_rbm[i][0] + '\n' final_text_simple = '' for i in range(len(extracted_sentences_simple)): final_text_simple += extracted_sentences_simple[i][0] + '\n' print('=====Extracted Final Text RBM=====') print(final_text_rbm) print() print('=====Extracted Final Text simple=====') print(final_text_simple) return final_text_rbm
article_count = 0 articles_in_sentences = {} for filename in article_files: file_name, file_extension = os.path.splitext(filename) tree = ET.parse(f'{my_dir}/articles/{filename}') root = tree.getroot() articles = list(root) article_number = 0 for article in articles: article_count += 1 title = article.find('nadpis').text.strip() content = article.find('text').text.strip() # SPLIT TO SENTENCES sentences = separator.separate(content) for s in range(len(sentences)): sentences[s] = sentences[s].strip('" \n') articles_in_sentences[ f'{filename.split(".")[0]}-{article_number}'] = sentences article_number += 1 print(articles_in_sentences) print(f'Articles total: {article_count}') dirr = os.path.dirname(os.path.realpath(__file__)) golden_filenames = os.listdir(f'{dirr}/rouge_2_0/summarizer/reference') sentence_number_dict = {} avg_words_list = [] avg_lines_list = []