def make_tag(data): stopwords = {'기자'} #전처리 text = preprocessing(data) #문장별로 나눠줌 texts = text.split('.') tag = '' # 학습 try: keywords, sents = summarize_with_sentences(texts, stopwords=stopwords, num_keywords=5, num_keysents=3) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:5]: # print('%8s:\t%.4f' % (word, r)) # print('#%s' % word) tag += '#' + word + ' ' except ValueError as v: # print('#') tag = '# ' # tag에 추가 return tag
def summary(list): try: penalty = lambda x: 0 if (5 <= len(x) <= 80) else 1 # 문장 길이 # stopwords = {'이러한', '일단', '제가', '이거', '아니라', '때문에', # '동영상', '도움말', '미지원으로', '드래그', '지원되지않습니다.도움말', # 'ㅎㅎ', '중입니다.5분', '퍼가기', 'Object', '마우스를', '인코딩', '음소', # '음소거', 'Flash', '영상의', '소요'} keywords, sents = summarize_with_sentences( list, penalty=penalty, # stopwords=stopwords, diversity=0.7, num_keywords=100, num_keysents=10, scaling=lambda x: 1, beta=0.85, # PageRank의 decaying factor beta max_iter=10, verbose=True ) print(keywords) return sents except ValueError: print('key가 없습니다.') pass
def summarizer(self, text, option='krwordrank'): # 'gensim', 'textrank', 'krwordrank' sent_lst = text.split(". ") if option == 'krwordrank': result = summarize_with_sentences(sent_lst, num_keysents=3)[1] elif option == 'gensim': result = summarize(text, ratio=3 / len(sent_lst)).split("\n") # else: # ks_summarizer = KeysentenceSummarizer(tokenize = self.okt.morphs) # result = list(zip(*ks_summarizer.summarize(sents=sent_lst,topk=3)))[2] return result
def test_keysentence(test_config): data_path = test_config['data_path'] with open(data_path, encoding='utf-8') as f: texts = [line.rsplit('\t')[0].strip() for line in f] keywords, sents = summarize_with_sentences(texts, num_keywords=100, num_keysents=10) for word in ['영화', '너무', '정말', '음악', '마지막']: assert word in keywords assert len(sents) == 10 print('\nKR-WordRank key-sentence extraction 라라랜드 영화 리뷰 10 개 핵심 문장') for sent in sents: print(' - {}'.format(sent))
def summary(list): penalty = lambda x: 0 if (25 <= len(x) <= 80) else 1 #문장 길이 stopwords = {'이러한', '일단', '제가', '이거', '아니라', '때문에'} keywords, sents = summarize_with_sentences( list, penalty=penalty, stopwords=stopwords, diversity=0.7, num_keywords=100, num_keysents=10, scaling=lambda x: 1, verbose=False, ) return sents
def summary_text(texts): texts = texts penalty = lambda x: 0 if (25 <= len(x) <= 50) else 1 stopwords = {'은', '는', '이', '가', '오늘'} keywords, sents = summarize_with_sentences( texts, penalty=penalty, stopwords=stopwords, diversity=0.7, num_keywords=10, num_keysents=1, scaling=lambda x: 1, verbose=False, ) keyword = [] for sent in sents: print(sent) for i in keywords: keyword.append(i) return print(keyword)
#이를 토대로 우리가 좀 더 원하는건 okt(twitter) 인듯 from collections import Counter kkma_candidates = kkma.sentences(content) nouns = okt.nouns(content) from krwordrank.sentence import summarize_with_sentences keywords, sents = summarize_with_sentences( nouns, num_keywords=100, num_keysents=1 ) import re from collections import Counter #널리 알려진 한국어 형태소 분석기들 중에선 빠르고 적절하며 원문을 보존하며 문장 구분을 해주는 기능이 구현된게 없고 def xplit(*delimiters): return lambda value: re.split('|'.join([re.escape(delimiter) for delimiter in delimiters]), value) xplit('. ', '? ', '! ', '\n', '.\n')("This is a sentence. Here is another sentence.\nHello, world!") class Sentence:
stopwords = custom_stopwords + default_stopwords return set(stopwords) with open('../data/test4_punct.txt', 'r') as f: text = f.read().split('\n') text = ' '.join(text) text = text.split('. ') stopwords = get_stopwords() print(stopwords) print('====================') keywords, sents = summarize_with_sentences(text, stopwords=stopwords, num_keywords=100, diversity=0.7, num_keysents=5, scaling=lambda x: 1, verbose=True) print(list(keywords.items())[:10]) print('====================') for i, s in enumerate(sents): print(i, s) print('====================') wordrank_extractor = KRWordRank( min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=20, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10
from krwordrank.sentence import summarize_with_sentences stop = 0 test = "국어 질문있습니다." st = '' for r in range(len(test)): texts = test[r] texts = preprocessing(texts, b_idx) st += texts texts = st.split('. ') try: stopwords = {b_idx.split(' ')[0], b_idx.split(' ')[1]} keywords, sents = summarize_with_sentences(texts, stopwords=stopwords, num_keywords=100, num_keysents=10) except ValueError: print('key가 없습니다.') print() continue for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:7]: #print('%8s:\t%.4f' % (word, r)) print('#%s' % word) print() if stop == 50: break
diversity=0.3, topk=10) for sent in sents: print(sent) from krwordrank.sentence import summarize_with_sentences penalty = lambda x: 0 if (10 <= len(x) <= 50) else 1 stopwords = {'잘 부탁드립니다', '부탁드립니다', '잘', '정말', '진짜'} keywords, sents = summarize_with_sentences( texts, penalty=penalty, stopwords=stopwords, diversity=0.7, num_keywords=50, num_keysents=10, scaling=lambda x: 1, verbose=False, ) for sent in sents: print(sent) # json으로 변경 # decode 코드가 주피터에서 안먹긴 하는데, 아마 decode 하면 될거라고 생각 print(json.dumps(sents)) print('') print(json.dumps(sents, indent=4)) print(type(json.dumps(sents))) with open('words.json', 'w', encoding="utf-8") as make_file: json.dump(sents, make_file, ensure_ascii=False, indent="\t")
# .”라고 전화고 있다 와 같은경우 방지 ( 판례 본문에서 직접 복사해와야한다.) if content[i][j + 1] == '”': continue final_content.append(content[i][sentence_start:j + 1]) sentence_start = j + 1 print(k, '번째 글') # print(final_content) # from krwordrank.sentence import summarize_with_sentences penalty = lambda x: 0 if (15 <= len(x) <= 90) else 1 keywords, sents = summarize_with_sentences(final_content, penalty=penalty, diversity=0.5, num_keywords=100, num_keysents=2, verbose=False) print(keywords) print(sents) driver.close() driver.switch_to.window(old_tab) print('----------------------------------') driver.quit() ''' from krwordrank.word import KRWordRank
text = text.split('.') if ' ' in text: text = list(filter(lambda a: a != ' ', text)) if '' in text: text = list(filter(lambda a: a != '', text)) print(len(text), text) keysents = '' if len(text) <= num_keysents: keysents = text[0] keysents = '. '.join(keysents) else: keywords, keysents = summarize_with_sentences( text, diversity=0.7, num_keysents=1, scaling=lambda x: 1, verbose=False, ) keysents = '. '.join(keysents) # print(keysents) basic_loader._save_text(keysents, idx) # score = rouge1(keywords, keysents, mecab_tokenizer) # print(idx, score) idx += 1 # lexrank.summarize(text)
def summary(): cur = mysql.connection.cursor() user_email = get_jwt_identity()['user_email'] pre_data = request.get_json()['paragraph'] emotion = request.get_json()['strength_of_feeling'] created_data_time = datetime.datetime.utcnow() data = [] data_list = [] data.append(pre_data) for sentence in data: list_sentence1 = sentence.split('\n') for list_sentence2 in list_sentence1: list_sentence = list_sentence2.replace('. ', '. ...').replace( '? ', '? ...').replace('! ', '! ...').split(' ...') for lines in list_sentence: line = lines.strip() data_list.append(line) data_list1 = list(data_list) for i in range(len(data_list)): x = data_list1.count('') for j in range(x): data_list1.remove('') texts = data_list1 penalty = lambda x: 0 if (10 <= len(x) <= 120) else 1 stopwords = {'오늘', '오늘은'} keywords, sents = summarize_with_sentences(texts, penalty=penalty, stopwords=stopwords, diversity=0.5, num_keywords=7, num_keysents=3, scaling=lambda x: 1, verbose=False, min_count=1) before_sentiment = [] sentiment = [] keyword = [] for sent in sents: before_sentiment.append(sent) print(before_sentiment) def text_input(a): global graph with graph.as_default(): txt = [] txt.append(a) text = [] for sentence in txt: temp_X = [] temp_X = okt.morphs(sentence, stem=True) # 토큰화 temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거 text.append(temp_X) seq = tokenizer.texts_to_sequences(text) padded = pad_sequences(seq, maxlen=max_len) pred = model.predict(padded) labels = [0, 1, 2, 3, 4] return labels[np.argmax(pred)] for i in range(3): sentiment.append(text_input(a=before_sentiment[i])) print(sentiment) def find_nearest(array, value): n = [abs(i - value) for i in array] idx = n.index(min(n)) return idx a = find_nearest(sentiment, emotion) summary_text = before_sentiment[a] cur.execute( "INSERT INTO user_summary (user_email, summary_text, created_data_time) VALUES ('" + str(user_email) + "', '" + str(summary_text) + "', '" + str(created_data_time) + "')") mysql.connection.commit() result = { 'user_email': user_email, 'summary_text': summary_text, 'created_data_time': created_data_time } return jsonify({'result': result})