def get_sentences(corpus, save=False, save_path="sentences.txt"): sentences = kss.split_sentences(corpus) if not save: return kss.split_sentences(corpus) save_iter_data(save_path, sentences)
def preprocessing(script): lines = script.split('\n') filtered_lines = list() for line in lines: condition_1 = (line != '') condition_2 = (line.startswith('✔') or line.startswith('▶') or line.startswith('📌') or line.startswith('#') or line.startswith('📡') or line.startswith('http') or line.startswith('-') or line.startswith('[')) == False # condition_3 = line.startswith('○브리핑 전문○') # check actual briefing part # if condition_3: # is_briefing = True if condition_1 and condition_2: sents = kss.split_sentences(line) for sent in sents: words = sent.split(' ') word_list = list() for word in words: if word.find('○') == -1 and word.find('●') == -1: word_list.append(word) sent = ' '.join(word_list) sent = sent.strip() if sent != '' and sent != '브리핑 전문': filtered_lines.append(sent) return filtered_lines
def _tokenize_sentences_worker(input_file: str, output_file: str, min_len: int, max_len: int, split_sent: bool = True): with open(input_file, 'r', encoding='utf-8') as src, \ open(output_file, 'w', encoding='utf-8') as dst: total_lines = '' for line in src: if not line.strip(): # Write the rest sentences. if not split_sent and len(total_lines.strip()) > min_len: dst.write(total_lines.strip() + '\n') total_lines = '' continue for s in kss.split_sentences(line): s = s.strip() if split_sent: if len(s) > min_len and len(s) < max_len: dst.write(s + '\n') else: if len(total_lines) + len(s) > max_len: dst.write(total_lines.strip() + '\n') total_lines = '' total_lines += s + ' '
def guess_summary(self, aspect, document, rel_words): neighbors = [{ 'entity': aspect, 'reasoning': f'aspect [[{aspect}]] is in the text', 'relation_weight': float('inf') }] for rel_word in rel_words: neighbors.append({ 'entity': rel_word, 'reasoning': f'Related word [[{rel_word}]] is in the text', 'relation_weight': float('inf') }) picked_sents, reasonings = [], [] for sent in kss.split_sentences(document): sent_morphs = okt.morphs(sent) for neighbor in neighbors: if neighbor['entity'] in sent_morphs: if sent not in picked_sents: picked_sents.append(sent) if neighbor['reasoning'] not in reasonings: reasonings.append(neighbor['reasoning']) if len(picked_sents) > 0: return { 'aspect_summary': ' '.join(picked_sents), 'reasonings': reasonings } else: return None
def text2sentences(self, text): self.sentences = kss.split_sentences(text) for idx in range(0, len(self.sentences)): if len(self.sentences[idx]) <= 10: self.sentences[idx-1] += (' ' + self.sentences[idx]) self.sentences[idx] = '' return self.sentences
def run_kss(in_path, out_dir): nlen = get_nlen(in_path) pbar = tqdm(total=nlen) out_idx = 0 cnt_sent = 0 with open(in_path) as f_in: out_path = os.path.join(out_dir, f"kss_{out_idx}.txt") f_out = open(out_path, 'w') for _ in range(nlen): line = f_in.readline() pbar.update(1) sentences = kss.split_sentences(line) if is_bad_paragraph(sentences): continue for sent in sentences: if is_other_sentence(sent): continue if is_end_sentence(sent): break sent = remove_sentence(sent) f_out.write(f"{sent}\n") cnt_sent += 1 f_out.write("\n") if cnt_sent > 1000000: f_out.close() out_idx += 1 out_path = os.path.join(out_dir, f"kss_{out_idx}.txt") f_out = open(out_path, 'w') cnt_sent = 0 pbar.close() f_out.close()
def load_data(self): question_file = open(self.file_path, 'r', encoding='utf-8') question_lines = question_file.readlines() question_file.close() for line in question_lines: line = line.strip() context = line.split('\t')[0] answer = line.split('\t')[1] question = line.split('\t')[2] total_tokens = [] qa_tokens = ['<answer>'] + tokenizer.tokenize(answer) + [ '</answer>' ] + ['<question>' ] + tokenizer.tokenize(question) + ['</question>'] for single_line in kss.split_sentences(context): tokenized_single_line = [ '<s>' ] + tokenizer.tokenize(single_line) + ['</s>'] if len(total_tokens) + len(qa_tokens) + len( tokenized_single_line) < 1024: total_tokens += tokenized_single_line else: break total_tokens += qa_tokens padded_total_tokens = total_tokens + ['<pad>'] * ( 1024 - len(total_tokens)) self.data.append( torch.tensor( tokenizer.convert_tokens_to_ids( padded_total_tokens)).unsqueeze(0))
def get_df(self, *colnames, by_sentence_textColname=None): ''' :param colnames: 행이름 str :param by_sentence_textColname: 문장 분해 대상 text 행이름 :return: DataFrame ''' df_documents = self.df.loc[:, list(colnames)] if by_sentence_textColname: df_sentences = pd.DataFrame() nrows = df_documents.shape[0] for i in tqdm(range(nrows), "loader : Getting Sentences "): row = df_documents.iloc[i] text = row[by_sentence_textColname] if len(text) > 0: text = cleanse_text(text) sentences = kss.split_sentences( text) #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함 for s in sentences: s = cleanse_sentence(s) if len(s) > 0: row_temp = row.copy() row_temp[by_sentence_textColname] = s df_sentences = df_sentences.append(row_temp) else: continue print( f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences" ) return df_sentences else: return df_documents
def load_data(self): answer_file = open(self.file_path, 'r', encoding='utf-8') answer_lines = answer_file.readlines() answer_file.close() for line in answer_lines: line = line.strip() context = line.split('\t')[0] answers = line.split('\t')[1].split('|') total_tokens = [] answer_tokens = ['<answer>'] for i in range(0, len(answers) - 2): answer_tokens += tokenizer.tokenize(answers[i]) + ['<sep>'] answer_tokens += tokenizer.tokenize(answers[-2]) + ['</answer>'] for single_line in kss.split_sentences(context): tokenized_single_line = [ '<s>' ] + tokenizer.tokenize(single_line) + ['</s>'] if len(total_tokens) + len(answer_tokens) + len( tokenized_single_line) < 1024: total_tokens += tokenized_single_line else: break total_tokens += answer_tokens padded_total_tokens = total_tokens + ['<pad>'] * ( 1024 - len(total_tokens)) self.data.append( torch.tensor( tokenizer.convert_tokens_to_ids( padded_total_tokens)).unsqueeze(0))
def write_subtitle(aligned_vtt_dict): for idx in aligned_vtt_dict: subtitle = SubtitleWrapper() vtts = aligned_vtt_dict[idx] for vtt in vtts: subtitle.write_caption(vtt['start'], vtt['end'], kss.split_sentences(vtt['text'])) subtitle.save_caption(os.path.join(args.vtt_path, idx)) print('[INFO] aligned subtitles saved.')
def sentAnalyze(user_input): try: adamsURL = "http://api.adams.ai/datamixiApi/tms" accessKey = "5071738647222560661" text = ' '.join(str(s) for s in kss.split_sentences( user_input[:800])) # AdamsAI가 877자 까지밖에 처리하지 못함 analysisCode = 'om' language = 'kor' params = dict(key=accessKey, query=text, analysis=analysisCode, lang=language) resp = requests.get(url=adamsURL, params=params) data = resp.json() sentiword = [] for i in range(len(data['return_object']['sentence'])): if 'sa' in data['return_object']['sentence'][i] and \ data['return_object']['sentence'][i]['sa']['sentiword'] and \ data['return_object']['sentence'][i]['sa']['polarity']<0: print(data['return_object']['sentence'][i]['text'], '\n', '\t', data['return_object']['sentence'][i]['sa']) sentiword.append( (data['return_object']['sentence'][i]['sa']['score'], list( filter( lambda w: '/' not in w, data['return_object'] ['sentence'][i]['sa']['sentiword'])))) dup = set() output = [] for x, y in sorted(sentiword): for w in y: if not w in dup and 'ㄹ' not in w and '다' in w: dup.add(w) output.append((x, w)) print(output) if not output: print("empty") return None else: sent2print = [] for i in range(len(output)): if len(output) >= 5: sent2print = [ output[0][1], output[1][1], output[2][1], output[3][1] ] else: sent2print.append(output[i][1]) return ', '.join(str(s) for s in sent2print) except KeyError: return None
def context_tokenizer(text, tokenizer): sent_list = kss.split_sentences(text) tokens = [] for sent in sent_list: tokenized_sentence = tokenizer.tokenize(sent) if len(tokens) + len(tokenized_sentence) < 912: tokens += ['<s>'] + tokenized_sentence + ['</s>'] else: break return tokens
def parse_json_recursively(json_object, target_key): if type(json_object) is dict and json_object: for key in json_object: if key in target_key: for sent in kss.split_sentences(json_object[key]): output_set.add(str(sent)) parse_json_recursively(json_object[key], target_key) elif type(json_object) is list and json_object: for item in json_object: parse_json_recursively(item, target_key)
def mecabFreqToSentenceList(text): # 리스트에 넣기전 자료형 초기화 sentence_token = [] allnoun = [] # 문서를 문장 단위로 분리해 sentence_token 리스트에 저장 sentence_token = kss.split_sentences(text) # 토큰화한 문장에서 명사만 뽑아 다시 join한 결과를 각각 리스트에 저장 for i in range(0, len(sentence_token)): #명사 분류 allnoun.append(" ".join(mecab.nouns(sentence_token[i]))) return allnoun
def _tokenize_sentences_worker(input_file: str, output_file: str, min_len: int): with open(input_file, 'r', encoding='utf-8') as src, \ open(output_file, 'w', encoding='utf-8') as dst: for line in src: for s in kss.split_sentences(line): s = s.strip() if len(s) < min_len: continue dst.write(s + '\n')
def test_quote_misalignment(self): # testcase from https://github.com/likejazz/korean-sentence-splitter/issues/8 text = """부부 싸움 규칙 가운데 ‘돈 히트 언더 더 벨트’(Don’t hit under the belt)가 있다. 권투할 때 벨트 아래를 치면 반칙이듯이, 상대가 너무 아파할 만한 것을 건드리면 회복하기 어렵다. 그 부분은 사람마다 다르다.""" splitted = kss.split_sentences(text) self.assertEqual(len(splitted), 3) text = """안녕하십니까? 삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S'데이 멤버십 블루 강연회 "Challenge BLUE, 박찬호&이동우의 삶과 도전" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했다.""" splitted = kss.split_sentences(text) self.assertEqual(len(splitted), 3) text = """삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S'데이 멤버십 블루 강연회 "Challenge BLUE, 박찬호&이동우의 삶과 도전" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했다.""" splitted = kss.split_sentences(text) self.assertEqual(len(splitted), 2) text = """삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S"데이 멤버십 블루 강연회 "Challenge BLUE, 박찬호&이동우의 삶과 도전" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했다.""" splitted = kss.split_sentences(text) self.assertEqual(len(splitted), 2) text = """삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S"데'이 멤버십 블루 강연회 "Challenge BLUE, 박찬호&이동우의 삶과 도전" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했'다.""" splitted = kss.split_sentences(text) self.assertEqual(len(splitted), 2)
def extract_nouns_v2(news: str) -> dict: """Extract nouns from news. :param news: contents of news. :return: dict(). Extracted keyword and its count. {keyword: count, } """ mecab = Mecab() news_lines = kss.split_sentences(news) nouns = [] for line in news_lines: nn = 0 pos = 0 for token in mecab.pos(line): pos = pos + line[pos:].find(token[0]) if token[1] == 'NNG': # 일반 명사 if nn > 0: if line[pos - 1] == ' ': nouns.append( (f'{nouns[-1][0]} {token[0]}', nouns[-1][1])) nouns.append(token[0]) else: nouns[-1] = (f'{nouns[-1][0]}{token[0]}', nouns[-1][1]) nn += 1 else: nn = 1 nouns.append(token) elif token[1] == 'NNP': # 고유 명사 if nn > 0: if line[pos - 1] == ' ': nouns.append((f'{nouns[-1][0]} {token[0]}', 'NNP')) nouns.append(token[0]) else: nouns[-1] = (f'{nouns[-1][0]}{token[0]}', 'NNP') nn += 2 else: nn = 2 nouns.append(token) else: nn = 0 pos += len(token[0]) return dict(Counter(nouns))
def encoding(category, text): sent_list = kss.split_sentences(text) tokens = [] for sent in sent_list: tokenized_sentence = tokenizer.tokenize(text) if len(tokens) + len(tokenized_sentence) < 912: tokens += ['<s>'] + tokenized_sentence + ['</s>'] else: break tokens += [category] tokens += ['<title>'] return torch.tensor(tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)
def check_duplicate(data): temp = list() for raw in data: r = kss.split_sentences(raw) temp.append(r) print(temp) print(type(temp)) print(len(temp)) print(type(temp[0])) print(len(temp[0])) return set(sum(temp, []))
def makeSentence(self, originaltext): #받아온 text 문장으로 나누기 textList = [] text = '' for sent in kss.split_sentences(originaltext): textList.append(sent) for i in textList: text = text + i + '. '#문장으로 구분하기 위해 '. '추가 return text
def ensembleSummarize(text): sentences = kss.split_sentences(text) n = len(sentences) s = [0] * n for idx in range(len(sentences)): if sentences[idx] in summarize(text): s[idx] += 1 if sentences[idx] in summarizeLexRank(text): s[idx] += 1 if sentences[idx] in summarizeTextRank(text).split("\n"): s[idx] += 1 i = s.index(max(s)) return sentences[i]
def makeLabel(question, label): ques, lab = [], [] for q in question: # type(q) -> str tmp = clean_text('\n'.join(kss.split_sentences(q))) sents = clean_punc(tmp, punct, punct_mapping) if sents is not None: ques.append(sents) lab.append(label) ques_col = pd.DataFrame(ques, columns=['sentences']) lab_col = pd.DataFrame(lab, columns=['label']) result = pd.concat([ques_col, lab_col], axis=1) return result
def predict(paragraph): with session.as_default(): with session.graph.as_default(): emotions = [0, 0, 0, 0, 0, 0] sentences = kss.split_sentences(paragraph) from NLP import Model for sentence in sentences: output = Model.put(token, sentence, mc) #문장 토크나이즈 emotion = Model.out(output, model) #1, model2, model3)#, model4, #model5,model6,model7)#,model8,model9,model10) #토큰화된 문장을 감정으로 출력 emotions[emotion[0]] = emotions[ emotion[0]] + 1 #감정에 따라 감정 리스트에 각 감정점수를 플러스 return emotions
def replace_str(data): data = data.replace('\n', ' ') data = data.replace('//', '') data = data.replace('ㅠ', '') data = data.replace('ㅋ', '') data = kss.split_sentences(data) # print(f'!! data : {data}') data = ' '.join(data[:3]) data = data.replace(' ', ' ') data = data.replace(' ', ' ') return data
def chat(self, sent='0'): self.tok_path tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0) sent_tokens = tok(sent) with torch.no_grad(): while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] timeout = time.time() + 60 while 1: input_ids = torch.LongTensor([self.vocab[U_TKN]] + self.vocab[q_tok] + self.vocab[EOS, SENT] + self.vocab[sent_tokens] + self.vocab[EOS, S_TKN] + self.vocab[a_tok]).unsqueeze( dim=0) pred = self(input_ids) gen = self.vocab.to_tokens( torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) if time.time() > timeout: break answer_list = kss.split_sentences(a)[1:-2] Simsimi_answer = "".join(answer_list) sentence_list = Simsimi_answer.split('.') sentences = [] for s in sentence_list: word_list = s.split() #리스트 # sentences=[] for word in word_list: if word.endswith('*님이') == True: word_list[word_list.index(word)] = word.replace( word, "상담자님이") # print(word) else: pass sentence = " ".join(word_list) sentences.append(sentence) # print(sentence) print("Simsimi > ", ".".join(sentences))
def process_reviews(reviews): processed_reviews = [] for review in reviews: review = repeat_normalize(review, num_repeats=2) # normalize repeats by two review = spacing(review) # space by words review = ('.').join(split_sentences(review)) # split by sentence try: review = spell_checker.check(review).as_dict()['checked'] except: print('pass') pass print(review) processed_reviews.append(review) time.sleep(0.5) return processed_reviews
def _tokenize_for_summarize(text): # Sentence separator splitted_array = kss.split_sentences(text) # Tokenizer tokenized_table = _tokenizer_kor(texts=splitted_array, pos_extraction=['Noun']) len_doc = len(tokenized_table) # Tokenizer afterprocess tokenized_sentence = ([ ' '.join(tokenized_table[i]) for i in range(len_doc) ]) return tokenized_sentence, splitted_array, len_doc
def summarization(txt): segment = [] for sent in kss.split_sentences(txt): # 품사 분석을 통한 분리 if "." in sent[0:-1]: # 문장 내 불필요한 온점은 삭제 sent = sent.replace(".", "") if sent[-1] in [".", "?", "!"]: # 마침표 유무에 따라 구분 sent = sent else: sent = sent + "." # 분리된 문장 끝에 온점 부여 segment.append(sent) if len(segment) <= 10: result = " ".join(segment).replace("\n", "") # 문장 내 enter 제거 else: # \n 처리 # 요약된 문장 수 조정 seg_str = " ".join(segment).replace("\n", "") result = summarize(seg_str, ratio=0.15, word_count=80) return result
def read_token(file_name): okt = Okt() # 품사 분석기 result = [] with open(file_name, encoding='UTF8') as fread: while True: line = fread.readline() #한 줄씩 읽음. if not line: break #모두 읽으면 while문 종료. #line = okt.morphs(line) #형태소 분석 for sent in kss.split_sentences(line): # 문단 -> 문장 tokenlist = okt.pos(sent, stem=True, norm=True) # 형태소 분석 후 단어 품사 태깅 tmp = [] for word in tokenlist: if word[1] in ["Noun", "Verb", "Adjective"]: tmp.append((word[0])) # 해당 단어를 저장함` result.append(tmp) return '\n'.join([' '.join(r) for r in result])
def result(): # 주소함수 # 0. initialize now = str(time.time()) dloads_src = '/static/img/after_' + now + '.jpg' time_info = datetime.datetime.now().strftime('%Y%m%d%H%M%S') sentence = request.form['sentence'] sentence = re.sub('\n', ' ', sentence) sentence = re.sub('\r', ' ', sentence) ############################### # 1. Get input text from user # ############################### input_text = kss.split_sentences(sentence) print(input_text) ###################### # 2. Extract keyword # ###################### keyword = get_keyword(input_text) print("추출된 키워드 > ", keyword) ####################### # # 3. Image crawling # ####################### if len(keyword) < 3: pass # 예외처리 else: image_link = get_crawlingImage(keyword, "KRTSOhiLDjFo8VpVkekS", "PnJAftBpaI", time_info, now) # image_link = get_crawlingImage(keyword, "KRTSOhiLDjFo8VpVkekS", "PnJAftBpaI", time_info) ############################## # 4. Predict sentiment label # ############################## sentiment_label = get_sentimentLabel(input_text, time_info) ##################### # 5. Style transfer # ##################### get_finalImage(image_link, sentiment_label, filename='after_' + now) return render_template('result.html', sentence=sentence, after_img='img/after_' + now + '.jpg', dloads_src=dloads_src)