def test_title_special_word(self): self.assertEqual(textcount.is_title_shock_ohmy("충격,이럴수가"), True) self.assertEqual(textcount.is_title_shock_ohmy("충격,이 럴수가"), False) self.assertEqual(textcount.is_title_shock_ohmy("충격, 이럴 수가"), True) self.assertEqual(textcount.is_title_shock_ohmy("충격이럴수가"), False) self.assertEqual(textcount.is_title_exclusive_news("단독"), True) self.assertEqual(textcount.is_title_exclusive_news("[단독"), True) self.assertEqual(textcount.is_title_exclusive_news("[단독보도"), True) self.assertEqual(textcount.is_title_exclusive_news("제목에 그냥 단독이 쓰임"), False) self.assertEqual(textcount.is_title_breaking_news("속보"), True) self.assertEqual(textcount.is_title_breaking_news("[속보]"), True) self.assertEqual(textcount.is_title_breaking_news("[긴급속보]"), True) self.assertEqual(textcount.is_title_breaking_news("뒤쪽에 속보가 오면 패스"), False) self.assertEqual(textcount.is_title_plan_news("기획"), True) self.assertEqual(textcount.is_title_plan_news("[기획"), True) self.assertEqual(textcount.is_title_plan_news("기획기사"), True) self.assertEqual(textcount.is_title_plan_news("신년기획특집"), True) self.assertEqual(textcount.is_title_plan_news("제목을 기획하다"), False)
def _lab_process(param, **kwargs): coll_apilab = helper.mongo.get_collection_nt(COLLNAME_APILAB) obj_id = ObjectId(param["id"]) doc = coll_apilab.find_one({"_id": obj_id}) if doc is None: helper.eventlog.error("ApiLab item not found: %s" % obj_id) return title = doc['title'] content = doc['content'] update_set = dict() if "split" in kwargs: lines = list(ntrust.sentences.split_sentence(content)) num_sentence = len(lines) content = update_set["result"] = "\n".join(lines) if "sanitize" in kwargs: san = ntrust.content.Sanitizer("labtest") san.process(content) content = update_set["result"] = san.get_contents() num_sentence = len(san.get_contents_lines()) bylines = san.get_bylines() if bylines: update_set[jnscore.DOCKEY_BYLINES] = bylines if "postag" in kwargs: lines = content.split("\n") # Mecab mecab_tags = [] mecab_start_time = timeit.default_timer() for line in lines: tagResults = tagger.merge(line, deep=False) for tres in tagResults: # print(tres) if tres[0] == "": continue if '-' not in tres[2]: pos = ','.join(tres[1]) mecab_tags.append({"word": tres[0], "pos": pos}) mecab_end_time = timeit.default_timer() update_set["mecab_time"] = mecab_end_time - mecab_start_time update_set["mecab_postag"] = mecab_tags # print("Mecab: %f seconds: %s" % (update_set["mecab_time"], update_set["mecab_postag"])) # Hannanum hannn_tags = [] hannn_start_time = timeit.default_timer() for line in lines: hpos = hannanum.pos(line) for pos in hpos: hannn_tags.append({"word": pos[0], "pos": pos[1]}) hannn_end_time = timeit.default_timer() update_set["hannanum_time"] = hannn_end_time - hannn_start_time update_set["hannanum_postag"] = hannn_tags # print("Hannanum: %f seconds: %s" % (update_set["hannanum_time"], update_set["hannanum_postag"])) # Kkma kkma_tags = [] kkma_start_time = timeit.default_timer() for line in lines: kpos = kkma.pos(line) for pos in kpos: kkma_tags.append({"word": pos[0], "pos": pos[1]}) kkma_end_time = timeit.default_timer() update_set["kkma_time"] = kkma_end_time - kkma_start_time update_set["kkma_postag"] = kkma_tags # print("Kkma: %f seconds: %s" % (update_set["kkma_time"], update_set["kkma_postag"])) # Twitter twit_tags = [] twit_start_time = timeit.default_timer() for line in lines: tpos = twitter.pos(line) for pos in tpos: twit_tags.append({"word": pos[0], "pos": pos[1]}) twit_end_time = timeit.default_timer() update_set["twitter_time"] = twit_end_time - twit_start_time update_set["twitter_postag"] = twit_tags # print("Twitter: %f seconds: %s" % (update_set["twitter_time"], update_set["twitter_postag"])) if "metric" in kwargs: content_length = textcount.length(content) update_set[jnscore.DOCKEY_CONTENT_LENGTH] = content_length update_set["title_length"] = textcount.length(title) update_set[ jnscore. DOCKEY_TITLE_NUM_EXCLAMATION] = textcount.number_of_exclamation_marks( title) update_set[ jnscore. DOCKEY_TITLE_NUM_QUESTION] = textcount.number_of_question_marks( title) update_set[ "title_numSingleQuote"] = textcount.number_of_singlequote_marks( title) update_set[ "title_numDoubleQuote"] = textcount.number_of_doublequote_marks( title) update_set["title_hasShock"] = textcount.is_title_shock_ohmy(title) update_set["title_hasExclusive"] = textcount.is_title_exclusive_news( title) update_set["title_hasBreaking"] = textcount.is_title_breaking_news( title) update_set["title_hasPlan"] = textcount.is_title_plan_news(title) quotes = sentences.extract_quoted(content) qarr = list() total_quotes_len = 0 if quotes: for qs in quotes: if textcount.number_of_white_spaces(qs) >= 2: qlen = textcount.length(qs) total_quotes_len += qlen qarr.append({"sentence": qs, "length": qlen}) update_set[jnscore.DOCKEY_QUOTES] = qarr # 인용문 본문 비율 update_set[jnscore.DOCKEY_CONTENT_QUOTE_PERCENT] = ( total_quotes_len / content_length) if content_length > 0 else 0 # 문장 갯수 update_set["content_numSentence"] = num_sentence # 문장 당 평균 길이 avgSentenceLength = 0 if num_sentence > 0: sumSentenceLength = 0 for line in content.split("\n"): sumSentenceLength += len(line) avgSentenceLength = float(sumSentenceLength) / num_sentence update_set[ jnscore.DOCKEY_CONTENT_AVG_SENTENCE_LENGTH] = avgSentenceLength # 본문에 수치 인용 개수, 문장 당 평균 부사 수 num_numbers = 0 num_adverbs = 0 num_adjectives = 0 num_conjunctions = 0 postags = update_set["mecab_postag"] for item in postags: if item["pos"].startswith('SN'): num_numbers += 1 for pos in item["pos"].split(","): if pos.startswith('SN'): num_numbers += 1 if pos.startswith("MAG"): num_adverbs += 1 elif pos.startswith("MAJ"): num_conjunctions += 1 elif pos.startswith("VA"): num_adjectives += 1 update_set["content_numAdverb"] = num_adverbs update_set["content_numAdjective"] = num_adjectives update_set["content_numConjunction"] = num_conjunctions update_set[jnscore.DOCKEY_CONTENT_AVG_ADVERBS_PER_SENTENCE] = ( num_adverbs / num_sentence) if num_sentence > 0 else 0 update_set[jnscore.DOCKEY_CONTENT_NUM_NUMBER] = num_numbers # 제목에 부사 수 title_adverbs = [] titlePos = kkma.pos(title) for pos in titlePos: if pos[1] == "MAG": title_adverbs.append(pos[0]) update_set[jnscore.DOCKEY_TITLE_ADVERBS] = title_adverbs if "trust" in kwargs: doc.update(update_set) doc[jnscore.DOCKEY_IMAGE_COUNT] = 0 jsco = jnscore.evaluate(doc["category"], doc["mediaType"], doc) update_set["journal"] = jsco.journal update_set["journal_totalSum"] = jsco.journalSum update_set["vanilla"] = jsco.vanilla update_set["vanilla_totalSum"] = jsco.vanillaSum update_set["score"] = jsco.scores update_set["completedAt"] = datetime.datetime.utcnow() coll_apilab.update_one({"_id": obj_id}, {"$set": update_set}, upsert=False) print("\nLabItem done:", obj_id)
def get_metric_update_dicts(params): import helper.entity_coll title = params["title"] content = params["content"] media_id = params["mediaId"] if "mediaId" in params else None image_count = params["imageCount"] if "imageCount" in params else 0 main_sets = dict() main_unsets = dict() # title #main_sets["title"] = title main_sets["title_length"] = textcount.length(title) main_sets[ jnscore. DOCKEY_TITLE_NUM_EXCLAMATION] = textcount.number_of_exclamation_marks( title) main_sets[jnscore. DOCKEY_TITLE_NUM_QUESTION] = textcount.number_of_question_marks( title) main_sets["title_numPuncs"] = main_sets[ "title_numExclamation"] + main_sets["title_numQuestion"] main_sets["title_numSingleQuote"] = textcount.number_of_singlequote_marks( title) main_sets["title_numDoubleQuote"] = textcount.number_of_doublequote_marks( title) main_sets["title_hasShock"] = textcount.is_title_shock_ohmy(title) main_sets["title_hasExclusive"] = textcount.is_title_exclusive_news(title) main_sets["title_hasBreaking"] = textcount.is_title_breaking_news(title) main_sets["title_hasPlan"] = textcount.is_title_plan_news(title) # image main_sets[jnscore.DOCKEY_IMAGE_COUNT] = image_count # content san = ntrust.content.Sanitizer(media_id) san.process(content) sancon = san.get_contents() main_sets["content"] = sancon content_length = textcount.length(sancon) main_sets[jnscore.DOCKEY_CONTENT_LENGTH] = content_length # 본문에서 바이라인 추출 bylines = san.get_bylines() if bylines: main_sets[jnscore.DOCKEY_BYLINES] = bylines else: main_unsets[jnscore.DOCKEY_BYLINES] = 1 # 인용문 추출 quotes = sentences.extract_quoted(sancon) qarr = list() total_quotes_len = 0 if quotes: for qs in quotes: if textcount.number_of_white_spaces(qs) >= 2: qlen = textcount.length(qs) total_quotes_len += qlen qarr.append({"sentence": qs, "length": qlen}) main_sets[jnscore.DOCKEY_QUOTES] = qarr # 빈 qarr 도 저장하라고 해서.. main_sets[jnscore.DOCKEY_CONTENT_QUOTE_PERCENT] = ( total_quotes_len / content_length) if content_length > 0 else 0 # 형태소 분석 및 인용문 앞부분 추출에 사용 lines = sancon.split("\n") # 문장 갯수 num_sentence = len(lines) main_sets["content_numSentence"] = num_sentence # 문장 당 평균 길이 avgSentenceLength = 0 if num_sentence > 0: sumSentenceLength = 0 for line in lines: sumSentenceLength += len(line) avgSentenceLength = float(sumSentenceLength) / num_sentence main_sets[jnscore.DOCKEY_CONTENT_AVG_SENTENCE_LENGTH] = avgSentenceLength # 인용문 앞부분 추출 if quotes: for line in lines: for qdic in main_sets[jnscore.DOCKEY_QUOTES]: if "frontText" not in qdic and line.find( qdic["sentence"]) >= 0: qdic["frontText"] = sentences.get_text_front_quote( line, qdic["sentence"]) # 형태소 분석 news_entity = dict() num_adverbs = 0 num_adjectives = 0 num_conjunctions = 0 # Mecab mecab_tags = [] mecab_start_time = timeit.default_timer() for line in lines: tagResults = tagger.merge(line, deep=False) for tres in tagResults: # print(tres) if tres[0] == "": continue if '-' not in tres[2]: pos = ','.join(tres[1]) mecab_tags.append({"word": tres[0], "pos": pos}) mecab_end_time = timeit.default_timer() news_entity["mecab_time"] = mecab_end_time - mecab_start_time news_entity["mecab_postag"] = mecab_tags # print("Mecab: %f seconds: %s" % (update_set["mecab_time"], update_set["mecab_postag"])) ec_mecab = helper.entity_coll.EntityCollector() for item in mecab_tags: for pos in item["pos"].split(","): if pos.startswith('N'): ec_mecab.feed(item["word"]) if pos.startswith("MAG"): num_adverbs += 1 elif pos.startswith("MAJ"): num_conjunctions += 1 elif pos.startswith("VA"): num_adjectives += 1 ec_mecab.get_result("mecab_", news_entity) # Hannanum hannn_tags = [] hannn_start_time = timeit.default_timer() for line in lines: hpos = hannanum.pos(line) for pos in hpos: hannn_tags.append({"word": pos[0], "pos": pos[1]}) hannn_end_time = timeit.default_timer() news_entity["hannanum_time"] = hannn_end_time - hannn_start_time news_entity["hannanum_postag"] = hannn_tags # print("Hannanum: %f seconds: %s" % (update_set["hannanum_time"], update_set["hannanum_postag"])) ec_hannanum = helper.entity_coll.EntityCollector() for item in hannn_tags: if item["pos"].startswith('N'): ec_hannanum.feed(item["word"]) ec_hannanum.get_result("hannanum_", news_entity) # Kkma kkma_tags = [] kkma_start_time = timeit.default_timer() for line in lines: kpos = kkma.pos(line) for pos in kpos: kkma_tags.append({"word": pos[0], "pos": pos[1]}) kkma_end_time = timeit.default_timer() news_entity["kkma_time"] = kkma_end_time - kkma_start_time news_entity["kkma_postag"] = kkma_tags # print("Kkma: %f seconds: %s" % (update_set["kkma_time"], update_set["kkma_postag"])) ec_kkma = helper.entity_coll.EntityCollector() for item in kkma_tags: if item["pos"].startswith('N'): ec_kkma.feed(item["word"]) ec_kkma.get_result("kkma_", news_entity) # Twitter twit_tags = [] twit_start_time = timeit.default_timer() for line in lines: tpos = twitter.pos(line) for pos in tpos: twit_tags.append({"word": pos[0], "pos": pos[1]}) twit_end_time = timeit.default_timer() news_entity["twitter_time"] = twit_end_time - twit_start_time news_entity["twitter_postag"] = twit_tags # print("Twitter: %f seconds: %s" % (update_set["twitter_time"], update_set["twitter_postag"])) ec_twit = helper.entity_coll.EntityCollector() for item in twit_tags: if item["pos"].startswith('N'): ec_twit.feed(item["word"]) ec_twit.get_result("twitter_", news_entity) # 문장 당 평균 부사 수 main_sets["content_numAdverb"] = num_adverbs main_sets["content_numAdjective"] = num_adjectives main_sets["content_numConjunction"] = num_conjunctions main_sets["content_avgAdverbsPerSentence"] = ( num_adverbs / num_sentence) if num_sentence > 0 else 0 # 본문에 수치 인용 개수 num_numbers = 0 postags = news_entity["mecab_postag"] for item in postags: if item["pos"].startswith('SN'): num_numbers += 1 main_sets[jnscore.DOCKEY_CONTENT_NUM_NUMBER] = num_numbers main_sets["content_anonPredicates"] = anonpred.find_anonymous_predicates( sancon) # TODO #main_sets["content_numForeignWord"] = 0 main_sets["informant_real"] = [] main_sets["quotes_ratioRealAnon"] = 0 # 제목에 부사 수 title_adverbs = [] titlePos = kkma.pos(title) for pos in titlePos: if pos[1] == "MAG": title_adverbs.append(pos[0]) main_sets["title_adverbs"] = title_adverbs return ({"$set": main_sets, "$unset": main_unsets}, news_entity)