示例#1
0
 def test_title_special_word(self):
     self.assertEqual(textcount.is_title_shock_ohmy("충격,이럴수가"), True)
     self.assertEqual(textcount.is_title_shock_ohmy("충격,이 럴수가"), False)
     self.assertEqual(textcount.is_title_shock_ohmy("충격, 이럴 수가"), True)
     self.assertEqual(textcount.is_title_shock_ohmy("충격이럴수가"), False)
     self.assertEqual(textcount.is_title_exclusive_news("단독"), True)
     self.assertEqual(textcount.is_title_exclusive_news("[단독"), True)
     self.assertEqual(textcount.is_title_exclusive_news("[단독보도"), True)
     self.assertEqual(textcount.is_title_exclusive_news("제목에 그냥 단독이 쓰임"),
                      False)
     self.assertEqual(textcount.is_title_breaking_news("속보"), True)
     self.assertEqual(textcount.is_title_breaking_news("[속보]"), True)
     self.assertEqual(textcount.is_title_breaking_news("[긴급속보]"), True)
     self.assertEqual(textcount.is_title_breaking_news("뒤쪽에 속보가 오면 패스"),
                      False)
     self.assertEqual(textcount.is_title_plan_news("기획"), True)
     self.assertEqual(textcount.is_title_plan_news("[기획"), True)
     self.assertEqual(textcount.is_title_plan_news("기획기사"), True)
     self.assertEqual(textcount.is_title_plan_news("신년기획특집"), True)
     self.assertEqual(textcount.is_title_plan_news("제목을 기획하다"), False)
示例#2
0
def _lab_process(param, **kwargs):
    coll_apilab = helper.mongo.get_collection_nt(COLLNAME_APILAB)

    obj_id = ObjectId(param["id"])
    doc = coll_apilab.find_one({"_id": obj_id})
    if doc is None:
        helper.eventlog.error("ApiLab item not found: %s" % obj_id)
        return

    title = doc['title']
    content = doc['content']

    update_set = dict()

    if "split" in kwargs:
        lines = list(ntrust.sentences.split_sentence(content))
        num_sentence = len(lines)
        content = update_set["result"] = "\n".join(lines)

    if "sanitize" in kwargs:
        san = ntrust.content.Sanitizer("labtest")
        san.process(content)
        content = update_set["result"] = san.get_contents()
        num_sentence = len(san.get_contents_lines())
        bylines = san.get_bylines()
        if bylines:
            update_set[jnscore.DOCKEY_BYLINES] = bylines

    if "postag" in kwargs:
        lines = content.split("\n")
        # Mecab
        mecab_tags = []
        mecab_start_time = timeit.default_timer()
        for line in lines:
            tagResults = tagger.merge(line, deep=False)
            for tres in tagResults:
                # print(tres)
                if tres[0] == "":
                    continue

                if '-' not in tres[2]:
                    pos = ','.join(tres[1])
                    mecab_tags.append({"word": tres[0], "pos": pos})

        mecab_end_time = timeit.default_timer()
        update_set["mecab_time"] = mecab_end_time - mecab_start_time
        update_set["mecab_postag"] = mecab_tags
        # print("Mecab: %f seconds: %s" % (update_set["mecab_time"], update_set["mecab_postag"]))

        # Hannanum
        hannn_tags = []
        hannn_start_time = timeit.default_timer()
        for line in lines:
            hpos = hannanum.pos(line)
            for pos in hpos:
                hannn_tags.append({"word": pos[0], "pos": pos[1]})

        hannn_end_time = timeit.default_timer()
        update_set["hannanum_time"] = hannn_end_time - hannn_start_time
        update_set["hannanum_postag"] = hannn_tags
        # print("Hannanum: %f seconds: %s" % (update_set["hannanum_time"], update_set["hannanum_postag"]))

        # Kkma
        kkma_tags = []
        kkma_start_time = timeit.default_timer()
        for line in lines:
            kpos = kkma.pos(line)
            for pos in kpos:
                kkma_tags.append({"word": pos[0], "pos": pos[1]})

        kkma_end_time = timeit.default_timer()
        update_set["kkma_time"] = kkma_end_time - kkma_start_time
        update_set["kkma_postag"] = kkma_tags
        # print("Kkma: %f seconds: %s" % (update_set["kkma_time"], update_set["kkma_postag"]))

        # Twitter
        twit_tags = []
        twit_start_time = timeit.default_timer()
        for line in lines:
            tpos = twitter.pos(line)
            for pos in tpos:
                twit_tags.append({"word": pos[0], "pos": pos[1]})

        twit_end_time = timeit.default_timer()
        update_set["twitter_time"] = twit_end_time - twit_start_time
        update_set["twitter_postag"] = twit_tags
        # print("Twitter: %f seconds: %s" % (update_set["twitter_time"], update_set["twitter_postag"]))

    if "metric" in kwargs:
        content_length = textcount.length(content)
        update_set[jnscore.DOCKEY_CONTENT_LENGTH] = content_length
        update_set["title_length"] = textcount.length(title)
        update_set[
            jnscore.
            DOCKEY_TITLE_NUM_EXCLAMATION] = textcount.number_of_exclamation_marks(
                title)
        update_set[
            jnscore.
            DOCKEY_TITLE_NUM_QUESTION] = textcount.number_of_question_marks(
                title)
        update_set[
            "title_numSingleQuote"] = textcount.number_of_singlequote_marks(
                title)
        update_set[
            "title_numDoubleQuote"] = textcount.number_of_doublequote_marks(
                title)
        update_set["title_hasShock"] = textcount.is_title_shock_ohmy(title)
        update_set["title_hasExclusive"] = textcount.is_title_exclusive_news(
            title)
        update_set["title_hasBreaking"] = textcount.is_title_breaking_news(
            title)
        update_set["title_hasPlan"] = textcount.is_title_plan_news(title)

        quotes = sentences.extract_quoted(content)
        qarr = list()
        total_quotes_len = 0
        if quotes:
            for qs in quotes:
                if textcount.number_of_white_spaces(qs) >= 2:
                    qlen = textcount.length(qs)
                    total_quotes_len += qlen
                    qarr.append({"sentence": qs, "length": qlen})
        update_set[jnscore.DOCKEY_QUOTES] = qarr

        # 인용문 본문 비율
        update_set[jnscore.DOCKEY_CONTENT_QUOTE_PERCENT] = (
            total_quotes_len / content_length) if content_length > 0 else 0

        # 문장 갯수
        update_set["content_numSentence"] = num_sentence

        # 문장 당 평균 길이
        avgSentenceLength = 0
        if num_sentence > 0:
            sumSentenceLength = 0
            for line in content.split("\n"):
                sumSentenceLength += len(line)
            avgSentenceLength = float(sumSentenceLength) / num_sentence

        update_set[
            jnscore.DOCKEY_CONTENT_AVG_SENTENCE_LENGTH] = avgSentenceLength

        # 본문에 수치 인용 개수, 문장 당 평균 부사 수
        num_numbers = 0
        num_adverbs = 0
        num_adjectives = 0
        num_conjunctions = 0

        postags = update_set["mecab_postag"]
        for item in postags:
            if item["pos"].startswith('SN'):
                num_numbers += 1

            for pos in item["pos"].split(","):
                if pos.startswith('SN'):
                    num_numbers += 1
                if pos.startswith("MAG"):
                    num_adverbs += 1
                elif pos.startswith("MAJ"):
                    num_conjunctions += 1
                elif pos.startswith("VA"):
                    num_adjectives += 1

        update_set["content_numAdverb"] = num_adverbs
        update_set["content_numAdjective"] = num_adjectives
        update_set["content_numConjunction"] = num_conjunctions
        update_set[jnscore.DOCKEY_CONTENT_AVG_ADVERBS_PER_SENTENCE] = (
            num_adverbs / num_sentence) if num_sentence > 0 else 0
        update_set[jnscore.DOCKEY_CONTENT_NUM_NUMBER] = num_numbers

        # 제목에 부사 수
        title_adverbs = []
        titlePos = kkma.pos(title)
        for pos in titlePos:
            if pos[1] == "MAG":
                title_adverbs.append(pos[0])

        update_set[jnscore.DOCKEY_TITLE_ADVERBS] = title_adverbs

    if "trust" in kwargs:
        doc.update(update_set)
        doc[jnscore.DOCKEY_IMAGE_COUNT] = 0
        jsco = jnscore.evaluate(doc["category"], doc["mediaType"], doc)
        update_set["journal"] = jsco.journal
        update_set["journal_totalSum"] = jsco.journalSum
        update_set["vanilla"] = jsco.vanilla
        update_set["vanilla_totalSum"] = jsco.vanillaSum
        update_set["score"] = jsco.scores

    update_set["completedAt"] = datetime.datetime.utcnow()
    coll_apilab.update_one({"_id": obj_id}, {"$set": update_set}, upsert=False)
    print("\nLabItem done:", obj_id)
示例#3
0
def get_metric_update_dicts(params):
    import helper.entity_coll

    title = params["title"]
    content = params["content"]

    media_id = params["mediaId"] if "mediaId" in params else None
    image_count = params["imageCount"] if "imageCount" in params else 0

    main_sets = dict()
    main_unsets = dict()

    # title
    #main_sets["title"] = title
    main_sets["title_length"] = textcount.length(title)
    main_sets[
        jnscore.
        DOCKEY_TITLE_NUM_EXCLAMATION] = textcount.number_of_exclamation_marks(
            title)
    main_sets[jnscore.
              DOCKEY_TITLE_NUM_QUESTION] = textcount.number_of_question_marks(
                  title)
    main_sets["title_numPuncs"] = main_sets[
        "title_numExclamation"] + main_sets["title_numQuestion"]
    main_sets["title_numSingleQuote"] = textcount.number_of_singlequote_marks(
        title)
    main_sets["title_numDoubleQuote"] = textcount.number_of_doublequote_marks(
        title)
    main_sets["title_hasShock"] = textcount.is_title_shock_ohmy(title)
    main_sets["title_hasExclusive"] = textcount.is_title_exclusive_news(title)
    main_sets["title_hasBreaking"] = textcount.is_title_breaking_news(title)
    main_sets["title_hasPlan"] = textcount.is_title_plan_news(title)

    # image
    main_sets[jnscore.DOCKEY_IMAGE_COUNT] = image_count

    # content
    san = ntrust.content.Sanitizer(media_id)
    san.process(content)
    sancon = san.get_contents()
    main_sets["content"] = sancon
    content_length = textcount.length(sancon)
    main_sets[jnscore.DOCKEY_CONTENT_LENGTH] = content_length

    # 본문에서 바이라인 추출
    bylines = san.get_bylines()
    if bylines:
        main_sets[jnscore.DOCKEY_BYLINES] = bylines
    else:
        main_unsets[jnscore.DOCKEY_BYLINES] = 1

    # 인용문 추출
    quotes = sentences.extract_quoted(sancon)
    qarr = list()
    total_quotes_len = 0
    if quotes:
        for qs in quotes:
            if textcount.number_of_white_spaces(qs) >= 2:
                qlen = textcount.length(qs)
                total_quotes_len += qlen
                qarr.append({"sentence": qs, "length": qlen})
    main_sets[jnscore.DOCKEY_QUOTES] = qarr  # 빈 qarr 도 저장하라고 해서..
    main_sets[jnscore.DOCKEY_CONTENT_QUOTE_PERCENT] = (
        total_quotes_len / content_length) if content_length > 0 else 0

    # 형태소 분석 및 인용문 앞부분 추출에 사용
    lines = sancon.split("\n")

    # 문장 갯수
    num_sentence = len(lines)
    main_sets["content_numSentence"] = num_sentence

    # 문장 당 평균 길이
    avgSentenceLength = 0
    if num_sentence > 0:
        sumSentenceLength = 0
        for line in lines:
            sumSentenceLength += len(line)
        avgSentenceLength = float(sumSentenceLength) / num_sentence

    main_sets[jnscore.DOCKEY_CONTENT_AVG_SENTENCE_LENGTH] = avgSentenceLength

    # 인용문 앞부분 추출
    if quotes:
        for line in lines:
            for qdic in main_sets[jnscore.DOCKEY_QUOTES]:
                if "frontText" not in qdic and line.find(
                        qdic["sentence"]) >= 0:
                    qdic["frontText"] = sentences.get_text_front_quote(
                        line, qdic["sentence"])

    # 형태소 분석
    news_entity = dict()
    num_adverbs = 0
    num_adjectives = 0
    num_conjunctions = 0

    # Mecab
    mecab_tags = []
    mecab_start_time = timeit.default_timer()
    for line in lines:
        tagResults = tagger.merge(line, deep=False)
        for tres in tagResults:
            # print(tres)
            if tres[0] == "":
                continue

            if '-' not in tres[2]:
                pos = ','.join(tres[1])
                mecab_tags.append({"word": tres[0], "pos": pos})

    mecab_end_time = timeit.default_timer()
    news_entity["mecab_time"] = mecab_end_time - mecab_start_time
    news_entity["mecab_postag"] = mecab_tags
    # print("Mecab: %f seconds: %s" % (update_set["mecab_time"], update_set["mecab_postag"]))

    ec_mecab = helper.entity_coll.EntityCollector()
    for item in mecab_tags:
        for pos in item["pos"].split(","):
            if pos.startswith('N'):
                ec_mecab.feed(item["word"])

            if pos.startswith("MAG"):
                num_adverbs += 1
            elif pos.startswith("MAJ"):
                num_conjunctions += 1
            elif pos.startswith("VA"):
                num_adjectives += 1

    ec_mecab.get_result("mecab_", news_entity)

    # Hannanum
    hannn_tags = []
    hannn_start_time = timeit.default_timer()
    for line in lines:
        hpos = hannanum.pos(line)
        for pos in hpos:
            hannn_tags.append({"word": pos[0], "pos": pos[1]})

    hannn_end_time = timeit.default_timer()
    news_entity["hannanum_time"] = hannn_end_time - hannn_start_time
    news_entity["hannanum_postag"] = hannn_tags
    # print("Hannanum: %f seconds: %s" % (update_set["hannanum_time"], update_set["hannanum_postag"]))

    ec_hannanum = helper.entity_coll.EntityCollector()
    for item in hannn_tags:
        if item["pos"].startswith('N'):
            ec_hannanum.feed(item["word"])
    ec_hannanum.get_result("hannanum_", news_entity)

    # Kkma
    kkma_tags = []
    kkma_start_time = timeit.default_timer()
    for line in lines:
        kpos = kkma.pos(line)
        for pos in kpos:
            kkma_tags.append({"word": pos[0], "pos": pos[1]})

    kkma_end_time = timeit.default_timer()
    news_entity["kkma_time"] = kkma_end_time - kkma_start_time
    news_entity["kkma_postag"] = kkma_tags
    # print("Kkma: %f seconds: %s" % (update_set["kkma_time"], update_set["kkma_postag"]))

    ec_kkma = helper.entity_coll.EntityCollector()
    for item in kkma_tags:
        if item["pos"].startswith('N'):
            ec_kkma.feed(item["word"])
    ec_kkma.get_result("kkma_", news_entity)

    # Twitter
    twit_tags = []
    twit_start_time = timeit.default_timer()
    for line in lines:
        tpos = twitter.pos(line)
        for pos in tpos:
            twit_tags.append({"word": pos[0], "pos": pos[1]})

    twit_end_time = timeit.default_timer()
    news_entity["twitter_time"] = twit_end_time - twit_start_time
    news_entity["twitter_postag"] = twit_tags
    # print("Twitter: %f seconds: %s" % (update_set["twitter_time"], update_set["twitter_postag"]))

    ec_twit = helper.entity_coll.EntityCollector()
    for item in twit_tags:
        if item["pos"].startswith('N'):
            ec_twit.feed(item["word"])
    ec_twit.get_result("twitter_", news_entity)

    # 문장 당 평균 부사 수
    main_sets["content_numAdverb"] = num_adverbs
    main_sets["content_numAdjective"] = num_adjectives
    main_sets["content_numConjunction"] = num_conjunctions
    main_sets["content_avgAdverbsPerSentence"] = (
        num_adverbs / num_sentence) if num_sentence > 0 else 0

    # 본문에 수치 인용 개수
    num_numbers = 0
    postags = news_entity["mecab_postag"]
    for item in postags:
        if item["pos"].startswith('SN'):
            num_numbers += 1

    main_sets[jnscore.DOCKEY_CONTENT_NUM_NUMBER] = num_numbers

    main_sets["content_anonPredicates"] = anonpred.find_anonymous_predicates(
        sancon)

    # TODO
    #main_sets["content_numForeignWord"] = 0
    main_sets["informant_real"] = []
    main_sets["quotes_ratioRealAnon"] = 0

    # 제목에 부사 수
    title_adverbs = []
    titlePos = kkma.pos(title)
    for pos in titlePos:
        if pos[1] == "MAG":
            title_adverbs.append(pos[0])

    main_sets["title_adverbs"] = title_adverbs

    return ({"$set": main_sets, "$unset": main_unsets}, news_entity)