Exemplo n.º 1
0
def extract_file_noun(input, output):
    input_file = open(input, mode='r', encoding='utf-8')
    open(output, mode='w', encoding='utf-8')
    output_file = open(output, mode='a', encoding='utf-8')
    line_number = 1
    while (True):
        line = input_file.readline()
        if not line:
            break

        line = line.strip()

        for line_array in line.split("\n"):
            sentences = nltkSentTokenizer(line_array)

            sentence_words = []
            for sent in sentences:

                word_list = expect_nng_text(sent)

                if len(word_list):
                    for word in word_list:
                        if util.check_email(word) or util.is_int(
                                word) or util.is_alpha(word):
                            continue
                        else:
                            output_file.write(word + os.linesep)
                            sentence_words.append(word)
                            # print(line_number, word)
        print(line_number, sentence_words)
        line_number += 1
Exemplo n.º 2
0
def extract_file_noun(input, output):
    input_file = open(input, mode='r', encoding='utf-8')
    open(output, mode='w', encoding='utf-8')
    output_file = open(output, mode='a', encoding='utf-8')
    line_number = 1
    while (True):
        line = input_file.readline()
        if not line:
            break

        line = line.strip()

        for line_array in line.split("\n"):
            sentences = nltkSentTokenizer(line_array)

            sentence_words = []
            for sent in sentences:

                word_list = expect_noun_text(sent)

                if len(word_list):
                    for word in word_list:
                        if util.check_email(word):
                            continue
                        else:
                            add_flag = True
                            for char in word:
                                if char in [
                                        "‘", "`", ",", "'", "\"", "|", "!",
                                        "@", "#", "$", "%", "^", "&", "*", "(",
                                        ")", "-", "_", "=", "+", "<", ">", ".",
                                        ";", ":", "ㄱ", "ㄴ", "ㄲ", "ㅂ", "ㅃ", "ㅈ",
                                        "ㅉ", "ㄷ", "ㄸ", "ㄱ", "ㅁ", "ㅇ", "ㄹ", "ㅎ",
                                        "ㅅ", "ㅆ", "ㅍ", "ㅊ", "ㅌ", "ㅋ", "ㅛ", "ㅕ",
                                        "ㅑ", "ㅐ", "ㅔ", "ㅗ", "ㅓ", "ㅏ", "ㅣ", "ㅠ",
                                        "ㅜ", "ㅡ"
                                ]:
                                    add_flag = False

                            if word == '기자' or word == str(
                                    date.today().day) + '일':
                                add_flag = False

                            if add_flag:
                                output_file.write(word + os.linesep)
                                sentence_words.append(word)
                            # print(line_number, word)
        print(line_number, sentence_words)
        line_number += 1
Exemplo n.º 3
0
def extract_mecab_multi_noun(text, item_counter=0):
    text = text.strip()

    multi_noun = []
    multi_noun_score = {}
    krword_rank_noun = []
    krword_rank_noun_score = {}
    krword_rank_once_noun = []
    krword_rank_once_noun_score = {}

    if text:
        sentence_list = nltkSentTokenizer(text)

        # print(sentence_list)

        for sentence in sentence_list:
            sentence = sentence.strip()
            if sentence:
                first_multi_noun_list, _ = expect_multi_noun_text_ko(sentence)
                first_single_noun_list, _ = expect_single_noun_text_ko(
                    sentence)

                first_multi_noun_list.extend(first_single_noun_list)
                # print("f", first_single_noun_list)
                # print("f", first_multi_noun_list)
                second_multi_noun_list, second_multi_noun_list_score = cleaning_multi_noun(
                    first_multi_noun_list, cleaning_count=2)
                # second_multi_noun_list, second_multi_noun_list_score = check_stopword(second_multi_noun_list, second_multi_noun_list_score)

                # print("origin : ", sentence)
                # print(second_multi_noun_list, second_multi_noun_list_score)

                multi_noun.extend(second_multi_noun_list)
                multi_noun_score.update(second_multi_noun_list_score)

        krword_rank_noun, krword_rank_noun_score = krwordrank_noun(
            sentence_list=sentence_list, min_count=5)
        krword_rank_once_noun, krword_rank_once_noun_score = krwordrank_noun(
            sentence_list=sentence_list, min_count=2)

    # print(multi_noun, multi_noun_score)
    # print(krword_rank_noun, krword_rank_noun_score)
    # print(krword_rank_once_noun, krword_rank_once_noun_score)

    multi_noun.extend(krword_rank_noun)
    multi_noun_score.update(krword_rank_noun_score)

    # multi_noun = multi_noun.extend(krword_rank_once_noun)
    # print(multi_noun, multi_noun_score)

    # print("-" * 100)
    multi_noun, multi_noun_score = check_stopword(multi_noun, multi_noun_score)
    # krword_rank_noun, krword_rank_noun_score = check_stopword(krword_rank_noun, krword_rank_noun_score)
    krword_rank_once_noun, krword_rank_once_noun_score = check_stopword(
        krword_rank_once_noun, krword_rank_once_noun_score)

    # print(multi_noun, multi_noun_score)
    multi_noun, multi_noun_score = remove_last_one_char(
        multi_noun, multi_noun_score)
    # krword_rank_noun, krword_rank_noun_score = remove_last_one_char(krword_rank_noun, krword_rank_noun_score)
    # krword_rank_noun, krword_rank_noun_score = remove_last_one_char(krword_rank_noun, krword_rank_noun_score)

    # print(multi_noun, multi_noun_score)
    # print(krword_rank_noun, krword_rank_noun_score)
    # print(krword_rank_once_noun, krword_rank_once_noun_score)

    multi_noun, multi_noun_score = check_stopword(multi_noun, multi_noun_score)

    # print("0" * 100)
    # print(multi_noun_score)
    # print(krword_rank_once_noun_score)
    multi_noun, multi_noun_score = multi_noun_score_add(
        multi_noun_score, krword_rank_once_noun_score)

    # print("0" * 100)
    # print(multi_noun, multi_noun_score)
    multi_noun, multi_noun_score = remove_stopword(multi_noun,
                                                   multi_noun_score)

    # print("0" * 100)
    # print(multi_noun, multi_noun_score)
    # print(multi_noun_score)
    return_multi_noun, return_multi_noun_score = text_in_mult_noun_finder(
        multi_noun, multi_noun_score, text)

    if item_counter == 0:
        return return_multi_noun, return_multi_noun_score
    else:
        return return_multi_noun[:item_counter], dict(
            itertools.islice(return_multi_noun_score.items(), item_counter))
Exemplo n.º 4
0
def extract_file_noun(input, output, time_interval=0):
    output_file = open(output, mode='w', encoding='utf-8')
    output_file.close()
    line_number = 1
    input_file = open(input, mode='r', encoding='utf-8')
    while True:
        line = input_file.readline()
        if len(line) < 2:
            break
        line = line.strip()
        line = util.remove_naver_news(line)
        line = util.remove_http_tag(line)
        line = util.normalize(line)

        for line_array in line.split("\n"):
            sentences = nltkSentTokenizer(line_array)

            sentence_words = []
            # for sent in sentences:
            #     sent = sentences.replace('.', ' ')
            #     sent = sentences.replace(',', ' ')
            sent = line_array.replace('  ', ' ')
            if len(sent.strip()) == 0:
                continue
            # print('sent:', sent)
            word_list = kakao_postagger_nn_finder(sent)
            # print('word_list:', word_list)
            # eng_word_list = re.findall('[A-Za-z]+', sent)
            # print('eng_word:', eng_word_list)
            # word_list = word_list + eng_word_list
            # print('word+eng_list:', word_list)

            if len(word_list):
                for word in word_list:
                    # if util.check_email(word) or util.is_int(word) or util.is_alpha(word):
                    if util.check_email(word):
                        continue
                    else:
                        if word.startswith(".") or word.startswith(
                                ",") or word.startswith(
                                    "!") or word.startswith("?"):
                            word = word[1:]
                        if word.endswith(".") or word.endswith(
                                ",") or word.endswith("!") or word.endswith(
                                    "?"):
                            word = word[:-1]

                        one_korea_char = [
                            'ㅂ', 'ㅈ', 'ㄷ', 'ㄱ', 'ㅅ', 'ㅛ', 'ㅛ', 'ㅕ', 'ㅑ', 'ㅐ',
                            'ㅔ', 'ㅃ', 'ㅉ', 'ㄸ', 'ㄲ', 'ㅆ', 'ㅒ', 'ㅖ', 'ㅁ', 'ㄴ',
                            'ㅇ', 'ㄹ', 'ㅎ', 'ㅗ', 'ㅓ', 'ㅏ', 'ㅣ', 'ㅋ', 'ㅌ', 'ㅊ',
                            'ㅍ', 'ㅠ', 'ㅜ', 'ㅡ'
                        ]
                        matching = [s for s in one_korea_char if s in word]
                        if len(matching) > 0:
                            # print("-" * 100)
                            # print(len(matching))
                            # print(matching)
                            # print("-" * 100)
                            continue

                        if str(sent).find(word) < 0:
                            continue
                        output_file = open(output, mode='a', encoding='utf-8')
                        output_file.write(word + os.linesep)
                        output_file.close()

                        sentence_words.append(word)
                        # print(line_number, word)
            del word_list

        time.sleep(time_interval)
        print(line_number, sentence_words)

        gc.enable()
        gc.collect()
        line_number += 1

    # while True:
    #     line = input_file.readline()
    #     if line_number < 1000 :
    #         line_number = line_number + 1
    #         continue
    #
    #     if len(line) < 2 or line_number > 2000:
    #         break;
    #
    #     line = line.strip()
    #     line = util.remove_naver_news(line)
    #     line = util.remove_http_tag(line)
    #     line = util.normalize(line)
    #
    #     for line_array in line.split("\n"):
    #         sentences = nltkSentTokenizer(line_array)
    #
    #         sentence_words = []
    #         for sent in sentences:
    #             sent = sent.replace('.', ' ')
    #             sent = sent.replace(',', ' ')
    #             sent = sent.replace('  ', ' ')
    #             if len(sent.strip()) == 0:
    #                 continue
    #             # print('sent:', sent)
    #             word_list = kakao_postagger_nn_finder(sent)
    #             # print('word_list:', word_list)
    #             # eng_word_list = re.findall('[A-Za-z]+', sent)
    #             # print('eng_word:', eng_word_list)
    #             # word_list = word_list + eng_word_list
    #             # print('word+eng_list:', word_list)
    #
    #             if len(word_list):
    #                 for word in word_list:
    #                     # if util.check_email(word) or util.is_int(word) or util.is_alpha(word):
    #                     if util.check_email(word):
    #                         continue
    #                     else:
    #                         if word.startswith(".") or word.startswith(",") or word.startswith(
    #                                 "!") or word.startswith("?"):
    #                             word = word[1:]
    #                         if word.endswith(".") or word.endswith(",") or word.endswith("!") or word.endswith("?"):
    #                             word = word[:-1]
    #
    #                         one_korea_char = ['ㅂ', 'ㅈ', 'ㄷ', 'ㄱ', 'ㅅ', 'ㅛ', 'ㅛ', 'ㅕ', 'ㅑ', 'ㅐ', 'ㅔ',
    #                                           'ㅃ', 'ㅉ', 'ㄸ', 'ㄲ', 'ㅆ', 'ㅒ', 'ㅖ',
    #                                           'ㅁ', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅎ', 'ㅗ', 'ㅓ', 'ㅏ', 'ㅣ',
    #                                           'ㅋ', 'ㅌ', 'ㅊ', 'ㅍ', 'ㅠ', 'ㅜ', 'ㅡ']
    #                         matching = [s for s in one_korea_char if s in word]
    #                         if len(matching) > 0:
    #                             # print("-" * 100)
    #                             # print(len(matching))
    #                             # print(matching)
    #                             # print("-" * 100)
    #                             continue
    #
    #                         if str(sent).find(word) < 0:
    #                             continue
    #                         output_file = open(output, mode='a', encoding='utf-8')
    #                         output_file.write(word + os.linesep)
    #                         output_file.close()
    #
    #                         sentence_words.append(word)
    #                         # print(line_number, word)
    #             del word_list
    #
    #     time.sleep(time_interval)
    #     print('2', line_number, sentence_words)
    #
    #     gc.enable()
    #     gc.collect()
    #     line_number += 1

    input_file.close()
Exemplo n.º 5
0
def extract_file_noun(input, output):
    input_file = open(input, mode='r', encoding='utf-8')
    open(output, mode='w', encoding='utf-8')
    line_number = 1
    yesterday_day = int((datetime.now() - timedelta(days=1)).strftime('%d'))

    while (True):
        line = input_file.readline()
        if not line:
            break

        line = line.strip()

        for line_array in line.split("\n"):
            sentences = nltkSentTokenizer(line_array)

            sentence_words = []
            for sent in sentences:

                word_list = extractnoun.findKoNoun(sent)

                if len(word_list[0]):
                    for word in word_list[0]:
                        word = word.strip()
                        add_flag = True
                        for char in word:
                            if char in [
                                    "‘", "`", ",", "'", "\"", "\\", "|", "!",
                                    "@", "#", "$", "%", "^", "&", "*", "(",
                                    ")", "※", "~", "-", "_", "=", "+", "<",
                                    ">", ".", ";", ":", "ㄱ", "ㄴ", "ㄲ", "ㅂ",
                                    "ㅃ", "ㅈ", "ㅉ", "ㄷ", "ㄸ", "ㄱ", "ㅁ", "ㅇ",
                                    "ㄹ", "ㅎ", "ㅅ", "ㅆ", "ㅍ", "ㅊ", "ㅌ", "ㅋ",
                                    "ㅛ", "ㅕ", "ㅑ", "ㅐ", "ㅔ", "ㅗ", "ㅓ", "ㅏ",
                                    "ㅣ", "ㅠ", "ㅜ", "ㅡ"
                            ]:
                                add_flag = False
                        if add_flag and len(word) < 4 \
                                and (not word.endswith('니다') \
                                     and not word.endswith('그후로') \
                                     and not word.endswith('가요') \
                                     and not word.endswith('고요') \
                                     and not word.endswith('구요') \
                                     and not word.endswith('나요') \
                                     and not word.endswith('다요') \
                                     and not word.endswith('마요') \
                                     and not word.endswith('바요') \
                                     and not word.endswith('사요') \
                                     and not word.endswith('어요') \
                                     and not word.endswith('자요') \
                                     and not word.endswith('차요') \
                                     and not word.endswith('타요') \
                                     and not word.endswith('해요') \
                                     and not word.endswith('세요') \
                                     and not word.endswith('네요') \
                                     and not word.endswith('케요') \
                                     and not word.endswith('군요') \
                                     and not word.endswith('하') \
                                     and not word.endswith('텐데') \
                                     and not word.endswith('건데') \
                                     and not word.endswith('을려') \
                                     and not word.endswith('을껄') \
                                     and not word.endswith('습니') \
                                     and not word.endswith('씁니') \
                                     and not word.endswith('좀') \
                                     and not word.endswith('처럼') \
                                     and not word.endswith('된') \
                                     and not word.endswith('나') \
                                     and not word.endswith('넣') \
                                     and not word.endswith('먹') \
                                     and not word.endswith('있') \
                                     and not word.endswith('볼라') \
                                     and not word.endswith('…') \
                                     and not word.endswith('비트코') \
                                     and not word.endswith('기자') \
                                     and not word.endswith('할') \
                                     and not word.endswith('위안삼') \
                                     and not word == '기자' \
                                     and not word == str(yesterday_day) + '일'
                        ):
                            sentence_words.append(word)

        output_file = open(output, mode='a', encoding='utf-8')
        for word in sentence_words:
            output_file.write(word + os.linesep)
        output_file.close()

        print(line_number, sentence_words)

        line_number += 1
Exemplo n.º 6
0
    line_number = 1
    while (True):
        text = input_file.readline()
        if not text:
            break

        text = remove_keyboard_out_chractor(text)
        text = remove_naver_news(text)

        if len(re.findall('function', text)) > 1 or len(
                re.findall('var currentDateParam', text)) > 0:
            continue

        line_sentence = []
        for text_item in text.split("\n"):
            sentences = nltkSentTokenizer(text_item)

            for sent in sentences:

                for line in sent.split(r'\n'):
                    if line.strip():
                        line = ut.normalizeText(line)
                        line_sentence.append(line)

        if len(line_sentence) < 3:
            continue

        print(line_number, line_sentence[:-2])
        line_number = line_number + 1

        contents = (" ".join(line_sentence[1:-2]).strip())