Exemplo n.º 1
0
def tokenize_okt(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(
        lambda x: text_tokenize(x['content'], okt, stopwords), axis=1)
    df['title_token'] = df.progress_apply(
        lambda x: text_tokenize(x['title'], okt, stopwords), axis=1)
    return df
Exemplo n.º 2
0
def tokenize_okt(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    return df
Exemplo n.º 3
0
import re
from fileIO import openJsonFile, closeJsonFile, saveError
from dbIO import readDB, insertDB

import nltk
from nltk.corpus import stopwords
from konlpy.tag import Okt
from ckonlpy.tag import Twitter, Postprocessor
from ckonlpy.utils import load_wordset, load_ngram

# nltk.download('punkt')
# nltk.download('stopwords')
okt = Okt()
twitter = Twitter()
stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt',
                           encoding='ANSI')
customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt',
                                 encoding='ANSI')
stopwordsEN = customStopwordsEN.union(set(stopwords.words('english')))
ngrams = load_ngram('cleansing_data/korean_ngram.txt')
userdicts = load_wordset('cleansing_data/korean_user_dict.txt')
twitter.add_dictionary(list(userdicts), 'Noun', force=True)


def getJobGroups():
    res = requests.get(
        'https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.latest_order&years=-1&locations=all'
    )
    html = res.text
    soup = BeautifulSoup(html, "html.parser")
Exemplo n.º 4
0
import re
from itertools import chain
import time, csv
import json
from dbIO import readDB, insertDB

import nltk
from nltk.corpus import stopwords
from konlpy.tag import Okt
from ckonlpy.tag import Twitter, Postprocessor
from ckonlpy.utils import load_wordset, load_ngram

okt = Okt()
# twitter = Twitter()
# stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt', encoding='ANSI')
customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt',
                                 encoding='ANSI')
stopwordsEN = customStopwordsEN.union(set(stopwords.words('english')))
# ngrams = load_ngram('cleansing_data/korean_ngram.txt')
# userdicts = load_wordset('cleansing_data/korean_user_dict.txt')
# twitter.add_dictionary(list(userdicts), 'Noun', force=True)


def connectWebDriver(web):
    options = webdriver.ChromeOptions()
    options.add_argument("disable-gpu")
    options.add_argument("headless")
    options.add_argument("lang=ko_KR")

    # 브라우저 화면 크기에 따라 미디어 쿼리 등에 따라 element 구조가
    # 달라질 수 있으므로 고정시키고 시작하기
    options.add_argument('--start-maximized')
Exemplo n.º 5
0
from konlpy.tag import Okt
from konlpy.utils import pprint
from collections import Counter
from ckonlpy.tag import Postprocessor
from ckonlpy.tag import Twitter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim as gensimvis
import codecs

from ckonlpy.utils import load_wordset
passwords = load_wordset('postprocess/passwords.txt')
stopwords = load_wordset('postprocess/stopwords.txt')

from ckonlpy.utils import load_replace_wordpair
replace = load_replace_wordpair('postprocess/replace.txt')

from ckonlpy.utils import load_ngram
ngrams = load_ngram('postprocess/ngrams.txt')

Okt = Okt()
twitter = Twitter()

new_nouns = []

with open('preprocess/dictionary.txt', encoding='utf8') as fd:
    for line in fd:
        new_nouns.append(line.strip('\n'))