예제 #1
0
def classify(document, label_dict, tfidf_train_vect):

    # 형태소 분석기(Mecab)
    mecab = Mecab(dicpath='C:\\mecab\\mecab-ko-dic')
    # 입력받은 문서를 길이 2이상의 명사로만 추출하여 재구성
    document = [
        i[0] for i in mecab.pos(document)
        if (((i[1] == "NNG") or (i[1] == "NNP") and (len(i[0]) > 1)))
    ]
    document = " ".join(document)

    clf = load_clf()  # 분류모델 load
    X = tfidf_train_vect.transform([document
                                    ])  # train tfidf vector에 맞춰 입력문서 vectorize
    y = clf.predict(X)[0]  # 입력문서의 예측 클래스

    proba = clf.predict_proba(X)  # 입력문서의 예측 클래스(확률값) 반환
    proba_max = np.max(proba)  # 입력문서의 예측 클래스(확률값) 중 가장 확률값 반환

    return label_dict[y], proba_max, y
예제 #2
0
def read_doc(file_name):
    with open(file_name) as f:
        doc = f.read()

    tagger = Mecab()

    sentences = []  # [n_sentences, n_words]
    tags = []
    start_sentence=0
    for sep in rLINE_SEP.finditer(doc):
        sentence = doc[start_sentence:sep.start(0)]
        sentence = clear_str(sentence)
        start_sentence = sep.end(0)
        if len(sentence) < 10:
            continue
        
        poss = tagger.pos(sentence)
        sentences.append([word for word, _ in poss])
        tags.append([tag for _, tag in poss])
    return sentences, tags
예제 #3
0
def gen_summary(text, max_length):
    """Clean sentence"""
    global counter_konlpy
    global total_dataset
    mecab = Mecab()
    text = re.sub('[0-9]', '', text)

    text = mecab.nouns(text)
    text = ['GO'] + text
    text = empty_remover(text)
    if len(text) >= max_length:
        text = text[0:max_length]
    else:
        text = text + ["PAD"] * (max_length - len(text))
        text = text[0:max_length]

    counter_konlpy += 1
    sys.stdout.write("\rParsed: %d / %d" % (counter_konlpy, total_dataset))
    sys.stdout.flush()
    return ' '.join(text)
예제 #4
0
    def __init__(self,
                 vocab_path="./transformer/vocbulary.voc",
                 tagger=Mecab()):
        # tagger: 형태소 분석기 -> Mecab() (default)
        self.tagger = tagger

        # 단어 사전 형태별 저장
        self.vocab, self.word2idx, \
        self.idx2word, self.vocab_len \
        = self._load_vocab_file(vocab_path)

        # start, end, unk 토큰 지정
        self.STD = "<START>"
        self.END = "<END>"
        self.UNK = "<UNK>"

        # start, end, unk 토큰 인덱스 번호
        self.STD_IDX = self.word2idx[self.STD]
        self.END_IDX = self.word2idx[self.END]
        self.UNK_IDX = self.word2idx[self.UNK]
예제 #5
0
def clean_str(s):
    """Clean sentence"""
    global counter_konlpy
    global total_dataset
    #global stopwords
    s = re.sub('[0-9]', '', s)

    mecab = Mecab()
    result = []
    try:
        result = mecab.nouns(s)
    except ValueError:
        resunt = []

    if len(result) > 1000:
        result = result[0:1000]
    counter_konlpy += 1
    sys.stdout.write("\rParsed: %d / %d" % (counter_konlpy, total_dataset))
    sys.stdout.flush()
    return ' '.join(result)
예제 #6
0
    def find_common_topic(self):
        """
        Finds common topic of overall conversation, and stores into csv file.
        :return: void. csv file will be generated
        """
        # Get conversation
        self._rewind()
        all_conversations = self._preprocess(self.get_all_conversations())

        # perform nlp on all words of conversation
        mecab = Mecab()
        category = ['NNP', 'NNG']
        keywords = [
            classification[0]
            for classification in mecab.pos(str(all_conversations))
            if classification[1] in category
        ]

        freq = Counter(keywords).most_common(300)
        return freq
예제 #7
0
def extract_key_phrases(text):
    """Return a set of key phrases.
    :param text: A string.
    """
    t = Mecab()
    tags_ko = t.pos(text)    
    
    textlist = [x[0] for x in tags_ko]
    tags_ko = filter_for_tags(tags_ko)
    
    tags_ko = normalize(tags_ko)
    word_set_list = list(tags_ko)
    
    graph = build_graph(word_set_list)
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    # most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,reverse=True)
    
    return keyphrases[0:3]
예제 #8
0
    def __init__(self,
                 input_path=None,
                 max_len=None,
                 min_len=None,
                 use_min_cnt=None,
                 word_min_cnt=None,
                 load_preprocessed=False,
                 dir_path=None):
        if not load_preprocessed:
            """
            This must be changed
            """
            self.df = pd.read_table(input_path)
            self.max_len, self.min_len = max_len, min_len
            self.use_min_cnt = use_min_cnt
            self.word_min_cnt = word_min_cnt

            self.tagger = Mecab()
        else:
            self.load(dir_path)
예제 #9
0
def generate_worldloud(text):
    from wordcloud import WordCloud
    from konlpy.tag import Mecab

    phrases = ''
    for phrase in text:
        phrases += phrase + ' '

    mecab = Mecab()
    nouns = mecab.nouns(phrases)
    words = ''
    for word in nouns:
        words += word + ' '

    wordcloud = WordCloud(font_path='/Library/fonts/AppleGothic.ttf',
                          background_color='white',
                          width=600,
                          height=400)
    wordcloud.generate_from_text(words)
    wordcloud.to_file('gachi/static/gachi/images/wordcloud.png')
예제 #10
0
class NLPParser():
    nlplib = Mecab()

    @staticmethod
    def parse(text):
        corpus = NLPParser.nlplib.pos(text)
        return corpus

    @staticmethod
    def get_sentence(corpus):
        result = []
        sentence = []
        for word in corpus:
            sentence.append(word)
            if not word[1] in ['EF', 'SF']:
                continue
            if len(sentence) > 1:
                result.append(sentence)
                sentence = []
        return result
예제 #11
0
    def __init__(self, args):
        super().__init__(args)

        save_dir = p.join(args.path.embed, self.name)
        self.encoder_path = p.join(save_dir, f"{self.name}.bin")
        self.idf_encoder_path = p.join(save_dir, f"{self.name}_idf.bin")
        self.idf_path = p.join(save_dir, "idf.bin")

        if self.args.model.tokenizer_name == "":
            print("Using Mecab tokenizer")
            mecab = Mecab()
            self.tokenizer = mecab.morphs
        elif self.args.model.tokenizer_name in [
                "monologg/kobert", "monologg/distilkobert"
        ]:
            print("Using KoBert tokenizer")
            self.tokenizer = KoBertTokenizer.from_pretrained(
                args.model.tokenizer_name).tokenize
        else:
            print("Using AutoTokenizer: ", args.model.tokenizer_name)
            self.tokenizer = AutoTokenizer.from_pretrained(
                args.model.tokenizer_name, use_fast=True).tokenize

        self.b = self.args.retriever.b
        self.k1 = self.args.retriever.k1
        self.encoder = TfidfVectorizer(tokenizer=self.tokenizer,
                                       ngram_range=(1, 2),
                                       use_idf=False,
                                       norm=None)
        self.idf_encoder = TfidfVectorizer(tokenizer=self.tokenizer,
                                           ngram_range=(1, 2),
                                           norm=None,
                                           smooth_idf=False)
        self.dls = np.zeros(len(self.contexts))

        for idx, context in enumerate(self.contexts):
            self.dls[idx] = len(context)

        self.avdl = np.mean(self.dls)
        self.p_embedding = None
        self.idf = None
예제 #12
0
def remove_particle(training_args):
    """
    remove particle

    Args:
        training_args
    """
    # load tokenizer
    mecab = Mecab()
    kkma = Kkma()
    hannanum = Hannanum()
    # load prediction file
    with open(os.path.join(training_args.output_dir, "predictions.json"),
              "r") as f:
        prediction_json = json.load(f)

    prediction_dict = dict()
    for mrc_id in prediction_json.keys():
        final_predictions = prediction_json[mrc_id]
        pos_tag = mecab.pos(final_predictions)

        # 조사가 있는 경우 삭제
        if final_predictions[-1] == "의":
            min_len = min(len(kkma.pos(final_predictions)[-1][0]),
                          len(mecab.pos(final_predictions)[-1][0]),
                          len(hannanum.pos(final_predictions)[-1][0]))
            if min_len == 1:
                final_predictions = final_predictions[:-1]
        elif pos_tag[-1][-1] in {
                "JX", "JKB", "JKO", "JKS", "ETM", "VCP", "JC"
        }:
            final_predictions = final_predictions[:-len(pos_tag[-1][0])]

        prediction_dict[str(mrc_id)] = final_predictions

    # save final results
    with open(os.path.join(training_args.output_dir, "final_predictions.json"),
              'w',
              encoding='utf-8') as make_file:
        json.dump(prediction_dict, make_file, indent="\t", ensure_ascii=False)
    print(prediction_dict)
    def __init__(self):
        self.twit = Okt()
        self.mecab = Mecab()

        # 정규 표현식 리스트
        self.regex_ls = [
            '[\t\n\r\f\v]', '\(.+?\)', '\[.+?\]', '\<.+?\>', '◀.+?▶',
            '(?<=▶).+', '(?<=▷).+', '(?<=※).+', '(?<=Copyrights).+',
            '[\w]+@[a-zA-Z]+\.[a-zA-Z]+[\.]?[a-z]*', '[가-힣]+기자', '[가-힣]+ 기자',
            '[가-힣]+ 선임기자', '[가-힣]+ 동아닷컴 기자',
            '[\{\}\[\]\/?,;·:“‘|\)*~`!^\-_+<>@○▲▶■◆\#$┌─┐&\\\=\(\'\"├┼┤│┬└┴┘|ⓒ]',
            '[0-9]+[년월분일시]*', '사진=[가-힣]*', '사진제공=[가-힣]*'
        ]

        # 제거대상 리스트, 불용어 리스트
        with open('../preprocessing_data/stopword_list.json',
                  'r',
                  encoding='UTF-8') as f:
            load_file = json.load(f)
            self.word_to_be_cleaned_ls = load_file['clean']
            self.stopword_ls = load_file['stopword']
예제 #14
0
def extract_docs(csv_path):
    mecab = Mecab()

    sts, labels, tags = [], [], []
    with open(csv_path, encoding='utf8') as f:
        reader = csv.reader(f,
                            delimiter='|',
                            escapechar=':',
                            quoting=csv.QUOTE_NONE,
                            skipinitialspace=True)

        for row in reader:
            doc = clear_str(row[1])
            for st in to_sentences(doc):
                morps = parse(st, mecab)
                if not morps: continue

                sts.append(morps)
                labels.append(row[0])
                tags.append(row[1][:50])
    return sts, labels, tags
예제 #15
0
def read_text(fin):
    # 전처리된 위키백과 파일을 읽어 들입니다.
    corpus_li = []
    mecab = Mecab(dicpath='/opt/local/lib/mecab/dic/mecab-ko-dic')
    for line in open(fin):
        # 깨지는 글자를 처리하기 위해 unicodedata.normalize 함수를 이용해
        # NFKC로변환합니다.
        line = unicodedata.normalize('NFKC', line)
        try:
            # 첫 글자가 숫자로 시작하는 문장을 말뭉치에 추가합니다.
            _ = int(line[0])
            corpus_li.append(' '.join(mecab.nouns(line)) + '\n')

        except ValueError:
            # 첫 글자가 한글로 시작하는 문장을 말뭉치에 추가합니다.
            if ord(line[0]) >= ord('가') and ord(line[0]) <= ord('힇'):
                corpus_li.append(' '.join(mecab.nouns(line)) + '\n')
            else:
                pass
    print('# of lines in corpus', len(corpus_li))
    return (corpus_li)
예제 #16
0
def preprocess(
    data_path: str, word_index: dict = None, num_words: int = 10000,
):
    tokenizer = Mecab()

    # 0. data load
    with open(data_path, "rb") as f:
        data = pickle.load(f)

    # 1. bag-of-words
    vocab, docs = [], []
    for doc in tqdm(data):
        if doc:
            # nsmc 데이터에 nan값을 제외해주기 위함
            try:
                nouns = tokenizer.nouns(doc)
                vocab.extend(nouns)
                docs.append(nouns)
            except:
                continue

    # 2. build vocab
    if not word_index:
        vocab = Counter(vocab)
        vocab = vocab.most_common(num_words)

        # 3. add unknwon token
        word_index = {"<UNK>": 0}
        for idx, (word, _) in enumerate(vocab, 1):
            word_index[word] = idx

    index_word = {idx: word for word, idx in word_index.items()}

    # 4. create corpus
    corpus = []
    for doc in docs:
        if doc:
            corpus.append([word_index.get(word, 0) for word in doc])

    return corpus, word_index, index_word
예제 #17
0
    def __init__(self, train=None, model=None, recom_raw=None):

        # 훈련용 토큰화 파일 전처리
        train_frame = pd.read_csv(train, header=None)
        token_train = []
        for i in range(len(train_frame)):
            token = train_frame.loc[i, :].values
            token = token.tolist()
            j = -1
            for k in range(len(token)):
                j += 1
                if not isinstance(token[j], str) and math.isnan(token[j]):
                    del token[j]
                    j = j - 1
            token_train.append(token)

        threshold = 3
        rare_cnt = 0  # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트

        # 정수 인코딩
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(token_train)
        total_cnt = len(tokenizer.word_index)  # 단어의 수

        for key, value in tokenizer.word_counts.items():
            if (value < threshold):
                rare_cnt = rare_cnt + 1

        vocab_size = total_cnt - rare_cnt + 2
        self.tokenizer = Tokenizer(vocab_size, oov_token='OOV')
        self.tokenizer.fit_on_texts(token_train)

        self.mecab = Mecab()

        # 분류 모델에 필요한 딥러닝 모델, 변수
        self.deeprunning = load_model(model)
        self.category = 0

        # 추천 모델에 필요한 data
        self.recom_data = pd.read_csv(recom_raw)
예제 #18
0
    def __init__(self, config):
        self.config = config
        self.device = self.config['device']
        self.examples = list()
        self.iterator = None
        self.tokenizer = Mecab()
        self.SRC = data.Field(tokenize=lambda x: x.split(' '),
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=True)
        self.rSRC = data.Field(tokenize=lambda x: x.split(' '),
                               eos_token='<eos>',
                               pad_token='<pad>',
                               lower=True,
                               batch_first=True,
                               include_lengths=True,
                               preprocessing=lambda x: x[::-1])

        self.SRC.vocab = pickle_reader(self.config['src_field_path'])
        self.rSRC.vocab = self.SRC.vocab
예제 #19
0
class MecabTokenizer(object):
    tokenizer = Mecab()

    @classmethod
    def make_vocab(cls, data_path, save_path):
        texts = read_text(data_path)
        words = [word for text in tqdm(texts) for word in cls.tokenizer.morphs(preprocess_text(text))]
        word_counter = Counter(words)

        vocab = {"[PAD]": 0, "[UNK]": 1}
        idx = 2
        for word, count in word_counter.most_common():
            vocab[word] = idx
            idx += 1
        save_json(save_path, vocab)

    def __init__(self, vocab_path, vocab_size):
        vocab = read_json(vocab_path)
        self.vocab = {key: value for key, value in vocab.items() if value < vocab_size}
        self.vocab_size = len(self.vocab)
        self.pad_token_id = self.vocab.get("[PAD]", None)
        self.unk_token_id = self.vocab.get("[UNK]", None)

    def tokenize(self, text):
        text = preprocess_text(text)
        return self.tokenizer.morphs(text)

    def encode(self, text):
        text = preprocess_text(text)
        tokens = self.tokenize(text)
        ids = [self.vocab.get(token.strip(), self.unk_token_id) for token in tokens]
        return ids

    def encode_plus(self, text):
        preprocessed_text = preprocess_text(text)
        tokens = self.tokenize(preprocessed_text)
        ids = [self.vocab.get(token.strip(), self.unk_token_id) for token in tokens]
        offset = get_offset_mapping(text, tokens)
        return {"w_ids": ids, "offset_mapping": offset}
예제 #20
0
    def __init__(self):
        self.config = {
            'min_count': 5,  # 등장 횟수가 5 이하인 단어는 무시
            'size': 200,  # ???차원짜리 벡터스페이스에 embedding
            'sg': 1,  # 0이면 CBOW, 1이면 skip-gram을 사용한다
            'batch_words': 10000,  # 사전을 구축할때 한번에 읽을 단어 수
            'iter': 50,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
            'window': 5,  # 윈도우 사이즈
            'workers': multiprocessing.cpu_count(),
        }
        #self.projectdir = os.path.dirname(os.path.dirname(__file__))

        #self.tagger = Mecab(dicpath=self.projectdir+"/install/mecab-ko-dic/dic")
        self.tagger = Mecab()
        #self.tagger = Mecab("/home/mini/work/chatbot2017/install/mecab-ko-dic/dic")
        self.twitter_tagger = Twitter()
        self.ps = PorterStemmer()
        self.title_dict = {}
        self.docs = []
        self.texts_ko = []
        self.sentences = []
        self.doc_file_names = []
예제 #21
0
    def __init__(self, data_handler):
        super(NaverNewsCrawler, self).__init__(NaverNewsCrawler)

        self.mecab = Mecab()
        self.data_handler = data_handler
        self.pattern_publisher = r"\s?[가-힣\s]{3,}기자"
        self.pattern_email = r"([\w-]+)@([\w\.-]+)(\.[\w\.]+)"
        driver_path = CONFIG['chromedriver_path']

        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('window-size=1920x1080')
        options.add_argument("disable-gpu")
        options.add_argument(
            "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"
        )

        self.driver = webdriver.Chrome(driver_path, chrome_options=options)
        self.driver.implicitly_wait(3)
예제 #22
0
    def find_common(filename):
        """
        Finds common keyword in given file(name)
        :param filename: name of file to analyze
        :return: Common keyword of file(returned as list of tuples)
                 Returned result is sorted
        """
        file = sample._open_file(filename)
        keywords = list()

        while True:
            line = file.readline().split(',')
            keywords += line[:-1]
            if line == ['']:
                break

        mecab = Mecab()
        category = ['NNP', 'NNG', 'SL', 'VV', 'VA', 'XR', 'VA+ETM', 'NP+VCP+EC']
        keywords = [classification[0] for classification in mecab.pos(str(keywords)) if classification[1] in category]

        cnt = sorted(Counter(keywords).items(), key=lambda x: x[1], reverse=True)
        return cnt
def make_noun_voca_list(raw_list, konlpy_opt):
    noun_voca_list = []
    assert konlpy_opt in ['Hannanum', 'Kkma', 'Komoran', 'Mecab', 'Okt']

    if konlpy_opt == 'Hannanum':
        nlp = Hannanum()
    elif konlpy_opt == 'Kkma':
        nlp = Kkma()
    elif konlpy_opt == 'Komoran':
        nlp = Komoran()
    elif konlpy_opt == 'Mecab':
        nlp = Mecab()
    elif konlpy_opt == 'Okt':
        nlp = Okt()

    for line in raw_list:
        if type(line).__name__ == 'float' or type(line).__name__ == 'int':
            noun_voca_list.append('')
        else:
            noun_voca_list.append(
                nlp.nouns(' '.join(re.compile('[가-힣0-9]+').findall(line))))

    return noun_voca_list
예제 #24
0
    def make_tag(self, contents):
        mecab = Mecab()
        tmp_tag = []
        count = Counter(' ')

        for col in self.collection_review.find({"ISBN": contents}, {
                "_id": 0,
                "review_text": 1
        }):
            pos = mecab.pos(col['review_text'])

            for i in xrange(1, len(pos), 1):
                if (pos[i][1] == "NNG") | (pos[i][1] == "NNP"):
                    print pos[i][0]
                    tmp_tag.append(pos[i][0])

            count = count + Counter(tmp_tag)
            print type(count)

        tag = count
        # self.collection_TitleTag.insert({"ISBN": contents, "tag": tag})

        return tag
예제 #25
0
def get_weight(url, f_l):
    mecab = Mecab()
    d_l = dict()
    for f in f_l:
        exec("d=" + open(os.getcwd() + "/static/dt/" + f + "2.txt").read(),
             globals())
        d_l[f] = d
    soup = ""
    try:
        a = Article(url, language='ko')
        a.download()
        a.parse()
        soup = a.text
    except:
        return -1
    print(soup)
    me = list()
    for i in mecab.pos(soup):
        if i[1] == "NNG" or i[1] == "NNP":
            me.append(i[0])
    print(me)
    W_l = dict()
    for f in f_l:
        W = 0.0
        for i in d_l[f]:
            if i in me:
                W = W + math.log(d_l[f][i])
        print(W)
        print(len(me), len(d_l[f]))
        if len(me) == 0 or len(d_l[f]) == 0:
            W = 0
        else:
            W = float(
                int((W / (float(
                    (len(me)**0.7) * (len(d_l[f])**0.5))) * (10**6))) / 10.0)
        W_l[f] = W
    return W_l
예제 #26
0
def expect_single_noun_text_ko(sentence):
    # Define a chunk grammar, or chunking rules, then chunk

    grammar = """
    명사1: {<SL>}
    명사1: {<SN>}

    명사1: {<NNG>}
    명사2: {<NN.*>}


    동사구: {<NP\+VCP\+EF>}
    동사구: {<NP><VCP\+EF>}
    형용사: {<MA.*>*}
    """
    mecab = Mecab()

    postagged_sentence = mecab.pos(sentence)
    nltk_rexp_parser = nltk.RegexpParser(grammar)
    chunks_sentence = nltk_rexp_parser.parse(postagged_sentence)

    extract_noun = []
    extract_noun_score = {}
    for subtree in chunks_sentence.subtrees():
        if subtree.label().startswith('명사'):
            if len(' '.join((e[0] for e in list(subtree)))) > 1:
                noun = ' '.join((e[0] for e in list(subtree)))
                if re.search(r"\s", noun):
                    extract_noun.append(noun)
                    # extract_noun_score[noun] = 0.75
                    if in_dict(extract_noun_score, noun) == False:
                        extract_noun_score[noun] = 0.75
                    else:
                        extract_noun_score[noun] += 0.75

    return sorted_dict(extract_noun_score)
예제 #27
0
    def get_tokenizer(self, tokenizer):
        tokenizer = tokenizer.lower()

        if tokenizer == "mecab":
            tokenizer = Mecab()

        elif tokenizer == "hannanum":
            tokenizer = Hannanum()

        elif tokenizer == "kkma":
            tokenizer = Kkma()

        elif tokenizer == "komoran":
            tokenizer = Komoran()

        elif tokenizer == "Okt":
            tokenizer = Okt()

        else:
            raise RuntimeError(
                "Tokenizer must be the one of Mecab, Hannanum, Kkma, Komoran, Okt."
            )

        return tokenizer
예제 #28
0
def annotate_example_tootouch(example, table):
    """
    Apr. 2021: Jaehyuk
    Annotate only the information that will be used in our model.
    """

    # tokenizer
    tokenizer = Mecab()

    ann = {'table_id': example['table_id'], 'phase': example['phase']}
    ann['question'] = example['question']
    ann['question_tok'] = [
        str(q).lower() for q in tokenizer.morphs(example['question'])
    ]
    # ann['table'] = {
    #     'header': [annotate(h) for h in table['header']],
    # }
    ann['sql'] = example['sql']
    ann['query'] = copy.deepcopy(example['sql'])

    conds1 = ann['sql']['conds']
    wv_ann1 = []
    for conds11 in conds1:
        wv_ann11 = tokenizer.morphs(str(conds11[2]))
        wv_ann1.append(wv_ann11)

        # Check whether wv_ann exsits inside question_tok

    try:
        wvi1_corenlp = check_wv_tok_in_nlu_tok(wv_ann1, ann['question_tok'])
        ann['wvi_corenlp'] = wvi1_corenlp
    except:
        ann['wvi_corenlp'] = None
        ann['tok_error'] = 'SQuAD style st, ed are not found under CoreNLP.'

    return ann
예제 #29
0
def get_nouns_from_csv(data, stopword, synonym):
    '''
    requirement : pandas, mecab
    data : dataframe type, content of article
    stopword : set type, stopword
    synonym : dict type, use in function preprocess
    return : word_list

    입력받은 path에서 csv 파일을 읽어와 dataframe에 저장
    'text' col을 차례대로 Mecab을 이용해 형태소 분석하여 명사만 word_list에 추가
    '''
    mecab = Mecab()  # 형태소분석기 Mecab(사용자정의사전 추가)
    word_list = []

    for idx in tqdm(range(len(data))):
        try:
            nouns = mecab.nouns(data.loc[idx, 'content'])
            nouns = preprocess(nouns=nouns, stopword=stopword, dic=synonym)
            word_list.append(nouns)
        except Exception as e:
            continue
    print("\nNoun Extraction Complete")

    return word_list
    def __init__(
        self,
        job_id,
        vocab_file,
        output_dir,
        max_seq_length,
        num_jobs,
        blanks_separate_docs,
        do_lower_case,
        tokenizer_type,
        num_out_files=500,
    ):
        self._blanks_separate_docs = blanks_separate_docs

        if tokenizer_type == "mecab_wordpiece":
            tokenizer = KoNLPyBertTokenizer(
                konlpy_wordpiece=KoNLPyWordPieceTokenizer(Mecab(),
                                                          use_tag=False),
                vocab_file=vocab_file,
                do_lower_case=do_lower_case,
            )
        elif tokenizer_type == "wordpiece":
            tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                   do_lower_case=do_lower_case)
        self._example_builder = ExampleBuilder(tokenizer, max_seq_length,
                                               tokenizer_type)
        self._writers = []
        for i in range(num_out_files):
            if i % num_jobs == job_id:
                output_fname = os.path.join(
                    output_dir,
                    "pretrain_data.tfrecord-{:}-of-{:}".format(
                        i, num_out_files),
                )
                self._writers.append(tf.io.TFRecordWriter(output_fname))
        self.n_written = 0