Пример #1
0
    def __init__(self, command='knp', option='-tab', rcfile='',
                 server=None, port=31000, timeout=30,
                 pattern=r'(?:^|\n)EOS($|\n)',
                 jumanrcfile='', juman_option='-e2 -B', juman_port=32000,
                 juman_command='juman', jumanpp=False):

        self.use_jumanpp = (juman_command == "jumanpp") or jumanpp
        assert 'EOS' in pattern
        self.pattern = pattern
        self.EOS = 'EOS'
        # tab形式しかパースしない
        assert '-tab' in option

        if rcfile and not os.path.isfile(os.path.expanduser(rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile)
            quit(1)

        # Setup Juman(++)
        assert port != juman_port
        juman_args = {'option': juman_option, 'rcfile': jumanrcfile,
                      'server':server, 'port':juman_port}
        if self.use_jumanpp:
            self.juman = Jumanpp(**juman_args)
        else:
            self.juman = Juman(**juman_args)
        # Setup KNP
        if server is not None:
            self.socket = Socket(server, port, option=option, timeout=timeout)
            self.query = partial(self.socket.query, pattern=pattern)
        else:
            if rcfile:
                option += " -r {}".format(rcfile)
            self.subprocess = Subprocess(command, option=option)
            self.query = partial(self.subprocess.query, pattern=pattern)
Пример #2
0
def analyzer():
    bc = BertClient(ip='bertserving', output_fmt='list')
    client = Elasticsearch('elasticsearch:9200')
    texts = []
    list_text = []
    jumanpp = Juman()
    query = request.args.get('q')
    result = jumanpp.analysis(query)
    for mrph in result.mrph_list():
        texts.append(mrph.midasi)
    list_text.append(" ".join(texts))
    query_vector = bc.encode(list_text, is_tokenized=False)[0]
    script_query = {
        "script_score": {
            "query": {
                "match": {
                    "source": "tb"
                }
            },
            "script": {
                "source":
                "cosineSimilarity(params.query_vector, doc['question_vector']) + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }

    response = client.search(index=INDEX_NAME,
                             body={
                                 "size": SEARCH_SIZE,
                                 "query": script_query
                             })
    return jsonify(response)
Пример #3
0
    def __init__(self):
        """Constructs a Tokenizer for Juman++.
    """
        from pyknp import Juman

        self.do_lower_case = False
        self._jumanpp = Juman()
Пример #4
0
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='juman',
                 jumanrcfile='',
                 jumanpp=False):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = (jumancommand == "jumanpp") or jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile)
            quit(1)

        if (self.jumanpp):
            self.juman = Jumanpp()
        else:
            self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)
Пример #5
0
 def _apply_jumanpp(self, inp: str) -> Tuple[str, str]:
     jumanpp = Juman(command=self.juman, option=self.juman_option)
     jumanpp_result = jumanpp.analysis(inp)
     jumanpp_out = jumanpp_result.spec() + 'EOS\n'
     jumanpp_conll_out = self._jumanpp2conll_one_sentence(
         jumanpp_out) + 'EOS\n'
     return jumanpp_out, jumanpp_conll_out
Пример #6
0
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='jumanpp',
                 jumanrcfile='',
                 jumanoption='',
                 jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.options = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand,
                           rcfile=jumanrcfile,
                           option=jumanoption,
                           jumanpp=self.jumanpp)
def main(DATA_ROOT):
    text_files = Path(DATA_ROOT).glob('**/*.txt')
    for text_file in text_files:
        with open(text_file) as f:
            content = f.read()

        content = re.sub(r"=+(.*?)=+", "\g<1>", content)
        content = re.sub(r"^\n", "", content, flags=re.MULTILINE)
        content = content.replace('<block>', '')
        content = content.replace('<math-element>', '')
        # In this case, 。 can be removed safely
        sentences = re.split(r"[。\n]", content)
        sentences = [line for line in sentences if len(line) != 0]
        sentences = [''.join(line.split()) for line in sentences]

        # Remove sentence which is not properly parsed
        val_sentences = []
        offsets = []

        juman = Juman()

        for sentence in tqdm(sentences):
            # Try to parse
            try:
                result = juman.analysis(sentence)

            except ValueError:
                print(sentence)

            except Exception as e:
                raise e

            current = 0
            offset = [0 for _ in range(len(sentence))]

            for mrph in result.mrph_list():
                current = current + len(mrph.midasi)
                try:
                    offset[current - 1] = 1

                except IndexError as e:
                    print(sentence)
                    print(current)
                    for _mrph in result.mrph_list():
                        print(_mrph.midasi)
                    raise e

                except Exception as e:
                    raise e

            val_sentences.append(sentence)
            offsets.append(offset)

        results = (sentences, offsets)

        file_name = text_file.name[:-4] + '.pickle'
        dic = text_file.parent

        with open(Path(dic, file_name), 'wb') as f:
            pickle.dump(results, f)
Пример #8
0
 def run(self):
     data = self.load()
     jumanpp = Juman()
     output = []
     for _, row in data.iterrows():
         zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True)
         splited = [
             mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list()
         ]
         if self.task_name == 'QA_B':
             qa_zenkaku = jaconv.h2z(
                 f"{row['target']}の{row['aspect']}は{row['sentiment']}",
                 ascii=True,
                 digit=True,
             )
         else:
             qa_zenkaku = " "
         qa_splited = [
             mrph.midasi
             for mrph in jumanpp.analysis(qa_zenkaku).mrph_list()
         ]
         output.append({
             "context": " ".join(splited),
             "qa": " ".join(qa_splited),
             "label": 1
         })
     self.dump(pd.DataFrame(output))
Пример #9
0
def initialize(fword, tword, modelfn, start, debug):
    juman = Juman()
    # parse and check from_word
    ms_f = juman.analysis(fword).mrph_list()
    if len(ms_f) > 1:
        print(u'{} is parsed multiple words'.format(fword))
        exit(1)
    wm_f = ms_f[0]
    if not wm_f.repname:
        print(u'no repname with {}'.format(fword))
        exit(1)
    fword = wm_f.repname
    # parse and check to_word
    ms_t = juman.analysis(tword).mrph_list()
    if len(ms_t) > 1:
        print(u'{} is parsed multiple words'.format(tword))
        exit(1)
    wm_t = ms_t[0]
    if not wm_t.repname:
        print(u'no repname with {}'.format(tword))
        exit(1)
    tword = wm_t.repname
    # load and check model
    print(u'loading model...')
    if modelfn.split('.')[-1] == 'model':
        model = Word2Vec.load(modelfn)
    elif modelfn.split('.')[-1] == 'bin':
        model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore')
    if fword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(fword))
        exit(1)
    elif tword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(tword))
        exit(1)
    model.save('hs0.100m.500.5.18mgt100.model')

    t1 = time.clock() - start
    if debug:
        printtime(t1)

    print(u'constructing id2vocab map...')
    id2vocab = {}
    for i, v in enumerate(model.vocab):
        id2vocab[i] = v

    t2 = time.clock() - t1
    if debug:
        printtime(t2)

    print(u'constructing V...')
    V = []
    for v in model.vocab:
        V.append(model[v])
    V = np.vstack(V)

    t3 = time.clock() - t2
    if debug:
        printtime(t3)
    return fword, tword, model, V, id2vocab, t3
Пример #10
0
    def string_word_point(self, df):
        jumanpp = Juman(jumanpp=False)
        tmp_word =[]
        df_time_word = pd.DataFrame(index=[], columns=['time','word']) #単語と時間のdf
        df_word_point = pd.DataFrame(index=[], columns=['word','point'])#単語とその出現数のdf
        df_time_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のコメント数のdf
        df_time_www_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のwww数のdf
        df_time_hakusyu_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時の拍手数のdf
        df_URL_point = pd.DataFrame(index=[], columns=['URL','point'])#URLまとめdf
        
        #print(df_word_point)
        for i in range(len(df)):
             #URLだったら追加
            url=URL_hanbetu(df['comment'][i])
            if url != False:
                tmp = self.my_index(df_URL_point['URL'],url)
                df_URL_point = self.make_df_append(df_URL_point,tmp,url)

            #print("記号削除前")
            #print(df_word_point)
            #記号削除中
            print(df['comment'][i])
            df['comment'][i] = self.my_delete(df['comment'][i])
            # h:m:s -> hms に変更
            tmp_time = self.strtime_to_inttime(df['time'][i])
                    
            #時間ごとのコメント数計算
            tmp = self.my_index(df_time_point['time'],tmp_time)
            df_time_point = self.make_df_append(df_time_point,tmp,tmp_time)
            #wwwがあったら1追加なかったら0追加
            print(url)
            if False != self.www_hanbetu(df['comment'][i]) and url == False:
                df_time_www_point = self.make_df_append(df_time_www_point,tmp,tmp_time)
            else:
                if False == tmp :
                    df_time_www_point = df_time_www_point.append({'time': tmp_time, 'point': 0}, ignore_index=True)
            #拍手があったら1追加なかったら0追加
            if False != self.hakusyu_hanbetu(df['comment'][i]):
                df_time_hakusyu_point = self.make_df_append(df_time_hakusyu_point,tmp,tmp_time)
            else:
                if False == tmp :
                    df_time_hakusyu_point = df_time_hakusyu_point.append({'time': tmp_time, 'point': 0}, ignore_index=True)

                #構文解析
                result = jumanpp.analysis(df['comment'][i])
                #print(result)
                #分析結果からdf作成
                for token in result.mrph_list():
                    tmp_word = token.midasi   
                #名詞の出現数計算
                    if 0 != self.word_Classification(token.hinsi):
                    #名詞なら
                        if self.word_Classification(token.hinsi) == '名詞':    
                            tmp = self.my_index(df_word_point['word'],tmp_word)
                            df_word_point = self.make_df_append(df_word_point,tmp,tmp_word)
                        #名詞とその時の時間
                            df_time_word = df_time_word.append({'time':tmp_time,'word': tmp_word}, ignore_index=True)

        return df_time_word,df_word_point,df_time_point,df_time_www_point, df_time_hakusyu_point,df_URL_point
    def __init__(self, bert_model: str, fine_tuned_model: str, jumanpp_command: str):
        self.jumanpp = Juman(command=jumanpp_command)

        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
        self.model = BertPosNegClassifier(bert_model)
        state_dict = torch.load(fine_tuned_model, map_location=torch.device('cpu'))
        self.model.load_state_dict({k.replace('module.', ''): v for k, v in state_dict.items()})
        self.model.eval()
Пример #12
0
 def __init__(
     self,
     preprocessor=None,
     stopwords=[],
 ):
     self.jumanpp = Juman()
     self.preprocessor = preprocessor
     self.stopwords = stopwords
Пример #13
0
def counter(text, d):
    jumanapp = Juman()
    result = jumanapp.analysis(text)
    for mrph in result.mrph_list():
        if mrph.genkei in d:
            d[mrph.genkei] = d[mrph.genkei] + 1
        else:
            d[mrph.genkei] = 1
Пример #14
0
 def morphological_analysis(self, text):
     jumanpp = Juman()
     ret = []
     text = self.remove_special_character(text)
     result = jumanpp.analysis(text)  # これでスペースで単語が区切られる
     for mrph in result.mrph_list():
         ret += self.modification(mrph.midasi)
     return ret
def juman_list(text):
    jumanpp = Juman()
    result = jumanpp.analysis(text)
    # アルファベットは全て "En" という文字列に置き換える
    wakati = [
        mrph.genkei if mrph.bunrui != "アルファベット" else "En"
        for mrph in result.mrph_list()
    ]
    return ",".join(wakati)
Пример #16
0
def juman_test():
    juman = Juman()
    print dir(juman)
    text = "テストテキスト"
    utext = unicode("".join(text.split()))
    print( u'"'+utext+u'"' )
    juman_result = juman.analysis( utext )
    for mrph in juman_result.mrph_list():
        print( '> ' + mrph.midasi + ' : ' + mrph.yomi + ' : ' + mrph.genkei )
    def test_juman_wrapper(self):
        juman = Juman()
        result = juman.analysis(u"これはペンです。")
        print(','.join(mrph.midasi for mrph in result))

        for mrph in result.mrph_list():
            assert isinstance(mrph, pyknp.Morpheme)
            print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
Пример #18
0
    def test_juman_wrapper(self):
        juman = Juman()
        result = juman.analysis(u"これはペンです。")
        print(','.join(mrph.midasi for mrph in result))

        for mrph in result.mrph_list():
            assert isinstance(mrph, pyknp.Morpheme)
            print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
Пример #19
0
    def __init__(self):
        """
        日本語専用トークナイザの構築。
        JUMAN++ を使用する。
   """
        from pyknp import Juman

        self.do_lower_case = False
        self._jumanpp = Juman()
Пример #20
0
def get_repname_using_jumanpp(genkei: str, pos: str) -> str:
    if pos == '助詞':
        return f'{genkei}/{genkei}'

    juman = Juman(option='-s 1')
    mrphs = juman.analysis(genkei, juman_format=JUMAN_FORMAT.LATTICE_TOP_ONE)
    # 形態素解析が誤っていないか(=1形態素になっているか)をチェック
    if len(mrphs) == 1:
        return mrphs[0].repname

    return f'{genkei}/{genkei}'
Пример #21
0
    def __init__(self):
        '''initialize

        Examples
        --------
        >>> nlp = JNLP()
        None

        '''

        self.juman = Juman()
        self.KNP = KNP(option='-tab -anaphora')
    def test_juman_wrapper(self):
        try:
            juman = Juman(command=self.path_to_juman_command)
            result = juman.analysis("これはペンです。")
            logger.debug(','.join(mrph.midasi for mrph in result))

            for mrph in result.mrph_list():
                assert isinstance(mrph, pyknp.Morpheme)
                logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                      % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
        except ImportError:
            print('skip test_juman_wrapper')
Пример #23
0
def bulk_predict(docs, batch_size=256):
    """Predict bert embeddings."""
    jumanpp = Juman(jumanpp=False)
    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i: i+batch_size]
        pre_embedding_docs = []
        for doc in batch_docs:
            for k in range(0, len(doc['question']), MAX_TXT_LENGTH)
                result = jumanpp.analysis(doc['question'][k:k+MAX_TXT_LENGTH])
                texts = [mrph.midasi for mrph in result.mrph_list()]
                pre_embedding_docs.append(" ".join(texts))
        embeddings = bc.encode(pre_embedding_docs,is_tokenized=True)
            yield emb
Пример #24
0
def seg2word(seg):
    len_split = 1000
    # seg = seg_in.replace(' ', '\u3000')
    # seg = seg_in.replace(' ', ' ')
    len_seg = len(seg)
    seg_splits = [seg[i:i + len_split] for i in range(0, len_seg, len_split)]

    juman_def = Juman(command="/mnt/gold/users/s18153/bin/jumanpp")
    return ' '.join([
        " ".join(
            [mrph.midasi for mrph in juman_def.analysis(seg_part).mrph_list()])
        for seg_part in seg_splits
    ])
Пример #25
0
def juman_wakati(text, hinshi=(), DEBUG=False, STEM_FLAG=False):
    juman = Juman()
    output = ""
    # wakati
    result = juman.analysis(text)
    for mrph in result.mrph_list():
        if STEM_FLAG and mrph.hinsi in hinshi:
            output += mrph.repname.split("/")[0] + " "
        if DEBUG:
            print("stem:", mrph.repname)
            print("midashi:", mrph.repname)
            print("hinsi:", mrph.hinsi)
            print("yomi:", mrph.yomi)
    return output.strip()
Пример #26
0
    def __init__(
        self,
        command='knp',
        server=None,
        port=31000,
        timeout=60,
        option='-tab',
        rcfile='',
        pattern=r'EOS',
        jumancommand='jumanpp',
        jumanrcfile='',
        jumanoption='',
        jumanpp=True,
        multithreading=False,
    ):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.options = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        if server is not None:
            self.analyzer = Analyzer(backend='socket',
                                     timeout=timeout,
                                     server=server,
                                     port=port,
                                     socket_option='RUN -tab -normal\n')
        else:
            cmds = [self.command] + self.options
            if self.rcfile:
                cmds += ['-r', self.rcfile]
            self.analyzer = Analyzer(backend='subprocess',
                                     multithreading=multithreading,
                                     timeout=timeout,
                                     command=cmds)
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand,
                           rcfile=jumanrcfile,
                           option=jumanoption,
                           jumanpp=self.jumanpp,
                           multithreading=multithreading)
Пример #27
0
def read_home_timeline( session ):
    print( '[kazuha] - read timeline.' )
    juman = Juman()
    req = session.get( twitter.API_home_timeline, params = {} )
    if req.status_code == 200:
        timeline = json.loads( req.text )
        for tweet in timeline:
            u_tweet_text = unicode( "".join(tweet["text"].split()) )
            print( u'[kazuha] - read timeline: '+ u_tweet_text )
            juman_result = juman.analysis( u_tweet_text )
            for mrph in juman_result.mrph_list():
                print u"%s - (%s, %s)" % (mrph.genkei, mrph.hinsi, mrph.bunrui)
            #end for
        #end for
    else:
        print( '[kazuha] - read timeline: failure.' )
Пример #28
0
class JumanTokenizer:
    def __init__(self, command, options):
        self.juman = Juman(command, options)

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Пример #29
0
def main(bert_vocab_filepath, example_filepath, context_filepath,
         cache_save_dir):
    #Juman++
    juman = Juman(jumanpp=True)

    logger.info("Cache files will be saved in {}.".format(cache_save_dir))

    #Tokenizer
    logger.info("Create a tokenizer from {}.".format(bert_vocab_filepath))
    tokenizer = BertTokenizer.from_pretrained(bert_vocab_filepath,
                                              do_lower_case=False)

    logger.info("Start loading examples from {}.".format(example_filepath))
    examples = load_examples(example_filepath)
    logger.info("Finished loading examples.")
    logger.info("Number of examples: {}".format(len(examples)))

    logger.info("Start loading contexts from {}.".format(context_filepath))
    contexts = load_contexts(context_filepath)
    logger.info("Finished loading contexts.")

    logger.info("Start encoding examples.")
    encoding = encode_examples(juman, tokenizer, examples, contexts, 512)
    logger.info("Finished encoding examples.")

    os.makedirs(cache_save_dir, exist_ok=True)
    torch.save(encoding["input_ids"],
               os.path.join(cache_save_dir, "input_ids.pt"))
    torch.save(encoding["attention_mask"],
               os.path.join(cache_save_dir, "attention_mask.pt"))
    torch.save(encoding["token_type_ids"],
               os.path.join(cache_save_dir, "token_type_ids.pt"))
    torch.save(encoding["labels"], os.path.join(cache_save_dir, "labels.pt"))
    logger.info("Saved cache files in {}.".format(cache_save_dir))
def title_clean(title_ls):
    tmp_ls = copy.deepcopy([title_ls])
    for i in range(len(tmp_ls) - 1):
        if tmp_ls[i] is None:
            del tmp_ls[i]
    for i in range(len(tmp_ls)):        
        tmp_ls[i] = normalize('NFKC', tmp_ls[i])
        tmp_ls[i] = tmp_ls[i].replace(' ', '')
        tmp_ls[i] = re.sub(r'−.+?$', '', tmp_ls[i])
        tmp_ls[i] = re.sub(r'ーY.+?$', '', tmp_ls[i])
        tmp_ls[i] = re.sub(r'\|.+?$', '', tmp_ls[i])
    jumanpp = Juman()
    sep_ls = []
    for tmp in tmp_ls: 
        sep_ls.append(' '.join([mrph.midasi for mrph in jumanpp.analysis(tmp)]))
    return sep_ls[0]
Пример #31
0
class JumanTokenizer():
    def __init__(self):
        self.juman = Juman(jumanpp=True)

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Пример #32
0
class JumanService(object):
    def __init__(self):
        self.__juman = Juman()

    def analysis(self, string):
        formattedString = JumanKnpUtil.format_input_string(string)
        return self.__juman.analysis(formattedString)
Пример #33
0
class JumanTokenizer():
    _trans_tables = str.maketrans({"\"": "", "@": "@", "#": "#"})

    def __init__(self, ):
        self.juman = Juman()

    def _preprocess(self, sentences):
        return sentences.replace(" ",
                                 "").replace("\n",
                                             "").translate(self._trans_tables)

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]

    def _preprocess_list(self, datas):
        return [[x, self._preprocess(x)] for x in datas]

    def _tokenize_for_multi(self, datas):
        try:
            return [datas[0], self.tokenize(datas[1])]
        except:
            return []

    def tokenize_multi(self, datas, thread=cpu_count()):
        datas = self._preprocess_list(datas)
        num_of_datas = len(datas)

        with Pool(thread) as pool:
            imap = pool.imap_unordered(self._tokenize_for_multi, datas)
            result = list(tqdm(imap, total=num_of_datas))
        return result
Пример #34
0
def word_distance(s1, s2):
    juman = Juman()
    r = len(s1 + s2) - len(re.sub("[a-zA-Z0-9]", "", s1 + s2))
    if r > len((s1 + s2).replace(" ", "")) // 2:
        return word_distance_en(s1, s2)
    sss = [
            set(
                [item.midasi for item in juman.analysis(ss).mrph_list() \
                        if item.hinsi in {'名詞', '動詞', '形容詞', '指示詞'}\
                        or '内容語' in item.imis
                        ]
            ) for ss in [s1, s2]
            ]
    if min(len(sss[0]), len(sss[1])) == 0:
        return 0
    return float(len(sss[0] & sss[1])) / min(len(sss[0]), len(sss[1]))
Пример #35
0
    def __init__(self, command='knp', server=None, port=31000, timeout=60,
                 option='-tab', rcfile='', pattern=r'EOS',
                 jumancommand='jumanpp', jumanrcfile='', jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp)
Пример #36
0
 def __init__(self):
     self.juman = Juman()
     self.knp = KNP()
Пример #37
0
# coding: utf-8

from pyknp import Juman
import sys
import codecs
 
juman = Juman()

input_file = "../data/sample.txt"
f = codecs.open(input_file, 'r', 'utf-8')
f_out = codecs.open(input_file + '_juman_result.txt','w', 'utf-8')
for line in f:
    result = juman.analysis(line[:-1].replace(" ", ""))
    #print ' '.join(mrph.midasi for mrph in result)
    f_out.write(' '.join(mrph.midasi for mrph in result) + '\n')
Пример #38
0
class Solver(object):
    def __init__(self):
        self.juman = Juman()
        self.knp = KNP()

    def Q61(self):
        u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.juman.analysis(input_sentence.decode("utf8"))
        for mrph in result.mrph_list():
            sys.stdout.write("{} ".format(mrph.midasi.encode("utf8")))
        sys.stdout.write("\n")
        return

    def Q62(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞")  # 名詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q63(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞")  # 動詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q64(self):
        u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ

        ヒント: ディクショナリ、sorted 関数を使う
        """
        data = u""
        hist = {}
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                for mrph in result.mrph_list():
                    try:
                        hist[mrph.genkei] += 1
                    except KeyError:
                        hist[mrph.genkei] = 1
                data = u""
        for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True):
            print("{},{}".format(key.encode("utf8"), val))

    def Q65(self):
        u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ

        ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする
        """

        data = u""
        num = 0
        denom = 0
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                if verbose:
                    logger.info("denom: {}".format(denom))
                for mrph in result.mrph_list():
                    denom += 1
                    if mrph.hinsi == u"動詞":
                        num += 1
                        continue
                    if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"):
                        num += 1
                        continue
                    if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"):
                        num += 1
                        continue
                data = u""

        print("{}/{}={}".format(num, denom, float(num) / denom))

    def Q66(self):
        u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = None
                for mrph in result.mrph_list():
                    if mrph.genkei == u"できる" or mrph.genkei == u"する":
                        if buff is not None:
                            extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8")))

                    if mrph.bunrui == u"サ変名詞":
                        buff = mrph
                    else:
                        buff = None
                data = u""
        for t in extract:
            print("{}+{}".format(t[0], t[1]))

    def Q67(self):
        u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = []
                for mrph in result.mrph_list():
                    if mrph.genkei == u"の" and len(buff) == 1:
                        buff.append(u"の")
                        continue
                    if mrph.hinsi == u"名詞":
                        if len(buff) == 0:
                            buff.append(mrph.genkei)
                            continue
                        if len(buff) == 2:
                            extract.add((buff[0], mrph.genkei))
                    buff = []
                data = u""
        for t in extract:
            print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8")))

    def Q68(self):
        u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.knp.parse(input_sentence.decode("utf8"))
        for bnst in result.bnst_list():
            sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
        sys.stdout.write("\n")
        return

    def Q69(self):
        u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)
        return

    def Q70(self):
        u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)

        return
Пример #39
0
class KNP(object):
    """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール

    Args:
        command (str): KNPコマンド
        option (str): KNP解析オプション 
                        (詳細解析結果を出力する-tabは必須。 
                        省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など)
        rcfile (str): KNP設定ファイルへのパス
        pattern (str): KNP出力の終端記号
        jumancommand (str): JUMANコマンド
        jumanrcfile (str): JUMAN設定ファイルへのパス
        jumanpp (bool): JUMAN++を用いるかJUMANを用いるか
    """

    def __init__(self, command='knp', server=None, port=31000, timeout=60,
                 option='-tab', rcfile='', pattern=r'EOS',
                 jumancommand='jumanpp', jumanrcfile='', jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp)

    def knp(self, sentence):
        """ parse関数と同じ """
        self.parse(sentence)

    def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        文字列を入力として構文解析を行い、文節列オブジェクトを返す

        Args:
            sentence (str): 文を表す文字列
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        assert(isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(
                    self.server, self.port, "RUN -tab -normal\n")
            else:
                command = [self.command] + self.option
                if self.rcfile:
                    command.extend(['-r', self.rcfile])
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern))
        else:
            knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern))
        return BList(knp_lines, self.pattern, juman_format)

    def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        ある文に関するKNP解析結果を文節列オブジェクトに変換する

        Args:
            input_str (str): ある文に関するKNPの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        return BList(input_str, self.pattern, juman_format)