class WordEmbeddingAvgVectorConstructor:
    def __init__(self):
        self.data_accessor = CorpusAccessor()
        if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
            print('[INFO] loading word embedding model...')
            self.word_embedding_model = word2vec.Word2Vec.load(
                WORD_EMBEDDING_MODEL_PATH)

    def convert_avg_vector(self, line):
        """
        文を文中の各単語の平均ベクトルに変換
        """
        if self.word_embedding_model is None:
            raise ValueError("there is not word embedding model")
        wakati_line = text_processor.wakati(line).split()
        word_vectors = np.array([
            self.word_embedding_model.__dict__['wv'][word]
            for word in wakati_line
        ])
        return np.average(word_vectors, axis=0)

    def sentence_to_word_embedding_avg_vector(self, ncode):
        """
        小説本文とあらすじ文の各文を、文中における各単語の分散表現の平均ベクトルに変換する
        データは文番号をkey、文ベクトルをvalueとする辞書で保存される
        [1: 文ベクトル, 2: 文ベクトル, ... , n: 文ベクトル]
        """
        print('[PROCESS NCODE]: {}'.format(ncode))
        contents_file_path = os.path.join(
            WORD_EMBEDDING_AVG_VECTOR_CONTENTS_PATH, ncode + '.txt')
        if os.path.isfile(contents_file_path):
            return

        contents_lines = self.data_accessor.get_contents_lines(ncode)
        synopsis_lines = self.data_accessor.get_synopsis_lines(ncode)
        if not contents_lines or not synopsis_lines:
            return

        # 本文各文のベクトル化
        contents_line_vectors = dict()
        for line_idx, line in enumerate(contents_lines):
            if line_idx % 50 == 0:
                print('contents progress: {:.1f}%'.format(
                    line_idx / len(contents_lines) * 100))
            vector = self.convert_avg_vector(line)
            contents_line_vectors[line_idx] = vector

        # データの保存
        print('[INFO] saving data: {}'.format(ncode))
        with open(contents_file_path, 'wb') as cf:
            joblib.dump(contents_line_vectors, cf, compress=3)

    def construct(self):
        for i, ncode in enumerate(self.data_accessor.ncodes):
            print('[INFO] num of constructed data: {}'.format(i))
            self.sentence_to_word_embedding_avg_vector(ncode)
Exemplo n.º 2
0
class WordEmbeddingVectorConstructor:
    def __init__(self):
        self.data_accessor = CorpusAccessor()
        if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
            print('[INFO] loading word embedding model...')
            self.word_embedding_model = word2vec.Word2Vec.load(
                WORD_EMBEDDING_MODEL_PATH)

    def convert_word_embedding_vectors(self, sentence):
        """
        文を文中の単語の文さんベクトルのリストに変換
        """
        wakati_line = text_processor.wakati(sentence).split()
        return [
            self.word_embedding_model.__dict__['wv'][word]
            for word in wakati_line
        ]

    def sentence_to_word_embedding_vectors(self, ncode):
        """
        小説本文を、文中における各単語の分散表現のベクトルのリストに変換する
        データは文番号をkey、文ベクトルをvalueとする辞書で保存される
        [1: tensor, 2: tensor, ... , n: tensor]
        """
        print('[PROCESS NCODE]: {}'.format(ncode))
        contents_lines = self.data_accessor.get_contents_lines(ncode)
        synopsis_lines = self.data_accessor.get_synopsis_lines(ncode)
        if not contents_lines or not synopsis_lines:
            return

        # 本文各文のベクトル化
        contents_line_vectors = dict()
        for line_idx, line in enumerate(contents_lines):
            if line_idx % 50 == 0:
                print('contents progress: {:.1f}%'.format(
                    line_idx / len(contents_lines) * 100))
            tensor = self.convert_word_embedding_vectors(line)
            contents_line_vectors[line_idx] = tensor

        # データの保存
        contents_file_path = os.path.join(WORD_EMBEDDING_VECTORS_CONTENTS_PATH,
                                          ncode + '.txt')
        print('[INFO] saving data: {}'.format(ncode))
        with open(contents_file_path, 'wb') as cf:
            joblib.dump(contents_line_vectors, cf, compress=3)

    def construct(self):
        """
        全小説のデータを構築する
        """
        for i, ncode in enumerate(self.data_accessor.ncodes):
            print('[INFO] num of constructed data: {}'.format(i))
            self.sentence_to_word_embedding_vectors(ncode)
Exemplo n.º 3
0
class WordIndexesConstructor():

    def __init__(self):
        self.data_accessor = CorpusAccessor()
        if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
            self.word_embedding_model = word2vec.Word2Vec.load(WORD_EMBEDDING_MODEL_PATH)

    def convert_index_list(self, line):
        if self.word_embedding_model is None:
            raise ValueError("there is not word embedding model")
        words = text_processor.wakati(line).split()
        index_list = [self.word_embedding_model.wv.vocab[word].index + 1 for word in words]
        return index_list


    def sentence_to_word_indexes(self, ncode):
        """
        小説本文各文を、文中の各単語のインデックスのリストに変換する
        データは文番号をkey、インデックスのリストをvalueとする辞書で保存される
        [1: list, 2: list, ... , n: list]
        """
        print('[PROCESS NCODE]: {}'.format(ncode))
        contents_file_path = os.path.join(WORD_INDEXES_CONTENTS_PATH, ncode + '.txt')
        if os.path.isfile(contents_file_path):
            return

        contents_lines = self.data_accessor.get_contents_lines(ncode)
        synopsis_lines = self.data_accessor.get_synopsis_lines(ncode)
        if not contents_lines or not synopsis_lines:
            return

        index_data = dict()
        for line_idx, line in enumerate(contents_lines):
            if line_idx % 50 == 0:
                print('contents progress: {:.1f}%'.format(line_idx / len(contents_lines) * 100))
            index_list = self.convert_index_list(line)
            index_data[line_idx] = index_list

        # データの保存
        print('[INFO] saving data: {}'.format(ncode))
        with open(contents_file_path, 'wb') as cf:
            joblib.dump(index_data, cf, compress=3)

    def construct(self):
        """
        全小説のデータを構築する
        """
        for i, ncode in enumerate(self.data_accessor.ncodes):
            print('[INFO] num of constructed data: {}'.format(i))
            self.sentence_to_word_indexes(ncode)
Exemplo n.º 4
0
def multi_generate(importance, start, end):
    """
    複数作品まとめて確認したいとき
    """
    corpus_accessor = CorpusAccessor()
    output_file_path = 'result_start_' + str(start) + '_end_' + str(
        end) + '.txt'
    file = open(output_file_path, 'w')

    love_story_s = LSTMSummarizer()
    love_story_supplier = LSTMVectorSupplier(
        'love_story',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    love_story_s.set_supplier(love_story_supplier)
    love_story_s.set_trained_model()

    fantasy_s = LSTMSummarizer()
    fantasy_supplier = LSTMVectorSupplier(
        'fantasy',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    fantasy_s.set_supplier(fantasy_supplier)
    fantasy_s.set_trained_model()

    literature_s = LSTMSummarizer()
    literature_supplier = LSTMVectorSupplier(
        'literature',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    literature_s.set_supplier(literature_supplier)
    literature_s.set_trained_model()

    sf_s = LSTMSummarizer()
    sf_supplier = LSTMVectorSupplier('sf',
                                     importance,
                                     use_data_of_position_of_sentence=True,
                                     use_data_of_is_serif=True,
                                     use_data_of_is_include_person=True,
                                     use_data_of_sentence_length=True)
    sf_s.set_supplier(sf_supplier)
    sf_s.set_trained_model()

    # sys.setrecursionlimit(20000)
    rouge = Rouge()

    for i, ncode in enumerate(corpus_accessor.exist_ncodes[start:end]):
        print('processed ncode count: ', i)

        genre = corpus_accessor.get_genre(ncode)
        if len(genre) == 0:
            print('non genre')
            continue
        ref = ''.join(corpus_accessor.get_synopsis_lines(ncode))

        synopsis = ''
        if genre == 'love_story':
            synopsis = love_story_s.generate(ncode)
        elif genre == 'fantasy':
            synopsis = fantasy_s.generate(ncode)
        elif genre == 'literature':
            synopsis = literature_s.generate(ncode)
        elif genre == 'sf':
            synopsis = sf_s.generate(ncode)

        score = rouge.get_scores(wakati(synopsis), wakati(ref),
                                 False)[0]['rouge-1']['r']

        file.write(ncode + '\n')
        file.write(genre + '\n')
        file.write('score: ' + str(score) + '\n')
        file.write(ref + '\n\n')
        file.write(synopsis + '\n\n\n')
    file.close()