예제 #1
0
    def sample_chapter_cid(self, rid, align_id):
        rid = int(rid)

        chapter_module = ChapterOptimizeModule()
        total_candidate_chapter_list = chapter_module.candidate_chapter_collecion(rid, align_id)
        self.logger.info('rid: {0} align_id: {1} total_candidate_chapter_length: {2}'.format(
            rid, align_id, len(total_candidate_chapter_list)))

        candidate_chapter_list = chapter_module.candidate_chapter_generate(rid, align_id, total_candidate_chapter_list)
        if len(candidate_chapter_list) == 0:
            return

        if len(candidate_chapter_list) >= 3:
            candidate_chapter_list = chapter_module.basic_chapter_filter(candidate_chapter_list)

        self.logger.info('rid: {0} align_id: {1} selected_candidate_chapter_length: {2}'.format(
            rid, align_id, len(candidate_chapter_list)))

        for candidate_chapter in candidate_chapter_list:
            with codecs.open('data/sample/' + str(candidate_chapter.site_id), 'a',
                             encoding='gbk', errors='ignore') as sample_file:
                sample_file.write(str(candidate_chapter.rid) + cur_delimiter
                                  + str(candidate_chapter.align_id) + cur_delimiter
                                  + str(candidate_chapter.chapter_id) + cur_delimiter
                                  + str(candidate_chapter.site_id) + cur_delimiter
                                  + str(candidate_chapter.site_status) + cur_delimiter
                                  + candidate_chapter.chapter_content + cur_linesep)
예제 #2
0
    def sample_chapter_rid(self, rid):
        """
        从一本书中随机挑选20个章节,按照之前章节选取的思路,并将候选章节写入对应的站点的样本文件。
        :return:
        """
        rid = int(rid)

        # 获得rid对应的权威目录,然后从中随机挑选10个章节
        chapter_db = ChapterDBModule()
        agg_dir_list = chapter_db.get_novelaggregationdir_list(rid)

        #从最后100章或者最后20%章内挑选
        sample_seq_num = min(len(agg_dir_list) - 100, len(agg_dir_list) * 4/5)
        sample_agg_dir_list = SampleChapter.sample_list(agg_dir_list, sample_num=20,
                                                        sep_num=sample_seq_num, head_num=10
        )

        chapter_module = ChapterOptimizeModule()
        for (align_id, chapter_index, chapter_url, chapter_status) in sample_agg_dir_list:
            self.logger.info('rid: {0}, sample_index: {1}/{2}, align_id: {3}, chapter_status: {4}'.format(
                rid, chapter_index, len(sample_agg_dir_list), align_id, chapter_status))

            total_candidate_chapter_list = chapter_module.candidate_chapter_collecion(rid, align_id)
            self.logger.info('total_candidate_chapter_length: {0}'.format(len(total_candidate_chapter_list)))

            candidate_chapter_list = chapter_module.candidate_chapter_generate(rid, align_id, total_candidate_chapter_list)
            if len(candidate_chapter_list) == 0:
                continue

            if len(candidate_chapter_list) >= 3:
                candidate_chapter_list = chapter_module.basic_chapter_filter(candidate_chapter_list)

            self.logger.info('selected_candidate_chapter_length: {0}'.format(len(candidate_chapter_list)))

            with codecs.open('data/sample_cid', 'a', encoding='gbk') as sample_cid_file:
                sample_cid_file.write(str(rid) + cur_delimiter + str(align_id) + cur_linesep)

            for candidate_chapter in candidate_chapter_list:
                with codecs.open('data/sample/' + str(candidate_chapter.site_id), 'a',
                                 encoding='gbk', errors='ignore') as sample_file:
                    sample_file.write(str(candidate_chapter.rid) + cur_delimiter
                                      + str(candidate_chapter.align_id) + cur_delimiter
                                      + str(candidate_chapter.chapter_id) + cur_delimiter
                                      + str(candidate_chapter.site_id) + cur_delimiter
                                      + str(candidate_chapter.site_status) + cur_delimiter
                                      + candidate_chapter.chapter_content + cur_linesep)