def sample_chapter_cid(self, rid, align_id): rid = int(rid) chapter_module = ChapterOptimizeModule() total_candidate_chapter_list = chapter_module.candidate_chapter_collecion(rid, align_id) self.logger.info('rid: {0} align_id: {1} total_candidate_chapter_length: {2}'.format( rid, align_id, len(total_candidate_chapter_list))) candidate_chapter_list = chapter_module.candidate_chapter_generate(rid, align_id, total_candidate_chapter_list) if len(candidate_chapter_list) == 0: return if len(candidate_chapter_list) >= 3: candidate_chapter_list = chapter_module.basic_chapter_filter(candidate_chapter_list) self.logger.info('rid: {0} align_id: {1} selected_candidate_chapter_length: {2}'.format( rid, align_id, len(candidate_chapter_list))) for candidate_chapter in candidate_chapter_list: with codecs.open('data/sample/' + str(candidate_chapter.site_id), 'a', encoding='gbk', errors='ignore') as sample_file: sample_file.write(str(candidate_chapter.rid) + cur_delimiter + str(candidate_chapter.align_id) + cur_delimiter + str(candidate_chapter.chapter_id) + cur_delimiter + str(candidate_chapter.site_id) + cur_delimiter + str(candidate_chapter.site_status) + cur_delimiter + candidate_chapter.chapter_content + cur_linesep)
def sample_chapter_rid(self, rid): """ 从一本书中随机挑选20个章节,按照之前章节选取的思路,并将候选章节写入对应的站点的样本文件。 :return: """ rid = int(rid) # 获得rid对应的权威目录,然后从中随机挑选10个章节 chapter_db = ChapterDBModule() agg_dir_list = chapter_db.get_novelaggregationdir_list(rid) #从最后100章或者最后20%章内挑选 sample_seq_num = min(len(agg_dir_list) - 100, len(agg_dir_list) * 4/5) sample_agg_dir_list = SampleChapter.sample_list(agg_dir_list, sample_num=20, sep_num=sample_seq_num, head_num=10 ) chapter_module = ChapterOptimizeModule() for (align_id, chapter_index, chapter_url, chapter_status) in sample_agg_dir_list: self.logger.info('rid: {0}, sample_index: {1}/{2}, align_id: {3}, chapter_status: {4}'.format( rid, chapter_index, len(sample_agg_dir_list), align_id, chapter_status)) total_candidate_chapter_list = chapter_module.candidate_chapter_collecion(rid, align_id) self.logger.info('total_candidate_chapter_length: {0}'.format(len(total_candidate_chapter_list))) candidate_chapter_list = chapter_module.candidate_chapter_generate(rid, align_id, total_candidate_chapter_list) if len(candidate_chapter_list) == 0: continue if len(candidate_chapter_list) >= 3: candidate_chapter_list = chapter_module.basic_chapter_filter(candidate_chapter_list) self.logger.info('selected_candidate_chapter_length: {0}'.format(len(candidate_chapter_list))) with codecs.open('data/sample_cid', 'a', encoding='gbk') as sample_cid_file: sample_cid_file.write(str(rid) + cur_delimiter + str(align_id) + cur_linesep) for candidate_chapter in candidate_chapter_list: with codecs.open('data/sample/' + str(candidate_chapter.site_id), 'a', encoding='gbk', errors='ignore') as sample_file: sample_file.write(str(candidate_chapter.rid) + cur_delimiter + str(candidate_chapter.align_id) + cur_delimiter + str(candidate_chapter.chapter_id) + cur_delimiter + str(candidate_chapter.site_id) + cur_delimiter + str(candidate_chapter.site_status) + cur_delimiter + candidate_chapter.chapter_content + cur_linesep)