示例#1
0
 def savetexts(self, filepath, prepath):
     """
     保存预处理后的文本
     :param filepath: html文件路径
     :param prepath:  保存路径
     :return:
     """
     self.logger.info('init pretreatment directory:"{0}"'.format(prepath))
     FileUtil.init_path(prepath)
     try:
         file_lists = os.listdir(filepath)  # 返回当前路径下所有文件和路径,字符串类型
         for filename in file_lists:
             file = os.path.join(filepath, filename)
             if os.path.isfile(file):
                 # 1.获取url及文本
                 url, text = FileUtil.get_url_text(file)
                 # 2.关键词信息
                 kws = PreDeal.seg(text)
                 self.logger.info(
                     "Store pretreatment texts content:{0}".format(
                         filename))
                 FileUtil.writefile(url + '\t'.join(kws),
                                    os.path.join(prepath, filename))
         self.logger.info('Text pretreatment End!')
     except Exception as e:
         print(e)
示例#2
0
 def init_path(self):
     savepath = os.path.join(config.hidepath, '_'.join(self.keys))
     kwpath = os.path.join(config.hidekwpath, '_'.join(self.keys))
     if not os.path.exists(savepath):
         os.makedirs(savepath)
     else:
         FileUtil.init_path(savepath)
     return savepath, kwpath
示例#3
0
 def crawl(self):
     self.download.download()
     readpath = os.path.join(config.spiderhtml, self.filename)
     savepath = os.path.join(config.spidertext, self.filename)
     FileUtil.init_path(savepath)
     for filename in os.listdir(readpath):
         file = os.path.join(readpath, filename)
         url, content = self.parse.parse(file)
         filename = filename.rstrip('.html') + '.txt'
         self.logger.info("Save spider url and content:{0}".format(url))
         FileUtil.writefile(url + content, os.path.join(savepath, filename))
     print('crawl web contents end!')
示例#4
0
 def init_config(self):
     FileUtil.init_path(self.savepath)