示例#1
0
 def savetexts(self, filepath, prepath):
     """
     保存预处理后的文本
     :param filepath: html文件路径
     :param prepath:  保存路径
     :return:
     """
     self.logger.info('init pretreatment directory:"{0}"'.format(prepath))
     FileUtil.init_path(prepath)
     try:
         file_lists = os.listdir(filepath)  # 返回当前路径下所有文件和路径,字符串类型
         for filename in file_lists:
             file = os.path.join(filepath, filename)
             if os.path.isfile(file):
                 # 1.获取url及文本
                 url, text = FileUtil.get_url_text(file)
                 # 2.关键词信息
                 kws = PreDeal.seg(text)
                 self.logger.info(
                     "Store pretreatment texts content:{0}".format(
                         filename))
                 FileUtil.writefile(url + '\t'.join(kws),
                                    os.path.join(prepath, filename))
         self.logger.info('Text pretreatment End!')
     except Exception as e:
         print(e)
示例#2
0
 def query(self, keywords, kwpath=''):
     path = []  # 已经找到的文章列表
     num = []  # 每篇含文章组合的个数
     unmatch = 0  # 失配个数
     maxh = 0  # 关键词个数
     q = ''  # 联合关键词
     flag = True  # 失配标志
     hidekey = []
     while keywords:
         kw = keywords[0]
         paper = Index.search(self.pindexp, q + ' ' + kw, limit=None)
         if paper:
             keywords.pop(0)
             hidekey.append(kw)
             q = q + ' ' + kw
             maxh += 1
         else:  # 当联合搜索无法进行下去时,转为寻找相似关键词
             simikeys = WV.similarwords(kw)
             t_paper = []
             if not simikeys:
                 print(
                     ".................Failed to find similar words................"
                 )
                 flag = False
             else:
                 for skw, similarity in simikeys:
                     sq = q + ' ' + skw
                     t_paper = Index.search(self.pindexp, sq, limit=None)
                     if t_paper:
                         hidekey.append(skw)
                         keywords.pop(0)
                         q = sq
                         maxh += 1
                         break
                 if not t_paper:  # 有关键词但联合搜索仍失败
                     flag = False
             # 失配
             if not flag:
                 doc = Index.search(self.pindexp, q, limit=None)
                 if not doc:
                     print("The keyword  '%s' is unMatch !" % kw)
                     unmatch += 1
                     hidekey.append('0')
                     keywords.pop(0)
                     path.append(None)
                     # flag = True
                 else:
                     path.append(doc)
                     num.append(maxh)
                     maxh = 0
                     q = ''
                 flag = True
         if not keywords:
             path.append(paper)
     hide_string = ' '.join(hidekey)
     FileUtil.writefile(hide_string, kwpath)
     return path
示例#3
0
 def crawl(self):
     self.download.download()
     readpath = os.path.join(config.spiderhtml, self.filename)
     savepath = os.path.join(config.spidertext, self.filename)
     FileUtil.init_path(savepath)
     for filename in os.listdir(readpath):
         file = os.path.join(readpath, filename)
         url, content = self.parse.parse(file)
         filename = filename.rstrip('.html') + '.txt'
         self.logger.info("Save spider url and content:{0}".format(url))
         FileUtil.writefile(url + content, os.path.join(savepath, filename))
     print('crawl web contents end!')