def info(self, info='', col_bits=5, pagenum=100): keywords = PreDeal.seg(info) # 1. 关键词提取 keys = jieba.analyse.textrank(info, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # 2. 调用搜索引擎爬取相关网页 # 2.1 抓取链接 spider_link = SpiderLink(keys, self.root) spider_link.crawl(pagenum) # 2.2 抓取内容 filename = '_'.join(keys) + '.html' spider_to = SpiderTo(filename) spider_to.crawl() # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合 p = PreDeal() filepath = os.path.join(config.spidertext, '_'.join(keys)) propath = os.path.join(config.prepapath, '_'.join(keys)) p.savetexts(filepath=filepath, prepath=propath) # 4. 构建索引, 并检索,得到包含关键词信息的网页 # 4.1 索引构建 indexpath = os.path.join(config.indexpath, '_'.join(keys)) Index.build(datapath=propath, indexpath=indexpath) search = Search(keys=keys, pindexp=indexpath) # 4.2 搜索并保存 search.retrieve(keywords=keywords) # 5. 选取最佳网页,位置信息描述,编码 info_kws = keywords[:] loc = Location(keywords=info_kws, col_bits=col_bits) name = '_'.join(keys) res_list = loc.describe(name) return res_list
def info(self, fi='', pagenum=100): info = FileUtil.readfile(fi) keywords = PreDeal.seg(info) # 1. 关键词提取 keys = jieba.analyse.textrank(info, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # 2. 调用搜索引擎爬取相关网页 # 2.1 抓取链接 spider_link = SpiderLink(keys, self.root) spider_link.crawl(pagenum) # 2.2 抓取内容 filename = '_'.join(keys) + '.html' spider_to = SpiderTo(filename) spider_to.crawl() # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合 p = PreDeal() filepath = os.path.join(config.spidertext, '_'.join(keys)) prepath = os.path.join(config.prepapath, '_'.join(keys)) p.savetexts(filepath=filepath, prepath=prepath) # 4. 构建索引, 并检索,得到包含关键词信息的网页 # 4.1 索引构建 indexpath = os.path.join(config.indexpath, '_'.join(keys)) idx = Index() idx.build(datapath=prepath, indexpath=indexpath) search = Search1(filename=fi, pindexp=indexpath) # 4.2 搜索并保存 info_k = keywords[:] num = search.retrieve(keywords=info_k) return keywords, num