Пример #1
0
    def remove(self, key):
        """
        在指定OPS服务器上,删除指定的键
        @param key: 设置键
        @return: 无
        """

        logcm.print_info("Remove ops key (key : %s)" % key)

        # 取得访问URL
        url = self.get_url('del', key)

        # 请求OPS服务器
        webcm.read_url(url, self.cfg['encoding'])
Пример #2
0
    def update(self, key, val):
        """
        在指定OPS服务器上,更新指定的键值
        @param key: 设置键
        @param val: 设置值
        @return: 无
        """

        logcm.print_info("Update ops value (key : %s, val : %s )" % (key, val))

        # 取得访问URL
        url = self.get_url('update', key, val)

        # 请求OPS服务器
        webcm.read_url(url, self.cfg['encoding'])
Пример #3
0
    def add(self, key, val):
        """
        在指定OPS服务器上,添加新的键值
        @param key: 设置键
        @param val: 设置值
        @return: 无
        """

        logcm.print_info("Add ops value (key : %s, val : %s )" % (key, val))

        # 取得访问URL
        url = self.get_url('add', key, val)

        # 请求OPS服务器
        webcm.read_url(url, self.cfg['encoding'])
Пример #4
0
def crawl_from_url(page_url,
                   next_page_select,
                   img_select,
                   tag_select,
                   local_path,
                   encoding='utf-8',
                   page_no=1):
    """
    从指定的网页URL,取得满足要求的图片URL列表
    @param page_url: 网页URL
    @param next_page_select: 下一页select语句
    @param img_select: 图片select语句
    @param tag_select: 标签select语句
    @param local_path: 本地文件保存路径
    @param encoding: 网页编码
    @param page_no: 网页页码号
    @return: 无
    """
    logcm.print_info("crawl_from_url Page.%d start..." % page_no)
    html = webcm.read_url(page_url, encoding)
    # print(html)
    soup = htmlcm.to_soup(html)
    # 下载当前网页中的所有图片
    webcm.down_img(soup, page_url, img_select, tag_select, local_path, page_no)
    # 取得下一个页面的URL地址
    next_page_url = htmlcm.next_page(soup, page_url, next_page_select)
    # 只要下一个页面存在,继续递归下载
    if next_page_url is not None:
        logcm.print_info("NextPageUrl is " + next_page_url)
        crawl_from_url(next_page_url, next_page_select, img_select, tag_select,
                       local_path, encoding, page_no + 1)
    else:
        logcm.print_info("End\n")
Пример #5
0
    def load(self):
        """
        取得所有设定值
        @return: 设定值的键值字典
        """

        logcm.print_info("Loading all ops values ...")
        # 取得访问URL
        url = self.get_url('load')

        # OPS服务器的设定画面访问
        html = webcm.read_url(url, self.cfg['encoding'])
        if html is None:
            return None

        ops_map = {}
        soup = htmlcm.to_soup(html)

        # OPS网页的表格解析
        trs = soup.select("body table tr")
        for i in range(1, len(trs)):
            tr = trs[i]
            tds = tr.select("td")
            if len(tds) == 4:
                key = tds[2].string
                val = tds[3].string
                ops_map[key] = val

        return ops_map
Пример #6
0
def crawl_with_format(page_url,
                      next_page_format,
                      img_select,
                      tag_select,
                      local_path,
                      encoding='utf-8',
                      page_no=1):
    """
    从指定的网页URL,取得满足要求的图片URL列表
    @param page_url: 网页URL
    @param next_page_format: 下一页连接的模板
    @param img_select: 图片select语句
    @param tag_select: 标签select语句
    @param local_path: 本地文件保存路径
    @param encoding: 网页编码
    @param page_no: 网页页码号
    @return: 无
    """
    logcm.print_info("......crawl_with_format Page." + str(page_no) + "......")
    html = webcm.read_url(page_url, encoding)
    # print(html)
    soup = htmlcm.to_soup(html)
    # 下载当前网页中的所有图片
    count = webcm.down_img(soup, page_url, img_select, tag_select, local_path,
                           page_no)
    if count == 0:
        # 如果下载不到文件,或者是已存在的文件,则结束下载。
        logcm.print_info("Not found image End\n")
        return

    # 取得下一个页面的URL地址
    next_page_url = next_page_format.replace('[page_no]', str(page_no + 1))
    logcm.print_info("NextPageUrl is " + next_page_url)
    crawl_with_format(
        next_page_url,
        next_page_format,
        img_select,
        tag_select,
        local_path,
        encoding,
        page_no + 1,
    )
Пример #7
0
"""

import jieba.analyse
import matplotlib.pyplot as plt

from common import htmlcm
from common import logcm
from common import webcm
from common import wordscm
from scipy.misc import imread
from wordcloud import WordCloud

# 加载内容URL的内容
url = 'http://www.leadbankmap.com/baogao/detail_4800.html'
encoding = 'utf-8'
html = webcm.read_url(url, encoding)
content = htmlcm.clean_html(html)

# 加载停用词列表
stopwords = wordscm.load_stopwords('./data', 'words_stop1.txt', encoding)

# 导入stopwords
jieba.analyse.set_stop_words('./data/words_stop1.txt')

# 文字频率排行
seg = jieba.analyse.textrank(content,
                             topK=50,
                             withWeight=False,
                             allowPOS=('nt', 'n', 'nv'))
cut_text = " ".join(seg)
logcm.print_obj(cut_text, 'cut_text')