Exemplo n.º 1
0
    def mysave(self, tocatid):
        self.database = Mysql(host="121.199.48.196",
                              user="******",
                              pwd="rajltool321123",
                              db="m_wxhs120_com")
        self.tocatid = tocatid
        # self.sDir = "d:/uploadfile/"#图片本地目录
        # self.sDir = "/mnt/xvdb1/virtualhost/vmO2xqlA/uploadfile/"#图片本地目录
        # self.picurl = "http://imgs.najiaoluo.com/"#远程图片域名
        # if os.path.exists(self.sDir)==False:
        #     os.mkdir(sDir)
        #     os.chmod(sDir,0o777) #其实makedirs默认就是777权限,不知为什么不可以
        # sName = sDir+str(int(time.time()))+'.txt'
        print('正在采集--' + self.title + '--文章')

        title = self.title.encode('gbk', 'ignore').decode('gbk')
        if (title.strip() == ''):
            print("标题,不采集!")
            return
        isexist1 = ""
        try:
            sql = "select id from v9_news where title='%s' and catid='%s' order by title desc" % (
                title, self.tocatid)
            # print(sql)
            isexist1 = self.database.ExecQuery(sql)
        except Exception as e:
            print("查询信息出错,错误信息:%s" % (e))
            pass
        if isexist1:
            print(title + '-----> 有重复不提交!')
        else:  #无相关记录时提交数据
            # pass
            self.addnews()
Exemplo n.º 2
0
    def stock_telescope(self):
        cf = ConfigParser.ConfigParser()
        cf.read("%s/%s.ini" % (self.Config_path, self.configs['name'])  )  
        host = cf.get("db", "host") 
        port = cf.get("db", "port") 
        user = cf.get("db", "user") 
        password = cf.get("db", "pass") 
        database = cf.get("db", "database") 
        
        conn = Mysql(
            host = host,
            user = user,
            password = password,
            database = database, 
        )

        cid = []        
        if "|" in self.configs['cid']:
            for i in self.configs['cid'].split('|'):
                cid.append(i.strip())
        else:
            cid.append(self.configs['cid'])
        print cid

        now = int(time.time())

        result = {}
        for appid in cid:
            sql_title = "select title from h_appmap where appid = '%s' limit 1" % appid
            appmap_row = conn.fetch(sql_title)
            if not appmap_row['title']: appmap_row['title'] = appid
            line = appmap_row['title']
            print line
            result[line] = []
            rangetime = 1209600

            if self.configs['attr'].has_key('rangetime'):
                if self.configs['attr']['rangetime'] and '|' in self.configs['attr']['rangetime']:
                    rangetime = self.configs['attr']['rangetime'].split('|')[1]
                    times = re.search("^([0-9]+)([a-z|A-Z]+)", rangetime)
                    n, m = int(times.group(1)), times.group(2) 
                    if m == 'h':
                        m = 3600
                    elif m == 'd':
                        m = 86400
                    elif m == 'w':
                        m = 604800
                    rangetime = n*m
            print rangetime                    

            sql_data = "select timestamp, value from h_data where appid = '%s' and timestamp > %d order by timestamp" % (appid, now-rangetime)
            data_rows = conn.fetchall(sql_data)
            for row in data_rows:
                result[line].append([int(row['timestamp']*1000), int(row['value'])]) 

            #print result

        
        return self.highstock(result, self.configs['id'], self.configs['attr'])
Exemplo n.º 3
0
 def process_item(self, item, spider):
     sql = 'select * from illusts where pixiv_id = %(pixiv_id)s'
     value = {
         'pixiv_id' : item['pixiv_id']
     }
     db = Mysql()
     count = db.select(sql, value)
     if count > 0:
         raise DropItem(item['pixiv_id'])
     else:
         return item
Exemplo n.º 4
0
    def save(self, tocatid):
        self.database = Mysql(host="121.41.40.189",
                              user="******",
                              pwd="nMAf6wBCdRstaaabbb",
                              db="najiaoluoabab")
        self.tocatid = tocatid
        self.sDir = "d:/uploadfile/"  #图片本地目录
        self.picurl = "http://imgs.najiaoluo.com/"  #远程图片域名
        if os.path.exists(sDir) == False:
            os.mkdir(sDir)
        # sName = sDir+str(int(time.time()))+'.txt'
        print('正在采集--' + self.title + '--文章')
        #公众号入库
        isexist = ""
        self.wxid = 0
        try:
            isexist = database.ExecQuery(
                "select id from v9_weixinhao where weixinID='" + self.wxh +
                "'")
        except Exception as e:
            print(e)
            pass
        if isexist:
            print("公众号-----> 有重复不提交!")
            self.wxid = isexist[0][0]
        else:  #入库并返回id
            self.wxid = self.addwx()

        title = self.title
        if (title.strip() == '' or self.wxid == 0):
            print("标题或微信ID为空,不采集!")
            return
        isexist1 = ""
        try:
            isexist1 = self.database.ExecQuery(
                "select * from v9_news where title='" + title + "'")
        except Exception as e:
            print(e)
            pass
        if isexist1:
            print(title + '-----> 有重复不提交!')
        else:  #无相关记录时提交数据
            self.addnews()
Exemplo n.º 5
0
 def process_item(self, item, spider):
     if item['mode'] == 'daily':
         illust_mode = 1
     else:
         illust_mode = 0
     date = time.strftime('%Y-%m-%d')
     value = {
         'date': date,
         'mode': illust_mode,
         'path': item['images'][0]['path'],
         'pixiv_id': item['pixiv_id'],
         'title': item['title'],
         'total_score': item['total_score'],
         'author': item['author'],
         'image_urls': item['image_urls'],
         'rank': item['rank'],
         'author_id': item['author_id']
     }
     sql = "insert into illusts(pixiv_id, title, total_score, author, image_urls, date, rank, author_id, path, mode)  VALUES (%(pixiv_id)s, %(title)s, %(total_score)s, %(author)s, %(image_urls)s, %(date)s, %(rank)s, %(author_id)s, %(path)s, %(mode)s)"
     db = Mysql()
     db.action(sql, value)
Exemplo n.º 6
0
    def mysave(self,scatid):
        database=Mysql(host="121.199.48.196", user="******", pwd="rajltool321123", db="test")
        sDir='d:/test/'
        #图片地址
        img_dir = 'img'
        if os.path.exists(sDir)==False:
            os.mkdir(sDir)
        # sName = sDir+str(int(time.time()))+'.txt'
        print('正在采集--'+self.title+'--文章')
        title = self.clearInput(self.title)
        m = self.clearInput(self.content)
        #批量替换旧内容中的图片的路径
        img_patt = re.compile('src=".*?/(\w+\.\w+)"')
        new_m = img_patt.sub(r'src="./%s/\1"'%img_dir,m)
        isexist1=""

        try:
            isexist1 = database.ExecQuery("select * from v9_news where title='"+title+"'")
        except Exception as e:
            print(e)
            pass
        if isexist1:
            print(title+'-----> 有重复不提交!')
        else:#无相关记录时提交数据
            content=new_m
            catid=scatid #保存到的栏目
            typeid=0
            tags=jieba.analyse.extract_tags(title, 6)
            keywords=(",".join(tags))
            description=self.dom('.art_content').text()[0:200]
            url=''
            listorder=0
            status=99
            username='******'
            inputtime=updatetime=int(time.time())
            insertbooksql ="insert into v9_news (title,catid,typeid,keywords,description,url,listorder,status,username,inputtime,updatetime) VALUES ('" \
							"{title}', {catid}, {typeid}, '{keywords}', '{description}', '{url}', {listorder}, {status}, '{username}', '{inputtime}', '{updatetime}')"
            insert1 = insertbooksql.format(title=title, catid=catid, typeid=typeid, keywords=keywords, description=description,url=url,listorder=listorder,status=status,username=username,inputtime=inputtime,updatetime=updatetime)
            print(insert1)
            try:
                database.ExecNonQuery(insert1)
                lastid=database.cur.lastrowid
                paginationtype = 2
                groupids_view = ""
                maxcharperpage = 0
                template = ""
                insertbooksql ="insert into v9_news_data (id,content,paginationtype,groupids_view,maxcharperpage,template) VALUES ({lastid}, '{content}', {paginationtype},'{groupids_view}',{maxcharperpage},'{template}')"
                insert2 = insertbooksql.format(lastid=lastid, content=content, paginationtype=paginationtype,groupids_view=groupids_view,maxcharperpage=maxcharperpage,template=template)
                print(insert2)
                database.ExecNonQuery(insert2)

            except Exception as e:
                print("文章数据库保存出错,错误信息:%s" % (e) )
                pass

            #真正下载图片
            img_patt = re.compile('src="(.*?)"')
            img_patt = img_patt.findall(m)
            i =0
            for img in img_patt:
                i+=1
                #图片名称
                img_name = os.path.join(img_dir,img.split('/')[-1])
             #获取图片资源
                if os.path.exists(sDir+img_dir)==False:
                    os.mkdir(sDir+img_dir)
                #合并路径
                imgpath=os.path.join(sDir,img_name)
                f = open(imgpath, 'wb')
                f.write(requests.get(img, stream=True).content)
                f.close()
Exemplo n.º 7
0

id, run_id, project, group, percent, stat = sys.argv[1:]              #stat: u(update)  ro r(rollback)

try:
    int(percent)
except:
    l = Log(project, group, percent)
else:
    l = Log(project, group, "%s%%" % percent)



mysql_conn = Mysql(
    host = '127.0.0.1',
    user = '******',
    password = '******',
    database = 'autorelease',
)


mongo_conn = Mongo(
    host = '127.0.0.1',
    database = 'log',
    table = 'log_'+id,
)

mongo_conn.drop()

mongo_conn.insert(
    id = id,
    run_id = run_id,
Exemplo n.º 8
0
class Getshow(object):
    def __init__(self, show_id):  # 参数为在vccoo上的id
        self.url = 'http://www.vccoo.com/v/{0}'.format(show_id)
        self._dom = None  # 弄个这个来缓存获取到的html内容,一个蜘蛛应该之访问一次

    @property
    def dom(self):  # 获取html内容
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = Pq(document.text)
        return self._dom

    # 标题
    @property
    def title(self):  # 让方法可以通过s.title的方式访问 可以少打对括号
        return self.dom('h1.article-title').text(
        )  # 关于选择器可以参考css selector或者jquery selector, 它们在pyquery下几乎都可以使用

    # 内容
    @property
    def content(self):
        d = Pq(self.dom('.article-content').html())
        d('.main-tg-area').remove()
        d('.articleRecommend').remove()
        return self.clearInput(d.html())
        # return self.dom('.article-content').html() # 直接获取html 胆子就是大 以后再来过滤

    # 公众号logo
    @property
    def wxlogo(self):
        return self.dom('.author-name img').attr('src')

    #微信号
    @property
    def wxh(self):
        wxlmurl = self.dom('.author-name a').attr('href')  #vccoo公众号栏目页
        document = requests.get(wxlmurl)
        document.encoding = 'utf-8'
        dom = Pq(document.text)
        return dom('.publicAccountID').text()

    # 公众号名称
    @property
    def wxname(self):
        return self.dom('.author-name strong').text()

    # 公众号文章真实网址
    @property
    def wxurl(self):
        return re.findall(r'var s = "(.*?)"',
                          self.dom('body').html())[0].replace("&", "&")
        # print(re.findall('<title>(.*?)</title>',"dsflksl<title>sdfsdf中国</title>dsfds")[0])

    # 公众号二维码
    @property
    def wxer(self):
        biz = self.wxurl.split("biz=")[1].split("&mid=")[0]
        return "http://mp.weixin.qq.com/mp/qrcode?scene=10000004&size=100&__biz=" + biz

    # <meta property="og:image" content="http://mmbiz.qpic.cn/mmbiz_jpg/3oP8LV1kURibv3LAbIkk4v6pXo6xHwZVkqibO0BSdVGicA8JHicKiaJZU3Dpga2ibwa2bEfad5PchdxXSFmxv6WkECEQ/0?wx_fmt=jpeg" />
    # 文章缩略图
    @property
    def thumb(self):
        return re.findall(r'<meta property="og:image" content="(.*?)"',
                          self.dom('head').html())[0]

    # 发布时间
    @property
    def addtime(self):
        return self.dom('.author-name').text(
        )[-10:]  # 获取tags,这里直接用text方法,再切分就行了。一般只要是文字内容,而且文字内容自己没有空格,逗号等,都可以这样弄,省事。

    # 清洗数据
    def clearInput(self, txt):
        txt = txt.replace('<!--main-tg-area-->', '')
        txt = txt.replace('<!-- articleRecommend/ -->', '')
        # txt=txt.replace('vccoo.com/refer.php?url=','')
        # 正则替换
        txt = re.sub(r'http:\/\/img\d+\.vccoo\.com\/refer\.php\?url=', '', txt)
        return txt

        # 入库
    def save(self, tocatid):
        self.database = Mysql(host="121.41.40.189",
                              user="******",
                              pwd="nMAf6wBCdRstaaabbb",
                              db="najiaoluoabab")
        self.tocatid = tocatid
        self.sDir = "d:/uploadfile/"  #图片本地目录
        self.picurl = "http://imgs.najiaoluo.com/"  #远程图片域名
        if os.path.exists(sDir) == False:
            os.mkdir(sDir)
        # sName = sDir+str(int(time.time()))+'.txt'
        print('正在采集--' + self.title + '--文章')
        #公众号入库
        isexist = ""
        self.wxid = 0
        try:
            isexist = database.ExecQuery(
                "select id from v9_weixinhao where weixinID='" + self.wxh +
                "'")
        except Exception as e:
            print(e)
            pass
        if isexist:
            print("公众号-----> 有重复不提交!")
            self.wxid = isexist[0][0]
        else:  #入库并返回id
            self.wxid = self.addwx()

        title = self.title
        if (title.strip() == '' or self.wxid == 0):
            print("标题或微信ID为空,不采集!")
            return
        isexist1 = ""
        try:
            isexist1 = self.database.ExecQuery(
                "select * from v9_news where title='" + title + "'")
        except Exception as e:
            print(e)
            pass
        if isexist1:
            print(title + '-----> 有重复不提交!')
        else:  #无相关记录时提交数据
            self.addnews()

    # 公众号入库
    def addwx(self):
        title = self.wxname
        catid = 10  #保存到的栏目
        typeid = 0
        tags = jieba.analyse.extract_tags(self.wxname, 3)
        keywords = (",".join(tags))
        description = ''
        url = ''
        listorder = 0
        status = 99
        username = '******'
        inputtime = updatetime = int(time.time())
        insertbooksql = "insert into v9_weixinhao (title,catid,typeid,keywords,description,url,listorder,status,username,inputtime,updatetime) VALUES ( '{title}', {catid}, {typeid}, '{keywords}', '{description}', '{url}', {listorder}, {status}, '{username}', '{inputtime}', '{updatetime}')"
        insert1 = insertbooksql.format(title=title,
                                       catid=catid,
                                       typeid=typeid,
                                       keywords=keywords,
                                       description=description,
                                       url=url,
                                       listorder=listorder,
                                       status=status,
                                       username=username,
                                       inputtime=inputtime,
                                       updatetime=updatetime)
        print(insert1)
        try:
            self.database.cur.execute(insert1)
            # 附表
            lastid = self.database.cur.lastrowid
            fenleiid = self.tocatid
            weixinID = self.wxh
            gnjs = ''
            wxrz = ''
            ndir = time.strftime("%Y/%m%d/")
            wxlogo = self.getimg(self.wxlogo, weixinID + "_logo.png",
                                 self.sDir + ndir, self.picurl + ndir)  #下载图片
            wxepic = self.getimg(self.wxer, weixinID + ".png",
                                 self.sDir + ndir, self.picurl + ndir)
            content = ''
            paginationtype = 2
            groupids_view = ""
            maxcharperpage = 0
            template = ""
            insertbooksql = "insert into v9_weixinhao_data (id,fenliid,weixinID,gnjs,wxrz,wxlogo,wxepic,content,paginationtype,groupids_view,maxcharperpage,template) VALUES ({lastid},{fenliid},{weixinID},'{gnjs}','{wxrz}','{wxlogo}','{wxepic}','{content}', {paginationtype},'{groupids_view}',{maxcharperpage},'{template}')"
            insert2 = insertbooksql.format(lastid=lastid,
                                           fenliid=fenliid,
                                           weixinID=weixinID,
                                           gnjs=gnjs,
                                           wxrz=wxrz,
                                           wxlogo=wxlogo,
                                           wxepic=wxepic,
                                           content=content,
                                           paginationtype=paginationtype,
                                           groupids_view=groupids_view,
                                           maxcharperpage=maxcharperpage,
                                           template=template)
            print(insert2)
            self.database.cur.execute(insert2)
            # database.cur.close()
            self.database.conn.commit()
            return self.database.cur.lastrowid
            print('公众号入库成功!')
        except Exception as e:
            print("公众号数据库保存出错,错误信息:%s" % (e))
            # database.conn.close()
            self.database.conn.rollback()
            return 0

    # 文章入库
    def addnews(self):
        #批量替换旧内容中的图片的路径
        # img_patt = re.compile('src=".*?/(\w+\.\w+)"')
        # new_m = img_patt.sub(r'src="./%s/\1"'%img_dir,m)
        title = self.title
        content = self.database.conn.escape(
            self.content)  #这里对内容进行转义,提交变量时不用加',因为后面转义过后会自动加引号
        catid = self.tocatid  #保存到的栏目
        wxid = self.wxid
        ndir = time.strftime("%Y/%m%d/")
        thumb = self.getimg(self.thumb,
                            self.random_str(6) + ".jpg",
                            self.sDir + "thumb/" + ndir,
                            self.picurl + "thumb/" + ndir)  #下载图片
        typeid = 0
        tags = jieba.analyse.extract_tags(self.title, 6)
        keywords = (",".join(tags))
        description = Pq(self.content).text()[0:200]
        url = ''
        listorder = 0
        status = 99
        username = '******'
        inputtime = updatetime = int(time.time())
        insertbooksql = "insert into v9_news (title,catid,wxid,thumb,typeid,keywords,description,url,listorder,status,username,inputtime,updatetime) VALUES ( '{title}',{catid},{wxid}, '{thumb}',{typeid}, '{keywords}', '{description}', '{url}',{listorder},{status}, '{username}', '{inputtime}', '{updatetime}')"
        insert1 = insertbooksql.format(title=title,
                                       catid=catid,
                                       wxid=wxid,
                                       thumb=thumb,
                                       typeid=typeid,
                                       keywords=keywords,
                                       description=description,
                                       url=url,
                                       listorder=listorder,
                                       status=status,
                                       username=username,
                                       inputtime=inputtime,
                                       updatetime=updatetime)
        print(insert1)
        try:  #这是用到了事务处理
            self.database.cur.execute(insert1)
            lastid = self.database.cur.lastrowid
            paginationtype = 2
            groupids_view = ""
            maxcharperpage = 0
            template = ""
            insertbooksql = "insert into v9_news_data (id,content,paginationtype,groupids_view,maxcharperpage,template) VALUES ({lastid}, {content}, {paginationtype},'{groupids_view}',{maxcharperpage},'{template}')"
            insert2 = insertbooksql.format(lastid=lastid,
                                           content=content,
                                           paginationtype=paginationtype,
                                           groupids_view=groupids_view,
                                           maxcharperpage=maxcharperpage,
                                           template=template)
            print(insert2)
            self.database.cur.execute(insert2)
            # database.cur.close()
            self.database.conn.commit()
            print('文章入库成功!')
        except Exception as e:
            print("文章数据库保存出错,错误信息:%s" % (e))
            # database.conn.close()
            self.database.conn.rollback()

    # 获取远程图片保存到本地,返回图片网址
    # imgUrl:要下载的远程图片 filename:保存的图片名 tourl:要保存的本地目录 neturl:图片网址
    def getimg(self, imgUrl, filename, tourl, neturl):
        if filename:
            local_filename = filename
        else:
            local_filename = imgUrl.split('/')[-1]
        print("Download Image File=", local_filename)
        if os.path.exists(tourl) == False:
            os.makedirs(tourl)

        # 这里是cookie的模拟方法,需要模拟登录
        # headers = {
        # "Host": "techinfo.subaru.com",
        # "User-Agent": "lol",
        # "Cookie": "JSESSIONID=F3CB4654BFC47A6A8E9A1859F0445123"
        # }
        # r = requests.get(url, stream=True, headers=headers)
        r = requests.get(
            imgUrl, stream=True)  # here we need to set stream = True parameter
        with open(tourl + local_filename, 'wb') as f:
            try:
                for chunk in r.iter_content(
                        chunk_size=1024
                ):  #大图片断点续传  1024 是一个比较随意的数,表示分几个片段传输数据。
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        f.flush()  #刷新也很重要,实时保证一点点的写入。
                f.close()
            except Exception as e:
                print("图片下载出错")
                f.close()
                return
        return neturl + local_filename

    # 生成num位随机字符串
    def random_str(self, num):
        li = []
        for i in range(int(num)):
            r = random.randrange(0, 5)
            if i == r:
                num = random.randrange(0, 10)
                li.append(str(num))
            else:
                temp = random.randrange(65, 91)
                c = chr(temp)
                li.append(c)
        result = "".join(li)
        return result
Exemplo n.º 9
0
def main(id, run_id, project, group, percent, stat):
    """ A demo daemon main routine, write a datestamp to
        /tmp/daemon-log every 10 seconds.
    """
    import time

    mysql_conn = Mysql(
        host='127.0.0.1',
        user='******',
        password='******',
        database='autorelease',
        )

    mongo_conn = Mongo(
    host='127.0.0.1',
    database='log',
    table='log_' + id,
    )


                 #stat: u(update)  ro r(rollback)

    try:
        int(percent)
    except:
        l = Log(project, group, percent)
    else:
        l = Log(project, group, "%s%%" % percent)

    content = l.load()


    mongo_conn.drop()

    mongo_conn.insert(
        id=id,
        run_id=run_id,
        content="",
        update_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
    )

    while True:
        content = l.load()
        result = os.popen("/usr/local/ctier/pkgs/ctl-3.6.1/bin/ctl-queue").read()

        if run_id in result:
            mongo_conn.update(condition={'id': id}, data={"content": content})
            continue
        else:
            mongo_conn.update(condition={'id': id}, data={"content": content})
            if stat == 'u':
                mysql_conn.save(
                    "update task_content set finish_time = '%s', status = '102' where id = '%s'" % (int(time.time()), id))
                time.sleep(10)
                send("", id, project, 102)

            #history

                content_result = Content.objects.filter(id=id, project=project, env=percent).order_by('-finish_time').values()[0]

                h = History(
                task_id=content_result['id'],
                type=content_result['type'],
                project=content_result['project'],
                env=content_result['env'],
                run_id=content_result['run_id'],
                version=content_result['version'],
                status=content_result['status'],
                deploy_time=content_result['deploy_time'],
                finish_time=content_result['finish_time'],
                create_user=content_result['create_user'],
                deploy_user=content_result['deploy_user'],
                )
                h.save()
            elif stat == 'r':
                mysql_conn.save(
                    "update task_content set finish_time = '%s', status = '105' where id = '%s'" % (int(time.time()), id))
                time.sleep(10)

                rollback_history_id = Rollback.objects.filter(task_id=id).order_by('-start_time').values()[0]['id']
                Rollback.objects.filter(id=rollback_history_id, task_id=id).update(finish_time=int(time.time()))
                send("", id, project, 105)
            exit()

        time.sleep(10)
Exemplo n.º 10
0
#encoding=utf-8
__author__ = 'jophyyao'

import sys, os, time, re

sys.path.insert(0, '../')
from controltier.node import Node
from tools.mysql import Mysql

mysql_conn = Mysql(
    host = '127.0.0.1',
    user = '******',
    password = '******',
    database = 'autorelease',
    )

now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))

for project in os.listdir('/usr/local/ctier/ctl/projects'):
     n = Node(project)
     for k, v in n.analysis().iteritems():
         result = mysql_conn.fetch("select count(id) from project_node where project = '%s' and hostname = '%s'" % (project, v['hostname']))
         if result['count(id)']:
             mysql_conn.save("""
             update project_node set
             name = '{name}',
             description = '{description}',
             tags = '{tags}',
             ctlusername = '******',
             osfamily = '{osfamily}',
             osname = '{osname}',
Exemplo n.º 11
0
class Getshow(object):
    def __init__(self, url):  # 参数为在id
        self.url = url
        self._dom = None  # 弄个这个来缓存获取到的html内容,一个蜘蛛应该之访问一次

    @property
    def dom(self):  # 获取html内容
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = Pq(document.text)
        return self._dom

    # 标题
    @property
    def title(self):  # 让方法可以通过s.title的方式访问 可以少打对括号
        return self.clearInput(self.dom('title').text(
        ))  # 关于选择器可以参考css selector或者jquery selector, 它们在pyquery下几乎都可以使用

    # 内容
    @property
    def content(self):
        return self.clearInput(self.dom('.nr').html())  # 直接获取html 胆子就是大 以后再来过滤

    def mysave(self, tocatid):
        self.database = Mysql(host="121.199.48.196",
                              user="******",
                              pwd="rajltool321123",
                              db="m_wxhs120_com")
        self.tocatid = tocatid
        # self.sDir = "d:/uploadfile/"#图片本地目录
        # self.sDir = "/mnt/xvdb1/virtualhost/vmO2xqlA/uploadfile/"#图片本地目录
        # self.picurl = "http://imgs.najiaoluo.com/"#远程图片域名
        # if os.path.exists(self.sDir)==False:
        #     os.mkdir(sDir)
        #     os.chmod(sDir,0o777) #其实makedirs默认就是777权限,不知为什么不可以
        # sName = sDir+str(int(time.time()))+'.txt'
        print('正在采集--' + self.title + '--文章')

        title = self.title.encode('gbk', 'ignore').decode('gbk')
        if (title.strip() == ''):
            print("标题,不采集!")
            return
        isexist1 = ""
        try:
            sql = "select id from v9_news where title='%s' and catid='%s' order by title desc" % (
                title, self.tocatid)
            # print(sql)
            isexist1 = self.database.ExecQuery(sql)
        except Exception as e:
            print("查询信息出错,错误信息:%s" % (e))
            pass
        if isexist1:
            print(title + '-----> 有重复不提交!')
        else:  #无相关记录时提交数据
            # pass
            self.addnews()

# 文章入库

    def addnews(self):
        #批量替换旧内容中的图片的路径

        title = self.title.encode('gbk', 'ignore').decode('gbk')
        # content=(self.content) #这里对内容进行转义,提交变量时不用加',因为后面转义过后会自动加引号
        content = self.database.conn.escape(
            self.content.encode(
                'gbk',
                'ignore').decode('gbk'))  #这里对内容进行转义,提交变量时不用加',因为后面转义过后会自动加引号
        catid = self.tocatid  #保存到的栏目
        # weixinid=str(self.wxid)
        ndir = time.strftime("%Y/%m%d/")
        # nthumb="http://img03.sogoucdn.com/net/a/04/link?appid=100520034&url="+self.thumb #这里对大图片进行缩放到512宽 id改100520034为300 100520031为121
        # thumb=self.getimg(nthumb,self.random_str(6)+".jpg",self.sDir+"thumb/"+ndir,self.picurl+"thumb/"+ndir) #下载图片
        thumb = ""
        typeid = 0
        tags = jieba.analyse.extract_tags(title, 6)
        keywords = (",".join(tags))
        description = Pq(self.content).text()[0:180].encode(
            'gbk', 'ignore').decode('gbk')
        url = ''
        listorder = 0
        status = 99
        username = '******'
        inputtime = updatetime = int(time.time())
        insertbooksql = "insert into v9_news (title,catid,thumb,typeid,keywords,description,url,listorder,status,username,inputtime,updatetime) VALUES ( '{title}',{catid}, '{thumb}',{typeid}, '{keywords}', '{description}', '{url}',{listorder},{status}, '{username}', {inputtime}, {updatetime})"
        insert1 = insertbooksql.format(title=title,
                                       catid=catid,
                                       thumb=thumb,
                                       typeid=typeid,
                                       keywords=keywords,
                                       description=description,
                                       url=url,
                                       listorder=listorder,
                                       status=status,
                                       username=username,
                                       inputtime=inputtime,
                                       updatetime=updatetime)
        # print(insert1)
        try:  #这是用到了事务处理
            self.database.cur.execute(insert1)
            lastid = self.database.cur.lastrowid
            paginationtype = 2
            groupids_view = ""
            maxcharperpage = 0
            template = ""
            insertbooksql = "insert into v9_news_data (id,content,paginationtype,groupids_view,maxcharperpage,template) VALUES ({lastid}, {content}, {paginationtype},'{groupids_view}',{maxcharperpage},'{template}')"
            insert2 = insertbooksql.format(lastid=lastid,
                                           content=content,
                                           paginationtype=paginationtype,
                                           groupids_view=groupids_view,
                                           maxcharperpage=maxcharperpage,
                                           template=template)
            # print(insert2)
            self.database.cur.execute(insert2)
            #新增hits表这里modelid=12 文章modelid=1
            hitsid = "c-1-" + str(lastid)
            insertsql = "INSERT INTO `v9_hits`(`hitsid`,`catid`,`updatetime`) VALUES ('{hitsid}',{catid},{updatetime}) "
            insert3 = insertsql.format(hitsid=hitsid,
                                       catid=catid,
                                       updatetime=updatetime)
            # print(insert3)
            self.database.cur.execute(insert3)
            sql = "select url from v9_category where catid=" + str(
                catid) + " order by catid desc"
            isurl = self.database.ExecQuery(sql)
            # print(isurl)
            # #更新文章主表url
            url = str(isurl[0][0]) + str(lastid) + ".html"
            # # print(url)
            insertsql = "update  `v9_news` set url='{url}' where id = {lastid} order by id desc"
            insert4 = insertsql.format(url=url, lastid=lastid)
            # print(insert4)
            self.database.cur.execute(insert4)

            # database.cur.close()
            self.database.conn.commit()
            print('文章%s入库成功!' % title)
        except Exception as e:
            print("文章%s数据库保存出错,错误信息:%s" % (title, e))
            # database.conn.close()
            self.database.conn.rollback()
        # with open(sName,'wb') as file:
        #     file.write(new_m.encode())
        # file.close()

    def clearInput(self, txt):
        txt = txt.replace('白求恩医学基金定点:', '')
        txt = txt.replace('连续10年荣获国家A级医院:', '')
        txt = txt.replace('被评为国家示范妇科科研基地、国家妇科疾病重点诊疗基地,更是连续10年被评为&quot;A级妇科医院',
                          '')
        txt = txt.replace('丽水市囿山路568号(民政局旁)', '无锡市锡山区东亭二泉东路195号')
        txt = txt.replace('丽水慈爱医院', '无锡华山医院')
        txt = txt.replace('丽水', '无锡')
        txt = txt.replace('慈爱', '华山')
        txt = txt.replace('湖南', '湖南')

        txt = txt.replace('0578-2292111', '0510-88200585')
        txt = txt.replace('05782292111', '051088200585')
        txt = txt.replace('0578-23292111', '0510-88200585')
        txt = txt.replace('057823292111', '051088200585')

        txt = txt.replace('972963352', '493709817')
        txt = txt.replace('预约68元妇科检查套餐', '预约0元妇科检查套餐')
        txt = txt.replace('专家', '医生')
        txt = txt.replace('24年', '')
        txt = txt.replace('非营利性', '专业')
        txt = txt.replace('着名', '专业')
        txt = txt.replace('白求恩医学基金无锡唯一定点医院', '瑞安专业医院')

        txt = txt.replace('世界', '')
        txt = txt.replace('白求恩医学基金无锡唯一定点医院', '瑞安专业医院')
        txt = txt.replace('德国蓝氧净疗杀菌技术', '华山妇科炎症治疗技术')
        txt = txt.replace('德国O3蓝氧净疗技术', '华山妇科炎症治疗技术')
        txt = txt.replace('权威', '专业')
        txt = txt.replace('汪爱云,女,1949年生,从事妇科临床、教学工作四十余年,并多次在国内着名的三甲医院研究深造。',
                          '从事妇科临床、教学工作二十余年')
        txt = txt.replace('临床经验超过40年', '临床经验超过20年')
        txt = txt.replace('汪爱云主任', '李医生')
        txt = txt.replace('王爱云主任', '李医生')
        txt = txt.replace('汪爱云', '李医生')
        txt = txt.replace('王爱云', '李医生')
        txt = txt.replace('陈汉娇', '李医生')
        txt = txt.replace('陈向宇', '李医生')
        txt = txt.replace('楼美丽', '李医生')
        txt = txt.replace('68元妇科六项套餐 关爱健康从体检开始', '0元妇科检查套餐 关爱健康从检查开始')
        txt = txt.replace('68元六大项妇科检查', '0元妇科检查套餐')
        txt = txt.replace('68元', '0元')
        txt = txt.replace('熊国伟', '曹医生')
        txt = txt.replace('董广胜', '曹医生')
        txt = txt.replace('李涛', '曹医生')
        txt = txt.replace('王益鑫', '曹医生')
        txt = txt.replace('包皮环切术只需580元是吗', '华山包皮环切术有优惠哦')
        txt = txt.replace('包皮环切术', '华山包皮环切术')
        txt = txt.replace('副主任医师/博士后', '')
        txt = txt.replace(
            '男,泌尿外科副主任医师,医学博士后。在国内较早开展前列腺癌表观遗传、微小RNA的研究,现为上海泌尿男科学会青年会员。', '')
        txt = txt.replace('副主任医师/博士后', '')
        txt = txt.replace('上海同济医院泌尿外科', '泌尿外科')
        txt = txt.replace('副教授', '')
        txt = txt.replace('博士后', '')
        txt = txt.replace('公立甲等', '')
        txt = txt.replace('沪浙', '')
        txt = txt.replace('著名', '')
        txt = txt.replace('沪浙', '')
        txt = txt.replace('沪浙', '')
        txt = txt.replace('李医生、李医生、李医生主任', '李医生')
        txt = txt.replace('白求恩基金会携手', '')
        txt = txt.replace('白求恩基金会', '')

        txt = txt.replace('40年', '20年')
        txt = txt.replace(
            ',被评为国家示范妇科科研基地、国家科学技术进步奖二等奖,不孕不育重点诊疗基地、全国十佳妇科医院,更是连续10年被评为国家A级妇科医院',
            '')
        txt = txt.replace('阴茎背神经选择性切断术', '华山早泄治疗术')
        txt = txt.replace('阴茎助勃器植入术', '华山阳痿治疗术')
        txt = txt.replace('检查价格仅需30元', '常规检查价格0元')
        txt = txt.replace('30元', '0元')

        # 正则替换
        # text=re.sub('\[[0-9]*\]','',text)
        # txt=re.sub(r"<img[^>]+src\s*=(\s*)['\"]([^'\"]+)['\"][^>]*>","<a href=\"/swt\" rel=\"nofollow\"><img src=\"\\2\" /></a>",txt)
        txt = re.sub(
            r"<img[^>]+src\s*=(\s*)['\"]([^'\"]+)['\"][^>]*>",
            "<a href=\"/swt\" rel=\"nofollow\"><img src=\"http://m.wxhs120.com/uploadfile/2015/1010/20151010085553617.gif\" /></a>",
            txt)
        return txt