Exemplo n.º 1
0
 def __init__(self, from_addr, to_addr, cc_addr, smtp_server,
              password):  # 初始化类
     try:
         self.from_addr = from_addr  # 发件人账号
         self.to_addr = to_addr  # 收件人地址
         self.cc_addr = cc_addr  # 抄送人地址
         self.smtp_server = smtp_server  # smtp服务器地址
         self.password = password  # 发送密码
         #self.msg = ''
     except Exception as err:
         crawler.log('邮件发送类初始化异常', 0)
         crawler.log(err, 0)
Exemplo n.º 2
0
 def init_mail_content(self, mail_content, mail_title):
     try:
         self.msg = MIMEText(mail_content, 'plain', 'utf-8')
         self.msg['From'] = formataddr(["招标自动抓取系统", self.from_addr])
         # self.msg['To'] = formataddr([self.to_addr, self.to_addr])
         self.msg['To'] = self.to_addr
         self.msg['Cc'] = self.cc_addr
         self.msg['Subject'] = Header(mail_title, 'utf-8').encode()
         self.msg['Date'] = time.strftime(
             "%Y-%m-%d %H:%M:%S") + ' +0800'  # 设置时间,后面的是时区
     except Exception as err:
         crawler.log('邮件内容初始化异常', 0)
         crawler.log(err, 0)
Exemplo n.º 3
0
 def send_mail(self, mail_content, mail_title):
     try:
         self.init_mail_content(mail_content, mail_title)
         server = smtplib.SMTP_SSL(self.smtp_server, 465)
         #server.set_debuglevel(1) #邮件发送调试开关
         server.login(self.from_addr, self.password)
         # server.sendmail(self.from_addr, [self.to_addr], self.msg.as_string())
         server.sendmail(self.from_addr,
                         self.to_addr.split(',') + self.cc_addr.split(','),
                         self.msg.as_string())
         server.quit()
     except Exception as err:
         crawler.log('邮件发送动作异常', 0)
         crawler.log(err, 0)
Exemplo n.º 4
0
    def init_mail_content_with_file(self, mail_content, mail_title):
        try:
            self.msg = MIMEMultipart()
            self.msg['Subject'] = Header(mail_title, 'utf-8').encode()
            self.msg['From'] = formataddr(["招标自动抓取系统", self.from_addr])
            # self.msg['To'] = formataddr([self.to_addr, self.to_addr])
            self.msg['To'] = self.to_addr
            self.msg['Cc'] = self.cc_addr
            self.msg['Date'] = time.strftime(
                "%Y-%m-%d %H:%M:%S") + ' +0800'  # 设置时间,后面的是时区
            # 下面是文字部分,也就是纯文本
            puretext = MIMEText('运营商抓取信息,')
            self.msg.attach(puretext)

            # xlsx类型的附件
            xls_name1 = '联通' + str(time.strftime("%y%m%d")) + '.xls'
            xls_name2 = '电信' + str(time.strftime("%y%m%d")) + '.xls'
            xls_name3 = '移动' + str(time.strftime("%y%m%d")) + '.xls'
            xls_name4 = '铁塔' + str(time.strftime("%y%m%d")) + '.xls'

            xlsxpart = MIMEApplication(open(xls_name1, 'rb').read())
            xlsxpart.add_header('Content-Disposition',
                                'attachment',
                                filename=xls_name1)
            self.msg.attach(xlsxpart)

            xlsxpart = MIMEApplication(open(xls_name2, 'rb').read())
            xlsxpart.add_header('Content-Disposition',
                                'attachment',
                                filename=xls_name2)
            self.msg.attach(xlsxpart)

            xlsxpart = MIMEApplication(open(xls_name3, 'rb').read())
            xlsxpart.add_header('Content-Disposition',
                                'attachment',
                                filename=xls_name3)
            self.msg.attach(xlsxpart)

            xlsxpart = MIMEApplication(open(xls_name4, 'rb').read())
            xlsxpart.add_header('Content-Disposition',
                                'attachment',
                                filename=xls_name4)
            self.msg.attach(xlsxpart)
        except Exception as err:
            crawler.log('邮件内容初始化异常', 0)
            crawler.log(err, 0)
Exemplo n.º 5
0
def send_all(mail_content, mail_title):
    for value in mails:
        try:
            value.send_mail(mail_content, mail_title)
            crawler.log('发送邮件给 ' + value.to_addr + ':' +
                        mail_content[:int(len(mail_content) / 4)] + '...')
        except Exception as err:
            crawler.log('发送给' + value.to_addr + '的邮件发送出现问题')
            crawler.log(err, 0)
Exemplo n.º 6
0
 def send_mail_with_file(self, mail_content, mail_title):
     try:
         self.init_mail_content_with_file(mail_content, mail_title)
         #client = smtplib.SMTP()
         #client.connect('smtp.163.com')
         server = smtplib.SMTP_SSL(self.smtp_server, 465)
         #client.login(username, password)
         server.login(self.from_addr, self.password)
         #client.sendmail(sender, receivers, msg.as_string())
         # server.sendmail(self.from_addr, [self.to_addr], self.msg.as_string())
         server.sendmail(self.from_addr,
                         self.to_addr.split(',') + self.cc_addr.split(','),
                         self.msg.as_string())
         #client.quit()
         server.quit()
         crawler.log('附件邮件已发送', 0)
     except Exception as err:
         crawler.log('带附件邮件发送动作异常', 0)
         crawler.log(err, 0)
Exemplo n.º 7
0
def seach_telecom_project():
    # 中国电信阳光采购网网址
    #url = "http://caigou.chinatelecom.com.cn:8001/ESCM/biddoc/getListProvince.do?paging.pageSize=40"
    url = "https://caigou.chinatelecom.com.cn/MSS-PORTAL/announcementjoin/list.do?paging.pageSize=40"
    # res = requests.get("https://caigou.chinatelecom.com.cn/MSS-PORTAL/announcementjoin/list.do?provinceJT=NJT",
    #                    verify=False)
    # 读取原始网页数据
    try:
        #original_data = urllib.request.urlopen(url, verify=False).read()
        #original_data = original_data.decode('GBK')
        res = requests.get(url, verify=False)
        original_data = res.text
        #print(original_data)
    except Exception as err:
        crawler.log(err, 0)

    # <a></a>之间整个链接的提取规则
    link_rule = r'<a (.*?)/a>'

    # <a></a>之间URL的提取规则
    url_rule = r"'(.*?)','(.*?)'"

    # <a></a>之间title的提取规则
    title_rule = '">(.*?)<'

    # 获取原始整个链接
    original_links = re.findall(link_rule, original_data, re.S | re.M)
    try:
        for value in original_links:
            url = re.findall(url_rule, value, re.S | re.M)  # 获取url
            title = re.findall(title_rule, value, re.S | re.M)  # 获取招标项目名称
            if len(title) and len(url):
                if not (title[0] in telecom_project_list):
                    url = list(url[0])
                    telecom_project_list[title[0]] = url[0]
                    temp_text = "https://caigou.chinatelecom.com.cn/MSS-PORTAL/account/viewad.do?category=" + url[
                        1] + "&id=" + url[0]
                    #temp_text1 = "https://caigou.chinatelecom.com.cn/MSS-PORTAL/purchaseannouncebasic/viewHome.do?id="
                    #temp_text = temp_text + telecom_project_list[title[0]]
                    #temp_text1 = temp_text1 + telecom_project_list[title[0]]
                    txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text
                    #print(txt)
                    logtxt = r'招标信息:' + title[0]
                    crawler.log(logtxt, 0)
                    this_province = crawler.finderX(all_province_keywords, txt)
                    this_genre = crawler.finderX(business_keywords, txt)
                    crawler.save_xls('电信' + str(time.strftime("%y%m%d")),
                                     0,
                                     province=this_province,
                                     genre=this_genre,
                                     business=title[0],
                                     url=temp_text,
                                     time=str(time.strftime("%Y-%m-%d")))
                    if crawler.finderX(province_keywords, txt):  # 判断是否是重点省分项目
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '电信' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点省分重点项目 请注意')  # 判断是否是重点类型项目
                    else:
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '电信' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点项目 请注意')  # 判断是否是重点类型项目
    except Exception as err:
        crawler.log(err, 0)
    crawler.log("完成抓取,目前【电信】共抓取:" + str(len(telecom_project_list)) + "条记录")
Exemplo n.º 8
0
def seach_chinatower_project():
    # 中国铁塔采购网网址
    tower_url = 'http://www.tower.com.cn/default/main/index/cn.chinatowercom.obp.main.index.obphomepage.queryNoticeDetails.biz.ext'
    tower_headers = {
        'Host': 'www.tower.com.cn',
        'Connection': 'keep-alive',
        'Content-Length': '229',
        'Origin': 'http://www.tower.com.cn',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'Content-Type': 'application/json; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer':
        'http://www.tower.com.cn/default/main/index/noticedetail.jsp?_operation=notice&_purchaseNoticeType=2&_resultsNoticeType=2',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
    }
    tower_payload = '{"noticeTitle":"","effectTime":"","failureTime":"","noticeType":"null","purchaseNoticeType":"2","resultsNoticeType":"2","level":"","pageIndex":0,"pageSize":50,"sortField":"failure_time","sortOrder":"desc","page":{"begin":0,"length":50}}'

    # 读取原始网页数据
    try:
        original_links = get_tower(tower_url, tower_headers, tower_payload)
    except Exception as err:
        crawler.log(err, 0)

    try:
        for value in original_links:
            # 代码统一,把url和title都转成list,下面的代码就不用改了
            url = [value['id']]
            title = [value['notice_title']]
            if len(title) and len(url):
                if not (title[0] in chinatower_project_list):
                    chinatower_project_list[title[0]] = url[0]
                    temp_text = "http://www.tower.com.cn/default/main/index/noticedetail.jsp?_operation=notice&_notice=6&_id="
                    temp_text = temp_text + str(
                        chinatower_project_list[title[0]])
                    txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text
                    logtxt = r'招标信息:' + title[0]
                    crawler.log(logtxt, 0)
                    this_province = crawler.finderX(all_province_keywords, txt)
                    this_genre = crawler.finderX(business_keywords, txt)
                    crawler.save_xls('铁塔' + str(time.strftime("%y%m%d")),
                                     0,
                                     province=this_province,
                                     genre=this_genre,
                                     business=title[0],
                                     url=temp_text,
                                     time=str(time.strftime("%Y-%m-%d")))
                    if crawler.finderX(province_keywords, txt):  # 判断是否是重点省分项目
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '铁塔' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点省分重点项目 请注意')  # 判断是否是重点类型项目
                    else:
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '铁塔' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点项目 请注意')  # 判断是否是重点类型项目
    except Exception as err:
        crawler.log(err, 0)
    crawler.log("完成抓取,目前【铁塔】共抓取:" + str(len(chinatower_project_list)) + "条记录")
Exemplo n.º 9
0
def seach_chinamobile_project():
    # 中移动采购网网址
    url = "https://b2b.10086.cn/b2b/main/listVendorNotice.html?noticeType=2"
    # 读取原始网页数据
    try:
        if platform.platform().find('Linux') >= 0:
            drive = webdriver.PhantomJS(executable_path=r"phantomjs")
        elif platform.platform().find('Windows') >= 0:
            drive = webdriver.PhantomJS(
                executable_path=r"phantomjs-2.1.1-windows/bin/phantomjs.exe")
        else:
            drive = webdriver.PhantomJS()
        drive.get(url)
        time.sleep(5)
        html = drive.page_source.encode('utf-8')
        #print(html.decode('utf-8'))
        drive.quit()
        original_data = html.decode('utf-8')
    except Exception as err:
        crawler.log(err, 0)

    # <a></a>之间整个链接的提取规则
    link_rule = r'<tr class=(.*?)</tr>'

    # <a></a>之间URL的提取规则
    url_rule = r'selectResult(.*?)">'

    # <a></a>之间title的提取规则
    title_rule = r'"#this"(.*?)</a>'

    # 获取原始整个链接
    original_links = re.findall(link_rule, original_data, re.S | re.M)
    try:
        for value in original_links:
            url = re.findall(url_rule, value, re.S | re.M)  # 获取url
            title = re.findall(title_rule, value, re.S | re.M)  # 获取招标项目名称
            if len(title) and len(url):
                if not (title[0] in chinamobile_project_list):
                    chinamobile_project_list[title[0]] = url[0]
                    temp_text = "https://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id="
                    temp_text = temp_text + chinamobile_project_list[
                        title[0]][2:-2]
                    title_text = title[0]
                    str1 = 'title='
                    if title_text.find(str1) > 0:
                        title_text = title_text[title_text.find(str1) +
                                                7:title_text.find('">')]
                    else:
                        title_text = title_text[1:]
                    txt = r'招标信息:' + title_text + ' ,\n链接:' + temp_text
                    logtxt = r'招标信息:' + title[0]
                    crawler.log(logtxt, 0)
                    this_province = crawler.finderX(all_province_keywords, txt)
                    this_genre = crawler.finderX(business_keywords, txt)
                    crawler.save_xls('移动' + str(time.strftime("%y%m%d")),
                                     0,
                                     province=this_province,
                                     genre=this_genre,
                                     business=title_text,
                                     url=temp_text,
                                     time=str(time.strftime("%Y-%m-%d")))
                    if crawler.finderX(province_keywords, txt):  # 判断是否是重点省分项目
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '移动' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title_text,
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点省分重点项目 请注意')  # 判断是否是重点类型项目
                    else:
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '移动' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title_text,
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点项目 请注意')  # 判断是否是重点类型项目
    except Exception as err:
        crawler.log(err, 0)
    crawler.log("完成抓取,目前【移动】共抓取:" + str(len(chinamobile_project_list)) + "条记录")
Exemplo n.º 10
0
def seach_chinaunicom_project():
    # 中国联通采购网网址
    url = "http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?"

    # 读取原始网页数据
    try:
        original_data = urllib.request.urlopen(url).read()
        original_data = original_data.decode('GBK')
    except Exception as err:
        crawler.log(err, 0)

    # <span></span>之间整个链接的提取规则
    link_rule = r'<span (.*?)</span>'

    # 原始链接段之间URL的提取规则
    url_rule = r'window.open(.*?)","","height=600,width=900.*?'

    # 原始链接段之间之间title的提取规则
    title_rule = r"title='(.*?)'>.*?"

    # 获取原始整个链接
    original_links = re.findall(link_rule, original_data, re.S | re.M)
    try:
        for value in original_links:
            url = re.findall(url_rule, value, re.S | re.M)  # 获取url
            title = re.findall(title_rule, value, re.S | re.M)  # 获取招标项目名称
            if len(title) and len(url):
                if not (title[0] in chinaunicom_project_list):
                    chinaunicom_project_list[title[0]] = url[0]
                    temp_text = "http://www.chinaunicombidding.cn/"
                    temp_text = temp_text + str(
                        chinaunicom_project_list[title[0]])[3:]
                    txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text
                    logtxt = r'招标信息:' + title[0]
                    crawler.log(logtxt, 0)
                    this_province = crawler.finderX(all_province_keywords, txt)
                    this_genre = crawler.finderX(business_keywords, txt)
                    crawler.save_xls('联通' + str(time.strftime("%y%m%d")),
                                     0,
                                     province=this_province,
                                     genre=this_genre,
                                     business=title[0],
                                     url=temp_text,
                                     time=str(time.strftime("%Y-%m-%d")))
                    if crawler.finderX(province_keywords, txt):  # 判断是否是重点省分项目
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '联通' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点省分重点项目 请注意')  # 判断是否是重点类型项目
                    else:
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '联通' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点项目 请注意')  # 判断是否是重点类型项目
    except Exception as err:
        crawler.log(err, 0)
    crawler.log("完成抓取,目前【联通】共抓取:" + str(len(chinaunicom_project_list)) + "条记录")
Exemplo n.º 11
0
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点项目 请注意')  # 判断是否是重点类型项目
    except Exception as err:
        crawler.log(err, 0)
    crawler.log("完成抓取,目前【铁塔】共抓取:" + str(len(chinatower_project_list)) + "条记录")


if __name__ == "__main__":
    second = sleeptime(0, 3, 0)
    tmp = sleeptime(0, 0, 10)
    crawler.log('系统启动', 0)
    flag = 0
    while True:
        try:
            seach_telecom_project()  # 电信
        except Exception as err:
            crawler.log('抓取电信项目出现问题,请检查网络连接')
            crawler.log(err, 0)

        try:
            seach_chinaunicom_project()  # 联通
        except Exception as err:
            crawler.log('抓取联通项目出现问题,请检查网络连接')
            crawler.log(err, 0)

        try:
Exemplo n.º 12
0
def seach_telecom_project():
    # 中国电信阳光采购网网址
    url = "http://caigou.chinatelecom.com.cn:8001/ESCM/biddoc/getListProvince.do?paging.pageSize=40"

    # 读取原始网页数据
    try:
        original_data = urllib.request.urlopen(url).read()
        original_data = original_data.decode('GBK')
    except Exception as err:
        crawler.log(err, 0)

    # <a></a>之间整个链接的提取规则
    link_rule = r'<a (.*?)</a>'

    # <a></a>之间URL的提取规则
    url_rule = r'href="(.*?)" target="_blank".*?'

    # <a></a>之间title的提取规则
    title_rule = r'.*?target="_blank".*?title="(.*?)">.*?'

    # 获取原始整个链接
    original_links = re.findall(link_rule, original_data, re.S | re.M)
    try:
        for value in original_links:
            url = re.findall(url_rule, value, re.S | re.M)  # 获取url
            title = re.findall(title_rule, value, re.S | re.M)  # 获取招标项目名称
            if len(title) and len(url):
                if not (title[0] in telecom_project_list):
                    telecom_project_list[title[0]] = url[0]
                    temp_text = "http://caigou.chinatelecom.com.cn:8001"
                    temp_text = temp_text + telecom_project_list[title[0]]
                    txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text
                    logtxt = r'招标信息:' + title[0]
                    crawler.log(logtxt, 0)
                    this_province = crawler.finderX(all_province_keywords, txt)
                    this_genre = crawler.finderX(business_keywords, txt)
                    crawler.save_xls('电信' + str(time.strftime("%y%m%d")),
                                     0,
                                     province=this_province,
                                     genre=this_genre,
                                     business=title[0],
                                     url=temp_text,
                                     time=str(time.strftime("%Y-%m-%d")))
                    if crawler.finderX(province_keywords, txt):  # 判断是否是重点省分项目
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '电信' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点省分重点项目 请注意')  # 判断是否是重点类型项目
                    else:
                        if crawler.finderX(business_keywords, txt):
                            crawler.save_xls(
                                '电信' + str(time.strftime("%y%m%d")),
                                1,
                                genre=this_genre,
                                province=this_province,
                                business=title[0],
                                url=temp_text,
                                time=str(time.strftime("%Y-%m-%d")))
                            send_all(txt, '可能是重点项目 请注意')  # 判断是否是重点类型项目
    except Exception as err:
        crawler.log(err, 0)
    crawler.log("完成抓取,目前【电信】共抓取:" + str(len(telecom_project_list)) + "条记录")