Exemplo n.º 1
0
 def parse_one_page(self, *params):
     #  3(内盘PTA) 4(外盘PTA) 5(内盘MEG现货) 6(外盘MEG) 9(涤纶DTY 150D/48F低弹) 10(直纺半光POY 150D/48F) 12(直纺半光FDY 150D/96F)
     #  13(1.4D直纺涤短) 14(半光聚酯水平片) 15(华东聚酯水平片) 39(有光聚酯切片)
     counter=1
     while counter<=2:
         try:
             for i in (3, 4, 5, 6, 9, 10, 12, 13, 14, 15, 39):  # 3, 4, 5, 6, 9, 10, 12 ,13 ,14 ,15
                 data = {
                     "Monitor_IDs": "a_210000_3,a_210000_4,a_220000_5,a_220000_6",
                     "monitorId": i,
                     "startdate": params[0][0],
                     "enddate": params[0][1],
                     "type": "dd"
                 }
                 url = 'http://www.ccf.com.cn/dynamic_graph/getPrice.php'
                 cookies = ast.literal_eval(params[0][3])
                 response = self.post_one_page(url, self._headers, data, cookies)
                 html2 = etree.HTML(response)
                 sleeptime = random.randint(15, 30)
                 print('\033[37;40m-------------------获取指标%s价格数据休眠%s秒------------------\033[0m' % (i, sleeptime))
                 time.sleep(sleeptime)
                 if str(params[0][0])[:7] == str(params[0][1])[:7]:
                     for i in (2, 3):
                         a = '//div[@class="box_products_txt"]/table/tr[%s]/td[1]/text()' % i
                         b = '//div[@class="box_products_txt"]/table/tr[%s]/td[2]/text()' % i
                         c = '//div[@class="box_products_txt"]/table/tr[%s]/td[3]/text()' % i
                         index = html2.xpath(a)
                         erdat = html2.xpath(b)
                         sl = html2.xpath(c)
                         if index==[]:
                             break
                         yield {
                             'index': str(index[0]),
                             'erdat': str(erdat[0].strip()[0:4] + erdat[0].strip()[5:7] + erdat[0].strip()[8:10]),
                             'sl': int(sl[0])
                         }
                 else:
                     a = '//div[@class="box_products_txt"]/table/tr[%s]/td[1]/text()' % 2
                     b = '//div[@class="box_products_txt"]/table/tr[%s]/td[2]/text()' % 2
                     c = '//div[@class="box_products_txt"]/table/tr[%s]/td[3]/text()' % 2
                     index = html2.xpath(a)
                     erdat = html2.xpath(b)
                     sl = html2.xpath(c)
                     for i in range(0, len(index)):
                         yield {
                             'index': str(index[i]),
                             'erdat': str(erdat[i].strip()[0:4] + erdat[i].strip()[5:7] + erdat[i].strip()[8:10]),
                             'sl': int(sl[i])
                         }
             break
         except:
             if counter == 2 :
                 print('\033[1;37;41m----------------------未取到CCF价格数据-----------------------\033[0m')
                 wechat_auto().send_mesg(0, 'person', 6)
         counter = counter + 1
Exemplo n.º 2
0
def question_search(date):
    file_dir = 'D:\\各装置主要生产日指标汇总'
    path = file_dir + '\%s 各装置主要生产日指标汇总表.pdf' % date
    # 定义pdf页数,并读取当页表格
    with pdfplumber.open(path) as pdf:
        first_page = pdf.pages[0]
        for table in first_page.extract_tables():  # 这个pdf暂时只有一张表
            df = pd.DataFrame(table[5:])
            values = df.values  # 获取每行值
            for i in range(0, len(values)):
                if re.search(str(values[i]), '#'):  # 查找是否出现#字符
                    wechat_auto().send_mesg(0, 'person', 0)
                    return '出现特殊字符'
        return '未出现特殊字符'
Exemplo n.º 3
0
 def parse_one_page(self, *params):
     # kczs:CCF库存指数  292000/POY库存 290000/FDY库存 291000/DTY库存 280000/涤纶短纤库存
     # fhzs:CCG负荷指数  210000/PTA负荷 222000/MEG负荷(总) 223000/MEG煤制负荷 220000/聚酯负荷
     #                  230000/直纺长丝负荷 240000/直纺短纤负荷 274000/聚酯瓶片负荷
     # xyzs:CCF下游指数  110000/江浙纺机开机率 150000/江浙加弹开机率
     counter = 1
     while counter <= 2:
         try:
             sj = (['kczs', 292000], ['kczs', 290000], ['kczs', 291000], ['kczs', 280000],
                   ['fhzs', 210000], ['fhzs', 222000], ['fhzs', 223000], ['fhzs', 220000], ['fhzs', 230000],
                   ['fhzs', 240000], ['fhzs', 274000],
                   ['xyzs', 110000], ['xyzs', 150000])
             for l in range(0, len(sj)):
                 data = {
                     "ProdClass": sj[l][0],
                     "ProdID": sj[l][1],
                     "startDate": "2019-06-01",
                     "endDate": params[0][1]
                 }
                 url = 'http://www.ccf.com.cn/dynamic_graph/index.php'
                 cookies = ast.literal_eval(params[0][3])
                 response = self.post_one_page(url, {}, data, cookies)
                 sleeptime = random.randint(10, 15)
                 print('\033[37;40m-------------------获取库存数据休眠', sleeptime, '秒------------------\033[0m')
                 time.sleep(sleeptime)
                 pattern = re.compile(
                     '<tr .*?><td align=center>(.*?)</td>.*?<td align=center>(.*?)</td>.*?<td align=center>(.*?)</td>.*?<td align=center>(.*?)</td>.*?<td align=center>(.*?)</td></tr>',
                     re.S)
                 items = re.findall(pattern, response)
                 for item in items:
                     if item == []:
                         print('\033[1;37;41m----------------------总库存逻辑异常-----------------------\033[0m')
                     else:
                         yield {
                             'index': item[0],
                             'erdat': item[1].strip()[0:4] + item[1].strip()[5:7] + item[1].strip()[8:10],
                             'sl': item[2]
                         }
             break
         except:
             if counter == 2 :
                 print('\033[1;37;41m----------------------未取到库存指数数据-----------------------\033[0m')
                 wechat_auto().send_mesg(0, 'person', 10)
         counter = counter + 1
Exemplo n.º 4
0
 def parse_one_page(self, *params):
     data = {
         '__RequestVerificationToken':params[0][5][1],
         'StartDate': str(params[0][0]).replace('-', '/'),
         'EndDate': str(params[0][1]).replace('-', '/'),
         "FrequencyCode": "Daily",
         "IncludePrePublishedPrices": "false",
         "SelectedQuotes[0][Id]": "{i}petchem/8602792",
         "SelectedQuotes[0][PriceOption]": "Average",
         "SelectedQuotes[0][UsePrimaryYAxis]": "true",
         "PrimaryYAxisUnitCode": "ZZZ",
         "PrimaryYAxisCurrencyCode": "ZZZ",
         "SecondaryYAxisUnitCode": "ZZZ",
         "SecondaryYAxisCurrencyCode": 'ZZZ',
         "isFormulaRequest": "false",
         "preEntitledWorkspaceName":'',
     }
     headers = {
         "Accept": "application/json, text/javascript, */*; q=0.01",
         "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
         "User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
         "X-Requested-With": "XMLHttpRequest",
     }
     counter = 1
     while counter <= 2:
         try:
             url = 'https://www.icis.com/Dashboard/PurchasedPriceHistory/DisplayChartDualYAxis'
             response3 = requests.post(url, data=data, headers=headers, cookies=params[0][5][0], timeout=120)
             a = response3.json()
             c = a['chartLines'][0]['pointList']
             for i in range(0, len(c)):
                 x, y = c[i]['pointPrice'], c[i]['pointDateString'].replace('-','')
                 yield {
                     'index': '冰醋酸中间价',
                     'erdat': str(y),
                     'sl': x
                 }
             break
         except:
             if counter == 2 :
                 print('\033[1;37;41m----------------------未取到冰醋酸中间价-----------------------\033[0m')
                 wechat_auto().send_mesg(0, 'person', 12)
         counter = counter + 1
Exemplo n.º 5
0
 def parse_one_page(self, *params):
     # 获取库存/港口数据
     counter = 1
     while counter <= 2 :
         try:
             for j in ({'1'}):  # 从1到n页抓取数据
                 url = 'http://www.ccf.com.cn/newscenter/index.php?cur_row_pos=0&cur_pg_num=' + j + "&Class_ID=1C0000"
                 cookies = ast.literal_eval(params[0][3])
                 response = self.get_one_page(url, {}, [], cookies)
                 sleeptime = random.randint(10, 15)
                 print('\033[37;40m-------------------获取MEG发货统计url休眠', sleeptime, '秒------------------\033[0m')
                 time.sleep(sleeptime)
                 pattern = re.compile(
                     '<li(?: | class=articlebreak )><span>.*?href="(.*?)" class="h1a2" target="_blank" onmouseover="return overlib.*?>\d+月\d+日MEG发货统计</a></li>')  # 去掉re.S 不匹配换行符
                 items = re.findall(pattern, response)
                 for i in range(0, 3):  # len(items)
                     url = 'http://www.ccf.com.cn' + items[i]
                     data = re.findall('/newscenter/detail-1C0000-(.*?).shtml', items[i])[0][:8]  # 获取日期
                     data = datetime.datetime.strptime(data, '%Y%m%d') + datetime.timedelta(days=-1)  # 日期减少一天
                     data = data.strftime('%Y%m%d')  # 转换为yyyymmdd格式
                     response = requests.get(url, headers={}, cookies=cookies)
                     sleeptime = random.randint(30, 45)
                     print('\033[37;40m-------------------获取新一条MEG发货统计数据休眠', sleeptime, '秒------------------\033[0m')
                     time.sleep(sleeptime)
                     response.encoding = 'gbk'
                     html2 = response
                     pattern2 = re.compile('<div id=newscontent>.*?MEG发货量在(.*?)吨.*?</div>', re.S)
                     items2 = re.findall(pattern2, html2.text)[:2]
                     for item in items2:
                         if item == []:
                             print('\033[1;37;41m----------------------MEG发货量逻辑异常-----------------------\033[0m')
                         else:
                             yield {
                                 'index': 'MEG发货量',
                                 'erdat': data,
                                 'sl': item
                             }
             break
         except:
             if counter == 2 :
                 print('\033[1;37;41m----------------------未取到库存/港口数据-----------------------\033[0m')
                 wechat_auto().send_mesg(0, 'person', 8)
         counter = counter + 1
Exemplo n.º 6
0
 def parse_one_page(self, *params):
     # MEG港口库存详细
     counter = 1
     while counter <= 2 :
         try:
             for j in ({'1'}):  # 从1到n页抓取数据
                 url = 'http://www.ccf.com.cn/newscenter/index.php?cur_row_pos=0&cur_pg_num=' + j + "&Class_ID=1C0000"
                 cookies = ast.literal_eval(params[0][3])
                 response = self.get_one_page(url, {}, [], cookies)
                 sleeptime = random.randint(10, 15)
                 print('\033[37;40m-------------------获取MEG港口库存url休眠', sleeptime, '秒------------------\033[0m')
                 time.sleep(sleeptime)
                 pattern = re.compile(
                     '<li(?: | class=articlebreak )><span>.*?href="(.*?)" class="h1a2" target="_blank" onmouseover="return overlib.*?MEG港口库存.*?</a></li>')  # 去掉re.S 不匹配换行符
                 items = re.findall(pattern, response)
                 for i in range(0, 3):  # 暂时只取一日数据 len(items)
                     url = 'http://www.ccf.com.cn' + items[i]
                     data = re.findall('/newscenter/detail-1C0000-(.*?).shtml', items[i])[0][:8]  # 获取日期
                     response = requests.get(url, cookies=cookies)
                     sleeptime = random.randint(30, 45)
                     print('\033[37;40m-------------------获取新一条MEG港口库存数据休眠', sleeptime, '秒------------------\033[0m')
                     time.sleep(sleeptime)
                     response.encoding = 'gbk'
                     html2 = response
                     pattern2 = re.compile('<p(?:| style="text-indent: 2em;")>(?:CCF讯|受封航影响).*?今日华东主港地区(.*?)约(.*?)万吨.*?</p>')
                     items2 = re.findall(pattern2, html2.text)[0][1]
                     if items2 == []:
                         print('\033[1;37;41m----------------------MEG主港库存逻辑异常-----------------------\033[0m')
                     else:
                         yield {
                             'index': 'MEG主港库存',
                             'erdat': data,
                             'sl': items2
                         }
             break
         except:
             if counter == 2 :
                 print('\033[1;37;41m----------------------未取到MEG港口库存数据-----------------------\033[0m')
                 wechat_auto().send_mesg(0, 'person', 9)
         counter = counter + 1
Exemplo n.º 7
0
def main():
    try:
        # 定义today为当前日期 ny为年月
        today = datetime.date.today().strftime('%Y-%m-%d')
        ny = datetime.date.today().strftime('%Y-%m')
        yeday = datetime.datetime.strptime(today, '%Y-%m-%d') + datetime.timedelta(days=-4)
        yeday = yeday.strftime('%Y-%m-%d')
        # 设定传递参数,依次为昨日,今日和年月
        ccf_cookie = str(get_market_values().ccf_cookie())
        ccfei_cookie = str(get_market_values().ccfei_cookie())
        icis_cookie = get_market_values().icis_cookie()

        param = [yeday, today, ny, ccf_cookie, ccfei_cookie, icis_cookie]
        sqltuple = get_market_values().parse_concurrent(param)
        # 定义连接,并插入数据
        conn = get_connection().hana_connection()
        ins_sql = '''insert into"COMMON"."XFM_MARKT" (ZB , ERDAT ,SPJ) values ('%s','%s',%s)'''
        sql().sql_req(conn, tuple(sqltuple), sql=ins_sql)
        get_connection().close_connection(conn)
    except Exception as e:
        print(e)
        wechat_auto().send_mesg(0, 'person', 13)
Exemplo n.º 8
0
def ie_autoload():
    ie_option = IOptions()
    # 设置隐藏模式
    ie_option.add_argument('--disable-gpu')
    ie_option.add_argument('--no-sandbox')
    ie_option.add_argument('--disable-dev-shm-usage')
    ie_driver = webdriver.Ie(options=ie_option)
    # 登陆网页保存cookie
    try:
        url1 = 'https://auth.xfmgroup.com/cas/login?' \
               'service=http%3a%2f%2fmes.xfmgroup.com%2fWebUI%2fIPWeb%2f'
        ie_driver.get(url1)
        time.sleep(5)
        ie_driver.find_element_by_xpath('//*[@id="fm1"]/div/div[3]/input[4]').click()
        time.sleep(5)
        # 进入需要计算的页面
        url2 = 'http://fbbb.xfmgroup.com/FBBB/Integrates/DispatcherIn.aspx?' \
               'funcid=00010013&DATE=%s' % yesterday2
        ie_driver.get(url2)

        # 等待frame/left加载出来
        WebDriverWait(ie_driver, 30).until(EC.presence_of_element_located(
            (By.XPATH, '//frame[@name="left"]')))
        ie_driver.switch_to.frame('left')
        time.sleep(1)
        ie_driver.find_element_by_id('btnCompute').click()

        # 等待计算按钮重新加载出来
        WebDriverWait(ie_driver, 30).until(EC.presence_of_element_located(
            (By.XPATH, '//input[@id="btnCompute"]')))

        # 结束
        ie_driver.quit()
    except:
        # 异常退出
        ie_driver.quit()
        wechat_auto().send_mesg(0, 'person', 5)
Exemplo n.º 9
0
def chrome_autoload():
    chrome_option = COptions()
    file_dir = 'D:\\各装置主要生产日指标汇总'
    # 设置隐藏模式
    prefs = {'profile.default_content_settings.popups': 0,
             'download.default_directory': file_dir} #设置为0,禁止弹出窗口
    chrome_option.add_experimental_option('prefs', prefs)
    chrome_option.add_argument('--headless')
    chrome_driver = webdriver.Chrome(options=chrome_option)
    try:
        # 登陆网页
        url2 = 'http://192.168.2.81:8080/BOE/OpenDocument/opendoc/openDocument.jsp?sIDType=CUID' \
               '&iDocID=AcoYQKP5WQlNng0bCsVeAcw&lsSERDAT=%s' % yesterday
        chrome_driver.get(url2)
        time.sleep(1)
        chrome_driver.find_element_by_id('_id0:logon:USERNAME').send_keys('BI_USER')
        chrome_driver.find_element_by_id('_id0:logon:PASSWORD').send_keys('Xfm@2019')
        chrome_driver.find_element_by_id('_id0:logon:logonButton').click()

        # 等待切换到opendoc框架,并等待出现webi框架,然后切换到webi框架
        WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located(
            (By.XPATH, '//iframe[@id="openDocChildFrame"]')))
        chrome_driver.switch_to.frame('openDocChildFrame')

        WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located(
            (By.XPATH, '//iframe[@id="webiViewFrame"]')))
        chrome_driver.switch_to.frame('webiViewFrame')

        # 等待直到加载完毕(标志是visibility是否由visible转换为hidden)
        try:
            WebDriverWait(chrome_driver, 60).until(EC.presence_of_element_located(
                (By.XPATH, "//div[@id='modal_waitDlg' and contains(@style,'visibility: hidden')]")))
        except Exception as e:
            print(e, '运行等待超时')

        # 开始下载文件并改名
        try:
            chrome_driver.find_element_by_id('ariaLabelledBy_alertDlg')
            chrome_driver.quit()
            return '未成功计算'
        except:
            try:
                # 等待页面加载完毕可点击,点击
                WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located(
                    (By.XPATH, '//*[@id="_dhtmlLib_270"]')))
                chrome_driver.find_element_by_id('_dhtmlLib_270').click()

                # 等待下载框弹出,点击
                WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located(
                    (By.XPATH, '//*[@id="check_SelectAllReport"]')))
                chrome_driver.find_element_by_id('check_SelectAllReport').click()
                time.sleep(3)

                chrome_driver.find_element_by_id('check_1783').click()
                time.sleep(3)
                chrome_driver.find_element_by_id('BtnCImg_OK_BTN_idExportDlg').click()
                time.sleep(20)
                old_dir = file_dir + '\各装置主要生产日指标汇总.pdf'
                new_dir = file_dir + '\%s 各装置主要生产日指标汇总表.pdf' % yesterday
                if os.path.exists(new_dir):
                    os.remove(new_dir)
                    time.sleep(1)
                    os.rename(old_dir, new_dir)
                else:
                    os.rename(old_dir, new_dir)
            except:
                chrome_driver.quit()
                return '未成功计算'
        # 结束
        chrome_driver.quit()
        return '下载完成'
    except:
        # 异常退出
        chrome_driver.quit()
        wechat_auto().send_mesg(0, 'person', 4)
        return '异常退出'
Exemplo n.º 10
0
        for table in first_page.extract_tables():  # 这个pdf暂时只有一张表
            df = pd.DataFrame(table[5:])
            values = df.values  # 获取每行值
            for i in range(0, len(values)):
                if re.search(str(values[i]), '#'):  # 查找是否出现#字符
                    wechat_auto().send_mesg(0, 'person', 0)
                    return '出现特殊字符'
        return '未出现特殊字符'

if __name__ == '__main__':
    ie_autoload()
    time.sleep(1)
    response = chrome_autoload()
    i = 1  # 计算5次,若5次都失败,退出
    while response=='未成功计算'and i<=4:
        print('\033[37;40m---------------这是第%s次调用chrome-----------------\033[0m' % (i+1))
        ie_autoload()
        time.sleep(1)
        response = chrome_autoload()
        i = i+1
    # 若连续五次计算都失败,发送消息检查代码
    if i ==5:
        wechat_auto().send_mesg(0, 'person', 3)
    # 调用检测函数
    try:
        question = question_search(yesterday)
        print(question)
    except:
        print('未发现需检查文档')

Exemplo n.º 11
0
import os
from 接口.setting import wechat_auto,fixed_params
yesterday = fixed_params().yesterday
filename = ' %s.pdf' % yesterday
chfilename = 'D:\各装置主要生产日指标汇总\各装置主要生产日指标汇总表'
if os.path.exists(chfilename + filename):
    # 参数依次是自己,路径,文件中文字符,文件剩余部分
    wechat_auto().send_file(namenum=0, filenum=0, filenum2=1, filename=filename)
else:
    # 提醒人员处理,参数依次是自己,文本1
    wechat_auto().send_mesg(0, 1)

Exemplo n.º 12
0
    def parse_one_page(self, *params):
        counter = 1
        while counter <= 2:
            try:
                for pagnum in {1}:
                    page = '&pos=%s&act=next&page=%s' % ((pagnum-1)*60,pagnum)
                    url = 'http://www.ometal.com/bin0/new/searchkey_cj.asp?type=%B3%A4%BD%AD%D3%D0%C9%AB%BD%F0%CA%F4%CF%D6%BB%F5&searchtype=&newsort=7'+ page
                    response = self.get_one_page(url, self._headers, [], [])
                    pattern = re.compile(
                                '<tr>[\s\S]*?<td align="left" class="s105">·[\s\S]*?<a href="(.*?)" target="_blank">'
                                '<span style="color:black;background-color:yellow">长江有色金属现货</span>.*?'
                                ,re.S)  # [\s\S]*?匹配包含换行的任意字符
                    items = re.findall(pattern, response)

                    for i in range(0, 5):  # 只爬取前五个页面
                        time.sleep(0.5)
                        url2 = 'http://www.ometal.com' + items[i]
                        response2 = self.get_one_page(url2, self._headers, [], [])
                        # html找不到钴或者电解锰,跳出本次循环
                        if response2.find('钴') == -1 or response2.find('电解锰') == -1 or response2.find('均价') == -1:
                            continue
                        html = etree.HTML(response2)
                        # 处理网页,方便正则匹配
                        response2 = response2.replace('&nbsp;',' ')
                        #  时间格式转换
                        day_ori= items[i][9:items[i].find('marketnew') - 1]
                        day = datetime.datetime.strptime(day_ori, '%Y/%m/%d').strftime('%Y%m%d')
                        #  爬取名称
                        key_values = {'1#钴': 11, '1#电解锰': 15}
                        for j in (11, 15):
                            if day > '20190110' and response2.find('吨') != -1:  # 2019年1月10日前并且含有单位吨按如下格式爬取,否则进入else循环
                                try:
                                    name_str = '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/p/text() | //*[@id="fontzoom"]/div/div/table/tbody/tr[%s]/td[1]/p/text() |' \
                                               '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/p/span[2]/text() | //div[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/text() |' \
                                               '//div[@id="fontzoom"]/div/table/tbody/tr[%s]/td[1]/p/text() | //*[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/p/strong/span/text()' % (
                                               j, j, j, j, j, j)
                                    sl_str = '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/p/text() | //*[@id="fontzoom"]/div/div/table/tbody/tr[%s]/td[5]/p/text() |' \
                                             '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/p/span/text() | //div[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/text() |' \
                                             '//div[@id="fontzoom"]/div/table/tbody/tr[%s]/td[5]/p/text() | //*[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/p/strong/span/text()' % (
                                             j, j, j, j, j, j)
                                    name = html.xpath(name_str)[0]
                                    sl = str(html.xpath(sl_str)[0]).replace(',', '')
                                    yield {
                                        'index': '1#钴' if j == 11 else '1#电解锰',
                                        'erdat': day,
                                        'sl': int(sl)
                                    }
                                except:
                                    name = list(key_values.keys())[list(key_values.values()).index(j)]
                                    pattern = re.compile(
                                        '''%s[\s\S]*?(?:.*?(?:mso-spacerun: 'yes'; mso-font-kerning: 0.0000pt"|mso-font-kerning: 0.0000pt; mso-spacerun: 'yes';")>){3}'''
                                        '''(.*?)[</span>]{1}''' % name
                                    , re.S)  # [\s\S]*?匹配包含换行的任意字符
                                    sl = str(re.findall(pattern, response2)[0]).replace(',', '')
                                    yield {
                                        'index': '1#钴'if j == 11 else '1#电解锰',
                                        'erdat': day,
                                        'sl': int(sl)
                                                }
                            elif day > '20190110' and response2.find('吨') == -1:
                                sl_str = '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[4]/text()' % j
                                sl = str(html.xpath(sl_str)[0]).replace(',', '')
                                yield {
                                    'index': '1#钴' if j == 11 else '1#电解锰',
                                    'erdat': day,
                                    'sl': int(sl)
                                }
                            else:
                                try:
                                    name = list(key_values.keys())[list(key_values.values()).index(j)]
                                    pattern = re.compile(
                                        '''%s</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td>''' % name
                                        , re.S)
                                    sl = str(re.findall(pattern, response2)[0][2]).replace(',', '')
                                    yield {
                                        'index': '1#钴'if j == 11 else '1#电解锰',
                                        'erdat': day,
                                        'sl': int(sl)
                                    }
                                except:
                                    try:
                                        if j == 11:
                                            name_str2 = '1# 钴'
                                        else:
                                            name_str2 = '1# 电解锰'
                                        pattern = re.compile('%s(?:</p></td><td.*?><p.*?>(.*?)){4}</p>' % name_str2, re.S)
                                        sl = str(re.findall(pattern, response2)[0]).replace(',', '')
                                        yield {
                                            'index': '1#钴'if j == 11 else '1#电解锰',
                                            'erdat': day,
                                            'sl': int(sl)
                                        }
                                    except:
                                        try:
                                            if j == 11:
                                                name_str2 = '1# 钴'
                                            else:
                                                name_str2 = '1# 电解锰'
                                            pattern = re.compile('%s(?:</td><td.*?>(.*?)){4}</td>' % name_str2, re.S)
                                            sl = str(re.findall(pattern, response2)[0]).replace(',', '')
                                            yield {
                                                'index': '1#钴'if j == 11 else '1#电解锰',
                                                'erdat': day,
                                                'sl': int(sl)
                                            }
                                        except:
                                            if j == 11:
                                                name_str2 = '钴'
                                            else:
                                                name_str2 = '电解锰'
                                            pattern = re.compile('%s</span></p>(?:</td><td.*?><p>(.*?)</p>){4}' % name_str2, re.S)
                                            sl = str(re.findall(pattern, response2)[0]).replace(',', '')
                                            yield {
                                                'index': '1#钴'if j == 11 else '1#电解锰',
                                                'erdat': day,
                                                'sl': int(sl)
                                            }
                break
            except:
                if counter == 2:
                    print('\033[1;37;41m----------------------未取到钴与电解锰价格数据-----------------------\033[0m')
                    wechat_auto().send_mesg(0, 'person', 11)
            counter = counter + 1
Exemplo n.º 13
0
    def parse_one_page(self, *params):
        counter = 1
        while counter <= 2:
            try:
                for j in ({'1'}):  # 从1到n页抓取数据
                    url = 'http://www.ccf.com.cn/newscenter/index.php?cur_row_pos=0&cur_pg_num=' + j + "&Class_ID=1B0000"
                    cookies = ast.literal_eval(params[0][3])
                    response = self.get_one_page(url, self._headers, [], cookies)
                    sleeptime = random.randint(5, 10)
                    print('\033[37;40m-------------------获取轻纺城网页url休眠', sleeptime, '秒------------------\033[0m')
                    time.sleep(sleeptime)
                    pattern = re.compile(
                        '<li(?: | class=articlebreak )><span>.*?href="(.*?)" class="h1a2" target="_blank" onmouseover="return overlib.*?</a></li>',
                        re.S)
                    items = re.findall(pattern, response)
                    # 获取页面的url信息
                    for i in range(0, 3):  # len(items)
                        url = 'http://www.ccf.com.cn' + items[i]
                        data = re.findall('/newscenter/detail-1B0000-(.*?).shtml', items[i])[0][:8]  # 获取日期
                        response = requests.get(url, [], cookies=cookies)
                        sleeptime = random.randint(15, 30)
                        print('\033[37;40m-------------------获取新一条轻纺城数据休眠', sleeptime, '秒------------------\033[0m')
                        time.sleep(sleeptime)
                        response.encoding = 'gbk'
                        html2 = etree.HTML(response.text)
                        if data >= '20171020':
                            j = 2
                            while j <= 3:
                                a = '//tbody/tr[%s]/td[1]/text()' % j
                                b = '//tbody/tr[%s]/td[2]/text()' % j
                                zxl = html2.xpath(a)[0]
                                sl = html2.xpath(b)[0]
                                yield {
                                    'index': str(zxl),
                                    'erdat': str(data),
                                    'sl': int(sl)
                                }
                                j = j + 1
                        elif data >= '20170117':
                            j = 3
                            while j <= 4:
                                a = '//tbody/tr[%s]/td[1]/text()' % j
                                b = '//tbody/tr[%s]/td[2]/text()' % j
                                zxl = html2.xpath(a)[0]
                                sl = html2.xpath(b)[0]
                                yield {
                                    'index': str(zxl),
                                    'erdat': str(data),
                                    'sl': int(sl)
                                }
                                j = j + 1
                        else:
                            a1 = 'normalize-space(//tbody/tr[3]/td[1]/p/text())'
                            b1 = '//tbody/tr[3]/td[2]/text()'
                            zxl = html2.xpath(a1)
                            sl = html2.xpath(b1)[0]
                            yield {
                                'index': str(zxl)[-3:],
                                'erdat': str(data),
                                'sl': int(sl)
                            }

                            a2 = '//tbody/tr[4]/td[1]/text()'
                            b2 = '//tbody/tr[4]/td[2]/text()'
                            zxl = html2.xpath(a2)[0]
                            sl = html2.xpath(b2)[0]
                            yield {
                                'index': str(zxl),
                                'erdat': str(data),
                                'sl': int(sl)
                            }

                            zxl = html2.xpath('//tbody/tr[4]/td[1]/text()')[0]
                            sl = html2.xpath('//tbody/tr[4]/td[2]/text()')[0]
                            yield {
                                'index': str(zxl),
                                'erdat': str(data),
                                'sl': int(sl)
                            }
                break
            except:
                if counter == 2 :
                    print('\033[1;37;41m----------------------未取到轻纺城数据-----------------------\033[0m')
                    wechat_auto().send_mesg(0, 'person', 7)
            counter = counter + 1