def main(url, checkRange=30): header.processBegin(url=url) header.clearFolder() #[2019.02.11] try: soup = request2soup(url) df_1 = parsingTitle(soup, checkRange) if len(df_1) != 0: #outputCsv(df_1, "第一層結果", FinalPath) header.outputCsv(df_1, "第一層結果") df_2 = parsingDetail(df_1) #outputCsv(df_2, "第二層結果", FinalPath) header.outputCsv(df_2, "第二層結果") header.RESULT_COUNT = len(df_2) header.zipFile() header.createInfoFile() header.createOKFile() header.outputLastResult(df_1, header.lastResult, checkRange) #[2019.02.11]新增產出lastResult方法 except: logging.error("執行爬網作業失敗") traceback.print_exc() header.createInfoFile() header.processEnd()
def main(url, tabNumber, checkRange=15): header.processBegin() header.clearFolder() DownloadTool = SeleniumUtil.ChromeDownload() DownloadTool.setDownLoadTempPath(header.TEMP_PATH) DownloadTool.setDownLoadFinalPath(FinalPath) chrome_options = DownloadTool.getChromeOptions() driver = webdriver.Chrome( chrome_options=chrome_options) # open chrome browser with Options try: if tabNumber >= 19 and tabNumber <= 22 and isinstance(tabNumber, int): url = url + str(tabNumber) else: raise ValueError("tabNumber 必須為 19 到 22 的整數") driver.get(url) df_1 = parsingTitle(url, driver, checkRange) if len(df_1) != 0: header.outputCsv(df_1, "第一層結果", FinalPath) df_2 = parsingDetail(df_1, tabNumber, FinalPath) header.outputCsv(df_2, "第二層結果", FinalPath) header.RESULT_COUNT = len(df_1) header.zipFile() header.createInfoFile() header.createOKFile() header.outputLastResult(df_1, header.lastResult, checkRange) # 2019-02-01新增產出lastResult方法 except: logging.error("執行爬網作業失敗") header.EXIT_CODE = -1 traceback.print_exc() header.processEnd()
def main(url, checkRange = 30): header.processBegin(url = url) header.clearFolder() try: soup = request2soup(url, 1) df_1 = parsingTitle(soup, checkRange) if len(df_1) != 0: header.outputCsv(df_1, "第一層結果") df_2 = parsingDetail(df_1) header.outputCsv(df_2, "Result") header.RESULT_COUNT= len(df_2) header.zipFile() header.createInfoFile() header.createOKFile() except: print("執行爬網作業失敗") logging.error("執行爬網作業失敗") header.EXIT_CODE = -1 traceback.print_exc() header.createInfoFile() header.processEnd()
def parsingTitle(soup, checkRange): try: # 取得上次爬網結果 lastResultPath = header.LAST_RESULT_PATH # +"/lastResult.csv"#[2019.02.11] if os.path.isfile(lastResultPath): lastResult = pd.read_csv(lastResultPath) else: lastResult = pd.DataFrame() header.lastResult = lastResult #[2019.02.11]新增全域變數 # 爬網日期區間為一個禮拜 endDate = datetime.date.today() strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat() df = pd.DataFrame( columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"]) soup = request2soup(url) # 資料處理 result = dataProcess_Title(soup, strDate) d = { 'WEB_ADDR': url, 'CRL_DATE': result['crawl_date'], 'ISS_DATE': '', 'TITL': result['titles_result'], 'LNK_URL': result['links'] } df = df.append(pd.DataFrame(data=d)) # 若與上次發文日期和標題相同,則跳至下一筆 if not lastResult.empty: for index, row in df.iterrows(): if row['TITL'] in list(lastResult['TITL']): df.drop(index, inplace=True) if len(df) == 0: logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate)) else: df.index = [i for i in range(df.shape[0])] # reset lastResult = lastResult.append(df) lastResult.index = [i for i in range(lastResult.shape[0])] # reset lastResult = lastResult[pd.to_datetime(lastResult['CRL_DATE']) >= ( datetime.date.today() - datetime.timedelta(days=checkRange))] header.outputCsv(lastResult, "lastResult", header.CRAWL_LIST_PATH) except: header.EXIT_CODE = -1 logging.error("爬取主旨列表失敗") traceback.print_exc() return df
def main(url, checkRange = 30): header.processBegin() header.clearFolder() try: df_1 = parsingTitle(url, checkRange) if len(df_1) == 0: return header.outputCsv(df_1, "第一層結果", FinalPath) df_2 = parsingDetail(df_1, FinalPath) header.outputCsv(df_2, "第二層結果", FinalPath) header.RESULT_COUNT = len(df_1) header.zipFile() header.createInfoFile() header.createOKFile() header.outputLastResult(df_1, header.lastResult, checkRange) # 2019-02-01新增產出lastResult方法 except: logging.error("執行爬網作業失敗") header.EXIT_CODE = -1 traceback.print_exc() header.processEnd()
def parsingTitle(soup, checkRange): try: # 取得上次爬網結果 lastResultPath = header.LAST_RESULT_PATH + "/lastResult.csv" if os.path.isfile(lastResultPath): lastResult = pd.read_csv(lastResultPath) else: lastResult = pd.DataFrame() # 爬網日期區間為一個禮拜 endDate = datetime.date.today() strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat() totalPage = soup.select(".page")[0].text.split("/")[1] # 總頁數 ending = False df = pd.DataFrame( columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"]) for i in range(int(totalPage)): if (i != 0): soup = request2soup(url, i + 1) try: sorts = soup.select(".sort1") sorts = [x.text.strip() for x in sorts] dates = soup.select(".pdate1") dates = [x.text.strip() for x in dates] titles = soup.select(".ptitle1") titles = [x.text.strip() for x in titles] links = soup.select(".ptitle1 a") links = [ "https://www.ib.gov.tw/ch/" + x.get("href") for x in links ] idx = pd.Series([False] * len(dates)) for j in range(len(dates)): date = dates[j] if date < strDate: # 若發文日期小於開始日期, 則結束爬取主旨 ending = True break idx[j] = True d = { "WEB_ADDR": url, "CRL_DATE": endDate, "ISS_DATE": dates, "TITL": titles, "LNK_URL": links } df = df.append(pd.DataFrame(data=d)[idx]) # append page # 若結束爬取主旨, 停止爬取剩下的 pages if ending: break except: logging.error("爬取第 %s 頁主旨發生錯誤" % str(i + 1)) traceback.print_exc() df.index = [i for i in range(df.shape[0])] # reset Index header.outputCsv(df, "lastResult", header.LAST_RESULT_PATH) if not lastResult.empty: # 若與上次發文日期和標題相同,則跳至下一筆 for i in range(len(df)): for j in range(len(lastResult)): if (df.ISS_DATE[i] == lastResult.ISS_DATE[j]) & ( df.TITL[i] == lastResult.TITL[j]): df.drop(i, inplace=True) break if len(df) == 0: logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate)) else: df.index = [i for i in range(df.shape[0])] # reset return df except: logging.error("爬取主旨列表失敗") traceback.print_exc() return pd.DataFrame( columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"])
def main(): header.processBegin(url=WEB_URL) header.clearFolder() try: reqMap = { # [20190402] 新增查詢的縣市 "REGION_IDs": { "1": "台北市", "2": "基隆市", "3": "新北市", "4": "新竹市", "5": "新竹縣", "6": "桃園市", "7": "苗栗縣", "8": "台中市", "10": "彰化縣", "11": "南投縣", "12": "嘉義市", "13": "嘉義縣", "14": "雲林縣", "15": "台南市", "17": "高雄市", "19": "屏東縣", "21": "宜蘭縣", "22": "台東縣", "23": "花蓮縣", "24": "澎湖縣", "25": "金門縣", "26": "連江縣" }, "QUERY_TYPEs": { '1': "住宅用地", '2': "商業用地", '3': "工業用地" }, "CONDITIONS": { "is_new_list": '1', "type": '2', "searchtype": '1', "firstRow": '0', "kind": "11", # 廠房土地出售 "area": "300," # 最小 300 坪 } } standbyDataFrame, historyDataFrame = parsingTitle(reqMap) if len(standbyDataFrame) < 1: # 無資料更新 logMsg = "無資料更新,爬網日期:" + TODAY print(logMsg) logging.critical(logMsg) else: finishDataFrame, detailDataFrame = parsingDetail(standbyDataFrame) header.outputCsv(detailDataFrame, header.PROJECT) header.RESULT_COUNT = len(detailDataFrame) # 更新 crawlHistory 檔案 updateHistoryDataFrame = pd.concat( [historyDataFrame, finishDataFrame], ignore_index=True) header.outputCsv(updateHistoryDataFrame, "crawlHistory", header.LAST_RESULT_PATH) header.zipFile() header.createInfoFile() header.createOKFile() except: setErrorMessage("執行爬網作業失敗") header.createInfoFile() header.zipFile(zipFolder=header.LOG_PATH, zipResultWithLog=False) header.processEnd()