def parsingDetail(df): df_detail = pd.DataFrame(columns = ["ISS_DATE", "TITL", "ISS_CTNT", "ISS_NO", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM"])#[2019.01.24]新增欄位(截斷後的資料夾名稱及檔名 for index, row in df.iterrows(): try: first_layer_date = row['ISS_DATE'] link = row['LNK_URL'] soup = request2soup(link) result = dataProcess_Detail(soup) title = result['title']#[2019.02.11]不能取第一層title,會有... FILES = result['FILES'] FILES_NM = result['FILES_NM'] FOLDER_NM = '' if len(FILES_NM) != 0: first_layer_date = re.sub(r'(/|-|\.)', '-', first_layer_date) FOLDER_NM = first_layer_date + '_' + title[:30].strip() + '_' + str(index) # 有附檔才會有資料夾名稱 header.downloadFile(FOLDER_NM, header.FINAL_PATH, result['fileUrls'], FILES_NM)#[2019.02.11]抽共用 d = {'ISS_DATE':result['issue_date'], 'TITL': title, 'ISS_CTNT': result['content'], 'ISS_NO':result['serno'], 'RLT_RGL':result['RLT_RGL'], 'FILES':','.join(FILES), 'FOLDER_NM': FOLDER_NM, 'FILES_NM':','.join(FILES_NM)} df_detail= df_detail.append(d, ignore_index=True) except: header.EXIT_CODE = -1 #[2019.02.01] 爬取內文發生錯誤則重爬 logging.error("爬取內文失敗") logging.error("失敗連結:" + link) traceback.print_exc() return df_detail
def parsingDetail(df, finalPath): df2 = pd.DataFrame(columns= ["ISS_DATE", "TITL", "ISS_CTNT", "ISS_NO", "RLT_RGL", "FILES", 'FOLDER_NM', 'FILES_NM']) for index, row in df.iterrows(): try: title = row['TITL'] link = row['LNK_URL'] logging.info(title) soup = request2soup(link) result = dataProcess_Detail(soup, row) if not bool(result): continue first_layer_date = row['ISS_DATE'] FILES = result['FILES'] FILES_NM = result['FILES_NM'] FOLDER_NM = '' if len(FILES_NM) != 0: first_layer_date = re.sub(r'(/|-|\.)', '-', first_layer_date) FOLDER_NM = first_layer_date + '_' + title[:30].strip() + '_' + str(index) # 有附檔才會有資料夾名稱 header.downloadFile(FOLDER_NM, finalPath, result['fileUrls'], FILES_NM) d = {'ISS_DATE':result['issue_date'], 'TITL': title, 'ISS_CTNT':result['content'],'ISS_NO': result['serno'], "RLT_RGL": '', 'FILES':','.join(FILES), 'FOLDER_NM':FOLDER_NM, 'FILES_NM':','.join(FILES_NM)} df2= df2.append(d, ignore_index=True) except: header.EXIT_CODE = -1 logging.error("爬取內文失敗") logging.error("失敗連結:" + link + "\n") traceback.print_exc() return df2
def parsingDetail(df, finalPath=FINAL_PATH): # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱 df2 = pd.DataFrame(columns=["ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM"]) # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類 for index, row in df.iterrows(): try: title = row['TITL'] link = row['LNK_URL'] # 下載內文附件 FOLDER_NM = title[:30].strip() # 有附檔才會有資料夾名稱 file_name = FOLDER_NM + '.pdf' header.downloadFile(FOLDER_NM, finalPath, [link], [file_name]) d = { 'ISS_DATE' : '', 'TITL' : title, 'ISS_CTNT' : '', 'ISS_NO' : '', 'RLT_RGL' : '', 'FILES' : file_name, 'FOLDER_NM' : FOLDER_NM, 'FILES_NM' : file_name } df2= df2.append(d, ignore_index=True) except: header.EXIT_CODE = -1 logging.error("爬取內文失敗") logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串 logging.error(str(traceback.format_exc())) return df2
def parsingDetail(df, finalPath=FINAL_PATH): # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱 df2 = pd.DataFrame(columns=[ "ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM" ]) # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類 for index, row in df.iterrows(): try: iss_date = row['ISS_DATE'] title = row['TITL'] link = row['LNK_URL'] # 取得內文 soup = request2soup(link) result = dataProcess_Detail(soup) # 下載內文附件 FILES_NM = result['fileNames'] FOLDER_NM = '' if len(FILES_NM) != 0: iss_date = re.sub(r'(/|-|\.)', '-', iss_date) FOLDER_NM = iss_date + '_' + title[:30].strip() + '_' + str( index) # 有附檔才會有資料夾名稱 header.downloadFile(FOLDER_NM, finalPath, result['fileUrls'], FILES_NM) folder_nm = FOLDER_NM files_nm = ','.join(FILES_NM) d = { 'ISS_DATE': iss_date, 'TITL': title, 'ISS_CTNT': '', 'ISS_NO': '', 'RLT_RGL': '', 'FILES': files_nm, 'FOLDER_NM': folder_nm, 'FILES_NM': files_nm } df2 = df2.append(d, ignore_index=True) except: header.EXIT_CODE = -1 logging.error("爬取內文失敗") logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串 logging.error(str(traceback.format_exc())) return df2
def parsingDetail(df, finalPath=FINAL_PATH): # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱 df2 = pd.DataFrame(columns=[ "ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM" ]) # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結 driver = webdriver.Chrome(ChromeDriverManager().install()) for index, row in df.iterrows(): try: iss_date = row['ISS_DATE'] title = row['TITL'] link = row['LNK_URL'] # 取得內文 driver.get(link) contents = driver.find_elements_by_xpath( '//section[@class="wrapper btm-110"]')[0].text file_urls = [] file_names = [] # PDF 附件 pdf_file_urls, pdf_file_names = getFileUrl(driver, "pdf") file_urls.extend(pdf_file_urls) file_names.extend(pdf_file_names) # WORD 附件 word_file_urls, word_file_names = getFileUrl(driver, "word") file_urls.extend(word_file_urls) file_names.extend(word_file_names) # 下載內文附件 FOLDER_NM = '' if len(file_urls) > 0: iss_date = re.sub(r'(/|-|\.)', '-', iss_date) FOLDER_NM = iss_date + '_' + title[:30].strip() + '_' + str( index) # 有附檔才會有資料夾名稱 header.downloadFile(FOLDER_NM, finalPath, file_urls, file_names) files_nm = ','.join(file_names) d = { 'ISS_DATE': iss_date, 'TITL': title, 'ISS_CTNT': contents, 'ISS_NO': '', 'RLT_RGL': '', 'FILES': files_nm, 'FOLDER_NM': FOLDER_NM, 'FILES_NM': files_nm } df2 = df2.append(d, ignore_index=True) except: header.EXIT_CODE = -1 logging.error("爬取內文失敗") logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串 logging.error(str(traceback.format_exc())) return df2
def parsingDetail(df, finalPath=FINAL_PATH): # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱 df2 = pd.DataFrame(columns=["ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM"]) # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類 for index, row in df.iterrows(): try: iss_date = row['ISS_DATE'] title = row['TITL'] link = row['LNK_URL'] iss_no = '' iss_ctnt = '' folder_nm = '' files = '' files_nm = '' # 檢查文章類型 lnk_type = row['TYPE'] if lnk_type == "file": folder_name = iss_date + '_' + title[:30] file_name = re.findall(r'filedisplay=\w+\.pdf', link)[0][len('filedisplay='):] header.downloadFile(folder_name, finalPath, [link], [file_name]) folder_nm = folder_name files = file_name files_nm = file_name elif lnk_type == "article": # 取得內文 soup = request2soup(link) result = dataProcess_Detail(soup, FIX_URL) # 下載內文附件 FILES = result['FILES'] FILES_NM = result['FILES_NM'] FOLDER_NM = '' if len(FILES_NM) != 0: iss_date = re.sub(r'(/|-|\.)', '-', iss_date) FOLDER_NM = iss_date + '_' + title[:30].strip() + '_' + str(index) # 有附檔才會有資料夾名稱 header.downloadFile(FOLDER_NM, finalPath, result['fileUrls'], FILES_NM) iss_no = result['serno'] iss_ctnt = result['content'] folder_nm = FOLDER_NM files = ','.join(FILES) files_nm = ','.join(FILES_NM) d = { 'ISS_DATE' : iss_date, 'TITL' : title, 'ISS_CTNT' : iss_ctnt, 'ISS_NO' : iss_no, 'RLT_RGL' : '', 'FILES' : files, 'FOLDER_NM' : folder_nm, 'FILES_NM' : files_nm } df2= df2.append(d, ignore_index=True) except: header.EXIT_CODE = -1 logging.error("爬取內文失敗") logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串 logging.error(str(traceback.format_exc())) return df2
def parsingDetail(df, finalPath=FINAL_PATH): # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱 df2 = pd.DataFrame(columns=[ "ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM" ]) # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類 for index, row in df.iterrows(): try: iss_date = row['ISS_DATE'] title = row['TITL'] link = row['LNK_URL'] # 取得內文 soup = request2soup(link) # 解釋字號 iss_no = soup.find('pre', {'id': 'preExpTitle'}).string # 主要內容 rows = soup.findAll('pre') content_rows = [] for row in rows: content_rows.append(row.string) # 內文 content = '\n'.join(str(e) for e in content_rows) # 相關法令 attach_links = soup.findAll('a', {'class': 'attach_link underline'}) rtl_list = [] for attach_link in attach_links: rtl_list.append(str(attach_link.string)) # 附件 files = soup.findAll('a', {'class': 'attach_pdf underline'}) file_names = [] file_links = [] for file in files: file_names.append(str(file.string) + '.pdf') file_links.append(str(file['href']).strip()) # 下載內文附件 if len(file_names) != 0: header.downloadFile(str(iss_date), finalPath, file_links, file_names) file_names = ','.join(file_names) d = { 'ISS_DATE': iss_date, 'TITL': title, 'ISS_CTNT': content, 'ISS_NO': iss_no, 'RLT_RGL': '\n'.join(rtl_list), 'FILES': file_names, 'FOLDER_NM': iss_date, 'FILES_NM': file_names } df2 = df2.append(d, ignore_index=True) except: header.EXIT_CODE = -1 logging.error("爬取內文失敗") logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串 logging.error(str(traceback.format_exc())) return df2