Пример #1
0
def parsingDetail(df):

    df_detail = pd.DataFrame(columns = ["ISS_DATE", "TITL", "ISS_CTNT", "ISS_NO", "RLT_RGL", "FILES",
                                        "FOLDER_NM", "FILES_NM"])#[2019.01.24]新增欄位(截斷後的資料夾名稱及檔名
    for index, row in df.iterrows():
        try:
            first_layer_date = row['ISS_DATE']
            link = row['LNK_URL']
            soup = request2soup(link)
            
            result = dataProcess_Detail(soup)
            title = result['title']#[2019.02.11]不能取第一層title,會有...
            FILES = result['FILES'] 
            FILES_NM = result['FILES_NM']
            FOLDER_NM = ''
            if len(FILES_NM) != 0:
                first_layer_date = re.sub(r'(/|-|\.)', '-', first_layer_date)
                FOLDER_NM = first_layer_date + '_' + title[:30].strip() + '_' + str(index) # 有附檔才會有資料夾名稱
                header.downloadFile(FOLDER_NM, header.FINAL_PATH, result['fileUrls'], FILES_NM)#[2019.02.11]抽共用
            
            d = {'ISS_DATE':result['issue_date'], 'TITL': title, 'ISS_CTNT': result['content'], 
                 'ISS_NO':result['serno'], 'RLT_RGL':result['RLT_RGL'], 'FILES':','.join(FILES), 
                 'FOLDER_NM': FOLDER_NM, 'FILES_NM':','.join(FILES_NM)}
            
            df_detail= df_detail.append(d, ignore_index=True)
    
        except:
            header.EXIT_CODE = -1   #[2019.02.01] 爬取內文發生錯誤則重爬
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + link)
            traceback.print_exc()
    
    return df_detail
Пример #2
0
def parsingDetail(df, finalPath):  
    df2 = pd.DataFrame(columns= ["ISS_DATE", "TITL", "ISS_CTNT", "ISS_NO", "RLT_RGL", "FILES", 'FOLDER_NM', 'FILES_NM'])
    for index, row in df.iterrows():
        try:
            title = row['TITL']
            link = row['LNK_URL']
            logging.info(title)
            soup = request2soup(link)
            result = dataProcess_Detail(soup, row)
            if not bool(result):
                continue
                
            first_layer_date = row['ISS_DATE']
            FILES = result['FILES'] 
            FILES_NM = result['FILES_NM']
            FOLDER_NM = ''
            if len(FILES_NM) != 0:
                first_layer_date = re.sub(r'(/|-|\.)', '-', first_layer_date)
                FOLDER_NM = first_layer_date + '_' + title[:30].strip() + '_' + str(index) # 有附檔才會有資料夾名稱
                header.downloadFile(FOLDER_NM, finalPath, result['fileUrls'], FILES_NM)
            
            d = {'ISS_DATE':result['issue_date'], 'TITL': title, 'ISS_CTNT':result['content'],'ISS_NO': result['serno'], "RLT_RGL": '', 
                 'FILES':','.join(FILES), 'FOLDER_NM':FOLDER_NM, 'FILES_NM':','.join(FILES_NM)}
            df2= df2.append(d, ignore_index=True)
        except:
            header.EXIT_CODE = -1
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + link + "\n")
            traceback.print_exc()
    return df2
Пример #3
0
def parsingDetail(df, finalPath=FINAL_PATH):

    # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱
    df2 = pd.DataFrame(columns=["ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM"])

    # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類
    for index, row in df.iterrows():
        try:
            title = row['TITL']
            link = row['LNK_URL']

            # 下載內文附件
            FOLDER_NM = title[:30].strip() # 有附檔才會有資料夾名稱
            file_name = FOLDER_NM + '.pdf'
            header.downloadFile(FOLDER_NM, finalPath, [link], [file_name])

            d = {
                'ISS_DATE' : '',
                'TITL' : title,
                'ISS_CTNT' : '',
                'ISS_NO' : '',
                'RLT_RGL' : '',
                'FILES' : file_name,
                'FOLDER_NM' : FOLDER_NM,
                'FILES_NM' : file_name
            }

            df2= df2.append(d, ignore_index=True)

        except:
            header.EXIT_CODE = -1
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串
            logging.error(str(traceback.format_exc()))
    return df2
Пример #4
0
def parsingDetail(df, finalPath=FINAL_PATH):

    # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱
    df2 = pd.DataFrame(columns=[
        "ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES",
        "FOLDER_NM", "FILES_NM"
    ])

    # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類
    for index, row in df.iterrows():
        try:
            iss_date = row['ISS_DATE']
            title = row['TITL']
            link = row['LNK_URL']

            # 取得內文
            soup = request2soup(link)
            result = dataProcess_Detail(soup)

            # 下載內文附件
            FILES_NM = result['fileNames']
            FOLDER_NM = ''
            if len(FILES_NM) != 0:
                iss_date = re.sub(r'(/|-|\.)', '-', iss_date)
                FOLDER_NM = iss_date + '_' + title[:30].strip() + '_' + str(
                    index)  # 有附檔才會有資料夾名稱
                header.downloadFile(FOLDER_NM, finalPath, result['fileUrls'],
                                    FILES_NM)

            folder_nm = FOLDER_NM
            files_nm = ','.join(FILES_NM)

            d = {
                'ISS_DATE': iss_date,
                'TITL': title,
                'ISS_CTNT': '',
                'ISS_NO': '',
                'RLT_RGL': '',
                'FILES': files_nm,
                'FOLDER_NM': folder_nm,
                'FILES_NM': files_nm
            }

            df2 = df2.append(d, ignore_index=True)

        except:
            header.EXIT_CODE = -1
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + str(link))  # 避免 unicode 編碼錯誤,將 link 轉為字串
            logging.error(str(traceback.format_exc()))
    return df2
Пример #5
0
def parsingDetail(df, finalPath=FINAL_PATH):

    # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱
    df2 = pd.DataFrame(columns=[
        "ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES",
        "FOLDER_NM", "FILES_NM"
    ])

    # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結
    driver = webdriver.Chrome(ChromeDriverManager().install())
    for index, row in df.iterrows():
        try:
            iss_date = row['ISS_DATE']
            title = row['TITL']
            link = row['LNK_URL']

            # 取得內文
            driver.get(link)
            contents = driver.find_elements_by_xpath(
                '//section[@class="wrapper btm-110"]')[0].text

            file_urls = []
            file_names = []

            # PDF 附件
            pdf_file_urls, pdf_file_names = getFileUrl(driver, "pdf")
            file_urls.extend(pdf_file_urls)
            file_names.extend(pdf_file_names)

            # WORD 附件
            word_file_urls, word_file_names = getFileUrl(driver, "word")
            file_urls.extend(word_file_urls)
            file_names.extend(word_file_names)

            # 下載內文附件
            FOLDER_NM = ''
            if len(file_urls) > 0:
                iss_date = re.sub(r'(/|-|\.)', '-', iss_date)
                FOLDER_NM = iss_date + '_' + title[:30].strip() + '_' + str(
                    index)  # 有附檔才會有資料夾名稱
                header.downloadFile(FOLDER_NM, finalPath, file_urls,
                                    file_names)

            files_nm = ','.join(file_names)

            d = {
                'ISS_DATE': iss_date,
                'TITL': title,
                'ISS_CTNT': contents,
                'ISS_NO': '',
                'RLT_RGL': '',
                'FILES': files_nm,
                'FOLDER_NM': FOLDER_NM,
                'FILES_NM': files_nm
            }

            df2 = df2.append(d, ignore_index=True)

        except:
            header.EXIT_CODE = -1
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + str(link))  # 避免 unicode 編碼錯誤,將 link 轉為字串
            logging.error(str(traceback.format_exc()))
    return df2
Пример #6
0
def parsingDetail(df, finalPath=FINAL_PATH):

    # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱
    df2 = pd.DataFrame(columns=["ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES", "FOLDER_NM", "FILES_NM"])

    # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類
    for index, row in df.iterrows():
        try:
            iss_date = row['ISS_DATE']
            title = row['TITL']
            link = row['LNK_URL']

            iss_no = ''
            iss_ctnt = ''
            folder_nm = ''
            files = ''
            files_nm = ''

            # 檢查文章類型
            lnk_type = row['TYPE']
            if lnk_type == "file":
                folder_name = iss_date + '_' + title[:30]
                file_name = re.findall(r'filedisplay=\w+\.pdf', link)[0][len('filedisplay='):]
                header.downloadFile(folder_name, finalPath, [link], [file_name])

                folder_nm = folder_name
                files = file_name
                files_nm = file_name
            
            elif lnk_type == "article":

                # 取得內文
                soup = request2soup(link)
                result = dataProcess_Detail(soup, FIX_URL)

                # 下載內文附件
                FILES = result['FILES']
                FILES_NM = result['FILES_NM']
                FOLDER_NM = ''
                if len(FILES_NM) != 0:
                    iss_date = re.sub(r'(/|-|\.)', '-', iss_date)
                    FOLDER_NM = iss_date + '_' + title[:30].strip() + '_' + str(index) # 有附檔才會有資料夾名稱
                    header.downloadFile(FOLDER_NM, finalPath, result['fileUrls'], FILES_NM)

                iss_no = result['serno']
                iss_ctnt = result['content']
                folder_nm = FOLDER_NM
                files = ','.join(FILES)
                files_nm = ','.join(FILES_NM)

            d = {
                'ISS_DATE' : iss_date,
                'TITL' : title,
                'ISS_CTNT' : iss_ctnt,
                'ISS_NO' : iss_no,
                'RLT_RGL' : '',
                'FILES' : files,
                'FOLDER_NM' : folder_nm,
                'FILES_NM' : files_nm
            }

            df2= df2.append(d, ignore_index=True)

        except:
            header.EXIT_CODE = -1
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + str(link)) # 避免 unicode 編碼錯誤,將 link 轉為字串
            logging.error(str(traceback.format_exc()))
    return df2
Пример #7
0
def parsingDetail(df, finalPath=FINAL_PATH):

    # 發文日期, 發文字號, 標題, 本文, 相關法條, 附件, 附件存放資料夾, 附件存檔名稱
    df2 = pd.DataFrame(columns=[
        "ISS_DATE", "ISS_NO", "TITL", "ISS_CTNT", "RLT_RGL", "FILES",
        "FOLDER_NM", "FILES_NM"
    ])

    # 第一層結果:網站網址, 爬網日期, 發文日期, 標題, 內文連結, 標題種類
    for index, row in df.iterrows():
        try:
            iss_date = row['ISS_DATE']
            title = row['TITL']
            link = row['LNK_URL']

            # 取得內文
            soup = request2soup(link)

            # 解釋字號
            iss_no = soup.find('pre', {'id': 'preExpTitle'}).string

            # 主要內容
            rows = soup.findAll('pre')
            content_rows = []
            for row in rows:
                content_rows.append(row.string)  # 內文
            content = '\n'.join(str(e) for e in content_rows)

            # 相關法令
            attach_links = soup.findAll('a',
                                        {'class': 'attach_link underline'})
            rtl_list = []
            for attach_link in attach_links:
                rtl_list.append(str(attach_link.string))

            # 附件
            files = soup.findAll('a', {'class': 'attach_pdf underline'})
            file_names = []
            file_links = []
            for file in files:
                file_names.append(str(file.string) + '.pdf')
                file_links.append(str(file['href']).strip())

            # 下載內文附件
            if len(file_names) != 0:
                header.downloadFile(str(iss_date), finalPath, file_links,
                                    file_names)

            file_names = ','.join(file_names)

            d = {
                'ISS_DATE': iss_date,
                'TITL': title,
                'ISS_CTNT': content,
                'ISS_NO': iss_no,
                'RLT_RGL': '\n'.join(rtl_list),
                'FILES': file_names,
                'FOLDER_NM': iss_date,
                'FILES_NM': file_names
            }

            df2 = df2.append(d, ignore_index=True)

        except:
            header.EXIT_CODE = -1
            logging.error("爬取內文失敗")
            logging.error("失敗連結:" + str(link))  # 避免 unicode 編碼錯誤,將 link 轉為字串
            logging.error(str(traceback.format_exc()))
    return df2