Python get_CRAWLEDLISTPATH示例

编程语言: Python

命名空间/包名称: path_define

方法/功能: get_CRAWLEDLISTPATH

hotexamples.com的示例: 2

Python get_CRAWLEDLISTPATH - 已找到2个示例。这些是从开源项目中提取的最受好评的path_define.get_CRAWLEDLISTPATH现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

import urllib2
from bs4 import BeautifulSoup
import re
import time
import urlparse
from initialremove import remove_files
from errorlogging import logerror
import path_define

#Global variables
LINK_ANCHOR={}
LINK_FILENAME=[]
ERROR_FILE_PATH= path_define.get_ERROR_FILE_PATH()
CRAWLEDLISTPATH = path_define.get_CRAWLEDLISTPATH()
CRAWLED_HTML_PATH = path_define.get_CRAWLED_HTML_PATH()

def get_html_content(url):
    try:
        counter= 1;
        html = urllib2.urlopen(url)
        content = BeautifulSoup(html,"html.parser")
        content.prettify()
        article_id = (url.split("/wiki/")[1])
        file_name=re.sub(r'-*_*', '', article_id)
        if file_name not in LINK_FILENAME:
            LINK_FILENAME.append(file_name)
        else:
            while(file_name in LINK_FILENAME):
                file_name = file_name+str(counter)
                counter = counter + 1
            LINK_FILENAME.append(file_name)

示例#2

显示文件

import os
from os.path import exists
from glob import glob
import path_define

CRAWLEDLISTPATH = path_define.get_CRAWLEDLISTPATH() + "\crawled_list.txt"
ERROR_FILE_PATH = path_define.get_ERROR_FILE_PATH()
CRAWLED_HTML_PATH = path_define.get_CRAWLED_HTML_PATH()


def remove_files():
    files = glob(CRAWLED_HTML_PATH + '\*.html')
    for filename in files:
        os.remove(filename)
    if exists(CRAWLEDLISTPATH):
        os.remove(CRAWLEDLISTPATH)
    if exists(ERROR_FILE_PATH):
        os.remove(ERROR_FILE_PATH)