示例#1
0
import urllib2
from bs4 import BeautifulSoup
import re
import time
import urlparse
from initialremove import remove_files
from errorlogging import logerror
import path_define

#Global variables
LINK_ANCHOR={}
LINK_FILENAME=[]
ERROR_FILE_PATH= path_define.get_ERROR_FILE_PATH()
CRAWLEDLISTPATH = path_define.get_CRAWLEDLISTPATH()
CRAWLED_HTML_PATH = path_define.get_CRAWLED_HTML_PATH()

def get_html_content(url):
    try:
        counter= 1;
        html = urllib2.urlopen(url)
        content = BeautifulSoup(html,"html.parser")
        content.prettify()
        article_id = (url.split("/wiki/")[1])
        file_name=re.sub(r'-*_*', '', article_id)
        if file_name not in LINK_FILENAME:
            LINK_FILENAME.append(file_name)
        else:
            while(file_name in LINK_FILENAME):
                file_name = file_name+str(counter)
                counter = counter + 1
            LINK_FILENAME.append(file_name)
示例#2
0
import os
from os.path import exists
from glob import glob
import path_define

CRAWLEDLISTPATH = path_define.get_CRAWLEDLISTPATH() + "\crawled_list.txt"
ERROR_FILE_PATH = path_define.get_ERROR_FILE_PATH()
CRAWLED_HTML_PATH = path_define.get_CRAWLED_HTML_PATH()


def remove_files():
    files = glob(CRAWLED_HTML_PATH + '\*.html')
    for filename in files:
        os.remove(filename)
    if exists(CRAWLEDLISTPATH):
        os.remove(CRAWLEDLISTPATH)
    if exists(ERROR_FILE_PATH):
        os.remove(ERROR_FILE_PATH)