示例#1
0
def crawlSingleURL(link, idx, total_links):
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)
        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        end = time.time()

        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))
        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except socket.timeout:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        pass
def crawlSingleURL(link, idx, total_links):
	try:
		opener = buildOpener()
		start = time.time()
		data = opener.open(link).read()
		soup = BeautifulSoup(data)
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})
		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)
		# Extract the title tag
		titleTag = None
		try:
			titleTag = soup.html.head.title
			titleTag = str(titleTag.string)
		except:
			titleTag = ""			
		end = time.time()

		# Return the basic URL data structure
		field = URLField(link, titleTag, descr, keywords)
		field.populate()	
		if ((idx % LINK_SET_INDICATOR) == 0):			
			sys.stdout.write("[%s/%s] " % (idx, total_links))
		# Exit crawl single URL with url field.
		# @return URLField
		return field
	except socket.timeout:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except urllib2.URLError:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except Exception, e:
		pass
def crawlSingleURLForContent(link, idx, total_links):
	""" Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
	try:
		opener = buildOpener()
		start = time.time()
		data = opener.open(link).read()
		istats = build_page_info(link, data)
		data = clean_content(data)
		soup = BeautifulSoup(data)
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})
		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)

		# Extract the title tag
		titleTag = None
		try:
			titleTag = soup.html.head.title
			titleTag = str(titleTag.string)
		except:
			titleTag = ""
		# Ignore content we aren't concerned with
		partial_content = doc_ignore_content(soup)
		
		end = time.time()
		# Return the basic URL data structure
		field = URLField(link, titleTag, descr, keywords)

		field.descr = field.tokenizeTags(field.descr)
		field.keywords = field.tokenizeTags(field.keywords)

		field.full_content = data
		field.extract_content = partial_content
		field.info_stats = istats
		field.populate()
		if ((idx % LINK_SET_INDICATOR) == 0):
			sys.stdout.write("[%s/%s] " % (idx, total_links))
	   		
		# Exit crawl single URL with url field.
		# @return URLField
		return field
	except urllib2.URLError:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except Exception, e:
		# NOTE: if pass allowed, compile errors will be ignored.
		print "ERR<crawlSingleURLForContent>: %s" % e
		pass
示例#4
0
def crawlSingleURLForContent(link, idx, total_links):
    """ Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        istats = build_page_info(link, data)
        data = clean_content(data)
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)

        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        # Ignore content we aren't concerned with
        partial_content = doc_ignore_content(soup)

        end = time.time()
        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)

        field.descr = field.tokenizeTags(field.descr)
        field.keywords = field.tokenizeTags(field.keywords)

        field.full_content = data
        field.extract_content = partial_content
        field.info_stats = istats
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))

        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        # NOTE: if pass allowed, compile errors will be ignored.
        print "ERR<crawlSingleURLForContent>: %s" % e
        pass