Пример #1
0
def crawlSingleURL(link, idx, total_links):
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)
        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        end = time.time()

        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))
        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except socket.timeout:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        pass
Пример #2
0
def crawlSingleURL(link, idx, total_links):
	try:
		opener = buildOpener()
		start = time.time()
		data = opener.open(link).read()
		soup = BeautifulSoup(data)
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})
		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)
		# Extract the title tag
		titleTag = None
		try:
			titleTag = soup.html.head.title
			titleTag = str(titleTag.string)
		except:
			titleTag = ""			
		end = time.time()

		# Return the basic URL data structure
		field = URLField(link, titleTag, descr, keywords)
		field.populate()	
		if ((idx % LINK_SET_INDICATOR) == 0):			
			sys.stdout.write("[%s/%s] " % (idx, total_links))
		# Exit crawl single URL with url field.
		# @return URLField
		return field
	except socket.timeout:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except urllib2.URLError:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except Exception, e:
		pass
Пример #3
0
def connectLinkService(requrl):
	""" First, connect to the botlist URL service and extract
	the most recent list of links.  This will seed the
	botlist spider crawler."""	
	opener = buildOpener()	
	req = urllib2.Request(requrl)
	req.add_header('user-agent', FF_USER_AGENT)
	link_data = opener.open(req).read()
	link_data = [ line.strip() for line in link_data.split('\n') ]
	link_data = filter(lambda (line):
					   (len(line) > 0) and (len(line.split('::|')) == NO_COLS_SERVICE),
					   link_data)
	content = [ col.split('::|') for col in link_data ]
	return content
Пример #4
0
def connectLinkService(requrl):
    """ First, connect to the botlist URL service and extract
	the most recent list of links.  This will seed the
	botlist spider crawler."""
    opener = buildOpener()
    req = urllib2.Request(requrl)
    req.add_header('user-agent', FF_USER_AGENT)
    link_data = opener.open(req).read()
    link_data = [line.strip() for line in link_data.split('\n')]
    link_data = filter(
        lambda (line):
        (len(line) > 0) and (len(line.split('::|')) == NO_COLS_SERVICE),
        link_data)
    content = [col.split('::|') for col in link_data]
    return content
Пример #5
0
def crawlSingleURLForContent(link, idx, total_links):
    """ Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        istats = build_page_info(link, data)
        data = clean_content(data)
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)

        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        # Ignore content we aren't concerned with
        partial_content = doc_ignore_content(soup)

        end = time.time()
        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)

        field.descr = field.tokenizeTags(field.descr)
        field.keywords = field.tokenizeTags(field.keywords)

        field.full_content = data
        field.extract_content = partial_content
        field.info_stats = istats
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))

        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        # NOTE: if pass allowed, compile errors will be ignored.
        print "ERR<crawlSingleURLForContent>: %s" % e
        pass
Пример #6
0
def crawlSingleURLForContent(link, idx, total_links):
	""" Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
	try:
		opener = buildOpener()
		start = time.time()
		data = opener.open(link).read()
		istats = build_page_info(link, data)
		data = clean_content(data)
		soup = BeautifulSoup(data)
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})
		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)

		# Extract the title tag
		titleTag = None
		try:
			titleTag = soup.html.head.title
			titleTag = str(titleTag.string)
		except:
			titleTag = ""
		# Ignore content we aren't concerned with
		partial_content = doc_ignore_content(soup)
		
		end = time.time()
		# Return the basic URL data structure
		field = URLField(link, titleTag, descr, keywords)

		field.descr = field.tokenizeTags(field.descr)
		field.keywords = field.tokenizeTags(field.keywords)

		field.full_content = data
		field.extract_content = partial_content
		field.info_stats = istats
		field.populate()
		if ((idx % LINK_SET_INDICATOR) == 0):
			sys.stdout.write("[%s/%s] " % (idx, total_links))
	   		
		# Exit crawl single URL with url field.
		# @return URLField
		return field
	except urllib2.URLError:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except Exception, e:
		# NOTE: if pass allowed, compile errors will be ignored.
		print "ERR<crawlSingleURLForContent>: %s" % e
		pass
Пример #7
0
def crawlBuildLinks(link_list):
	opener = buildOpener()
	""" Iterate through the list of links and collect links found
	on each page through the use of the beautiful soup lib."""
	total_links = 0
	total_links_tag = 0
	sub_links = None
	for link in link_list:
		try:
			data = opener.open(link).read()
			soup = BeautifulSoup(data)
			sub_links_tag = soup.findAll('a')
			total_links_tag = total_links_tag + len(sub_links_tag)
			sub_links = [processSubLink(el) for el in sub_links_tag if validateSubLink(el)]			
			# Filter out duplicates with set
			sub_links = set(sub_links)		
			total_links = total_links + len(sub_links)
		except Exception, e:
			print "ERR <crawlBuildLinks>: %s" % e
			print "    <crawlBuildLinks>: url=[%s]" % link
Пример #8
0
def crawlBuildLinks(link_list):
    opener = buildOpener()
    """ Iterate through the list of links and collect links found
	on each page through the use of the beautiful soup lib."""
    total_links = 0
    total_links_tag = 0
    sub_links = None
    for link in link_list:
        try:
            data = opener.open(link).read()
            soup = BeautifulSoup(data)
            sub_links_tag = soup.findAll('a')
            total_links_tag = total_links_tag + len(sub_links_tag)
            sub_links = [
                processSubLink(el) for el in sub_links_tag
                if validateSubLink(el)
            ]
            # Filter out duplicates with set
            sub_links = set(sub_links)
            total_links = total_links + len(sub_links)
        except Exception, e:
            print "ERR <crawlBuildLinks>: %s" % e
            print "    <crawlBuildLinks>: url=[%s]" % link