Пример #1
0
def crawlSingleURL(link, idx, total_links):
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)
        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        end = time.time()

        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))
        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except socket.timeout:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        pass
Пример #2
0
def crawlSingleURLForContent(link, idx, total_links):
    """ Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        istats = build_page_info(link, data)
        data = clean_content(data)
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)

        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        # Ignore content we aren't concerned with
        partial_content = doc_ignore_content(soup)

        end = time.time()
        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)

        field.descr = field.tokenizeTags(field.descr)
        field.keywords = field.tokenizeTags(field.keywords)

        field.full_content = data
        field.extract_content = partial_content
        field.info_stats = istats
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))

        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        # NOTE: if pass allowed, compile errors will be ignored.
        print "ERR<crawlSingleURLForContent>: %s" % e
        pass