def crawlSingleURL(link, idx, total_links): try: opener = buildOpener() start = time.time() data = opener.open(link).read() soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except socket.timeout: print "ERR: timeout [%s/%s] " % (idx, total_links) except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: pass
def crawlSingleURL(link, idx, total_links): try: opener = buildOpener() start = time.time() data = opener.open(link).read() soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except socket.timeout: print "ERR: timeout [%s/%s] " % (idx, total_links) except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: pass
def crawlSingleURLForContent(link, idx, total_links): """ Crawl this URL but only extract the content for content analysis. A more extensive model than crawlSingleURL""" try: opener = buildOpener() start = time.time() data = opener.open(link).read() istats = build_page_info(link, data) data = clean_content(data) soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" # Ignore content we aren't concerned with partial_content = doc_ignore_content(soup) end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.descr = field.tokenizeTags(field.descr) field.keywords = field.tokenizeTags(field.keywords) field.full_content = data field.extract_content = partial_content field.info_stats = istats field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: # NOTE: if pass allowed, compile errors will be ignored. print "ERR<crawlSingleURLForContent>: %s" % e pass
def crawlSingleURLForContent(link, idx, total_links): """ Crawl this URL but only extract the content for content analysis. A more extensive model than crawlSingleURL""" try: opener = buildOpener() start = time.time() data = opener.open(link).read() istats = build_page_info(link, data) data = clean_content(data) soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" # Ignore content we aren't concerned with partial_content = doc_ignore_content(soup) end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.descr = field.tokenizeTags(field.descr) field.keywords = field.tokenizeTags(field.keywords) field.full_content = data field.extract_content = partial_content field.info_stats = istats field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: # NOTE: if pass allowed, compile errors will be ignored. print "ERR<crawlSingleURLForContent>: %s" % e pass