def url_fix(url, charset='utf-8'): if isinstance(url, unicode): url = url.encode(charset, 'ignore') scheme, netloc, path, params, query, fragment = urlparse(url) if not netloc: raise MalformedURLException(url) if netloc.endswith('youtube.com'): params = parse_qs(query + fragment) try: # Of the form http://www.youtube.com/v/<video_id>(?|&)foo=bar matched = match('^/(v|embed)/([^?&]+)', path) if matched: video_id = matched.group(2) else: video_id = params['v'][0] scheme = 'http' netloc = 'www.youtube.com' path = '/watch' query = 'v=%s' % video_id fragment = params = '' except KeyError: raise MalformedURLException(url) elif netloc.endswith('vimeo.com'): try: scheme = 'http' netloc = 'www.vimeo.com' path = path[path.rindex('/') + 1:] query = params = fragment = '' except ValueError: raise MalformedURLException(url) elif netloc in ['facebook.com', 'www.facebook.com']: params = parse_qs(query + fragment) try: video_id = params['video_id'][0] scheme = 'https' netloc = 'graph.facebook.com' path = '/%s' % video_id query = params = fragment = '' except KeyError: vid_match = match('/v/([0-9]+)$', path) if not vid_match: raise MalformedURLException(url) scheme = 'http' netloc = 'www.facebook.com' query = params = fragment = '' else: path = quote(path, '/%') query = quote_plus(query, ':&=') return urlunparse(norm_tuple(scheme, netloc, path, params, query, fragment))
def download(self): ''' modified from https://github.com/codelucas/newspaper/blob/master/newspaper/network.py ''' if(self.is_downloaded): return True FAIL_ENCODING = 'ISO-8859-1' useragent = self.newspaper_article.config.browser_user_agent timeout = self.newspaper_article.config.request_timeout try: html = None response = requests.get(url=self.url, timeout=60)#TODO: add back get_request_kwargs functionality present in newspaper impl if(response.status_code >= 400): logging.warn(u"encountered status code {0} while getting {1}".format(response.status_code, self.url)) return False if(not re.search("(text/html|application/xhtml\+xml) *(; .*)?", response.headers["content-type"])): logging.debug(u"not a html: {0}".format(response.headers["content-type"])) return False try: parsed_url = urlparse(response.url) parsed_as_list = list(parsed_url) parsed_as_list[5] = '' self.canonical_url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(response.url, str(e))) return False if response.encoding != FAIL_ENCODING: html = response.text else: html = response.content if not html: return False converted = UnicodeDammit(html, is_html=True) if not converted.unicode_markup: logging.warn("Failed to detect encoding of downloaded article, tried: " + ", ".join(converted.tried_encodings)) return False self.html = converted.unicode_markup self.is_downloaded = True except Exception as e: logging.warn('%s on %s' % (e, self.url)) return False return True
def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration while(True): if(len(self.visit_queue) <= 0): raise StopIteration current_url = self.visit_queue.pop() if(self._should_skip()): logging.info(u"skipping {0} randomly".format(current_url)) continue logging.info(u"visiting {0}".format(current_url)) #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() # get get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info("Matches with filter, skipping the {0}".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if parsed_url.scheme != u"http" and parsed_url.scheme != u"https": logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if not parsed_url.netloc.endswith(self.domain): continue if url in self.visited_urls: continue self.visit_queue.appendleft(url) self.visited_urls.add(url) logging.info(u"added {0} to the visit queue".format(url)) self.pages_visited += 1 return article
def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while (True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format( self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format( current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info( u"skipping url \"{0}\" because it matches filter" .format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if (parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info( u"skipping url with invalid scheme: {0}". format(url)) continue parsed_as_list[5] = '' url = urlunparse( urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info( u"skipping malformed url {0}. Error: {1}". format(url, str(e))) continue if (not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not (u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put( (url, str(int(current_level) + 1))) logging.info( u"added {0} to the to_visit as well as the level {1}" .format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info( u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e
def download(self): ''' modified from https://github.com/codelucas/newspaper/blob/master/newspaper/network.py ''' if (self.is_downloaded): return True FAIL_ENCODING = 'ISO-8859-1' useragent = self.newspaper_article.config.browser_user_agent timeout = self.newspaper_article.config.request_timeout try: html = None with eventlet.Timeout(15): response = requests.get( url=self.url, timeout=15 ) #TODO: add back get_request_kwargs functionality present in newspaper impl if (response.status_code >= 400): logging.warn( u"encountered status code {0} while getting {1}".format( response.status_code, self.url)) return False if (not re.search("(text/html|application/xhtml\+xml) *(; .*)?", response.headers["content-type"])): logging.debug(u"not a html: {0}".format( response.headers["content-type"])) return False try: parsed_url = urlparse(response.url) parsed_as_list = list(parsed_url) parsed_as_list[5] = '' self.canonical_url = urlunparse( urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format( response.url, str(e))) return False if response.encoding != FAIL_ENCODING: html = response.text else: html = response.content if not html: return False converted = UnicodeDammit(html, is_html=True) if not converted.unicode_markup: logging.warn( "Failed to detect encoding of downloaded article, tried: " + ", ".join(converted.tried_encodings)) return False self.html = converted.unicode_markup self.is_downloaded = True except Exception as e: logging.warn('%s on %s' % (e, self.url)) return False except eventlet.Timeout as t: logging.warn('Timeout on %s' % (self.url)) return False return True
def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e