def prepare_html_tree(self, url, domain, html=None, store_script=False, store_style=False, store_urls=True): if url.endswith('/'): url = url[:-1] filetype = utils.get_filetype_from_url(url) if filetype: #index = len(filetype) url = url[:-(len(filetype) + 1)] self._url = url self.domain = domain self.scheme = utils.get_scheme(url) self._html_source = html self.clean_html_source() self._backup_html_tree = lxml.html.fromstring(self._html_source, parser=self._parser) # registering all available functions in default namespace # xpath2_functions.register_functions(self._backup_html_tree) self.body_node = self._tree_explorer.get_elements_by_tags( self._backup_html_tree, ['body'])[0] # viene prelevato il body dal DOM self.language = self.extract_content_language() self._is_news = self._check_webnews_from_meta_tag() # iniziare da qui tutte le chiamate per modificare il DOM e non prima self._fix_relative_urls() if store_urls: self._retrieve_urls()
def get_songs_by_uri(self, uri): if not uri: return songs = [] uri_scheme = utils.get_scheme(uri) if uri_scheme == "file": path = utils.get_path_from_uri(uri) prefix = os.path.splitext(path)[0] cue_file = "%s.%s" % (prefix, "cue") if os.path.exists(cue_file): try: cuesheet = read_cuesheet(path, cue_file) except CueException, e: print e song = self.get_or_create_song({"uri":uri}, "local", read_from_file=True) if song: return [song] else: return [] else: for tag in cuesheet.get_tags(): s = self.get_or_create_song(tag, "cue", read_from_file=False) songs.append(s) return songs song = self.get_or_create_song({"uri":uri}, "local", read_from_file=True) if song: return [ song ] else: return []
def get_songs_by_uri(self, uri): if not uri: return songs = [] uri_scheme = utils.get_scheme(uri) if uri_scheme == "file": path = utils.get_path_from_uri(uri) prefix = os.path.splitext(path)[0] cue_file = "%s.%s" % (prefix, "cue") if os.path.exists(cue_file): try: cuesheet = read_cuesheet(path, cue_file) except CueException, e: print e song = self.get_or_create_song({"uri": uri}, "local", read_from_file=True) if song: return [song] else: return [] else: for tag in cuesheet.get_tags(): s = self.get_or_create_song(tag, "cue", read_from_file=False) songs.append(s) return songs song = self.get_or_create_song({"uri": uri}, "local", read_from_file=True) if song: return [song] else: return []
def set_song(self, song, play=False, crossfade=None, seek=None): uri = song.get("uri") mime_type = get_mime_type(uri) if mime_type in [ "audio/x-scpls", "audio/x-mpegurl", "video/x-ms-asf", "application/xspf+xml" ]: if get_scheme(song.get("uri")) != "file": self.fetch_song = song self.emit("fetch-start", song) ThreadRun(self.async_fetch, self.play_radio, (song,), (play, crossfade, seek)).start() else: self.fetch_song = None self.__set_song(song, play, crossfade, seek) else: self.fetch_song = None self.__set_song(song, play, crossfade, seek)
def get_tasks(self, job): if not job in self.jobs: return 0 new_urls = self.database.get_urls(job) self.print_queue.put("got tasks for %s (%d)" % (job, len(new_urls))) new_urls = [[https, task.decode("utf8")] for https, task in new_urls] new_allowed_urls = [] for new_https, new_url in new_urls: task_scheme = "%s%s" % (utils.get_scheme(new_https), new_url) if not self.jobs[job]["robots"] or self.jobs[job]["robots"].allowed( task_scheme, self.info["n"]): new_allowed_urls.append([new_https, new_url]) else: self.database.timestamp(new_url) self.jobs[job]["tasks"] += new_urls return len(new_urls)
def yield_tasks(self): while self.working: tasks = [] for job in list(self.jobs): if not self.jobs[job]["tasks"]: self.fill_queue.put(job) continue after_delay = self.jobs[job]["timestamp"]+self.jobs[job]["sleep"] time_since = after_delay-time.time() if not time_since > 0 or self.jobs[job]["timestamp"] == 0.0: https, task = self.jobs[job]["tasks"].pop(0) if not len(self.jobs[job]["tasks"]) > 0: self.fill_queue.put(job) task_scheme = "%s%s" % (utils.get_scheme(https), task) tasks.append(task_scheme) self.jobs[job]["timestamp"] = time.time() if tasks: yield tasks
def get_scheme(self): return utils.get_scheme(self.get("uri"))
def get_category_urls(self, source_url, doc): """Inputs source lxml root and source url, extracts domain and finds all of the top level urls, we are assuming that these are the category urls. cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia] """ page_urls = self.get_urls(doc) valid_categories = [] for p_url in page_urls: scheme = utils.get_scheme(p_url, allow_fragments=False) domain = utils.get_domain(p_url, allow_fragments=False) path = utils.get_path(p_url, allow_fragments=False) if domain: child_tld = tldextract.extract(p_url) domain_tld = tldextract.extract(source_url) child_subdomain_parts = child_tld.subdomain.split('.') subdomain_contains = False for part in child_subdomain_parts: if part == domain_tld.domain: subdomain_contains = True break else: valid_categories.append(scheme + '://' + domain) # TODO account for case where category is in form # http://subdomain.domain.tld/category/ <-- still legal! else: # we want a path with just one subdir # cnn.com/world and cnn.com/world/ are both valid_categories path_chunks = [x for x in path.split('/') if len(x) > 0] if 'index.html' in path_chunks: path_chunks.remove('index.html') if len(path_chunks) == 1 and len(path_chunks[0]) < 14: valid_categories.append(domain + path) stopwords = [ 'about', 'help', 'privacy', 'legal', 'feedback', 'sitemap', 'profile', 'account', 'mobile', 'sitemap', 'facebook', 'myspace', 'twitter', 'linkedin', 'bebo', 'friendster', 'stumbleupon', 'youtube', 'vimeo', 'store', 'mail', 'preferences', 'maps', 'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes', 'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter', 'subscribe', 'academy', 'shopping', 'purchase', 'site-map', 'shop', 'donate', 'newsletter', 'product', 'advert', 'info', 'tickets', 'coupons', 'forum', 'board', 'archive', 'browse', 'howto', 'how to', 'faq', 'terms', 'charts', 'services', 'contact', 'plus', 'admin', 'login', 'signup', 'register', 'developer', 'proxy' ] _valid_categories = [] # TODO Stop spamming urlparse and tldextract calls... for p_url in valid_categories: path = utils.get_path(p_url) subdomain = tldextract.extract(p_url).subdomain conjunction = path + ' ' + subdomain bad = False for badword in stopwords: if badword.lower() in conjunction.lower(): bad = True break if not bad: _valid_categories.append(p_url) _valid_categories.append('/') # add the root for i, p_url in enumerate(_valid_categories): if p_url.startswith('://'): p_url = 'http' + p_url _valid_categories[i] = p_url elif p_url.startswith('//'): p_url = 'http:' + p_url _valid_categories[i] = p_url if p_url.endswith('/'): p_url = p_url[:-1] _valid_categories[i] = p_url _valid_categories = list(set(_valid_categories)) #category_urls = [utils.prepare_url(p_url, source_url) for p_url in _valid_categories] #category_urls = [c for c in category_urls if c is not None] #return category_urls return None