def prepareURLsForNextCrawl(urls): new_urls = [] for url in urls: url = utils.sanitize_url(url) if url.endswith(".js") or url.endswith("exe"): url = utils.domainOf(url) url = utils.sanitize_url(url) new_urls.append(url) new_urls = [url for url in new_urls if not isWhiteListedURL(url)] return list(set(new_urls))
def __init__(self, url, status_code=200): self.url = utils.sanitize_url(url) print"Creating Webpage object for url: ", self.url self.url_hash = self._url_hash() self.status_code = 200 #self._get_status_code() self.domain = self._domain() self.host = "0.0.0.0" #self._host() self.geo_location = "None"#utils.getGeoLocation(str(self.host)) self.incoming_links = set([]) self.a_links = set([]) self.js_links = set([])
def generate_links(link, base_link): """docstring for generate_links""" a_url_list = [] try: response = requests.get(link) # BeautifulSoup generate DOM tree for html document # for searching and manuplation of dom document dom_tree = BeautifulSoup.BeautifulSoup(response.text) a_element_list = dom_tree.fetch('a') # a elements in html doc for a_element in a_element_list: a_url = a_element.get('href') if utils.is_valid_url(a_url, base_link): a_url_list.append(utils.sanitize_url(a_url, base_link)) except: return a_url_list return a_url_list
def crawl_url(url, headless=True, save_into_db=True): print "Crawling URL",url iurl_hash = utils.get_url_hash(url) update = {iurl_hash: url} db = db_utils.getDBInstance() if regex_domain_match(db, url): print "Skipping: ",url db.crawl_queue.remove(update) url = utils.sanitize_url(url) url_hash = utils.get_url_hash(url) db_query = {'url_hash': url_hash} if headless: display = Display(visible=0, size=(800, 600)) display.start() obj_in_db = db.webpages.find_one(db_query) webpage = None if not obj_in_db: webpage = WebPage(url) browser = webdriver.Chrome(desired_capabilities=capabilities) browser.set_page_load_timeout(30) #browser.implicitly_wait(5) try: print "Visiting page: ",url if not url.startswith("http"): raise Exception browser.get(url) #time.sleep(1) except Exception, e: print "Error Occured" browser.quit() print e return -1
def download_audio(self, path_to_save=None) -> None: """ Downloads only the audio from the video. Format: .mp3 (Useful when downloading songs from YouTube) """ # check if the soup and json dict exists if not self._src_page_soup: self._create_soup() self._create_json_dict() self._video_streams, self._audio_streams = self._extract_streams() audio_src_url: str = "" for audio_stream in self._audio_streams: # apparently YT serves medium quality audio as its highest quality if audio_stream["audio_quality"] == "AUDIO_QUALITY_MEDIUM": audio_src_url: str = audio_stream["src_url"] break # clean the url first audio_src_url: str = utils.sanitize_url(audio_src_url) print("::-> Downloading the audio file...") # request the audio source try: audio_resp: requests.Response = requests.get( audio_src_url, headers=utils.request_headers(), stream=True) audio_resp.raise_for_status() except: print("::-> An error occurred while requesting the file") raise # save to disk with is_video not set utils.save_to_disk(audio_resp, self.get_video_title(), path_to_save, is_video=False) print("Done!\n")
def download(self, vid_format: str, path_to_save=None) -> None: """ Downloads the video. Current resolutions supported: all """ if not vid_format: print("\n::-> Error: quality/resolution must not be None\n") exit(1) # check if soup and json dict are created if not self._src_page_soup: self._create_soup() self._create_json_dict() self._video_streams, self._audio_streams = self._extract_streams() vid_src_url: str = None vid_wa_url: str = None # video without audio url for stream in self._video_streams: if stream["quality_label"] == vid_format: if re.search(",", stream["mime_type"]): vid_src_url: str = stream["src_url"] break else: vid_wa_url: str = stream["src_url"] break if vid_src_url: # got the source url vid_src_url: str = utils.sanitize_url(vid_src_url) print("::-> Download in progress...") # ? get the response from the src url in chunks (stream=True) try: response: requests.Response = requests.get( vid_src_url, headers=utils.request_headers(), stream=True) response.raise_for_status() except: print("::-> An error occurred while requesting the file.") raise utils.save_to_disk(response, self.get_video_title(), path_to_save, is_video=True) # endif # ? When the video and audio urls are different elif vid_wa_url: # clean the url vid_wa_url: str = utils.sanitize_url(vid_wa_url) # download audio and video files to be combined self.download_audio(path_to_save) print("::-> Downloading the video file...") self._download_video(vid_wa_url, path_to_save) # get to know which video and audio files needs to be combined if path_to_save[len(path_to_save) - 1] != "/": path_to_save += "/" vid_filelist: list = glob.glob(path_to_save + "*.mp4") last_vid_file: str = max(vid_filelist, key=os.path.getctime) audio_filelist: list = glob.glob(path_to_save + "*.mp3") last_audio_file: str = max(audio_filelist, key=os.path.getctime) # use ffmpeg to combine both, audio and video print( "::-> Combining the audio and video files into one video file..." ) # keep the console clean cmd: str = f'ffmpeg -v quiet -i "{last_vid_file}" -i "{last_audio_file}" -map 0:v:0 -map 1:a:0 "{self.get_video_title()}_final.mp4"' # finally execute the command ffmpeg_exitcode = os.system(cmd) # delete the downloaded files so that the final combined file remain try: os.remove(last_vid_file) os.remove(last_audio_file) except OSError: pass # endif print("Successfully downloaded the video/audio titled: ") print(self.get_video_title()) print("\nDownload is complete.\n")