Пример #1
0
def prepareURLsForNextCrawl(urls):
    new_urls = []
    for url in urls:
        url = utils.sanitize_url(url)
        if url.endswith(".js") or url.endswith("exe"):
            url = utils.domainOf(url)
            url = utils.sanitize_url(url)
        new_urls.append(url)
    new_urls = [url for url in new_urls if not isWhiteListedURL(url)]
    return list(set(new_urls))
Пример #2
0
 def __init__(self, url, status_code=200):
     self.url = utils.sanitize_url(url)
     print"Creating Webpage object for url: ", self.url
     self.url_hash = self._url_hash()
     self.status_code = 200 #self._get_status_code()
     self.domain = self._domain()
     self.host = "0.0.0.0" #self._host()
     self.geo_location = "None"#utils.getGeoLocation(str(self.host))
     self.incoming_links = set([])
     self.a_links = set([])
     self.js_links = set([])
Пример #3
0
def generate_links(link, base_link):
    """docstring for generate_links"""
    a_url_list = []
    try:
        response = requests.get(link)
        # BeautifulSoup generate DOM tree for html document
        # for searching and manuplation of dom document
        dom_tree = BeautifulSoup.BeautifulSoup(response.text)
        a_element_list = dom_tree.fetch('a')  # a elements in html doc
        for a_element in a_element_list:
            a_url = a_element.get('href')
            if utils.is_valid_url(a_url, base_link):
                a_url_list.append(utils.sanitize_url(a_url, base_link))
    except:
        return a_url_list
    return a_url_list
Пример #4
0
def crawl_url(url, headless=True, save_into_db=True):
    print "Crawling URL",url
    iurl_hash = utils.get_url_hash(url)
    update = {iurl_hash: url}

    db = db_utils.getDBInstance()  
    if regex_domain_match(db, url):
        print "Skipping: ",url
        
    db.crawl_queue.remove(update)
    url = utils.sanitize_url(url)
    url_hash = utils.get_url_hash(url)
    db_query = {'url_hash': url_hash}

    if headless:
        display = Display(visible=0, size=(800, 600))
        display.start()

    
    obj_in_db = db.webpages.find_one(db_query)
    
    webpage = None
    if not obj_in_db:
        webpage = WebPage(url)


    browser = webdriver.Chrome(desired_capabilities=capabilities)
    browser.set_page_load_timeout(30)
    #browser.implicitly_wait(5)
    try:
        print "Visiting page: ",url
        if not url.startswith("http"):
            raise Exception

        browser.get(url)
        #time.sleep(1)
    except Exception, e:
        print "Error Occured"
        browser.quit()
        print e
        return -1 
Пример #5
0
    def download_audio(self, path_to_save=None) -> None:
        """
        Downloads only the audio from the video. 
        Format: .mp3

        (Useful when downloading songs from YouTube)
        """
        # check if the soup and json dict exists
        if not self._src_page_soup:
            self._create_soup()
            self._create_json_dict()
            self._video_streams, self._audio_streams = self._extract_streams()

        audio_src_url: str = ""
        for audio_stream in self._audio_streams:
            # apparently YT serves medium quality audio as its highest quality
            if audio_stream["audio_quality"] == "AUDIO_QUALITY_MEDIUM":
                audio_src_url: str = audio_stream["src_url"]
                break

        # clean the url first
        audio_src_url: str = utils.sanitize_url(audio_src_url)

        print("::-> Downloading the audio file...")
        # request the audio source
        try:
            audio_resp: requests.Response = requests.get(
                audio_src_url, headers=utils.request_headers(), stream=True)
            audio_resp.raise_for_status()
        except:
            print("::-> An error occurred while requesting the file")
            raise

        # save to disk with is_video not set
        utils.save_to_disk(audio_resp,
                           self.get_video_title(),
                           path_to_save,
                           is_video=False)
        print("Done!\n")
Пример #6
0
    def download(self, vid_format: str, path_to_save=None) -> None:
        """
        Downloads the video.
        Current resolutions supported: all
        """
        if not vid_format:
            print("\n::-> Error: quality/resolution must not be None\n")
            exit(1)

        # check if soup and json dict are created
        if not self._src_page_soup:
            self._create_soup()
            self._create_json_dict()
            self._video_streams, self._audio_streams = self._extract_streams()

        vid_src_url: str = None
        vid_wa_url: str = None  # video without audio url
        for stream in self._video_streams:
            if stream["quality_label"] == vid_format:
                if re.search(",", stream["mime_type"]):
                    vid_src_url: str = stream["src_url"]
                    break
                else:
                    vid_wa_url: str = stream["src_url"]
                    break

        if vid_src_url:
            # got the source url
            vid_src_url: str = utils.sanitize_url(vid_src_url)

            print("::-> Download in progress...")
            # ? get the response from the src url in chunks (stream=True)
            try:
                response: requests.Response = requests.get(
                    vid_src_url, headers=utils.request_headers(), stream=True)
                response.raise_for_status()
            except:
                print("::-> An error occurred while requesting the file.")
                raise

            utils.save_to_disk(response,
                               self.get_video_title(),
                               path_to_save,
                               is_video=True)

            # endif

        # ? When the video and audio urls are different
        elif vid_wa_url:
            # clean the url
            vid_wa_url: str = utils.sanitize_url(vid_wa_url)

            # download audio and video files to be combined
            self.download_audio(path_to_save)
            print("::-> Downloading the video file...")
            self._download_video(vid_wa_url, path_to_save)

            # get to know which video and audio files needs to be combined
            if path_to_save[len(path_to_save) - 1] != "/":
                path_to_save += "/"

            vid_filelist: list = glob.glob(path_to_save + "*.mp4")
            last_vid_file: str = max(vid_filelist, key=os.path.getctime)
            audio_filelist: list = glob.glob(path_to_save + "*.mp3")
            last_audio_file: str = max(audio_filelist, key=os.path.getctime)

            # use ffmpeg to combine both, audio and video
            print(
                "::-> Combining the audio and video files into one video file..."
            )

            # keep the console clean
            cmd: str = f'ffmpeg -v quiet -i "{last_vid_file}" -i "{last_audio_file}" -map 0:v:0 -map 1:a:0 "{self.get_video_title()}_final.mp4"'
            # finally execute the command
            ffmpeg_exitcode = os.system(cmd)

            # delete the downloaded files so that the final combined file remain
            try:
                os.remove(last_vid_file)
                os.remove(last_audio_file)
            except OSError:
                pass

        # endif
        print("Successfully downloaded the video/audio titled: ")
        print(self.get_video_title())
        print("\nDownload is complete.\n")