def dl(self): """ Downloads the highest Quallitiy picture available. returns False if something goes wrong. """ if(self.orig_url == ""): if(self.hq_url == ""): down = Download(self.lq_url, self.config.get_image_folder()) if(down.perform()): return True else: down = Download(self.hq_url, self.config.get_image_folder()) if(down.perform()): return True else: down = Download(self.orig_url, as_var=True) if(down.perform()): result = down.get_result() soup = BeautifulSoup(result.getvalue()) download_link = soup.find("a", text="this link") orig_url = self.dl_url_base + download_link["href"] time.sleep(120) down = Download(orig_url, self.config.get_image_folder()) if(down.perform()): self.file_name = down.get_output_name() return True return False
def file_exists(self, file_path): hash_local = self.hash_file(file_path) download = Download( ("https://commons.wikimedia.org/w/api.php?action=query&list" "=allimages&format=json&aisha1=") + hash_local, as_var=True) if(download.perform()): content = download.get_result().getvalue() json_data = json.loads(content) if(len(json_data["query"]["allimages"]) > 0): return True else: return False
def parse_web(self): down = Download(self.url, as_var=True, post_dict=self.post_dict) found_start = False can_add = False if(down.perform()): web_string_etree = etree.fromstring(down.get_result().getvalue()) for element in web_string_etree.iter("script"): redirect_url = element.text redirect_url_array = redirect_url.split("\"") down = Download(self.base_url + redirect_url_array[1], as_var=True) if(down.perform()): string_etree = html.fromstring( down.get_result().getvalue()) table = string_etree.xpath("//table[@id='QueryResults']") for element in table[0].iter("tr"): list_of_elements = list(element.iter("td")) if(len(list_of_elements) > 5): a = list(list_of_elements[0].iter("a")) if(found_start or self.no_need): can_add = True if(self.new_start): if(self.new_start == a[0].text and not found_start): found_start = True if(can_add): self.db.insert_image(a[0].attrib["href"], a[0].text, self.parse_date( list_of_elements[1].text), list_of_elements[2].text, list_of_elements[3].text, list_of_elements[4].text, list_of_elements[5].text, list_of_elements[6].text, list_of_elements[7].text, self.mission_id, False, False) self.db.update_mission_image_id( self.mission_id, a[0].text) self.db.update_mission_image_id( self.mission_id, str(0))
def find_urls(self): """ Finds the Download urls with different qualities and save them. """ down = Download(self.url, as_var=True) if(down.perform()): result = down.get_result() soup = BeautifulSoup(result.getvalue()) download_links = soup.find_all("a", {"class": "DownloadLink"}) if(download_links): self.lq_url = download_links[0]["href"] self.hq_url = download_links[1]["href"] raw_link = soup.find( text="Other options available:").find_next("script").text m = re.search(r"href=..(.*\.\b[a-zA-Z0-9]+\b)", raw_link) if(m): self.orig_url = self.url_base + "/" + m.group(1)
def parse_web(self): down = Download(self.url, as_var=True) if(down.perform()): result = down.get_result() soup = BeautifulSoup(result.getvalue()) mission_table = soup.find( text="Missions used in the Database").find_next("table") mission_params = mission_table.find("tbody").find_all("tr") for m in mission_params: mission_as_list = list(m.children) if(len(mission_as_list) > 5): self.db.insert_mission(mission_as_list[0].text, mission_as_list[1].text, mission_as_list[2].text, self.parse_date( mission_as_list[3].text), self.parse_date( mission_as_list[4].text), mission_as_list[5].text)
def find_online_category(self, term): result = None down = Download(self.base_api + urllib.quote(term), as_var=True) if(down.perform()): result = down.get_result() return result