def read_file_content(self, file_url=None): """Return name of temp file in which remote file is saved.""" if not file_url: file_url = self.url pywikibot.warning("file_url is not given. " "Set to self.url by default.") pywikibot.output(u'Reading file %s' % file_url) resume = False rlen = 0 _contents = None dt = 15 uo = URLopener() retrieved = False while not retrieved: if resume: pywikibot.output(u"Resume download...") uo.addheader('Range', 'bytes=%s-' % rlen) infile = uo.open(file_url) if 'text/html' in infile.info().getheader('Content-Type'): pywikibot.output(u"Couldn't download the image: " "the requested URL was not found on server.") return content_len = infile.info().getheader('Content-Length') accept_ranges = infile.info().getheader('Accept-Ranges') == 'bytes' if resume: _contents += infile.read() else: _contents = infile.read() infile.close() retrieved = True if content_len: rlen = len(_contents) content_len = int(content_len) if rlen < content_len: retrieved = False pywikibot.output( u"Connection closed at byte %s (%s left)" % (rlen, content_len)) if accept_ranges and rlen > 0: resume = True pywikibot.output(u"Sleeping for %d seconds..." % dt) time.sleep(dt) if dt <= 60: dt += 15 elif dt < 360: dt += 60 else: pywikibot.log( u"WARNING: length check of retrieved data not possible.") handle, tempname = tempfile.mkstemp() with os.fdopen(handle, "wb") as t: t.write(_contents) return tempname
class RemoteFile(object): def __init__(self, url): self.opener = URLopener() self.url = url self.filename = url.rstrip('/').rsplit('/', 1)[-1] self.offset = 0 def seek(self, offset, whence=0): assert whence == 0 self.offset = offset def read(self, size): start = self.offset end = start + size - 1 assert end > start h = 'Range', 'bytes={}-{}'.format(start, end) stderr.write('Fetching {} {}\n'.format(self.filename, h[1])) self.opener.addheaders.append(h) data = self.opener.open(self.url).read() return data def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close()
def get_imagelinks(url): """Given a URL, get all images linked to by the page at that URL.""" # Check if BeautifulSoup is imported. if isinstance(BeautifulSoup, ImportError): raise BeautifulSoup links = [] uo = URLopener() with uo.open(url) as f: soup = BeautifulSoup(f.read()) if not shown: tagname = "a" elif shown == "just": tagname = "img" else: tagname = ["a", "img"] for tag in soup.findAll(tagname): link = tag.get("src", tag.get("href", None)) if link: ext = os.path.splitext(link)[1].lower().strip('.') if ext in fileformats: links.append(urllib.basejoin(url, link)) return links
def call_api(url, wait=1): time.sleep(wait) req = URLopener() req.addheader('Authorization', 'token ' + TOKEN) fp = req.open(url) data = json.load(fp) fp.close() return data
class HttpFetcherBasic(HttpFetcher): def __init__(self, url): super().__init__(url) self.urlop = URLopener() for hdr, val in (tuple(x.split("=", 1)) if "=" in x else (x, "") for x in url.fragment.split("&") if x): self.urlop.addheader(hdr, val) def open(self, url): return self.urlop.open(url)
class RemoteFile(object): def __init__(self, url): self.opener = URLopener() self.url = url self.filename = url.rstrip('/').rsplit('/', 1)[-1] self.offset = 0 def seek(self, offset, whence=0): assert whence == 0 self.offset = offset def read(self, size): start = self.offset end = start + size - 1 assert end > start h = 'Range', 'bytes={}-{}'.format(start, end) stderr.write('Fetching {} {}\n'.format(self.filename, h[1])) self.opener.addheaders.append(h) data = self.opener.open(self.url).read() return data
def get_imagelinks(url): """Given a URL, get all images linked to by the page at that URL.""" links = [] uo = URLopener() file = uo.open(url) soup = BeautifulSoup.BeautifulSoup(file.read()) file.close() if not shown: tagname = "a" elif shown == "just": tagname = "img" else: tagname = ["a", "img"] for tag in soup.findAll(tagname): link = tag.get("src", tag.get("href", None)) if link: ext = os.path.splitext(link)[1].lower().strip('.') if ext in fileformats: links.append(urllib.basejoin(url, link)) return links
def test6(url = "http://example.com"): od = URLopener() # ruleid: insecure-urlopener-open od.open(url)
def read_file_content(self, file_url=None): """Return name of temp file in which remote file is saved.""" if not file_url: file_url = self.url pywikibot.warning('file_url is not given. ' 'Set to self.url by default.') pywikibot.output('Reading file {}'.format(file_url)) resume = False rlen = 0 _contents = None dt = 15 uo = URLopener() retrieved = False while not retrieved: if resume: pywikibot.output('Resume download...') uo.addheader('Range', 'bytes={}-'.format(rlen)) with closing(uo.open(file_url)) as infile: info = infile.info() info_get = info.get content_type = info_get('Content-Type') content_len = info_get('Content-Length') accept_ranges = info_get('Accept-Ranges') if 'text/html' in content_type: pywikibot.output( "Couldn't download the image: " 'the requested URL was not found on server.') return valid_ranges = accept_ranges == 'bytes' if resume: _contents += infile.read() else: _contents = infile.read() retrieved = True if content_len: rlen = len(_contents) content_len = int(content_len) if rlen < content_len: retrieved = False pywikibot.output( 'Connection closed at byte {} ({} left)'.format( rlen, content_len)) if valid_ranges and rlen > 0: resume = True pywikibot.output('Sleeping for {} seconds...'.format(dt)) pywikibot.sleep(dt) if dt <= 60: dt += 15 elif dt < 360: dt += 60 else: pywikibot.log( 'WARNING: length check of retrieved data not possible.') handle, tempname = tempfile.mkstemp() with os.fdopen(handle, 'wb') as t: t.write(_contents) return tempname
os.mkdir(vol_dir) url_1 = "ftp://s220ftp.tipo.gov.tw/PatentPubXML_" + str(num) + "/" files = get_ftp_ls(url_1) files = [f.decode() for f in files] dirs = [d for d in files if "." not in d] print("num of dirs in " + str(num) + " folder: " + str(len(dirs))) for d in dirs[:]: url_2 = url_1 + d + "/" files = get_ftp_ls(url_2) filename = [f.decode() for f in files if f.lower().endswith(b'.xml')] if len(filename) < 1: print("fail url:", url_2) else: filename = filename[0] if not filename in os.listdir(vol_dir): url_3 = url_2 + filename store_path = os.path.join(vol_dir, filename) opener = URLopener() with opener.open(url_3) as remote_file, open( store_path, 'wb') as local_file: shutil.copyfileobj(remote_file, local_file) document_count += 1 if document_count % 1000 == 0: print("num of docs downloaded:", document_count) start_num += 1
def test2(): od = URLopener() # ruleid: insecure-urlopener-open-ftp url = "ftp://example.com" od.open(url)
def test6_ok(url = "https://example.com"): od = URLopener() # ok: insecure-urlopener-open od.open(url)
# easy manipulation. soup = BeautifulSoup(result.content, "html.parser") print('Page loaded successfully.') # Finding all the images with a width of '160', in this way it is possible to # select the needed images because are the only ones with this size. imgs = soup.findAll('img', width='160') names = [] # Used to store the path of images on the page. # Loop over the iterable result of the search to get the path from the 'src' # attribute. for x in imgs: names.append(x['src']) # Looping over the paths of needed images to read them, open, transform into # PIL image object and save with proper name. for x in names: img = opener.open(MAIN_PATH + x) img = img.read() img = Image.open(BytesIO(img)) IMG_NAME = '_'.join((x[15:25], x[78:80], '00')) + '.gif' img.save(os.path.join(SAVE_FOLDER, IMG_NAME)) img.close() print('Images saved correctly in ' + SAVE_FOLDER) new_imgs = os.listdir(SAVE_FOLDER) for img in new_imgs: print("---> " + img) # Just to not directly close the prompt. input('Press ENTER to exit')
def test1(): od = URLopener() # ruleid: insecure-urlopener-open-ftp od.open("ftp://example.com")
def test2_ok(): od = URLopener() # ok: insecure-urlopener-open-ftp url = "ftps://example.com" od.open(url)
def test1_ok(): od = URLopener() # ok: insecure-urlopener-open od.open("https://example.com")
def getImage(url): MAX_TITLE_DESC = 100 MAX_TITLE = 255 uo = URLopener() file = uo.open(url) soup = BeautifulSoup.BeautifulSoup(file.read()) file.close() outImage = Image() imgTag = soup.find("img", { "class" : "imageWithCaption" }) link = imgTag.get("src", imgTag.get("href", None)) if link: outImage.url = urllib.basejoin(url, link) caption = soup.find("div", { "id" : "caption" }) captionTxt = caption.string #Kuressaare linnus, vaade põhjast (SM F 3761:473 F); Saaremaa Muuseum; Faili nimi:smf_3761_473.jpg (capPart1, museumName, capPart3) = captionTxt.split(';') museumName = museumName.strip() matchItemRef = re.search("^(.+)\((.+?)\)$", capPart1) if (matchItemRef and matchItemRef.group(2)): outImage.source = u'[%s %s, %s]' % ( url, museumName, matchItemRef.group(2) ) outImage.source.strip() mainTable = soup.find("table", {"class" : "data highlighted"}) outDesc = u"<table>\n" outDesc += getWikiTable(mainTable, outImage) mainTable = soup.find("table", {"class" : "data"}) outDesc += getWikiTable(mainTable, outImage) mainTable = soup.find("table", {"class" : "data full_length"}) outDesc += getWikiTable(mainTable, outImage) outDesc += u"</table>\n" titleStart = matchItemRef.group(1).strip() if ( len(titleStart) > MAX_TITLE_DESC ): #shorten title beginning titleStart = titleStart[:MAX_TITLE_DESC] outImage.name = titleStart + u', ' + outImage.accession_number + u'.jpg' outImage.name = cleanUpTitle( outImage.name ) if ( len(outImage.name) > MAX_TITLE ): #shorten title outImage.name = outImage.name[:MAX_TITLE] outImage.description = '{{et|1=' + outDesc + '}}' outImage.license = '{{PD-old}}' ##add categories museumName = museumName.encode('utf_8') if museumData.get(museumName) and museumData.get(museumName).get('enName'): museumEnName = museumData.get(museumName).get('enName') outImage.institution = u'{{Institution:' + museumEnName + u'}}' museumCat = u'Images from the ' + museumEnName outImage.categories.append( museumCat ) else: print "Museum enName not found for %s ! \n" % url return None return outImage