def handle_starttag(self, tag, attrs): #tmpoutput = "" count = 0 global bDoWork #self.output = "" # Only parse the 'anchor' tag. if tag == "a": # Check the list of defined attributes. for name, value in attrs: # If href is defined, print it. if name == "href": if value[len(value) - 3:len(value)] == "jpg": #print value if not "http://" in value and bDoWork == True: bDoWork = False tmpoutput = value #print "Val: " + value imgurl = 'http://apod.nasa.gov/apod/' + tmpoutput #print "IMGURL: " + imgurl filename = imgurl.split('/')[-1] #print "FileName: " + filename if (not os.path.isfile(filename)) and ( 'apod.nasa.gov' in imgurl): print "Downloading: " + filename image = URLopener() image.retrieve(imgurl, filename) sleep(lWaitTime) elif (os.path.isfile(filename)): print "Verified: " + filename break
def download_package(pkg_name, pkg_version): '''Download the required package. Sometimes the download can be flaky, so we use the retry decorator.''' pkg_type = 'sdist' # Don't download wheel archives for now # This JSON endpoint is not provided by PyPI mirrors so we always need to get this # from pypi.python.org. pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) downloader = URLopener() for pkg in pkg_info['releases'][pkg_version]: if pkg['packagetype'] == pkg_type: filename = pkg['filename'] expected_md5 = pkg['md5_digest'] if os.path.isfile(filename) and check_md5sum(filename, expected_md5): print "File with matching md5sum already exists, skipping %s" % filename return True pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path']) print "Downloading %s from %s" % (filename, pkg_url) downloader.retrieve(pkg_url, filename) actual_md5 = md5(open(filename).read()).hexdigest() if check_md5sum(filename, expected_md5): return True else: print "MD5 mismatch in file %s." % filename return False print "Could not find archive to download for %s %s %s" % ( pkg_name, pkg_version, pkg_type) sys.exit(1)
def download_package(pkg_name, pkg_version): '''Download the required package. Sometimes the download can be flaky, so we use the retry decorator.''' pkg_type = 'sdist' # Don't download wheel archives for now # This JSON endpoint is not provided by PyPI mirrors so we always need to get this # from pypi.python.org. pkg_info = json.loads( urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) downloader = URLopener() for pkg in pkg_info['releases'][pkg_version]: if pkg['packagetype'] == pkg_type: filename = pkg['filename'] expected_md5 = pkg['md5_digest'] if os.path.isfile(filename) and check_md5sum( filename, expected_md5): print "File with matching md5sum already exists, skipping %s" % filename return True pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path']) print "Downloading %s from %s" % (filename, pkg_url) downloader.retrieve(pkg_url, filename) actual_md5 = md5(open(filename).read()).hexdigest() if check_md5sum(filename, expected_md5): return True else: print "MD5 mismatch in file %s." % filename return False print "Could not find archive to download for %s %s %s" % ( pkg_name, pkg_version, pkg_type) sys.exit(1)
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert (method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv # Log us in automagically! httpd = self.session.config.http_worker.httpd global HACKS_SESSION_ID if HACKS_SESSION_ID is None: HACKS_SESSION_ID = httpd.make_session_id(None) mailpile.auth.SetLoggedIn(None, user='******', session_id=HACKS_SESSION_ID) cookie = httpd.session_cookie try: uo = URLopener() uo.addheader('Cookie', '%s=%s' % (cookie, HACKS_SESSION_ID)) if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert(method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv # Log us in automagically! httpd = self.session.config.http_worker.httpd global HACKS_SESSION_ID if HACKS_SESSION_ID is None: HACKS_SESSION_ID = httpd.make_session_id(None) mailpile.auth.SetLoggedIn(None, user='******', session_id=HACKS_SESSION_ID) cookie = httpd.session_cookie try: uo = URLopener() uo.addheader('Cookie', '%s=%s' % (cookie, HACKS_SESSION_ID)) with TcpConnBroker().context(need=[TcpConnBroker.OUTGOING_HTTP]): if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def getcif(target): """ Get all ICSD cif files listed in target file. The target file should contain tag like '# BCC'. """ matgenIDs=getMatgenIDs() if not os.path.isdir('./ciffiles'): os.makedirs('./ciffiles') with open(target,'r') as f: st=f.readline() t1=time.time() while st: if st[0]=='#': tg=st.split()[-1] st=f.readline() t2=time.time() print "time for the %s = %2.2f sec" %(tg,t2-t1) t1=time.time() continue st=st.strip() ind=getID(st) if ind in matgenIDs: continue #skip matgen compounds URL=prefix+tg+'/'+st+'/'+st+'.cif' testfile=URLopener() try: testfile.retrieve(URL,'ciffiles/'+st) except: print "Error: ",URL st=f.readline()
def handle_starttag(self, tag, attrs): #tmpoutput = "" count = 0 global bDoWork #self.output = "" # Only parse the 'anchor' tag. if tag == "a": # Check the list of defined attributes. for name, value in attrs: # If href is defined, print it. if name == "href": if value[len(value) - 3:len(value)] == "jpg": #print value if not "http://" in value and bDoWork == True: bDoWork = False tmpoutput = value #print "Val: " + value imgurl = 'http://apod.nasa.gov/apod/' + tmpoutput #print "IMGURL: " + imgurl filename = imgurl.split('/')[-1] #print "FileName: " + filename if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl): #print "Downloading: " + filename image = URLopener() image.retrieve(imgurl,filename) sleep(lWaitTime) elif (os.path.isfile(filename)): print "Verified: " + filename break
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith("http"): url = "http://%s:%s%s" % ( self.session.config.sys.http_host, self.session.config.sys.http_port, ("/" + url).replace("//", "/"), ) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert method in ("GET", "POST") qv, pv = [], [] if method == "POST": which = pv else: which = qv for arg in args[2:]: if "=" in arg: which.append(tuple(arg.split("=", 1))) elif arg.upper()[0] == "P": which = pv elif arg.upper()[0] == "Q": which = qv if qv: qv = urlencode(qv) url += ("?" in url and "&" or "?") + qv # Log us in automagically! httpd = self.session.config.http_worker.httpd global HACKS_SESSION_ID if HACKS_SESSION_ID is None: HACKS_SESSION_ID = httpd.make_session_id(None) mailpile.auth.SetLoggedIn(None, user="******", session_id=HACKS_SESSION_ID) cookie = httpd.session_cookie try: uo = URLopener() uo.addheader("Cookie", "%s=%s" % (cookie, HACKS_SESSION_ID)) with TcpConnBroker().context(need=[TcpConnBroker.OUTGOING_HTTP], oneshot=True): if method == "POST": (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, "rb").read().strip() if data.startswith("{") and "application/json" in hdrs: data = json.loads(data) return self._success("%s %s" % (method, url), result={"headers": hdrs.splitlines(), "data": data}) except: self._ignore_exception() return self._error("%s %s" % (method, url))
class SlippyCache(object): """This is a basic map tile cache used by the SlippyPanel class to retrieve and store locally the images that form the map""" def __init__(self, source, proxy = ""): self.source = source if len(proxy) > 0: self._opener = URLopener({"http": proxy}) else: self._opener = URLopener() self._fetchQueue = Queue(0) self._fetchThread = Thread(target = self._FetchTile) self._fetchThread.setDaemon(True) self._fetchThread.start() def _FetchTile(self): task = "" while task is not None: task = self._fetchQueue.get() url, fname = task if not os.path.isfile(fname): print "Getting", fname try: self._opener.retrieve(url, "tmp.png") shutil.move("tmp.png", fname) except IOError: pass self._fetchQueue.task_done() def StartNewFetchBatch(self): try: while True: item = self._fetchQueue.get(False) self._fetchQueue.task_done() except Empty: pass def GetTileFilename(self, xtile, ytile, zoom): numTiles = 2 ** zoom while xtile >= numTiles: xtile -= numTiles if xtile < 0 or ytile < 0 or ytile >= numTiles: # Indicate that this is not a valid tile return None else: fname = "/".join([self.source.get_full_name(), str(zoom), str(xtile), str(ytile) + ".png"]) if not os.path.isfile(fname): url = self.source.get_tile_url(xtile, ytile, zoom) # Ensure that the directory exists dname = os.path.dirname(fname) if not os.path.isdir(dname): os.makedirs(dname) self._fetchQueue.put((url, fname)) # Valid tile, though may not be present in the cache return fname
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert (method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv try: uo = URLopener() if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert(method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv try: uo = URLopener() if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def try_download(_path, _file, _url, _stale,): now = time() url = URLopener() file_exists = isfile(_path+_file) == True if file_exists: file_old = (getmtime(_path+_file) + _stale) < now if not file_exists or (file_exists and file_old): try: url.retrieve(_url, _path+_file) result = 'ID ALIAS MAPPER: \'{}\' successfully downloaded'.format(_file) except IOError: result = 'ID ALIAS MAPPER: \'{}\' could not be downloaded'.format(_file) else: result = 'ID ALIAS MAPPER: \'{}\' is current, not downloaded'.format(_file) url.close() return result
def download(self, sysctl, code): try: logging.info('Begin download files.') if not isdir(self.p_dwld): mkdir(self.p_dwld) obj = URLopener() for f in self.files: logging.info('Start download {}.'.format(f)) obj.retrieve(self.url + f, self.p_dwld + f) logging.info('Download {} done.'.format(f)) return True except BaseException as down: logging.error('Download {}.'.format(down)) self._rolback(sysctl, code)
def download_package(pkg_name, pkg_version): file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version) if not file_name: return False if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5): print 'File with matching md5sum already exists, skipping {0}'.format( file_name) return True downloader = URLopener() pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path) print 'Downloading {0} from {1}'.format(file_name, pkg_url) downloader.retrieve(pkg_url, file_name) if check_md5sum(file_name, expected_md5): return True else: print 'MD5 mismatch in file {0}.'.format(file_name) return False
def startplayback_images(args): """Shows an image """ # cache path sDir = xbmc.translatePath(args._addon.getAddonInfo("profile")) if args.PY2: sPath = join(sDir.decode("utf-8"), u"image.jpg") else: sPath = join(sDir, "image.jpg") # download image file = URLopener() file.retrieve(args.url, sPath) # display image item = xbmcgui.ListItem(getattr(args, "title", "Title not provided"), path=sPath) xbmcplugin.setResolvedUrl(int(args._argv[1]), True, item) xbmc.executebuiltin("SlideShow(" + sDir + ")")
def download_package(pkg_name, pkg_version): file_name, path, hash_algorithm, expected_digest = get_package_info(pkg_name, pkg_version) if not file_name: return False if os.path.isfile(file_name) and check_digest(file_name, hash_algorithm, expected_digest): print 'File with matching digest already exists, skipping {0}'.format(file_name) return True downloader = URLopener() pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path) print 'Downloading {0} from {1}'.format(file_name, pkg_url) downloader.retrieve(pkg_url, file_name) if check_digest(file_name, hash_algorithm, expected_digest): return True else: print 'Hash digest check failed in file {0}.'.format(file_name) return False
def download_reports(years=_years, weeks=_weeks): '''Crawls through IMoH website and download all excel files in the given weeks and years''' # Create paths for logging files and download loaction prefix = datetime.now().strftime('./log/weeklies/%y%m%d_%H%M%S_') log_d = prefix + "downloads.log" log_f = prefix + "FAILED.log" base_loc = 'http://www.health.gov.il/PublicationsFiles/IWER' # URL object my_file = URLopener() for year in years: print "\n", year, for week in weeks: f = open(log_d, 'a') f.write('\n{year}_{week}: '.format(week=week, year=year)) # There are many different options of paths options = ['{base}{week:02d}_{year}.xls'.format(base=base_loc, week=week, year=year), '{base}{week}_{year}.xls'.format(base=base_loc, week=week, year=year), '{base}{week:02d}_{year}.xlsx'.format(base=base_loc, week=week, year=year), '{base}{week}_{year}.xlsx'.format(base=base_loc, week=week, year=year)] for i, o in enumerate(options): filetype = o.split(".")[-1] try: # Try different paths on remote, but always save on same path locally my_file.retrieve(o, './data/weeklies/{year}_{week:02d}.{ft}'.format(week=week, year=year, ft=filetype)) # If succeeds write which filetype (xls/x) was saved f.write('{ft}'.format(ft=filetype), ) # If downloads succeeds move close the log file and break the loop f.close() break except: # When option excepted, write try number to the log f.write("{} ".format(i + 1)) # If all options were exhausted, it has failed. if i == len(options) - 1 and week != 53: print "== {year}_{week:02d} FAILED ==".format(week=week, year=year), with open(log_f, 'a') as failed: failed.write("{year}_{week:02d} FAILED\n".format(week=week, year=year)) f.write("FAILED") f.close() f.close()
raw_input("Press <enter> to exit.") sys.exit(1) urlget = URLopener({}) errors = [] for line in f.readlines(): try: url, fname = [s.strip() for s in line.split(' ')] except ValueError: print("Could not parse this input: " + line) continue if osp.isfile(fname): print('Skipping existing file %s' % fname) else: print('Downloading %s to %s' % (url, fname)) try: urlget.retrieve(url, fname, report_progress) except IOError: print(' (!) Download failed, adding to plan B list') errors.append(url) if errors: print( "\nAn error(s) was detected; would you like to retry using the " + "system browser?") raw_input("Press Ctrl+C to exit or <enter> to continue.") for url in errors: openurl(url) raw_input("Press <enter> to exit.")
date_delivered = None try: party = flyer_dict['publisher_party']['party_name'] except: party = None try: constituency = flyer_dict['constituency']['slug'] except: constituency = "Not Coded" images = flyer_dict['images'] this_flyer_storage = storage + "/" + constituency + "/" + this_flyer if os.path.isdir(this_flyer_storage) == False: check_path(this_flyer_storage) with open( storage + "/" + "flyerinfo_" + str(newfolder_date.date()) + ".csv", "a") as outfile: f = csv.writer(outfile) f.writerow([ party, int(this_flyer), constituency, date_uploaded, date_delivered ]) outfile.close() for image in range(len(images)): print images[image]['image'] imgfile = URLopener() imgfile.retrieve( images[image]['image'], this_flyer_storage + "/" + str(image) + ".jpg") index += 1 print "Finished: " + str(index) + " on page " + str(pageno)
def DownloadImageFromAPODPage(url): if "ap140302" in url: print "stop here" # Copy all of the content from the provided web page webpage = urlopen(url).read() #print "-" #print "URL: " + url global bDoWork global bCleanExtras global bVerified global strAPODPicturesFolder strAPODFileName = "" # Here I retrieve and print to screen the titles and links with just Beautiful Soup #print "Loading Soup" soup = BeautifulSoup(webpage) for url in soup.findAll("a"): imgurl = url.get('href') #print imgurl if not ('http://' in imgurl): imgurl = 'http://apod.nasa.gov/' + url.get('href') #sleep(lWaitTime) if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg": #print "IMG: " + imgurl strAPODFileName = imgurl.strip().split('/')[-1] #print "strAPODFileName = " + strAPODFileName filename = strAPODPicturesFolder + strAPODFileName if bDoWork: bDoWork = False #filename = url.strip().split('/')[-1] #print filename if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl): #print "Downloading: " + filename image = URLopener() image.retrieve(imgurl, filename) sleep(lWaitTime) elif (os.path.isfile(filename)): #print "Verified: " + filename bVerified = True if not bCleanExtras: #if we are not cleaning extras we can break here #print "Not Seeking Extras" break else: if (os.path.isfile(filename)): #this is the logic to clean extra downloads/duplicates #print "Deleting " + filename os.remove(filename) #print "Seeking Title" txtName = "" bForce = False for bTag in soup.findAll("title"): if (txtName == ""): #bForce = True txtName = bTag.text txtName = txtName.replace("APOD:", "").strip() if "\r" in txtName or "\n" in txtName: txtName = txtName.strip().replace("\r", ' ').replace( "\n", " ").replace(" ", " ").replace(" ", " ") bForce = True #print txtName for bTag in soup.findAll("b"): if (txtName == ""): txtName = bTag.text txtName = txtName.strip() if "\r" in txtName or "\n" in txtName: txtName = txtName.strip().replace("\r", ' ').replace( "\n", " ").replace(" ", " ").replace(" ", " ") bForce = True #print txtName #print "Loading Info" txtPName = "" for pTag in soup.findAll("p"): txtPName = pTag.text txtPName = txtPName.strip() if "Explanation:" in txtPName: iLoc = txtPName.find("Tomorrow's picture:") iLoc = iLoc - 1 iLoc2 = txtPName.find("digg_url") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("APOD presents:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] #The Amateur Astronomers Association of New York Presents: iLoc2 = txtPName.find( "The Amateur Astronomers Association of New York Presents:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("Presents:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("What was that?:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("Follow APOD on:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] bForce = True if iLoc > 0 and (strAPODFileName <> ""): txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace( ' ', ' ').replace(' ', ' ').replace(' ', ' ').replace( 'Explanation: ', '') if bForce or not (os.path.isfile( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'))): #print "Title: " + txtName #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt') f = open( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w') f.write(txtName.encode('utf8')) f.close if bForce or (txtPName.strip() <> "") or (iLoc2 > 0) or ( not (os.path.isfile( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt')))): #print "Info Paragraph: " + txtPName.encode('utf8') #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt') with open( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f: #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') f.write(txtPName.encode('utf8')) #f.close #f.flush #print "Checking for Title File" if (not strAPODFileName == "") and (not ( os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt')))): #print "Title not found" txtAllPageText = soup.get_text().replace("\r", ' ').replace( "\n", " ").replace(" ", " ").replace(" ", " ") if "-" in txtAllPageText: iLoc1 = txtAllPageText.find("-") txtAllPageText = txtAllPageText[iLoc1 + 1:].strip() iLoc2 = txtAllPageText.find("Astronomy Picture of the Day") txtAllPageText = txtAllPageText[:iLoc2].strip() #print "Title: " + txtAllPageText #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt') f = open( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w') f.write(txtAllPageText.encode('utf8')) f.close #print "Checking for Info File" if (not strAPODFileName == "") and (not ( os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt')))): #print "Info file found" txtAllPageText = soup.get_text().replace("\r", ' ').replace( "\n", " ").replace(" ", " ").replace(" ", " ") if "Explanation:" in txtAllPageText: iLoc1 = txtAllPageText.find("Explanation:") txtAllPageText = txtAllPageText[iLoc1 + 12:].strip() iLoc2 = txtAllPageText.find("Tomorrow's picture:") txtAllPageText = txtAllPageText[:iLoc2].strip() iLoc2 = txtAllPageText.find("digg_url") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("APOD Presents:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("Presents:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("What was that?:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find( "The Amateur Astronomers Association of New York Presents:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("Follow APOD on:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] #print "Info Paragraph: " + txtAllPageText #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt') with open( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f: #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') f.write(txtAllPageText.encode('utf8')) #f.close os.fsync if (not strAPODFileName == "") and (os.path.isfile(strAPODPicturesFolder + strAPODFileName)): #print "APOD Image File Found" #This is True on Windows, should be set to false otherwise if False: strAPODDestCache = "G:\\apod\\cache_NookHD+\\" SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODDestCache + strAPODFileName, 1920.0, 1280.0) strAPODDestCache = "G:\\apod\\cache_Nexus7_2013\\" SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODDestCache + strAPODFileName, 1920.0, 1104.0) #save a PIL image containing the Title and Info if strAPODPicsWithText <> "": SavePILText(strAPODFileName) if strAPODCache <> "": SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODCache + strAPODFileName, 1920.0, 1080.0)
def download_edgent(): file = URLopener() file.retrieve(EDGENT_TAR_URL, TMP_DIR + "/" + EDGENT_TAR) exec_cmd("cd %s ; tar zxvf %s/%s" % (TMP_DIR, TMP_DIR, EDGENT_TAR))
def DownloadImageFromAPODPage(url): if "ap140302" in url: print "stop here" # Copy all of the content from the provided web page webpage = urlopen(url).read() #print "-" #print "URL: " + url global bDoWork global bCleanExtras global bVerified global strAPODPicturesFolder strAPODFileName = "" # Here I retrieve and print to screen the titles and links with just Beautiful Soup #print "Loading Soup" soup = BeautifulSoup(webpage) for url in soup.findAll("a"): imgurl = url.get('href') #print imgurl if not ('http://' in imgurl): imgurl = 'http://apod.nasa.gov/' + url.get('href') #sleep(lWaitTime) if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg": #print "IMG: " + imgurl strAPODFileName = imgurl.strip().split('/')[-1] #print "strAPODFileName = " + strAPODFileName filename = strAPODPicturesFolder + strAPODFileName if bDoWork: bDoWork = False #filename = url.strip().split('/')[-1] #print filename if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl): #print "Downloading: " + filename image = URLopener() image.retrieve(imgurl,filename) sleep(lWaitTime) elif (os.path.isfile(filename)): #print "Verified: " + filename bVerified = True if not bCleanExtras: #if we are not cleaning extras we can break here #print "Not Seeking Extras" break else: if (os.path.isfile(filename)): #this is the logic to clean extra downloads/duplicates #print "Deleting " + filename os.remove(filename) #print "Seeking Title" txtName = "" bForce = False for bTag in soup.findAll("title"): if (txtName == ""): #bForce = True txtName = bTag.text txtName = txtName.replace("APOD:", "").strip() if "\r" in txtName or "\n" in txtName: txtName = txtName.strip().replace("\r", ' ').replace("\n", " ").replace(" ", " ").replace(" ", " ") bForce = True #print txtName for bTag in soup.findAll("b"): if (txtName == ""): txtName = bTag.text txtName = txtName.strip() if "\r" in txtName or "\n" in txtName: txtName = txtName.strip().replace("\r", ' ').replace("\n", " ").replace(" ", " ").replace(" ", " ") bForce = True #print txtName #print "Loading Info" txtPName = "" for pTag in soup.findAll("p"): txtPName = pTag.text txtPName = txtPName.strip() if "Explanation:" in txtPName: iLoc = txtPName.find("Tomorrow's picture:") iLoc = iLoc - 1 iLoc2 = txtPName.find("digg_url") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("APOD presents:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] #The Amateur Astronomers Association of New York Presents: iLoc2 = txtPName.find("The Amateur Astronomers Association of New York Presents:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("Presents:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("What was that?:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] iLoc2 = txtPName.find("Follow APOD on:") if iLoc2 > 0: #txtPName = txtPName txtPName = txtPName[:iLoc2] bForce = True if iLoc > 0 and (strAPODFileName <> ""): txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace('Explanation: ', '') if bForce or not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'))): #print "Title: " + txtName #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt') f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w') f.write(txtName.encode('utf8')) f.close if bForce or (txtPName.strip() <> "") or (iLoc2 > 0) or (not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt')))): #print "Info Paragraph: " + txtPName.encode('utf8') #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt') with open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f: #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') f.write(txtPName.encode('utf8')) #f.close #f.flush #print "Checking for Title File" if (not strAPODFileName == "") and (not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt')))): #print "Title not found" txtAllPageText = soup.get_text().replace("\r", ' ').replace("\n", " ").replace(" ", " ").replace(" ", " ") if "-" in txtAllPageText: iLoc1 = txtAllPageText.find("-") txtAllPageText = txtAllPageText[iLoc1 + 1:].strip() iLoc2 = txtAllPageText.find("Astronomy Picture of the Day") txtAllPageText = txtAllPageText[:iLoc2].strip() #print "Title: " + txtAllPageText #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt') f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w') f.write(txtAllPageText.encode('utf8')) f.close #print "Checking for Info File" if (not strAPODFileName == "") and (not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt')))): #print "Info file found" txtAllPageText = soup.get_text().replace("\r", ' ').replace("\n", " ").replace(" ", " ").replace(" ", " ") if "Explanation:" in txtAllPageText: iLoc1 = txtAllPageText.find("Explanation:") txtAllPageText = txtAllPageText[iLoc1 + 12:].strip() iLoc2 = txtAllPageText.find("Tomorrow's picture:") txtAllPageText = txtAllPageText[:iLoc2].strip() iLoc2 = txtAllPageText.find("digg_url") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("APOD Presents:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("Presents:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("What was that?:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("The Amateur Astronomers Association of New York Presents:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] iLoc2 = txtAllPageText.find("Follow APOD on:") if iLoc2 > 0: #txtPName = txtPName txtAllPageText = txtAllPageText[:iLoc2] #print "Info Paragraph: " + txtAllPageText #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt') with open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f: #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') f.write(txtAllPageText.encode('utf8')) #f.close os.fsync if (not strAPODFileName == "") and (os.path.isfile(strAPODPicturesFolder + strAPODFileName)): #print "APOD Image File Found" #This is True on Windows, should be set to false otherwise if False: strAPODDestCache = "G:\\apod\\cache_NookHD+\\" SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODDestCache + strAPODFileName, 1920.0, 1280.0) strAPODDestCache = "G:\\apod\\cache_Nexus7_2013\\" SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODDestCache + strAPODFileName, 1920.0, 1104.0) #save a PIL image containing the Title and Info if strAPODPicsWithText <> "": SavePILText(strAPODFileName) if strAPODCache <> "": SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODCache + strAPODFileName, 1920.0, 1080.0)
def DownloadImageFromAPODPage(url): # Copy all of the content from the provided web page webpage = urlopen(url).read() print "-" print "URL: " + url global bDoWork global bCleanExtras global bVerified global strAPODPicturesFolder strAPODFileName = "" # Here I retrieve and print to screen the titles and links with just Beautiful Soup soup = BeautifulSoup(webpage) for url in soup.findAll("a"): imgurl = url.get('href') #print imgurl if not ('http://' in imgurl): imgurl = 'http://apod.nasa.gov/' + url.get('href') #sleep(lWaitTime) if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg": print "IMG: " + imgurl strAPODFileName = imgurl.strip().split('/')[-1] print "strAPODFileName = " + strAPODFileName filename = strAPODPicturesFolder + strAPODFileName if bDoWork: bDoWork = False #filename = url.strip().split('/')[-1] #print filename if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl): print "Downloading: " + filename image = URLopener() image.retrieve(imgurl, filename) sleep(lWaitTime) elif (os.path.isfile(filename)): print "Verified: " + filename bVerified = True if not bCleanExtras: #if we are not cleaning extras we can break here print "Not Seeking Extras" break else: if (os.path.isfile(filename)): #this is the logic to clean extra downloads/duplicates print "Deleting " + filename os.remove(filename) txtName = "" for bTag in soup.findAll("b"): if (txtName == ""): txtName = bTag.text txtName = txtName.strip() print txtName txtPName = "" for pTag in soup.findAll("p"): txtPName = pTag.text txtPName = txtPName.strip() if "Explanation:" in txtPName: iLoc = txtPName.find("Tomorrow's picture:") iLoc = iLoc - 1 if iLoc > 0 and (strAPODFileName <> ""): txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace( ' ', ' ').replace(' ', ' ').replace(' ', ' ').replace( 'Explanation: ', '') if not (os.path.isfile( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'))): print "Title: " + txtName print "FN: " + strAPODFileName.replace( '.jpg', '_Title.txt') f = open( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w') f.write(txtName.encode('utf8')) f.close if not (os.path.isfile( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'))): print "Info Paragraph: " + txtPName print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt') f = open( strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') f.write(txtPName.encode('utf8')) f.close
from urllib import urlretrieve, URLopener import ssl from subprocess import call import os import zipfile # Unfortunately NESP SSL certificate doesn't play nice with Python 2.7 on Windows. Tried hard to fix this the proper way but gave up. ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE opener = URLopener(context=ctx) print("Downloading sample data, please wait") filename = 'sample.zip' opener.retrieve('https://tsx.org.au/sample.zip', filename) print("Extracting") zip_ref = zipfile.ZipFile(filename, 'r') zip_ref.extractall('.') zip_ref.close() # Clean up print("Clean up") os.remove(filename) print("Done")
def install_firmware(self, new_version): logging.info('Update firmware request') logging.info('Current firmware version: {}'.format( self.firmware_version)) logging.info('Firmware version to install: {}'.format(new_version)) fw_fname_prefix = 'sensa-%s' % new_version fw_check_url = '%sstatic/firmware/%s.chk' % ( self.api_url, fw_fname_prefix) fw_filename = fw_fname_prefix + '.zip' fw_url = '%sstatic/firmware/%s' % (self.api_url, fw_filename) # Firmware install shell script deploy_script = 'deploy.sh' # Download firmware fw_file = URLopener() try: fw_file.retrieve(fw_url, fw_filename) except IOError: logging.error('Error during firmware download') return 1 fw_file.close() # Check downloaded firmware integrity try: fw_checksum_req = requests.get(fw_check_url) except requests.exceptions.RequestException: logging.error('Error during firmware download') return 1 expected_check = fw_checksum_req.text.split() fw_checksum = md5(open(fw_filename, 'rb').read()).hexdigest() if(fw_checksum != expected_check[0] and fw_filename != expected_check[1]): logging.error('Error checking firmware integrity') return logging.info('Files checked. Updating') # Unzip try: fw_file = ZipFile(fw_filename, 'r') except IOError: logging.error('Error reading local firmware file') return fw_file.extractall() fw_file.close() # Run firmware script call(['sh', deploy_script]) # Remove firmware file call(['rm', fw_filename]) # Remove firmware script call(['rm', deploy_script]) config = SafeConfigParser() config.read(self.config_file) # Update firmware version on config file config.set('device', 'firmware_version', new_version) try: conf_file = open(self.config, 'wb') try: parser.write(conf_file) finally: conf_file.close() except IOError: logging.error('Error updating version on config file') '''
def parse(self, response): global excel_file species_name = observation_name_dict[str(response.url).strip().split( "/" )[-1]] # look up species name in our dictionary by using observation number as key self.driver.get(response.url) # load an observation page in selenium try: # try to load webpage image_elements = WebDriverWait(self.driver, 5).until( expected_conditions.presence_of_all_elements_located( (By.XPATH, "//div[@class='image-gallery-image']/img" ))) # scrape images of species image = URLopener( ) # create a blank URLopener object to later download image(s) image_urls = [] # list of urls, scraped from image elements for image_element in image_elements: # build image_urls image_url = image_element.get_attribute( "src" ) # extract src attribtue from image element (the image url) image_urls.append(image_url) # add extracted url to image_urls number_of_images = len( image_urls ) # calculate total number of images for an observation if number_of_images > 1: # if there are multiple images for one observation page for index in range(number_of_images): # loop over all images indexed_species_name = species_name + '-0' + str( index + 1 ) # create string that looks like "species_name-0x" where x is >= 1 if DOWNLOAD_IMAGES: image.retrieve( image_urls[index].replace("large", "original"), "images/" + indexed_species_name + ".jpg" ) # download the image, save it as "species_name-0x.jpeg" if UPDATE_CSV: species_row = DataFrame( ) # initialize new row to hold "species-0x" data species_row = excel_file.loc[ excel_file['Name'] == species_name] # fetch row corresponding to this species species_row.iloc[ 0, 0] = indexed_species_name # change Name field to "species-0x" species_row.to_csv( EXCEL_FILENAME, header=None, mode='a', index=False, sep=',', encoding='utf-8' ) # add the new row to original csv file if UPDATE_CSV: excel_file = read_csv( EXCEL_FILENAME ) # refresh excel_file by reading in newly added rows "species-0x" species_index = excel_file.index[ excel_file['Name'] == species_name][ 0] # get index of row to eliminate "species" excel_file.drop(species_index).to_csv( EXCEL_FILENAME, index=False) # remove original species row if DOWNLOAD_IMAGES: if number_of_images == 1: # if there is only one image for one observation image.retrieve( str(image_urls[0]).replace("large", "original"), "images/" + species_name + ".jpg" ) # download the image, save it as "species_name.jpeg" except TimeoutException: # in the case of a timeout # add species url to exception list with open("exception-url-list.txt", 'a') as url_file: url_file.write(str(response.url) + '\n') if UPDATE_CSV and DELETE_FAILURES: # delete a species row excel_file = excel_file.read_csv(EXCEL_FILENAME) species_index = excel_file.index[ excel_file['Name'] == species_name][ 0] # get index of row to eliminate "species" excel_file.drop(species_index).to_csv( EXCEL_FILENAME, index=False) # remove original species row yield
def DownloadImageFromAPODPage(url): # Copy all of the content from the provided web page webpage = urlopen(url).read() print "-" print "URL: " + url global bDoWork global bCleanExtras global bVerified global strAPODPicturesFolder strAPODFileName = "" # Here I retrieve and print to screen the titles and links with just Beautiful Soup soup = BeautifulSoup(webpage) for url in soup.findAll("a"): imgurl = url.get('href') #print imgurl if not ('http://' in imgurl): imgurl = 'http://apod.nasa.gov/' + url.get('href') #sleep(lWaitTime) if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg": print "IMG: " + imgurl strAPODFileName = imgurl.strip().split('/')[-1] print "strAPODFileName = " + strAPODFileName filename = strAPODPicturesFolder + strAPODFileName if bDoWork: bDoWork = False #filename = url.strip().split('/')[-1] #print filename if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl): print "Downloading: " + filename image = URLopener() image.retrieve(imgurl,filename) sleep(lWaitTime) elif (os.path.isfile(filename)): print "Verified: " + filename bVerified = True if not bCleanExtras: #if we are not cleaning extras we can break here print "Not Seeking Extras" break else: if (os.path.isfile(filename)): #this is the logic to clean extra downloads/duplicates print "Deleting " + filename os.remove(filename) txtName = "" for bTag in soup.findAll("b"): if (txtName == ""): txtName = bTag.text txtName = txtName.strip() print txtName txtPName = "" for pTag in soup.findAll("p"): txtPName = pTag.text txtPName = txtPName.strip() if "Explanation:" in txtPName: iLoc = txtPName.find("Tomorrow's picture:") iLoc = iLoc - 1 if iLoc > 0 and (strAPODFileName <> ""): txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace('Explanation: ', '') if not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'))): print "Title: " + txtName print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt') f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w') f.write(txtName.encode('utf8')) f.close if not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'))): print "Info Paragraph: " + txtPName print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt') f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') f.write(txtPName.encode('utf8')) f.close
def retrieve_schemas_in_local_directory_zip(zip_schemas_uri, proxy_dict, schema_zipfilename, dest_directory): # crack open a connection via the proxy... try: schema_zip_file = URLopener(proxies=proxy_dict) except: print("urllib.URLopener error: %s" % (proxy_dict)) return None # if the local schema directory does not exist - error if not (os.path.exists(dest_directory)): try: os.makedirs(dest_directory) except ValueError as err: print("Unable to create the schema files directory %s" % dest_directory) print(err) return None # full pathname of the target zip_file_path = dest_directory + schema_zipfilename # retrieve the zip file from the url try: zip_url_path = zip_schemas_uri + schema_zipfilename schema_zip_file.retrieve(zip_url_path, zip_file_path) except: print("Error retrieving %s to %s with proxy=%s" % (zip_url_path, zip_file_path, proxy_dict)) print( "...this could be due to %s not being available at %s or an invalid proxy setting." % (schema_zipfilename, zip_schemas_uri)) return None # unzip it with zipfile.ZipFile(zip_file_path, "r") as zfile: try: zfile.extractall(dest_directory) except: print("Error unzipping %s" % zip_file_path) return None # locate the "ServiceRoot" (either json or xml) in the zip file... this will yeild # the subpath below "dest_directory" where the schema files were loaded into (and unloaded from) # the source zipfile. This subpath is needed for the tool to have a full pathname to the schema files # unzip_metadata_subpath = None try: with zipfile.ZipFile(zip_file_path, "r") as zfile: for fname in zfile.namelist(): # find the path within the zip file for the metadata files if "metadata" in fname and "ServiceRoot" in fname and fname.__str__( ).endswith(".xml") and ("MAC" not in fname): str_idx = fname.find("ServiceRoot") unzip_metadata_subpath = fname[0:str_idx] break except: print( "Error processing the zip file %s while searching for a \'ServiceRoot\' metadata file" % zip_file_path) return None if (unzip_metadata_subpath == None): print( "Error: %s does not appear to be a valid DMTF/SPMF metadata zip file..." % zip_file_path) print( " Unable to locate the \'ServiceRoot'\ xml file below the \'metadata'\ in the zipfile %s" % zip_file_path) return False # return the full path to the unzipped metadata files return dest_directory + unzip_metadata_subpath
for poemLink in poemLinks: try: pl = db_session.query(Poem).filter_by(poem=poemLink).one() except NoResultFound: p_obj = Poem(poem=poemLink) db_session.add(p_obj) print 'added poem %s' %poemLink db_session.commit() #download and save file to temp file #make sure its not massive d = urllib.urlopen(poemLink) if int(d.info()['Content-Length']) > 25000000: #arbitrary length, could be better continue filename = urlparse(poemLink).path.split('/')[-1] try: opener.retrieve(poemLink, temp + filename) except: continue #open and convert file to mono, 8000Hz poem = AudioSegment.from_mp3(temp + filename) poem = poem.set_channels(1) poem = poem.set_frame_rate(8000) #erase temp file os.remove(temp + filename) #cut the poem into lines based on silence lines = cutbySilence(poem) #number the lines line_num = 0 for line in lines: if line.duration_seconds > 30: continue