Пример #1
0
    def handle_starttag(self, tag, attrs):
        #tmpoutput = ""
        count = 0
        global bDoWork
        #self.output = ""
        # Only parse the 'anchor' tag.
        if tag == "a":
            # Check the list of defined attributes.
            for name, value in attrs:
                # If href is defined, print it.
                if name == "href":
                    if value[len(value) - 3:len(value)] == "jpg":
                        #print value
                        if not "http://" in value and bDoWork == True:
                            bDoWork = False
                            tmpoutput = value
                            #print "Val: " + value
                            imgurl = 'http://apod.nasa.gov/apod/' + tmpoutput
                            #print "IMGURL: " + imgurl
                            filename = imgurl.split('/')[-1]
                            #print "FileName: " + filename

                            if (not os.path.isfile(filename)) and (
                                    'apod.nasa.gov' in imgurl):
                                print "Downloading: " + filename
                                image = URLopener()
                                image.retrieve(imgurl, filename)
                                sleep(lWaitTime)
                            elif (os.path.isfile(filename)):
                                print "Verified: " + filename
                            break
Пример #2
0
def download_package(pkg_name, pkg_version):
  '''Download the required package. Sometimes the download can be flaky, so we use the
  retry decorator.'''
  pkg_type = 'sdist' # Don't download wheel archives for now
  # This JSON endpoint is not provided by PyPI mirrors so we always need to get this
  # from pypi.python.org.
  pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read())

  downloader = URLopener()
  for pkg in pkg_info['releases'][pkg_version]:
    if pkg['packagetype'] == pkg_type:
      filename = pkg['filename']
      expected_md5 = pkg['md5_digest']
      if os.path.isfile(filename) and check_md5sum(filename, expected_md5):
        print "File with matching md5sum already exists, skipping %s" % filename
        return True
      pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path'])
      print "Downloading %s from %s" % (filename, pkg_url)
      downloader.retrieve(pkg_url, filename)
      actual_md5 = md5(open(filename).read()).hexdigest()
      if check_md5sum(filename, expected_md5):
        return True
      else:
        print "MD5 mismatch in file %s." % filename
        return False
  print "Could not find archive to download for %s %s %s" % (
      pkg_name, pkg_version, pkg_type)
  sys.exit(1)
Пример #3
0
def download_package(pkg_name, pkg_version):
    '''Download the required package. Sometimes the download can be flaky, so we use the
  retry decorator.'''
    pkg_type = 'sdist'  # Don't download wheel archives for now
    # This JSON endpoint is not provided by PyPI mirrors so we always need to get this
    # from pypi.python.org.
    pkg_info = json.loads(
        urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read())

    downloader = URLopener()
    for pkg in pkg_info['releases'][pkg_version]:
        if pkg['packagetype'] == pkg_type:
            filename = pkg['filename']
            expected_md5 = pkg['md5_digest']
            if os.path.isfile(filename) and check_md5sum(
                    filename, expected_md5):
                print "File with matching md5sum already exists, skipping %s" % filename
                return True
            pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path'])
            print "Downloading %s from %s" % (filename, pkg_url)
            downloader.retrieve(pkg_url, filename)
            actual_md5 = md5(open(filename).read()).hexdigest()
            if check_md5sum(filename, expected_md5):
                return True
            else:
                print "MD5 mismatch in file %s." % filename
                return False
    print "Could not find archive to download for %s %s %s" % (
        pkg_name, pkg_version, pkg_type)
    sys.exit(1)
Пример #4
0
    def command(self):
        args = list(self.args)
        method, url = args[0:2]

        if not url.startswith('http'):
            url = 'http://%s:%s%s' % (self.session.config.sys.http_host,
                                      self.session.config.sys.http_port,
                                      ('/' + url).replace('//', '/'))

        # FIXME: The python URLopener doesn't seem to support other verbs,
        #        which is really quite lame.
        method = method.upper()
        assert (method in ('GET', 'POST'))

        qv, pv = [], []
        if method == 'POST':
            which = pv
        else:
            which = qv
        for arg in args[2:]:
            if '=' in arg:
                which.append(tuple(arg.split('=', 1)))
            elif arg.upper()[0] == 'P':
                which = pv
            elif arg.upper()[0] == 'Q':
                which = qv

        if qv:
            qv = urlencode(qv)
            url += ('?' in url and '&' or '?') + qv

        # Log us in automagically!
        httpd = self.session.config.http_worker.httpd
        global HACKS_SESSION_ID
        if HACKS_SESSION_ID is None:
            HACKS_SESSION_ID = httpd.make_session_id(None)
        mailpile.auth.SetLoggedIn(None,
                                  user='******',
                                  session_id=HACKS_SESSION_ID)
        cookie = httpd.session_cookie

        try:
            uo = URLopener()
            uo.addheader('Cookie', '%s=%s' % (cookie, HACKS_SESSION_ID))
            if method == 'POST':
                (fn, hdrs) = uo.retrieve(url, data=urlencode(pv))
            else:
                (fn, hdrs) = uo.retrieve(url)
            hdrs = unicode(hdrs)
            data = open(fn, 'rb').read().strip()
            if data.startswith('{') and 'application/json' in hdrs:
                data = json.loads(data)
            return self._success('%s %s' % (method, url),
                                 result={
                                     'headers': hdrs.splitlines(),
                                     'data': data
                                 })
        except:
            self._ignore_exception()
            return self._error('%s %s' % (method, url))
Пример #5
0
    def command(self):
        args = list(self.args)
        method, url = args[0:2]

        if not url.startswith('http'):
            url = 'http://%s:%s%s' % (self.session.config.sys.http_host,
                                      self.session.config.sys.http_port,
                                      ('/' + url).replace('//', '/'))

        # FIXME: The python URLopener doesn't seem to support other verbs,
        #        which is really quite lame.
        method = method.upper()
        assert(method in ('GET', 'POST'))

        qv, pv = [], []
        if method == 'POST':
            which = pv
        else:
            which = qv
        for arg in args[2:]:
            if '=' in arg:
                which.append(tuple(arg.split('=', 1)))
            elif arg.upper()[0] == 'P':
                which = pv
            elif arg.upper()[0] == 'Q':
                which = qv

        if qv:
            qv = urlencode(qv)
            url += ('?' in url and '&' or '?') + qv

        # Log us in automagically!
        httpd = self.session.config.http_worker.httpd
        global HACKS_SESSION_ID
        if HACKS_SESSION_ID is None:
            HACKS_SESSION_ID = httpd.make_session_id(None)
        mailpile.auth.SetLoggedIn(None,
                                  user='******',
                                  session_id=HACKS_SESSION_ID)
        cookie = httpd.session_cookie

        try:
            uo = URLopener()
            uo.addheader('Cookie', '%s=%s' % (cookie, HACKS_SESSION_ID))
            with TcpConnBroker().context(need=[TcpConnBroker.OUTGOING_HTTP]):
                if method == 'POST':
                    (fn, hdrs) = uo.retrieve(url, data=urlencode(pv))
                else:
                    (fn, hdrs) = uo.retrieve(url)
            hdrs = unicode(hdrs)
            data = open(fn, 'rb').read().strip()
            if data.startswith('{') and 'application/json' in hdrs:
                data = json.loads(data)
            return self._success('%s %s' % (method, url), result={
                'headers': hdrs.splitlines(),
                'data': data
            })
        except:
            self._ignore_exception()
            return self._error('%s %s' % (method, url))
Пример #6
0
def getcif(target):
    """
    Get all ICSD cif files listed in target file.
    The target file should contain tag like '# BCC'.
    """
    matgenIDs=getMatgenIDs()

    if not os.path.isdir('./ciffiles'):
        os.makedirs('./ciffiles')

    with open(target,'r') as f:
        st=f.readline()
        t1=time.time()
        while st:
            if st[0]=='#':
                tg=st.split()[-1]
                st=f.readline()
                t2=time.time()
                print "time for the %s = %2.2f sec" %(tg,t2-t1) 
                t1=time.time()
                continue
            st=st.strip()
            ind=getID(st)
            if ind in matgenIDs:
                continue #skip matgen compounds
            URL=prefix+tg+'/'+st+'/'+st+'.cif' 
            testfile=URLopener()
            try:
                testfile.retrieve(URL,'ciffiles/'+st)
            except:
                print "Error: ",URL

            st=f.readline()
    def handle_starttag(self, tag, attrs):
        #tmpoutput = ""
        count = 0
        global bDoWork
        #self.output = ""
        # Only parse the 'anchor' tag.
        if tag == "a":
            # Check the list of defined attributes.
            for name, value in attrs:
                # If href is defined, print it.
                if name == "href":
                    if value[len(value) - 3:len(value)] == "jpg":
                        #print value
                        if not "http://" in value and bDoWork == True: 
                            bDoWork = False
                            tmpoutput = value
                            #print "Val: " + value
                            imgurl = 'http://apod.nasa.gov/apod/' + tmpoutput
                            #print "IMGURL: " + imgurl
                            filename = imgurl.split('/')[-1]
                            #print "FileName: " + filename

                            if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl):
                                #print "Downloading: " + filename
                                image = URLopener()
                                image.retrieve(imgurl,filename) 
                                sleep(lWaitTime)
                            elif (os.path.isfile(filename)):
                                print "Verified: " + filename
                            break
Пример #8
0
    def command(self):
        args = list(self.args)
        method, url = args[0:2]

        if not url.startswith("http"):
            url = "http://%s:%s%s" % (
                self.session.config.sys.http_host,
                self.session.config.sys.http_port,
                ("/" + url).replace("//", "/"),
            )

        # FIXME: The python URLopener doesn't seem to support other verbs,
        #        which is really quite lame.
        method = method.upper()
        assert method in ("GET", "POST")

        qv, pv = [], []
        if method == "POST":
            which = pv
        else:
            which = qv
        for arg in args[2:]:
            if "=" in arg:
                which.append(tuple(arg.split("=", 1)))
            elif arg.upper()[0] == "P":
                which = pv
            elif arg.upper()[0] == "Q":
                which = qv

        if qv:
            qv = urlencode(qv)
            url += ("?" in url and "&" or "?") + qv

        # Log us in automagically!
        httpd = self.session.config.http_worker.httpd
        global HACKS_SESSION_ID
        if HACKS_SESSION_ID is None:
            HACKS_SESSION_ID = httpd.make_session_id(None)
        mailpile.auth.SetLoggedIn(None, user="******", session_id=HACKS_SESSION_ID)
        cookie = httpd.session_cookie

        try:
            uo = URLopener()
            uo.addheader("Cookie", "%s=%s" % (cookie, HACKS_SESSION_ID))
            with TcpConnBroker().context(need=[TcpConnBroker.OUTGOING_HTTP], oneshot=True):
                if method == "POST":
                    (fn, hdrs) = uo.retrieve(url, data=urlencode(pv))
                else:
                    (fn, hdrs) = uo.retrieve(url)
            hdrs = unicode(hdrs)
            data = open(fn, "rb").read().strip()
            if data.startswith("{") and "application/json" in hdrs:
                data = json.loads(data)
            return self._success("%s %s" % (method, url), result={"headers": hdrs.splitlines(), "data": data})
        except:
            self._ignore_exception()
            return self._error("%s %s" % (method, url))
Пример #9
0
class SlippyCache(object):
    """This is a basic map tile cache used by the SlippyPanel class
    to retrieve and store locally the images that form the map"""
    def __init__(self, source, proxy = ""):
        self.source = source
        if len(proxy) > 0:
            self._opener = URLopener({"http": proxy})
        else:
            self._opener = URLopener()
        self._fetchQueue = Queue(0)
        self._fetchThread = Thread(target = self._FetchTile)
        self._fetchThread.setDaemon(True)
        self._fetchThread.start()

    def _FetchTile(self):
        task = ""
        while task is not None:
            task = self._fetchQueue.get()
            url, fname = task
            if not os.path.isfile(fname):
                print "Getting", fname
                try:
                    self._opener.retrieve(url, "tmp.png")
                    shutil.move("tmp.png", fname)
                except IOError:
                    pass
            self._fetchQueue.task_done()

    def StartNewFetchBatch(self):
        try:
            while True:
                item = self._fetchQueue.get(False)
                self._fetchQueue.task_done()
        except Empty:
            pass

    def GetTileFilename(self, xtile, ytile, zoom):
        numTiles = 2 ** zoom
        while xtile >= numTiles:
            xtile -= numTiles
        if xtile < 0 or ytile < 0 or ytile >= numTiles:
            # Indicate that this is not a valid tile
            return None
        else:
            fname = "/".join([self.source.get_full_name(), str(zoom), str(xtile), str(ytile) + ".png"])
            if not os.path.isfile(fname):
                url = self.source.get_tile_url(xtile, ytile, zoom)
                # Ensure that the directory exists
                dname = os.path.dirname(fname)
                if not os.path.isdir(dname):
                    os.makedirs(dname)
                self._fetchQueue.put((url, fname))
            # Valid tile, though may not be present in the cache
            return fname
Пример #10
0
    def command(self):
        args = list(self.args)
        method, url = args[0:2]

        if not url.startswith('http'):
            url = 'http://%s:%s%s' % (self.session.config.sys.http_host,
                                      self.session.config.sys.http_port,
                                      ('/' + url).replace('//', '/'))

        # FIXME: The python URLopener doesn't seem to support other verbs,
        #        which is really quite lame.
        method = method.upper()
        assert (method in ('GET', 'POST'))

        qv, pv = [], []
        if method == 'POST':
            which = pv
        else:
            which = qv
        for arg in args[2:]:
            if '=' in arg:
                which.append(tuple(arg.split('=', 1)))
            elif arg.upper()[0] == 'P':
                which = pv
            elif arg.upper()[0] == 'Q':
                which = qv

        if qv:
            qv = urlencode(qv)
            url += ('?' in url and '&' or '?') + qv

        try:
            uo = URLopener()
            if method == 'POST':
                (fn, hdrs) = uo.retrieve(url, data=urlencode(pv))
            else:
                (fn, hdrs) = uo.retrieve(url)
            hdrs = unicode(hdrs)
            data = open(fn, 'rb').read().strip()
            if data.startswith('{') and 'application/json' in hdrs:
                data = json.loads(data)
            return self._success('%s %s' % (method, url),
                                 result={
                                     'headers': hdrs.splitlines(),
                                     'data': data
                                 })
        except:
            self._ignore_exception()
            return self._error('%s %s' % (method, url))
Пример #11
0
    def command(self):
        args = list(self.args)
        method, url = args[0:2]

        if not url.startswith('http'):
            url = 'http://%s:%s%s' % (self.session.config.sys.http_host,
                                      self.session.config.sys.http_port,
                                      ('/' + url).replace('//', '/'))

        # FIXME: The python URLopener doesn't seem to support other verbs,
        #        which is really quite lame.
        method = method.upper()
        assert(method in ('GET', 'POST'))

        qv, pv = [], []
        if method == 'POST':
            which = pv
        else:
            which = qv
        for arg in args[2:]:
            if '=' in arg:
                which.append(tuple(arg.split('=', 1)))
            elif arg.upper()[0] == 'P':
                which = pv
            elif arg.upper()[0] == 'Q':
                which = qv

        if qv:
            qv = urlencode(qv)
            url += ('?' in url and '&' or '?') + qv

        try:
            uo = URLopener()
            if method == 'POST':
                (fn, hdrs) = uo.retrieve(url, data=urlencode(pv))
            else:
                (fn, hdrs) = uo.retrieve(url)
            hdrs = unicode(hdrs)
            data = open(fn, 'rb').read().strip()
            if data.startswith('{') and 'application/json' in hdrs:
                data = json.loads(data)
            return self._success('%s %s' % (method, url), result={
                'headers': hdrs.splitlines(),
                'data': data
            })
        except:
            self._ignore_exception()
            return self._error('%s %s' % (method, url))
Пример #12
0
def try_download(_path, _file, _url, _stale,):
    now = time()
    url = URLopener()
    file_exists = isfile(_path+_file) == True
    if file_exists:
        file_old = (getmtime(_path+_file) + _stale) < now
    if not file_exists or (file_exists and file_old):
        try:
            url.retrieve(_url, _path+_file)
            result = 'ID ALIAS MAPPER: \'{}\' successfully downloaded'.format(_file)
        except IOError:
            result = 'ID ALIAS MAPPER: \'{}\' could not be downloaded'.format(_file)
    else:
        result = 'ID ALIAS MAPPER: \'{}\' is current, not downloaded'.format(_file)
    url.close()
    return result
Пример #13
0
    def download(self, sysctl, code):
        try:
            logging.info('Begin download files.')

            if not isdir(self.p_dwld):
                mkdir(self.p_dwld)

            obj = URLopener()
            for f in self.files:
                logging.info('Start download {}.'.format(f))
                obj.retrieve(self.url + f, self.p_dwld + f)
                logging.info('Download {} done.'.format(f))
            return True

        except BaseException as down:
            logging.error('Download {}.'.format(down))
            self._rolback(sysctl, code)
Пример #14
0
def download_package(pkg_name, pkg_version):
    file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version)
    if not file_name:
        return False
    if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5):
        print 'File with matching md5sum already exists, skipping {0}'.format(
            file_name)
        return True
    downloader = URLopener()
    pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
    print 'Downloading {0} from {1}'.format(file_name, pkg_url)
    downloader.retrieve(pkg_url, file_name)
    if check_md5sum(file_name, expected_md5):
        return True
    else:
        print 'MD5 mismatch in file {0}.'.format(file_name)
        return False
Пример #15
0
def startplayback_images(args):
    """Shows an image
    """
    # cache path
    sDir = xbmc.translatePath(args._addon.getAddonInfo("profile"))
    if args.PY2:
        sPath = join(sDir.decode("utf-8"), u"image.jpg")
    else:
        sPath = join(sDir, "image.jpg")

    # download image
    file = URLopener()
    file.retrieve(args.url, sPath)

    # display image
    item = xbmcgui.ListItem(getattr(args, "title", "Title not provided"), path=sPath)
    xbmcplugin.setResolvedUrl(int(args._argv[1]), True, item)
    xbmc.executebuiltin("SlideShow(" + sDir + ")")
Пример #16
0
def download_package(pkg_name, pkg_version):
  file_name, path, hash_algorithm, expected_digest = get_package_info(pkg_name,
      pkg_version)
  if not file_name:
    return False
  if os.path.isfile(file_name) and check_digest(file_name, hash_algorithm,
      expected_digest):
    print 'File with matching digest already exists, skipping {0}'.format(file_name)
    return True
  downloader = URLopener()
  pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
  print 'Downloading {0} from {1}'.format(file_name, pkg_url)
  downloader.retrieve(pkg_url, file_name)
  if check_digest(file_name, hash_algorithm, expected_digest):
    return True
  else:
    print 'Hash digest check failed in file {0}.'.format(file_name)
    return False
Пример #17
0
def download_reports(years=_years, weeks=_weeks):
    '''Crawls through IMoH website and download all excel files in the given weeks and years'''
    # Create paths for logging files and download loaction
    prefix = datetime.now().strftime('./log/weeklies/%y%m%d_%H%M%S_')
    log_d = prefix + "downloads.log"
    log_f = prefix + "FAILED.log"
    base_loc = 'http://www.health.gov.il/PublicationsFiles/IWER'
    # URL object
    my_file = URLopener()

    for year in years:
        print "\n", year,
        for week in weeks:
            f = open(log_d, 'a')
            f.write('\n{year}_{week}: '.format(week=week, year=year))
            # There are many different options of paths
            options = ['{base}{week:02d}_{year}.xls'.format(base=base_loc, week=week, year=year),
                       '{base}{week}_{year}.xls'.format(base=base_loc, week=week, year=year),
                       '{base}{week:02d}_{year}.xlsx'.format(base=base_loc, week=week, year=year),
                       '{base}{week}_{year}.xlsx'.format(base=base_loc, week=week, year=year)]
            for i, o in enumerate(options):
                filetype = o.split(".")[-1]
                try:
                    # Try different paths on remote, but always save on same path locally
                    my_file.retrieve(o,
                                     './data/weeklies/{year}_{week:02d}.{ft}'.format(week=week, year=year, ft=filetype))
                    # If succeeds write which filetype (xls/x) was saved
                    f.write('{ft}'.format(ft=filetype), )
                    # If downloads succeeds move close the log file and break the loop
                    f.close()
                    break
                except:
                    # When option excepted, write try number to the log
                    f.write("{} ".format(i + 1))
                    # If all options were exhausted, it has failed.
                    if i == len(options) - 1 and week != 53:
                        print "== {year}_{week:02d} FAILED ==".format(week=week, year=year),
                        with open(log_f, 'a') as failed:
                            failed.write("{year}_{week:02d} FAILED\n".format(week=week, year=year))
                        f.write("FAILED")
                        f.close()
        f.close()
Пример #18
0
        raw_input("Press <enter> to exit.")
        sys.exit(1)

    urlget = URLopener({})
    errors = []
    for line in f.readlines():
        try:
            url, fname = [s.strip() for s in line.split('    ')]
        except ValueError:
            print("Could not parse this input: " + line)
            continue
        if osp.isfile(fname):
            print('Skipping existing file %s' % fname)
        else:
            print('Downloading %s to %s' % (url, fname))
            try:
                urlget.retrieve(url, fname, report_progress)
            except IOError:
                print(' (!) Download failed, adding to plan B list')
                errors.append(url)

    if errors:
        print(
            "\nAn error(s) was detected; would you like to retry using the " +
            "system browser?")
        raw_input("Press Ctrl+C to exit or <enter> to continue.")
        for url in errors:
            openurl(url)

    raw_input("Press <enter> to exit.")
Пример #19
0
         date_delivered = None
     try:
         party = flyer_dict['publisher_party']['party_name']
     except:
         party = None
     try:
         constituency = flyer_dict['constituency']['slug']
     except:
         constituency = "Not Coded"
     images = flyer_dict['images']
     this_flyer_storage = storage + "/" + constituency + "/" + this_flyer
     if os.path.isdir(this_flyer_storage) == False:
         check_path(this_flyer_storage)
         with open(
                 storage + "/" + "flyerinfo_" + str(newfolder_date.date()) +
                 ".csv", "a") as outfile:
             f = csv.writer(outfile)
             f.writerow([
                 party,
                 int(this_flyer), constituency, date_uploaded,
                 date_delivered
             ])
             outfile.close()
         for image in range(len(images)):
             print images[image]['image']
             imgfile = URLopener()
             imgfile.retrieve(
                 images[image]['image'],
                 this_flyer_storage + "/" + str(image) + ".jpg")
 index += 1
 print "Finished: " + str(index) + " on page " + str(pageno)
def DownloadImageFromAPODPage(url):

    if "ap140302" in url:
        print "stop here"

    # Copy all of the content from the provided web page
    webpage = urlopen(url).read()
    #print "-"
    #print "URL: " + url
    global bDoWork
    global bCleanExtras
    global bVerified
    global strAPODPicturesFolder
    strAPODFileName = ""
    # Here I retrieve and print to screen the titles and links with just Beautiful Soup
    #print "Loading Soup"
    soup = BeautifulSoup(webpage)
    for url in soup.findAll("a"):
        imgurl = url.get('href')
        #print imgurl
        if not ('http://' in imgurl):
            imgurl = 'http://apod.nasa.gov/' + url.get('href')
            #sleep(lWaitTime)
            if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg":
                #print "IMG: " + imgurl
                strAPODFileName = imgurl.strip().split('/')[-1]
                #print "strAPODFileName = " + strAPODFileName
                filename = strAPODPicturesFolder + strAPODFileName
                if bDoWork:
                    bDoWork = False
                    #filename = url.strip().split('/')[-1]
                    #print filename
                    if (not os.path.isfile(filename)) and ('apod.nasa.gov'
                                                           in imgurl):
                        #print "Downloading: " + filename
                        image = URLopener()
                        image.retrieve(imgurl, filename)
                        sleep(lWaitTime)
                    elif (os.path.isfile(filename)):
                        #print "Verified: " + filename
                        bVerified = True
                    if not bCleanExtras:
                        #if we are not cleaning extras we can break here
                        #print "Not Seeking Extras"
                        break
                else:
                    if (os.path.isfile(filename)):
                        #this is the logic to clean extra downloads/duplicates
                        #print "Deleting " + filename
                        os.remove(filename)

    #print "Seeking Title"
    txtName = ""
    bForce = False

    for bTag in soup.findAll("title"):
        if (txtName == ""):
            #bForce = True
            txtName = bTag.text
            txtName = txtName.replace("APOD:", "").strip()
            if "\r" in txtName or "\n" in txtName:
                txtName = txtName.strip().replace("\r", ' ').replace(
                    "\n", " ").replace("  ", " ").replace("  ", " ")
                bForce = True
            #print txtName

    for bTag in soup.findAll("b"):
        if (txtName == ""):
            txtName = bTag.text
            txtName = txtName.strip()
            if "\r" in txtName or "\n" in txtName:
                txtName = txtName.strip().replace("\r", ' ').replace(
                    "\n", " ").replace("  ", " ").replace("  ", " ")
                bForce = True
            #print txtName

    #print "Loading Info"
    txtPName = ""
    for pTag in soup.findAll("p"):
        txtPName = pTag.text
        txtPName = txtPName.strip()
        if "Explanation:" in txtPName:
            iLoc = txtPName.find("Tomorrow's picture:")
            iLoc = iLoc - 1
            iLoc2 = txtPName.find("digg_url")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("APOD presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            #The Amateur Astronomers Association of New York Presents:
            iLoc2 = txtPName.find(
                "The Amateur Astronomers Association of New York Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("What was that?:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("Follow APOD on:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
                bForce = True
            if iLoc > 0 and (strAPODFileName <> ""):
                txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace(
                    '  ', ' ').replace('  ', ' ').replace('  ', ' ').replace(
                        'Explanation: ', '')
                if bForce or not (os.path.isfile(
                        strAPODDataFolder +
                        strAPODFileName.replace('.jpg', '_Title.txt'))):
                    #print "Title: " + txtName
                    #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt')
                    f = open(
                        strAPODDataFolder +
                        strAPODFileName.replace('.jpg', '_Title.txt'), 'w')
                    f.write(txtName.encode('utf8'))
                    f.close
                if bForce or (txtPName.strip() <> "") or (iLoc2 > 0) or (
                        not (os.path.isfile(
                            strAPODDataFolder +
                            strAPODFileName.replace('.jpg', '_Info.txt')))):
                    #print "Info Paragraph: " + txtPName.encode('utf8')
                    #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt')
                    with open(
                            strAPODDataFolder +
                            strAPODFileName.replace('.jpg', '_Info.txt'),
                            'w') as f:
                        #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w')
                        f.write(txtPName.encode('utf8'))
                        #f.close
                        #f.flush

    #print "Checking for Title File"
    if (not strAPODFileName == "") and (not (
            os.path.isfile(strAPODDataFolder +
                           strAPODFileName.replace('.jpg', '_Title.txt')))):
        #print "Title not found"
        txtAllPageText = soup.get_text().replace("\r", ' ').replace(
            "\n", " ").replace("  ", " ").replace("  ", " ")
        if "-" in txtAllPageText:
            iLoc1 = txtAllPageText.find("-")
            txtAllPageText = txtAllPageText[iLoc1 + 1:].strip()
            iLoc2 = txtAllPageText.find("Astronomy Picture of the Day")
            txtAllPageText = txtAllPageText[:iLoc2].strip()
            #print "Title: " + txtAllPageText
            #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt')
            f = open(
                strAPODDataFolder +
                strAPODFileName.replace('.jpg', '_Title.txt'), 'w')
            f.write(txtAllPageText.encode('utf8'))
            f.close

    #print "Checking for Info File"
    if (not strAPODFileName == "") and (not (
            os.path.isfile(strAPODDataFolder +
                           strAPODFileName.replace('.jpg', '_Info.txt')))):
        #print "Info file found"
        txtAllPageText = soup.get_text().replace("\r", ' ').replace(
            "\n", " ").replace("  ", " ").replace("  ", " ")
        if "Explanation:" in txtAllPageText:
            iLoc1 = txtAllPageText.find("Explanation:")
            txtAllPageText = txtAllPageText[iLoc1 + 12:].strip()
            iLoc2 = txtAllPageText.find("Tomorrow's picture:")
            txtAllPageText = txtAllPageText[:iLoc2].strip()
            iLoc2 = txtAllPageText.find("digg_url")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("APOD Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("What was that?:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find(
                "The Amateur Astronomers Association of New York Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("Follow APOD on:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            #print "Info Paragraph: " + txtAllPageText
            #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt')
            with open(
                    strAPODDataFolder +
                    strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f:
                #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w')
                f.write(txtAllPageText.encode('utf8'))
                #f.close

    os.fsync
    if (not strAPODFileName == "") and (os.path.isfile(strAPODPicturesFolder +
                                                       strAPODFileName)):
        #print "APOD Image File Found"
        #This is True on Windows, should be set to false otherwise
        if False:
            strAPODDestCache = "G:\\apod\\cache_NookHD+\\"
            SaveCacheImage(strAPODPicturesFolder + strAPODFileName,
                           strAPODDestCache + strAPODFileName, 1920.0, 1280.0)
            strAPODDestCache = "G:\\apod\\cache_Nexus7_2013\\"
            SaveCacheImage(strAPODPicturesFolder + strAPODFileName,
                           strAPODDestCache + strAPODFileName, 1920.0, 1104.0)
        #save a PIL image containing the Title and Info
        if strAPODPicsWithText <> "":
            SavePILText(strAPODFileName)
        if strAPODCache <> "":
            SaveCacheImage(strAPODPicturesFolder + strAPODFileName,
                           strAPODCache + strAPODFileName, 1920.0, 1080.0)
Пример #21
0
def download_edgent():
    file = URLopener()
    file.retrieve(EDGENT_TAR_URL, TMP_DIR + "/" + EDGENT_TAR)
    exec_cmd("cd %s ; tar zxvf %s/%s" % (TMP_DIR, TMP_DIR, EDGENT_TAR))
def DownloadImageFromAPODPage(url):

    if "ap140302" in url:
        print "stop here"

    # Copy all of the content from the provided web page
    webpage = urlopen(url).read()
    #print "-"
    #print "URL: " + url
    global bDoWork
    global bCleanExtras
    global bVerified
    global strAPODPicturesFolder
    strAPODFileName = ""
    # Here I retrieve and print to screen the titles and links with just Beautiful Soup
    #print "Loading Soup"
    soup = BeautifulSoup(webpage)
    for url in soup.findAll("a"):
        imgurl = url.get('href')
        #print imgurl
        if not ('http://' in imgurl):
            imgurl = 'http://apod.nasa.gov/' + url.get('href')
            #sleep(lWaitTime)
            if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg":
                #print "IMG: " + imgurl
                strAPODFileName = imgurl.strip().split('/')[-1]
                #print "strAPODFileName = " + strAPODFileName
                filename = strAPODPicturesFolder + strAPODFileName
                if bDoWork:
                    bDoWork = False
                    #filename = url.strip().split('/')[-1]
                    #print filename
                    if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl):
                        #print "Downloading: " + filename
                        image = URLopener()
                        image.retrieve(imgurl,filename) 
                        sleep(lWaitTime)
                    elif (os.path.isfile(filename)):
                        #print "Verified: " + filename
                        bVerified = True
                    if not bCleanExtras:
                        #if we are not cleaning extras we can break here
                        #print "Not Seeking Extras"
                        break
                else:
                    if (os.path.isfile(filename)):
                        #this is the logic to clean extra downloads/duplicates                
                        #print "Deleting " + filename
                        os.remove(filename)
    
    #print "Seeking Title"
    txtName = ""
    bForce = False

    for bTag in soup.findAll("title"):
        if (txtName == ""):
            #bForce = True
            txtName = bTag.text
            txtName = txtName.replace("APOD:", "").strip()
            if "\r" in txtName or "\n" in txtName:
                txtName = txtName.strip().replace("\r", ' ').replace("\n", " ").replace("  ", " ").replace("  ", " ")
                bForce = True
            #print txtName

    for bTag in soup.findAll("b"):
        if (txtName == ""):
            txtName = bTag.text
            txtName = txtName.strip()
            if "\r" in txtName or "\n" in txtName:
                txtName = txtName.strip().replace("\r", ' ').replace("\n", " ").replace("  ", " ").replace("  ", " ")
                bForce = True
            #print txtName

    #print "Loading Info"
    txtPName = ""
    for pTag in soup.findAll("p"):
        txtPName = pTag.text
        txtPName = txtPName.strip()
        if "Explanation:" in txtPName:
            iLoc = txtPName.find("Tomorrow's picture:")
            iLoc = iLoc - 1
            iLoc2 = txtPName.find("digg_url")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("APOD presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            #The Amateur Astronomers Association of New York Presents:
            iLoc2 = txtPName.find("The Amateur Astronomers Association of New York Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("What was that?:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
            iLoc2 = txtPName.find("Follow APOD on:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtPName = txtPName[:iLoc2]
                bForce = True
            if iLoc > 0 and (strAPODFileName <> ""):
                txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('Explanation: ', '')
                if bForce or not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'))):
                    #print "Title: " + txtName
                    #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt')
                    f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w')
                    f.write(txtName.encode('utf8'))
                    f.close
                if bForce or (txtPName.strip() <> "") or (iLoc2 > 0) or (not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt')))):
                    #print "Info Paragraph: " + txtPName.encode('utf8')
                    #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt')
                    with open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f:
                        #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w')
                        f.write(txtPName.encode('utf8'))
                        #f.close
                        #f.flush

    #print "Checking for Title File"
    if (not strAPODFileName == "") and (not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt')))):
        #print "Title not found"
        txtAllPageText = soup.get_text().replace("\r", ' ').replace("\n", " ").replace("  ", " ").replace("  ", " ")
        if "-" in txtAllPageText:
            iLoc1 = txtAllPageText.find("-")
            txtAllPageText = txtAllPageText[iLoc1 + 1:].strip()
            iLoc2 = txtAllPageText.find("Astronomy Picture of the Day")
            txtAllPageText = txtAllPageText[:iLoc2].strip()
            #print "Title: " + txtAllPageText
            #print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt')
            f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w')
            f.write(txtAllPageText.encode('utf8'))
            f.close

    #print "Checking for Info File"
    if (not strAPODFileName == "") and (not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt')))):
        #print "Info file found"
        txtAllPageText = soup.get_text().replace("\r", ' ').replace("\n", " ").replace("  ", " ").replace("  ", " ")
        if "Explanation:" in txtAllPageText:
            iLoc1 = txtAllPageText.find("Explanation:")
            txtAllPageText = txtAllPageText[iLoc1 + 12:].strip()
            iLoc2 = txtAllPageText.find("Tomorrow's picture:")
            txtAllPageText = txtAllPageText[:iLoc2].strip()
            iLoc2 = txtAllPageText.find("digg_url")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("APOD Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("What was that?:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("The Amateur Astronomers Association of New York Presents:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            iLoc2 = txtAllPageText.find("Follow APOD on:")
            if iLoc2 > 0:
                #txtPName = txtPName
                txtAllPageText = txtAllPageText[:iLoc2]
            #print "Info Paragraph: " + txtAllPageText
            #print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt')
            with open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w') as f:
                #f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w')
                f.write(txtAllPageText.encode('utf8'))
                #f.close

    os.fsync
    if (not strAPODFileName == "") and (os.path.isfile(strAPODPicturesFolder + strAPODFileName)):
        #print "APOD Image File Found"
        #This is True on Windows, should be set to false otherwise
        if False:
            strAPODDestCache = "G:\\apod\\cache_NookHD+\\"
            SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODDestCache + strAPODFileName, 1920.0, 1280.0) 
            strAPODDestCache = "G:\\apod\\cache_Nexus7_2013\\"
            SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODDestCache + strAPODFileName, 1920.0, 1104.0) 
        #save a PIL image containing the Title and Info
        if strAPODPicsWithText <> "":
            SavePILText(strAPODFileName)
        if strAPODCache <> "":
            SaveCacheImage(strAPODPicturesFolder + strAPODFileName, strAPODCache + strAPODFileName, 1920.0, 1080.0) 
Пример #23
0
def DownloadImageFromAPODPage(url):
    # Copy all of the content from the provided web page
    webpage = urlopen(url).read()
    print "-"
    print "URL: " + url
    global bDoWork
    global bCleanExtras
    global bVerified
    global strAPODPicturesFolder
    strAPODFileName = ""
    # Here I retrieve and print to screen the titles and links with just Beautiful Soup
    soup = BeautifulSoup(webpage)
    for url in soup.findAll("a"):
        imgurl = url.get('href')
        #print imgurl
        if not ('http://' in imgurl):
            imgurl = 'http://apod.nasa.gov/' + url.get('href')
            #sleep(lWaitTime)
            if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg":
                print "IMG: " + imgurl
                strAPODFileName = imgurl.strip().split('/')[-1]
                print "strAPODFileName = " + strAPODFileName
                filename = strAPODPicturesFolder + strAPODFileName
                if bDoWork:
                    bDoWork = False
                    #filename = url.strip().split('/')[-1]
                    #print filename
                    if (not os.path.isfile(filename)) and ('apod.nasa.gov'
                                                           in imgurl):
                        print "Downloading: " + filename
                        image = URLopener()
                        image.retrieve(imgurl, filename)
                        sleep(lWaitTime)
                    elif (os.path.isfile(filename)):
                        print "Verified: " + filename
                        bVerified = True
                    if not bCleanExtras:
                        #if we are not cleaning extras we can break here
                        print "Not Seeking Extras"
                        break
                else:
                    if (os.path.isfile(filename)):
                        #this is the logic to clean extra downloads/duplicates
                        print "Deleting " + filename
                        os.remove(filename)
    txtName = ""
    for bTag in soup.findAll("b"):
        if (txtName == ""):
            txtName = bTag.text
            txtName = txtName.strip()
            print txtName

    txtPName = ""
    for pTag in soup.findAll("p"):
        txtPName = pTag.text
        txtPName = txtPName.strip()
        if "Explanation:" in txtPName:
            iLoc = txtPName.find("Tomorrow's picture:")
            iLoc = iLoc - 1
            if iLoc > 0 and (strAPODFileName <> ""):
                txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace(
                    '  ', ' ').replace('  ', ' ').replace('  ', ' ').replace(
                        'Explanation: ', '')
                if not (os.path.isfile(
                        strAPODDataFolder +
                        strAPODFileName.replace('.jpg', '_Title.txt'))):
                    print "Title: " + txtName
                    print "FN: " + strAPODFileName.replace(
                        '.jpg', '_Title.txt')
                    f = open(
                        strAPODDataFolder +
                        strAPODFileName.replace('.jpg', '_Title.txt'), 'w')
                    f.write(txtName.encode('utf8'))
                    f.close
                if not (os.path.isfile(
                        strAPODDataFolder +
                        strAPODFileName.replace('.jpg', '_Info.txt'))):
                    print "Info Paragraph: " + txtPName
                    print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt')
                    f = open(
                        strAPODDataFolder +
                        strAPODFileName.replace('.jpg', '_Info.txt'), 'w')
                    f.write(txtPName.encode('utf8'))
                    f.close
Пример #24
0
from urllib import urlretrieve, URLopener
import ssl
from subprocess import call
import os
import zipfile

# Unfortunately NESP SSL certificate doesn't play nice with Python 2.7 on Windows. Tried hard to fix this the proper way but gave up.
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
opener = URLopener(context=ctx)

print("Downloading sample data, please wait")
filename = 'sample.zip'
opener.retrieve('https://tsx.org.au/sample.zip', filename)

print("Extracting")
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall('.')
zip_ref.close()

# Clean up
print("Clean up")
os.remove(filename)

print("Done")
Пример #25
0
    def install_firmware(self, new_version):
        logging.info('Update firmware request')
        logging.info('Current firmware version: {}'.format(
            self.firmware_version))
        logging.info('Firmware version to install: {}'.format(new_version))
        fw_fname_prefix = 'sensa-%s' % new_version
        fw_check_url = '%sstatic/firmware/%s.chk' % (
            self.api_url, fw_fname_prefix)
        fw_filename = fw_fname_prefix + '.zip'
        fw_url = '%sstatic/firmware/%s' % (self.api_url, fw_filename)
        # Firmware install shell script
        deploy_script = 'deploy.sh'

        # Download firmware
        fw_file = URLopener()
        try:
            fw_file.retrieve(fw_url, fw_filename)
        except IOError:
            logging.error('Error during firmware download')
            return 1
        fw_file.close()

        # Check downloaded firmware integrity
        try:
            fw_checksum_req = requests.get(fw_check_url)
        except requests.exceptions.RequestException:
            logging.error('Error during firmware download')
            return 1
        expected_check = fw_checksum_req.text.split()

        fw_checksum = md5(open(fw_filename, 'rb').read()).hexdigest()
        if(fw_checksum != expected_check[0] and
           fw_filename != expected_check[1]):
            logging.error('Error checking firmware integrity')
            return

        logging.info('Files checked. Updating')
        # Unzip
        try:
            fw_file = ZipFile(fw_filename, 'r')
        except IOError:
            logging.error('Error reading local firmware file')
            return
        fw_file.extractall()
        fw_file.close()

        # Run firmware script
        call(['sh', deploy_script])
        # Remove firmware file
        call(['rm', fw_filename])
        # Remove firmware script
        call(['rm', deploy_script])
        config = SafeConfigParser()
        config.read(self.config_file)
        # Update firmware version on config file
        config.set('device', 'firmware_version', new_version)
        try:
            conf_file = open(self.config, 'wb')
            try:
                parser.write(conf_file)
            finally:
                conf_file.close()
        except IOError:
            logging.error('Error updating version on config file')

        '''
Пример #26
0
    def parse(self, response):
        global excel_file
        species_name = observation_name_dict[str(response.url).strip().split(
            "/"
        )[-1]]  # look up species name in our dictionary by using observation number as key

        self.driver.get(response.url)  # load an observation page in selenium

        try:  # try to load webpage
            image_elements = WebDriverWait(self.driver, 5).until(
                expected_conditions.presence_of_all_elements_located(
                    (By.XPATH, "//div[@class='image-gallery-image']/img"
                     )))  # scrape images of species

            image = URLopener(
            )  # create a blank URLopener object to later download image(s)

            image_urls = []  # list of urls, scraped from image elements
            for image_element in image_elements:  # build image_urls
                image_url = image_element.get_attribute(
                    "src"
                )  # extract src attribtue from image element (the image url)
                image_urls.append(image_url)  # add extracted url to image_urls

            number_of_images = len(
                image_urls
            )  # calculate total number of images for an observation

            if number_of_images > 1:  # if there are multiple images for one observation page
                for index in range(number_of_images):  # loop over all images
                    indexed_species_name = species_name + '-0' + str(
                        index + 1
                    )  # create string that looks like "species_name-0x" where x is >= 1

                    if DOWNLOAD_IMAGES:
                        image.retrieve(
                            image_urls[index].replace("large", "original"),
                            "images/" + indexed_species_name + ".jpg"
                        )  # download the image, save it as "species_name-0x.jpeg"

                    if UPDATE_CSV:
                        species_row = DataFrame(
                        )  # initialize new row to hold "species-0x" data
                        species_row = excel_file.loc[
                            excel_file['Name'] ==
                            species_name]  # fetch row corresponding to this species
                        species_row.iloc[
                            0,
                            0] = indexed_species_name  # change Name field to "species-0x"

                        species_row.to_csv(
                            EXCEL_FILENAME,
                            header=None,
                            mode='a',
                            index=False,
                            sep=',',
                            encoding='utf-8'
                        )  # add the new row to original csv file

                if UPDATE_CSV:
                    excel_file = read_csv(
                        EXCEL_FILENAME
                    )  # refresh excel_file by reading in newly added rows "species-0x"
                    species_index = excel_file.index[
                        excel_file['Name'] == species_name][
                            0]  # get index of row to eliminate "species"
                    excel_file.drop(species_index).to_csv(
                        EXCEL_FILENAME,
                        index=False)  # remove original species row

            if DOWNLOAD_IMAGES:
                if number_of_images == 1:  # if there is only one image for one observation
                    image.retrieve(
                        str(image_urls[0]).replace("large", "original"),
                        "images/" + species_name + ".jpg"
                    )  # download the image, save it as "species_name.jpeg"

        except TimeoutException:  # in the case of a timeout
            # add species url to exception list
            with open("exception-url-list.txt", 'a') as url_file:
                url_file.write(str(response.url) + '\n')

            if UPDATE_CSV and DELETE_FAILURES:
                # delete a species row
                excel_file = excel_file.read_csv(EXCEL_FILENAME)
                species_index = excel_file.index[
                    excel_file['Name'] == species_name][
                        0]  # get index of row to eliminate "species"
                excel_file.drop(species_index).to_csv(
                    EXCEL_FILENAME, index=False)  # remove original species row

        yield
Пример #27
0
def DownloadImageFromAPODPage(url):
    # Copy all of the content from the provided web page
    webpage = urlopen(url).read()
    print "-"
    print "URL: " + url
    global bDoWork
    global bCleanExtras
    global bVerified
    global strAPODPicturesFolder
    strAPODFileName = ""
    # Here I retrieve and print to screen the titles and links with just Beautiful Soup
    soup = BeautifulSoup(webpage)
    for url in soup.findAll("a"):
        imgurl = url.get('href')
        #print imgurl
        if not ('http://' in imgurl):
            imgurl = 'http://apod.nasa.gov/' + url.get('href')
            #sleep(lWaitTime)
            if imgurl[len(imgurl) - 3:len(imgurl)] == "jpg":
                print "IMG: " + imgurl
                strAPODFileName = imgurl.strip().split('/')[-1]
                print "strAPODFileName = " + strAPODFileName
                filename = strAPODPicturesFolder + strAPODFileName
                if bDoWork:
                    bDoWork = False
                    #filename = url.strip().split('/')[-1]
                    #print filename
                    if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl):
                        print "Downloading: " + filename
                        image = URLopener()
                        image.retrieve(imgurl,filename) 
                        sleep(lWaitTime)
                    elif (os.path.isfile(filename)):
                        print "Verified: " + filename
                        bVerified = True
                    if not bCleanExtras:
                        #if we are not cleaning extras we can break here
                        print "Not Seeking Extras"
                        break
                else:
                    if (os.path.isfile(filename)):
                        #this is the logic to clean extra downloads/duplicates                
                        print "Deleting " + filename
                        os.remove(filename)
    txtName = ""
    for bTag in soup.findAll("b"):
        if (txtName == ""):
            txtName = bTag.text
            txtName = txtName.strip()
            print txtName

    txtPName = ""
    for pTag in soup.findAll("p"):
        txtPName = pTag.text
        txtPName = txtPName.strip()
        if "Explanation:" in txtPName:
            iLoc = txtPName.find("Tomorrow's picture:")
            iLoc = iLoc - 1
            if iLoc > 0 and (strAPODFileName <> ""):
                txtPName = txtPName[0:iLoc].strip().replace('\n', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('Explanation: ', '')
                if not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'))):
                    print "Title: " + txtName
                    print "FN: " + strAPODFileName.replace('.jpg', '_Title.txt')
                    f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Title.txt'), 'w')
                    f.write(txtName.encode('utf8'))
                    f.close
                if not (os.path.isfile(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'))):
                    print "Info Paragraph: " + txtPName
                    print "FN: " + strAPODFileName.replace('.jpg', '_Info.txt')
                    f = open(strAPODDataFolder + strAPODFileName.replace('.jpg', '_Info.txt'), 'w')
                    f.write(txtPName.encode('utf8'))
                    f.close
Пример #28
0
def retrieve_schemas_in_local_directory_zip(zip_schemas_uri, proxy_dict,
                                            schema_zipfilename,
                                            dest_directory):
    # crack open a connection via the proxy...
    try:
        schema_zip_file = URLopener(proxies=proxy_dict)
    except:
        print("urllib.URLopener error: %s" % (proxy_dict))
        return None

    # if the local schema directory does not exist - error
    if not (os.path.exists(dest_directory)):
        try:
            os.makedirs(dest_directory)

        except ValueError as err:
            print("Unable to create the schema files directory %s" %
                  dest_directory)
            print(err)
            return None

    # full pathname of the target
    zip_file_path = dest_directory + schema_zipfilename

    # retrieve the zip file from the url
    try:
        zip_url_path = zip_schemas_uri + schema_zipfilename
        schema_zip_file.retrieve(zip_url_path, zip_file_path)
    except:
        print("Error retrieving %s to %s with proxy=%s" %
              (zip_url_path, zip_file_path, proxy_dict))
        print(
            "...this could be due to %s not being available at %s or an invalid proxy setting."
            % (schema_zipfilename, zip_schemas_uri))
        return None

    # unzip it
    with zipfile.ZipFile(zip_file_path, "r") as zfile:
        try:
            zfile.extractall(dest_directory)
        except:
            print("Error unzipping %s" % zip_file_path)
            return None

    # locate the "ServiceRoot" (either json or xml) in the zip file... this will yeild
    # the subpath below "dest_directory" where the schema files were loaded into (and unloaded from)
    # the source zipfile.  This subpath is needed for the tool to have a full pathname to the schema files
    #
    unzip_metadata_subpath = None
    try:
        with zipfile.ZipFile(zip_file_path, "r") as zfile:
            for fname in zfile.namelist():
                # find the path within the zip file for the metadata files
                if "metadata" in fname and "ServiceRoot" in fname and fname.__str__(
                ).endswith(".xml") and ("MAC" not in fname):
                    str_idx = fname.find("ServiceRoot")
                    unzip_metadata_subpath = fname[0:str_idx]
                    break
    except:
        print(
            "Error processing the zip file %s while searching for a \'ServiceRoot\' metadata file"
            % zip_file_path)
        return None

    if (unzip_metadata_subpath == None):
        print(
            "Error: %s does not appear to be a valid DMTF/SPMF metadata zip file..."
            % zip_file_path)
        print(
            "  Unable to locate the \'ServiceRoot'\ xml file below the \'metadata'\ in the zipfile %s"
            % zip_file_path)
        return False

    # return the full path to the unzipped metadata files
    return dest_directory + unzip_metadata_subpath
Пример #29
0
	for poemLink in poemLinks: 
		try:
			pl = db_session.query(Poem).filter_by(poem=poemLink).one()
		except NoResultFound:
			p_obj = Poem(poem=poemLink)
			db_session.add(p_obj)
			print 'added poem %s' %poemLink
			db_session.commit()
			#download and save file to temp file
			#make sure its not massive
			d = urllib.urlopen(poemLink)
			if int(d.info()['Content-Length']) > 25000000: #arbitrary length, could be better
				continue
			filename = urlparse(poemLink).path.split('/')[-1]
			try:
				opener.retrieve(poemLink, temp + filename)
			except:
				continue
			#open and convert file to mono, 8000Hz
			poem = AudioSegment.from_mp3(temp + filename)
			poem = poem.set_channels(1)
			poem = poem.set_frame_rate(8000)
			#erase temp file
			os.remove(temp + filename)
			#cut the poem into lines based on silence
			lines = cutbySilence(poem)
			#number the lines
			line_num = 0
			for line in lines:
				if line.duration_seconds > 30:
					continue