def rebuild(self, db_path, log, skip_if_present=False): """ For every float in the file wmo, download every data file related to that float that starts with 'MR'. Then create a xml file with the data read from the wmo file. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output - *skip_if_present*: A boolean value that set if not download again the files that are saved on the local directory. By defalut is False Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated print("REBUILD") downloaded = [] # Read the wmo file line by line (exclude the first one because # it does not contain data) A=self.wmo_file_reader() # Delete, if present, the XML files with all the floats xml_file = join(xml_path, self.__class__.__name__ + '.xml') if exists(xml_file): remove(xml_file) # and create a new one (in memory) root = xml_tree.Element("BioFloats") root.set('Updated', now_as_string()) tree = xml_tree.ElementTree(root) # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=False) # Open the connection with the remote archive # Download data for every active float for l in range(len(A)): f = A[l]['wmo'] floatname = A[l]['nome_fs'].replace(' ','') if not self.is_a_lov_float(f, floatname): continue # Update the xml with the current status of the float f_in_xml = root.findall('wmo_' + str(f)) if len(f_in_xml) == 0: f_node = xml_tree.SubElement(root, 'wmo_' + str(f)) else: f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0] f_node.set('status', A[l]['status']) try: urlfilelist = http_url + floatname + "/liste_all" print(urlfilelist) response = urllib2.urlopen(urlfilelist) except: log.info('Cannot download file ' + urlfilelist + '. This file will be skipped!') continue remotepathlist = response.read().rsplit("\n")[:-1] filelist=[os.path.basename(fn) for fn in remotepathlist] # Now I look for the profiles dir. This is the folder # where all the data are stored if len(filelist) > 0: download_for_f = [] # Copy all file in a local dir with the same name # skipping the one that we already have float_local_dir = join(path, f) print(float_local_dir) ensure_dir(float_local_dir, log, expected = False) for ff in filelist: url = http_url + floatname + "/" + ff d = download_file(url, ff, float_local_dir, log, None, True) # If the file was downloaded without any problem, # add it to the list of downloaded files if d: downloaded.append(ff) download_for_f.append(ff) if len(download_for_f) == 0: log.info('No updates found for float ' + str(f)) else: log.info('No updates found for float ' + str(f)) # Save the XML file xml_as_string = xml_tree.tostring(root) xml_rebuild = parseString(xml_as_string) pretty_xml = xml_rebuild.toprettyxml(indent=' ') pretty_xml_lines = pretty_xml.split('\n') pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()]) ensure_dir(xml_path, log, expected=False) with open(xml_file, 'w') as xml_f: xml_f.write(pretty_xml) # Return the list of downloaded files return downloaded
def harvest(self, db_path, log): """ For every float in the file wmo, check the status in the wmo_file and in the xml one. If in at least one file the float is reported as active, then check the last file downloaded for that wmo and download every file on the server that is more recent than the one already downloaded. Then update the xml file with the status reported in the wmo file. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated print("HARVEST") downloaded = [] # Read the wmo file A = self.wmo_file_reader() lines_active_floats=np.where(A['status']=='A')[0] lines_dead__floats =np.where(A['status']=='D')[0] # Now we need the xml file that keeps what we did on the # last updates xml_file = join(xml_path, self.__class__.__name__ + '.xml') try: tree = xml_tree.parse(xml_file) except: log.info('XML file not found or not readable. ' 'This script will update every file ' 'from the remote archive. This is ' 'almost the same than run in reset ' 'mode, but the files that exist will ' 'not be downloaded again. Moreover, ' 'the XML file will be rewritten.') return self.rebuild(db_path, log, skip_if_present=True) root = tree.getroot() # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=True) # Open the connection with the remote archive # Enter in the directory tree # Download data for every active float for l in lines_active_floats: # Update the xml with the current status of the float f = A[l]['wmo'] floatname = A[l]['nome_fs'].replace(' ','') if not self.is_a_lov_float(f, floatname): continue wmo_in_xml = 'wmo_' + str(f) f_in_xml = root.findall(wmo_in_xml) if len(f_in_xml) == 0: f_node = xml_tree.SubElement(root, wmo_in_xml) else: f_node = [fn for fn in f_in_xml if fn.tag==wmo_in_xml][0] f_node.set('status', 'A') try: urlfilelist = http_url + floatname + "/liste_all" print(urlfilelist) response = urllib2.urlopen(urlfilelist) except: log.info('No directory associated with file ' + str(f) + '. This file will be skipped!') continue remotepathlist = response.read().rsplit("\n")[:-1] filelist=[os.path.basename(fn) for fn in remotepathlist] # Now I look for the profiles dir. This is the folder # where all the data are stored if len(filelist) > 0: download_for_f = [] # Copy all file in a local dir with the same name # skipping the one that we already have float_local_dir = join(path, f) ensure_dir(float_local_dir, log, expected = False) for ff in filelist: url = http_url + floatname + "/" + ff d = download_file(url, ff, float_local_dir, log, None, True) # If the file was downloaded without any problem, # add it to the list of downloaded files if d: downloaded.append(ff) download_for_f.append(ff) if len(download_for_f) == 0: log.info('No updates found for float ' + str(f)) else: log.info('No updates found for float ' + str(f)) print ("DIED FLOATS") for l in lines_dead__floats: f = A[l]['wmo'] floatname = A[l]['nome_fs'].replace(' ','') if not self.is_a_lov_float(f, floatname): continue to_be_updated = False # Update the xml with the current status of the float # Check if it must be updated f_in_xml = root.findall('wmo_' + str(f)) if len(f_in_xml) == 0: # If this float is new, then add it to the archive # and it will be updated to_be_updated = True f_node = xml_tree.SubElement(root, 'wmo_' + str(f)) else: f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0] # If I already know this float, but the last time it # was not dead, update it if f_node.get('status') != 'D': to_be_updated = True f_node.set('status', 'D') if not to_be_updated: log.debug("Wmo " + str(f) + " is dead and will not be updated") else: log.debug("Wmo " + str(f) + " now is dead but was active on " "the last run and will be updated anyway") if to_be_updated: try: urlfilelist = http_url + floatname + "/liste_all" print(urlfilelist) response = urllib2.urlopen(urlfilelist) except: log.info('No directory associated with file ' + str(f) + '. This file will be skipped!') continue remotepathlist = response.read().rsplit("\n")[:-1] filelist=[os.path.basename(fn) for fn in remotepathlist] # Now I look for the profiles dir. This is the folder # where all the data are stored if len(filelist) > 0: download_for_f = [] # Copy all file in a local dir with the same name # skipping the one that we already have float_local_dir = join(path, f) ensure_dir(float_local_dir, log, expected = False) for ff in filelist: url = http_url + floatname + "/" + ff d = download_file(url, ff, float_local_dir, log, None, True) # If the file was downloaded without any problem, # add it to the list of downloaded files if d: downloaded.append(ff) if len(download_for_f) == 0: log.info('No updates found for float ' + str(f)) else: log.info('No updates found for float ' + str(f)) # Save the XML file root.set('Updated', now_as_string()) xml_as_string = xml_tree.tostring(root) xml_rebuild = parseString(xml_as_string) pretty_xml = xml_rebuild.toprettyxml(indent=' ') pretty_xml_lines = pretty_xml.split('\n') pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()]) with open(xml_file, 'w') as xml_f: xml_f.write(pretty_xml) # Return the list of downloaded files return downloaded