def collect_year(year, data_source = 'NCDC'): """ Collect the GSOD data file for all locations for the specified year. Look locally for the tar file first. If it is not there, and its gzip version is not either, use the ftp connection to retrieve it from data source. """ filename = info2filepath(year) local_folderpath = os.path.join("Data", "GSOD", "gsod_"+str(year)) local_filepath = os.path.join(local_folderpath, filename) if not os.path.isdir(local_folderpath): # Folder not already present os.mkdir(local_folderpath) if count_op_files(local_folderpath) < 10: # probably not all the data files are present if not os.path.exists(local_filepath): # tar file not present either: download it! if data_source == 'NCDC': remote_location = str(year) print("Retrieving archive %s... This may take several minutes." % local_filepath) remote_target = os.path.join(remote_location, filename) retrieve_file(data_source, remote_target, local_filepath) untar(local_filepath) try: panda = datafolder2pandas(local_folderpath) except MemoryError: # For years where there is a large amount of data, it is not possible to # load everything in memory # FIXME: load the data in a memory mapped/pytable stored pandas in this # case? Clarify because the memory error is thrown by mmap. It may be # doing this already, but be running into mmap limitations? warnings.warn("The year %s contains too much data to be loaded into a " "single object in memory") panda = None return panda
def collect_year_at_loc(year, location_WMO, location_WBAN, data_source = 'NCDC', internet_connected = True): """ Collect the data GSOD data file for specified location and specified year. Look locally for the file first. If it is not there, and its gzip version is not either, untar the file if it is present and has not been untared, or use the ftp connection to retrieve it from data source. """ filename = info2filepath(year, location_WMO, location_WBAN) folder_location = os.path.join("Data", "GSOD", "gsod_"+str(year)) filepath = os.path.join(folder_location, filename) print "Attempting to collect %s..." % filepath filepath_found = True if not os.path.exists(filepath): zipped_filepath = filepath+".gz" if os.path.exists(zipped_filepath): unzip(zipped_filepath) elif os.path.exists(os.path.join(folder_location, "gsod_"+str(year)+".tar")): # Possible not to rely on outside servers: untar the file if there # are no op.gz or op files. If not it means that the file is # missing. there_are_op_files = False for filename in os.listdir(folder_location): if os.path.splitext(filename)[1] in [".op", ".op.gz"]: there_are_op_files = True break if not there_are_op_files: untar(os.path.join(folder_location, "gsod_"+str(year)+".tar")) if os.path.isfile(zipped_filepath): unzip(zipped_filepath) else: warnings.warn("File %s is missing from the dataset: skipping " "this location." % zipped_filepath) filepath_found = False elif internet_connected: target_folder = "Data/GSOD/gsod_"+str(year) if not os.path.exists(target_folder): print "Creating locally the folder %s." % target_folder os.mkdir(target_folder) # Download the file from NCDC if data_source == 'NCDC': remote_location = str(year) remote_target = os.path.join(remote_location, filename+".gz") retrieve_file(data_source, remote_target, zipped_filepath) if os.path.isfile(zipped_filepath): unzip(zipped_filepath) else: filepath_found = False else: filepath_found = False if filepath_found: return datafile2pandas(filepath)
def collect_year_at_loc(year, location_WMO, location_WBAN, data_source='NCDC', internet_connected=True): """ Collect the data GSOD data file for specified location and specified year. Look locally for the file first. If it is not there, and its gzip version is not either, untar the file if it is present and has not been untared, or use the ftp connection to retrieve it from data source. """ filename = info2filepath(year, location_WMO, location_WBAN) folder_location = os.path.join("Data", "GSOD") #, "gsod_"+str(year)) filepath = os.path.join(folder_location, filename) log.info("Attempting to collect %s..." % filepath) filepath_found = False if not os.path.exists(filepath): zipped_filepath = filepath + ".gz" tar_filepath = os.path.join(folder_location, "gsod_" + str(year) + ".tar") if os.path.exists(zipped_filepath): # Read from the zip filepath = gzip.open(zipped_filepath) filepath_found = True elif os.path.exists(tar_filepath): # Possible not to rely on outside servers: load the file from the # tarfile archive = tarfile.TarFile(tar_filepath) try: gzf = archive.extractfile(filename + '.gz') filepath = gzip.GzipFile(fileobj=gzf) filepath_found = True except KeyError, e: # Some archives have a './' at the beginning try: gzf = archive.extractfile( os.path.join('.', filename + '.gz')) filepath = gzip.GzipFile(fileobj=gzf) filepath_found = True except KeyError, e: log.warn("File %s is missing from the dataset: skipping " "this location." % zipped_filepath)
def collect_year_at_loc(year, location_WMO, location_WBAN, data_source='NCDC', internet_connected=True): """ Collect the data GSOD data file for specified location and specified year. Look locally for the file first. If it is not there, and its gzip version is not either, untar the file if it is present and has not been untared, or use the ftp connection to retrieve it from data source. """ filename = info2filepath(year, location_WMO, location_WBAN) folder_location = os.path.join("Data", "GSOD") #, "gsod_"+str(year)) filepath = os.path.join(folder_location, filename) log.info("Attempting to collect %s..." % filepath) filepath_found = False if not os.path.exists(filepath): zipped_filepath = filepath+".gz" tar_filepath = os.path.join(folder_location,"gsod_"+str(year)+".tar") if os.path.exists(zipped_filepath): # Read from the zip filepath = gzip.open(zipped_filepath) filepath_found = True elif os.path.exists(tar_filepath): # Possible not to rely on outside servers: load the file from the # tarfile archive = tarfile.TarFile(tar_filepath) try: gzf = archive.extractfile(filename+'.gz') filepath = gzip.GzipFile(fileobj = gzf) filepath_found = True except KeyError, e: # Some archives have a './' at the beginning try: gzf = archive.extractfile(os.path.join('.', filename+'.gz')) filepath = gzip.GzipFile(fileobj = gzf) filepath_found = True except KeyError, e: log.warn("File %s is missing from the dataset: skipping " "this location." % zipped_filepath)