def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # files are nested in another baad_data folder # important files considered (baad_data.csv,baad_methods.csv) # relevant files can be added in the same manner file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"] engine.download_files_from_archive(self.urls["BAAD"], file_names) # creating data from baad_data.csv engine.auto_create_table(Table("data", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="baad_data.csv") engine.insert_data_from_file(engine.format_filename("baad_data.csv")) # creating methods from baad_methods.csv engine.auto_create_table(Table("methods", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="baad_methods.csv") engine.insert_data_from_file( engine.format_filename("baad_methods.csv"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) # structure_plot_year table self.engine.auto_create_table(Table("structure_plot_year"), url=self.urls["structure_plot_year"]) self.engine.insert_data_from_url(self.urls["structure_plot_year"]) # structure_plot_year table self.engine.auto_create_table(Table("plots"), url=self.urls["plots"]) self.engine.insert_data_from_url(self.urls["plots"]) # species table self.engine.download_file(self.urls["species"], "original_MSH_SPECIES_DESCRIPTORS.csv") data_path = self.engine.format_filename("MSH_SPECIES_DESCRIPTORS.csv") old_data = os.path.normpath(self.engine.find_file("original_MSH_SPECIES_DESCRIPTORS.csv")) with open(old_data, 'rU') as infile, open(data_path, 'w')as new_data: for line in infile: line = str(line).encode('utf-8') new_data.write(line) infile.close() new_data.close() self.engine.auto_create_table(Table("species"), filename="MSH_SPECIES_DESCRIPTORS.csv") self.engine.insert_data_from_file(data_path) # species_plot_year tables table = Table("species_plot_year") table.delimiter = ',' table.columns = [ ('record_id', ('pk-auto',)), ('plot_id_year', ('char',)), ('plot_name', ('char',)), ('plot_number', ('int',)), ('year', ('int',)), ('species', ('ct_column',)), ('count', ('ct-double',)) ] table.ct_column = 'species' table.ct_names = ['Abilas', 'Abipro', 'Achmil', 'Achocc', 'Agoaur', 'Agrexa', 'Agrpal', 'Agrsca', 'Alnvir', 'Anamar', 'Antmic', 'Antros', 'Aqifor', 'Arcnev', 'Arnlat', 'Astled', 'Athdis', 'Blespi', 'Brocar', 'Brosit', 'Carmer', 'Carmic', 'Carpac', 'Carpay', 'Carpha', 'Carros', 'Carspe', 'Casmin', 'Chaang', 'Cirarv', 'Cisumb', 'Crycas', 'Danint', 'Descae', 'Elyely', 'Epiana', 'Eriova', 'Eripyr', 'Fesocc', 'Fravir', 'Gencal', 'Hiealb', 'Hiegra', 'Hyprad', 'Junmer', 'Junpar', 'Juncom', 'Leppun', 'Lommar', 'Luepec', 'Luihyp', 'Luplat', 'Luplep', 'Luzpar', 'Maiste', 'Pencar', 'Pencon', 'Penser', 'Phahas', 'Phlalp', 'Phldif', 'Phyemp', 'Pincon', 'Poasec', 'Poldav', 'Polmin', 'Pollon', 'Poljun', 'Popbal', 'Potarg', 'Psemen', 'Raccan', 'Rumace', 'Salsit', 'Saxfer', 'Senspp', 'Sibpro', 'Sorsit', 'Spiden', 'Trispi', 'Tsumer', 'Vacmem', 'Vervir', 'Vioadu', 'Xerten'] self.engine.table = table self.engine.create_table() self.engine.insert_data_from_url(self.urls["species_plot_year"])
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [ ("index", ("pk-int",)), ("sourcenumber", ("int",)), ("sourcename", ("char",)), ("speciesname", ("char",)), ("speciescode", ("char",)), ("invert/vert", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("trophicgild", ("char",)), ("drymass", ("double",)), ("logdrymass", ("double",)), ("ecosystemtype", ("char",)), ("energysource", ("char",)), ("habitat", ("char",)), ("residentecosystem", ("char",)), ("temperature", ("double",)), ("nexcretionrate", ("double",)), ("pexcretionrate", ("double",)), ("lognexcretionrate", ("double",)), ("logpexcretionrate", ("double",)), ("incubationtime", ("double",)), ("nform", ("char",)), ("pform", ("char",)), ("bodyc", ("double",)), ("bodyn", ("double",)), ("bodyp", ("double",)), ("bodyc:n", ("double",)), ("bodyc:p", ("double",)), ("bodyn:p", ("double",)), ("bodydatasource", ("char",)), ("datasource", ("char",)), ("dataproviders", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): data_file_name = "eBird_Observation_Dataset_2013.csv" Script.download(self, engine, debug) self.engine.download_files_from_archive(self.urls["main"], [data_file_name], filetype='gz') table = (Table("main", delimiter=",")) table.columns = [("BASISOFRECORD", ("char", )), ("INSTITUTIONCODE", ("char", )), ("COLLECTIONCODE", ("char", )), ("CATALOGNUMBER", ("char", )), ("OCCURRENCEID", ("char", )), ("RECORDEDBY", ("char", )), ("YEAR", ("int", )), ("MONTH", ("int", )), ("DAY", ("int", )), ("COUNTRY", ("char", )), ("STATEPROVINCE", ("char", )), ("COUNTY", ("char", )), ("DECIMALLATITUDE", ("double", )), ("DECIMALLONGITUDE", ("double", )), ("LOCALITY", ("char", )), ("KINGDOM", ("char", )), ("PHYLUM", ("char", )), ("CLASS", ("char", )), ("SPORDER", ("char", )), ("FAMILY", ("char", )), ("GENUS", ("char", )), ("SPECIFICEPITHET", ("char", )), ("SCIENTIFICNAME", ("char", )), ("VERNACULARNAME", ("char", )), ("INDIVIDUALCOUNT", ("int", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(data_file_name)) return engine
def __init__(self, **kwargs): Script.__init__(self, **kwargs) self.title = "Gulf of Maine intertidal density/cover (Petraitis et al. 2008)" self.citation = "Peter S. Petraitis, Harrison Liu, and " \ "Erika C. Rhile. 2008. Densities and cover " \ "data for intertidal organisms in the Gulf of " \ "Maine, USA, from 2003 to 2007. Ecology 89:588." self.name = "intertidal-abund-me" self.ref = "https://figshare.com/collections/DENSITIES_AND_COVER_DATA_FOR_INTERTIDAL_ORGANISMS_IN_THE_GULF_OF_MAINE_USA_FROM_2003_TO_2007/3300200" self.description = "The data on densities and percent cover in the " \ "60 experimental plots from 2003 to 2007 and to " \ "update data from 1996 to 2002 that are already " \ "published in Ecological Archives." \ "Includes densities of mussels, " \ "herbivorous limpet, herbivorous snails, " \ "predatory snail, barnacle , fucoid algae and " \ "percent cover by mussels, barnacles, fucoids, " \ "and other sessile organisms." self.retriever_minimum_version = '2.0.dev' self.version = '1.5.3' self.urls = {"main": "https://ndownloader.figshare.com/files/5600831"} self.cleanup_func_table = Cleanup(correct_invalid_value, missing_values=[-999.9]) if parse_version(VERSION) <= parse_version("2.0.0"): self.shortname = self.name self.name = self.title self.cleanup_func_table = Cleanup(correct_invalid_value, nulls=[-999.9]) self.tables = {"main": Table("main", cleanup=self.cleanup_func_table)}
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr( self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table( key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def __init__(self): Script.__init__(self, tables={'trees': Table('trees', cleanup=Cleanup(correct_invalid_value, nulls=[-999]))}, name="Tree growth, mortality, physical condition - Clark, 2006", tags=['Taxon > Plants'], urls={'trees': 'http://esapubs.org/archive/ecol/E087/132/LS_trees_1983_2000.txt'}, shortname="Clark2006", description="David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132.")
def __init__(self, **kwargs): Script.__init__(self, **kwargs) self.name = "CRC Avian Body Masses" self.shortname = "AvianBodyMass" self.public = False self.ref = "http://www.crcpress.com/ecommerce_product/product_detail.jsf?isbn=1420064444" self.tables = {"mass": Table("mass", delimiter="~")} self.urls = {"mass": ""} self.tags = ["Taxon > Birds", "Data Type > Compilation"]
def __init__(self): Script.__init__(self, tables={'trees': Table('trees', cleanup=Cleanup(correct_invalid_value, nulls=[-999]))}, name="Tree growth, mortality, physical condition - Clark, 2006", tags=['Taxon > Plants'], urls={'trees': 'http://esapubs.org/archive/ecol/E087/132/LS_trees_1983_2000.txt'}, shortname="Clark2006", description = "The data set helps to examine the post-establishment ecology of 10 species of tropical wet forest trees selected to span a range of predicted life history patterns at the La Selva Biological Station in Costa Rica.", ref = "http://esapubs.org/archive/ecol/E087/132/", citation="David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132.")
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine taxa = ('Plant', 'Animal') for tax in taxa: table = Table(tax.lower() + 's', delimiter=',', header_rows = 3, pk='record_id', contains_pk=True) columns = [("record_id" , ("pk-int",) ), ("station_id" , ("int",) ), ("obs_date" , ("char",) ), ("ind_id" , ("int",) ), ("sci_name" , ("char",) ), ("com_name" , ("char",) ), ("kingdom" , ("char",) ), ("pheno_cat" , ("char",) ), ("pheno_name" , ("char",) ), ("pheno_status" , ("char",) ), ("lat" , ("double",) ), ("lon" , ("double",) ), ("elevation" , ("int",) ), ("network_name" , ("char",) )] table.columns = columns engine.table = table engine.create_table() base_url = 'http://www.usanpn.org/getObs/observations/' years = range(2009, 2013) for year in years: if year == 2009 and tax == 'Animal': continue url = base_url + 'get%s%sDataNoDefinitions' % (year, tax) filename = '%s_%s.csv' % (tax, year) engine.download_file(url, filename) engine.insert_data_from_file(engine.find_file(filename)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )), ("sourcename", ("char", )), ("speciesname", ("char", )), ("speciescode", ("char", )), ("invert/vert", ("char", )), ("phylum", ("char", )), ("class", ("char", )), ("order", ("char", )), ("family", ("char", )), ("trophicgild", ("char", )), ("drymass", ("double", )), ("logdrymass", ("double", )), ("ecosystemtype", ("char", )), ("energysource", ("char", )), ("habitat", ("char", )), ("residentecosystem", ("char", )), ("temperature", ("double", )), ("nexcretionrate", ("double", )), ("pexcretionrate", ("double", )), ("lognexcretionrate", ("double", )), ("logpexcretionrate", ("double", )), ("incubationtime", ("double", )), ("nform", ("char", )), ("pform", ("char", )), ("bodyc", ("double", )), ("bodyn", ("double", )), ("bodyp", ("double", )), ("bodyc:n", ("double", )), ("bodyc:p", ("double", )), ("bodyn:p", ("double", )), ("bodydatasource", ("char", )), ("datasource", ("char", )), ("dataproviders", ("char", ))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["UPSP_Demo_data.txt", "UPSP_Species_list2.txt"], archive_type="zip") # Create table sp_list(Species) filename = "UPSP_Species_list2.txt" engine.auto_create_table( Table('sp_list', cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) # Create table ind_loc_girth filename = "UPSP_Demo_data.txt" engine.auto_create_table( Table('ind_loc_girth', cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def __init__(self, **kwargs): Script.__init__(self, **kwargs) self.name = "CRC Avian Body Masses" self.shortname = "AvianBodyMass" self.public = False self.ref = "http://www.crcnetbase.com/isbn/9781420064452" self.citation = "Robert B. Payne, CRC Handbook of Avian Body Masses. Second Edition. The Wilson Journal of Ornithology Sep 2009 : Vol. 121, Issue 3, pg(s) 661-662 doi: 10.1676/1559-4491-121.3.661." self.description = "Body masses of birds of the world." self.tables = {"mass": Table("mass", delimiter="~")} self.urls = {"mass": ""} self.tags = ["Taxon > Birds", "Data Type > Compilation"]
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_name = "PanTHERIA_1-0_WR05_Aug2008.txt" engine.download_files_from_archive(self.urls["data"], [file_name], "zip") # Create table Species engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table), filename=file_name) engine.insert_data_from_file(engine.format_filename(file_name))
def __init__(self): Script.__init__(self, tables={'trees': Table('trees', cleanup=Cleanup(correct_invalid_value, nulls=[-999]))}, name="Tree growth, mortality, physical condition - Clark, 2006", tags=['plants', 'time-series'], urls={'trees': 'https://ndownloader.figshare.com/files/5597693'}, shortname="la-selva-trees", description="The data set helps to examine the post-establishment ecology of 10 species of tropical wet forest trees selected to span a range of predicted life history patterns at the La Selva Biological Station in Costa Rica.", ref="https://doi.org/10.6084/m9.figshare.c.3299324.v1", retriever_minimum_version= "2.0.dev", version='1.3.0', citation="David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132.")
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # files are nested in another baad_data folder # important files considered (baad_data.csv,baad_methods.csv) # relevant files can be added in the same manner file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"] engine.download_files_from_archive(self.urls["BAAD"], file_names) # creating data from baad_data.csv if parse_version(VERSION).__str__() >= parse_version( "2.1.dev").__str__(): filename = "baad_data/baad_data.csv" engine.auto_create_table(Table("data", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) else: filename = "baad_data.csv" engine.auto_create_table(Table("data", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) # creating methods from baad_methods.csv if parse_version(VERSION).__str__() >= parse_version( "2.1.dev").__str__(): filename = "baad_data/baad_methods.csv" engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) else: filename = "baad_methods.csv" engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filenames = [ 'Aquatic_animal_excretion_data.csv', 'Aquatic_animal_excretion_variable_descriptions.csv' ] for file_paths in filenames: if not os.path.isfile(engine.format_filename(file_paths)): url = self.urls["aquatic_animals"] engine.download_files_from_archive(url, filenames, "zip") # processing Aquatic_animal_excretion_data.csv filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )), ("sourcename", ("char", )), ("speciesname", ("char", )), ("speciescode", ("char", )), ("invert/vert", ("char", )), ("phylum", ("char", )), ("class", ("char", )), ("order", ("char", )), ("family", ("char", )), ("trophicgild", ("char", )), ("drymass", ("double", )), ("logdrymass", ("double", )), ("ecosystemtype", ("char", )), ("energysource", ("char", )), ("habitat", ("char", )), ("residentecosystem", ("char", )), ("temperature", ("double", )), ("nexcretionrate", ("double", )), ("pexcretionrate", ("double", )), ("lognexcretionrate", ("double", )), ("logpexcretionrate", ("double", )), ("incubationtime", ("double", )), ("nform", ("char", )), ("pform", ("char", )), ("bodyc", ("double", )), ("bodyn", ("double", )), ("bodyp", ("double", )), ("bodyc:n", ("double", )), ("bodyc:p", ("double", )), ("bodyn:p", ("double", )), ("bodydatasource", ("char", )), ("datasource", ("char", )), ("dataproviders", ("char", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename))) # processing Aquatic_animal_excretion_variable_descriptions.csv filename = 'Aquatic_animal_excretion_variable_descriptions.csv' tablename = 'variable_descriptions' table = Table(str(tablename), delimiter=',') table.columns = [("Column", ("char", )), ("Variable", ("char", )), ("Description", ("char", )), ("Data Class", ("char", )), ("Units", ("char", )), ("Minimum_value", ("char", )), ("Maximum_value", ("char", )), ("Possible_values", ("char", )), ("Missing_data_symbol", ("char", )), ("Notes", ("char", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = "Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt" engine.download_files_from_archive(self.urls["data"], [filename], filetype="zip") # Create table Species engine.auto_create_table(Table('main', cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_names = [ ('isotopes.csv','isotopes'), ('sources.csv','sources'), ('diet.csv', 'diet') ] engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], filetype="zip", archivename="ECOL_92_97") for(filename,tablename) in file_names: data_path = self.engine.format_filename(filename) self.engine.auto_create_table(Table(str(tablename), cleanup=self.cleanup_func_table),filename=filename) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["PanTHERIA_1-0_WR05_Aug2008.txt"], filetype="zip") # Create table Species engine.auto_create_table(Table('species', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="PanTHERIA_1-0_WR05_Aug2008.txt") engine.insert_data_from_file( engine.format_filename("PanTHERIA_1-0_WR05_Aug2008.txt"))
def __init__(self, **kwargs): Script.__init__(self, **kwargs) self.title = "Mammal Super Tree" self.name = 'mammal-super-tree' self.ref = 'http://doi.org/10.1111/j.1461-0248.2009.01307.x' self.citation = "Fritz, S. A., Bininda-Emonds, O. R. P. and Purvis, A. (2009), Geographical variation in predictors of mammalian extinction risk: big is bad, but only in the tropics. Ecology Letters, 12: 538-549. doi:10.1111/j.1461-0248.2009.01307.x" self.description = "Mammal Super Tree from Fritz, S.A., O.R.P Bininda-Emonds, and A. Purvis. 2009. Geographical variation in predictors of mammalian extinction risk: big is bad, but only in the tropics. Ecology Letters 12:538-549" self.retriever_minimum_version = '2.0.dev' self.version = "2.0.0" self.urls = { 'mammal_super_tree_fritz2009.tre': 'http://onlinelibrary.wiley.com/store/10.1111/j.1461-0248.2009.01307.x/asset/supinfo/ELE_1307_sm_SA1.tre?v=1&s=366b28651a9b5d1a3148ef9a8620f8aa31a7df44' } self.downlaod_only = True self.tables = {'trees': Table("trees")}
def __init__(self): Script.__init__(self, name="Gulf of Maine intertidal density/cover (Ecological Archives 2008)", description="Peter S. Petraitis, Harrison Liu, and Erika C. Rhile. 2008. Densities and cover data for intertidal organisms in the Gulf of Maine, USA, from 2003 to 2007. Ecology 89:588.", shortname="Petraitis2008", ref="http://www.esapubs.org/archive/ecol/E089/032/", urls = { "main": "http://www.esapubs.org/archive/ecol/E089/032/Succession_sampling_03-07_data.txt", }, tables = { "main": Table("main", cleanup=Cleanup(correct_invalid_value, nulls=[-999.9])), } )
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_names = [('Flensburg_Data_Links.csv', 'links'), ('Flensburg_Data_Nodes.csv', 'nodes')] engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], "zip", False, "ECOL_92_174") for (filename, tablename) in file_names: data_path = self.engine.format_filename(filename) self.engine.auto_create_table(Table( str(tablename), cleanup=self.cleanup_func_table), filename=filename) self.engine.insert_data_from_file(data_path)
def __init__(self): Script.__init__(self, name = "Gulf of Maine intertidal density/cover (Ecological Archives 2008)", citation = "Peter S. Petraitis, Harrison Liu, and Erika C. Rhile. 2008. Densities and cover data for intertidal organisms in the Gulf of Maine, USA, from 2003 to 2007. Ecology 89:588.", shortname = "Petraitis2008", ref = "http://www.esapubs.org/archive/ecol/E089/032/", description = "The data set provides access to data on densities and percent cover in the 60 experimental plots from 2003 to 2007 and to update data from 1996 to 2002 that are already published in Ecological Archives.It includes densities of mussels, an herbivorous limpet, herbivorous snails, a predatory snail, a barnacle , and fucoid algae and percent cover by mussels, barnacles, fucoids, and other sessile organisms.", urls = { "main": "http://www.esapubs.org/archive/ecol/E089/032/Succession_sampling_03-07_data.txt", }, tables = { "main": Table("main", cleanup=Cleanup(correct_invalid_value, nulls=[-999.9])), } )
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"] engine.download_files_from_archive(self.urls["data"], files, filetype="zip") # Create table species engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table), filename="Species_list.txt") engine.insert_data_from_file(engine.format_filename("Species_list.txt")) # Create table sites engine.auto_create_table(Table('sites', cleanup=self.cleanup_func_table), filename="Site_variables.txt") engine.insert_data_from_file(engine.format_filename("Site_variables.txt")) # Create table microplots table = Table('microplots') table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))] table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9', 'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17', 'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26', 'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35', 'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46', 'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54', 'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62', 'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70', 'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79', 'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88', 'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96', 'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104'] table.ct_column = 'PlotID' engine.auto_create_table(table, filename="Microplot_data.txt") engine.insert_data_from_file(engine.format_filename("Microplot_data.txt")) # Create table microplots table = Table('macroplots') table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5'] table.ct_column = 'Tree' table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')), ('Girth', ('ct-int',))] engine.auto_create_table(table, filename="Macroplot_data_Rev.txt") engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
def __init__(self): Script.__init__( self, name= "Gulf of Maine intertidal density/cover (Petraitis et al. 2008)", citation= "Peter S. Petraitis, Harrison Liu, and Erika C. Rhile. 2008. Densities and cover data for intertidal organisms in the Gulf of Maine, USA, from 2003 to 2007. Ecology 89:588.", shortname="intertidal-abund-me", ref= "https://figshare.com/collections/DENSITIES_AND_COVER_DATA_FOR_INTERTIDAL_ORGANISMS_IN_THE_GULF_OF_MAINE_USA_FROM_2003_TO_2007/3300200", description= "The data set provides access to data on densities and percent cover in the 60 experimental plots from 2003 to 2007 and to update data from 1996 to 2002 that are already published in Ecological Archives.It includes densities of mussels, an herbivorous limpet, herbivorous snails, a predatory snail, a barnacle , and fucoid algae and percent cover by mussels, barnacles, fucoids, and other sessile organisms.", retriever_minimum_version='2.0.dev', version='1.4.0', urls={"main": "https://ndownloader.figshare.com/files/5600831"}, tables={ "main": Table("main", cleanup=Cleanup(correct_invalid_value, nulls=[-999.9])) })
def create_resources(file, skip_lines): """Creates resources for the script or errors out if not possible""" engine = Engine() table = engine.auto_create_table(Table(str(file), header_rows=skip_lines), filename=file, make=False) clean_table = table.__dict__ resource_dict = {} path_to_table = os.path.basename(clean_table["name"]) resource_dict["name"] = os.path.splitext(path_to_table)[0] resource_dict["schema"] = {} resource_dict["dialect"] = {} resource_dict["schema"]["fields"] = [] for cname, ctuple in clean_table["columns"]: resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0] }) resource_dict["url"] = "FILL" return resource_dict
def create_tabular_resources(self, file, skip_lines, encoding): """Create resources for tabular scripts""" engine = Engine() self.encoding = encoding engine.encoding = encoding table_val = Table(str(file), header_rows=skip_lines) table = engine.auto_create_table(table_val, filename=file, make=False) clean_table = table.__dict__ resource_dict = {} path_to_table = os.path.basename(clean_table["name"]) print("Processing... {file_name}".format(file_name=path_to_table)) r_name = os.path.splitext(path_to_table)[0].lower() resource_dict["name"] = clean_table_name(r_name) resource_dict["path"] = path_to_table resource_dict["schema"] = {} resource_dict["dialect"] = {"delimiter": ","} resource_dict["schema"]["fields"] = [] for cname, ctuple in clean_table["columns"]: if len(ctuple) >= 2: if ctuple[0] == "char": # char sizes need quotes char_size = "{a}".format(a=ctuple[1]) resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0], "size": char_size }) else: resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0], "size": ctuple[1] }) else: resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0] }) resource_dict["url"] = "fill" return resource_dict
def create_resources(file, skip_lines): """Creates resources for the script or errors out if not possible""" engine = Engine() table = engine.auto_create_table(Table(str(file), header_rows=skip_lines), filename=file, make=False) clean_table = table.__dict__ resource_dict = {} path_to_table = os.path.basename(clean_table["name"]) print("Processing... {file_name}".format(file_name=path_to_table)) resource_dict["name"] = os.path.splitext(path_to_table)[0].lower() resource_dict["path"] = path_to_table resource_dict["schema"] = {} resource_dict["dialect"] = {"delimiter": ","} resource_dict["schema"]["fields"] = [] for cname, ctuple in clean_table["columns"]: if len(ctuple) >= 2: if ctuple[0] == 'char': # char sizes need quotes char_size = "{a}".format(a=ctuple[1]) resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0], "size": char_size }) else: resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0], "size": ctuple[1] }) else: resource_dict["schema"]["fields"].append({ "name": cname, "type": ctuple[0] }) resource_dict["url"] = "FILL" return resource_dict
def __init__(self, **kwargs): Script.__init__(self, **kwargs) self.cleanup_func_table = Cleanup(correct_invalid_value, missing_values=[-999]) self.title = "Tree growth, mortality, physical condition - Clark, 2006" self.keywords = ['plants', 'time-series'] self.urls = {'trees': 'https://ndownloader.figshare.com/files/5597693'} self.name = "la-selva-trees" self.description = "The data set helps to examine the post-establishment ecology of 10 species of tropical wet forest trees selected to span a range of predicted life history patterns at the La Selva Biological Station in Costa Rica." self.ref = "https://doi.org/10.6084/m9.figshare.c.3299324.v1" self.retriever_minimum_version = "2.0.dev" self.version = '1.4.1' self.citation = "David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132." if parse_version(VERSION) <= parse_version("2.0.0"): self.shortname = self.name self.name = self.title self.tags = self.keywords self.cleanup_func_table = Cleanup(correct_invalid_value, nulls=[-999]) self.tables = { 'trees': Table('trees', cleanup=self.cleanup_func_table) }
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'vertnet_latest_mammals.csv' tablename = 'mammals' table = Table(str(tablename), delimiter=',') table.columns = [ ("record_id", ("pk-auto",)), ("beginrecord", ("char",)), ("icode", ("char",)), ("title", ("char",)), ("citation", ("char",)), ("contact", ("char",)), ("email", ("char",)), ("emlrights", ("char",)), ("gbifdatasetid", ("char",)), ("gbifpublisherid", ("char",)), ("doi", ("char",)), ("migrator", ("char",)), ("networks", ("char",)), ("orgcountry", ("char",)), ("orgname", ("char",)), ("orgstateprovince", ("char",)), ("pubdate", ("char",)), ("source_url", ("char",)), ("iptrecordid", ("char",)), ("associatedmedia", ("char",)), ("associatedoccurrences", ("char",)), ("associatedorganisms", ("char",)), ("associatedreferences", ("char",)), ("associatedsequences", ("char",)), ("associatedtaxa", ("char",)), ("bed", ("char",)), ("behavior", ("char",)), ("catalognumber", ("char",)), ("continent", ("char",)), ("coordinateprecision", ("char",)), ("coordinateuncertaintyinmeters", ("char",)), ("country", ("char",)), ("countrycode", ("char",)), ("county", ("char",)), ("dateidentified", ("char",)), ("day", ("char",)), ("decimallatitude", ("char",)), ("decimallongitude", ("char",)), ("disposition", ("char",)), ("earliestageorloweststage", ("char",)), ("earliesteonorlowesteonothem", ("char",)), ("earliestepochorlowestseries", ("char",)), ("earliesteraorlowesterathem", ("char",)), ("earliestperiodorlowestsystem", ("char",)), ("enddayofyear", ("char",)), ("establishmentmeans", ("char",)), ("eventdate", ("char",)), ("eventid", ("char",)), ("eventremarks", ("char",)), ("eventtime", ("char",)), ("fieldnotes", ("char",)), ("fieldnumber", ("char",)), ("footprintspatialfit", ("char",)), ("footprintsrs", ("char",)), ("footprintwkt", ("char",)), ("formation", ("char",)), ("geodeticdatum", ("char",)), ("geologicalcontextid", ("char",)), ("georeferencedby", ("char",)), ("georeferenceddate", ("char",)), ("georeferenceprotocol", ("char",)), ("georeferenceremarks", ("char",)), ("georeferencesources", ("char",)), ("georeferenceverificationstatus", ("char",)), ("group", ("char",)), ("habitat", ("char",)), ("highergeography", ("char",)), ("highergeographyid", ("char",)), ("highestbiostratigraphiczone", ("char",)), ("identificationid", ("char",)), ("identificationqualifier", ("char",)), ("identificationreferences", ("char",)), ("identificationremarks", ("char",)), ("identificationverificationstatus", ("char",)), ("identifiedby", ("char",)), ("individualcount", ("char",)), ("island", ("char",)), ("islandgroup", ("char",)), ("latestageorhigheststage", ("char",)), ("latesteonorhighesteonothem", ("char",)), ("latestepochorhighestseries", ("char",)), ("latesteraorhighesterathem", ("char",)), ("latestperiodorhighestsystem", ("char",)), ("lifestage", ("char",)), ("lithostratigraphicterms", ("char",)), ("locality", ("char",)), ("locationaccordingto", ("char",)), ("locationid", ("char",)), ("locationremarks", ("char",)), ("lowestbiostratigraphiczone", ("char",)), ("materialsampleid", ("char",)), ("maximumdepthinmeters", ("char",)), ("maximumdistanceabovesurfaceinmeters", ("char",)), ("maximumelevationinmeters", ("char",)), ("member", ("char",)), ("minimumdepthinmeters", ("char",)), ("minimumdistanceabovesurfaceinmeters", ("char",)), ("minimumelevationinmeters", ("char",)), ("month", ("char",)), ("municipality", ("char",)), ("occurrenceid", ("char",)), ("occurrenceremarks", ("char",)), ("occurrencestatus", ("char",)), ("organismid", ("char",)), ("organismname", ("char",)), ("organismremarks", ("char",)), ("organismscope", ("char",)), ("othercatalognumbers", ("char",)), ("pointradiusspatialfit", ("char",)), ("preparations", ("char",)), ("previousidentifications", ("char",)), ("recordedby", ("char",)), ("recordnumber", ("char",)), ("reproductivecondition", ("char",)), ("samplingeffort", ("char",)), ("samplingprotocol", ("char",)), ("sex", ("char",)), ("startdayofyear", ("char",)), ("stateprovince", ("char",)), ("typestatus", ("char",)), ("verbatimcoordinates", ("char",)), ("verbatimcoordinatesystem", ("char",)), ("verbatimdepth", ("char",)), ("verbatimelevation", ("char",)), ("verbatimeventdate", ("char",)), ("verbatimlatitude", ("char",)), ("verbatimlocality", ("char",)), ("verbatimlongitude", ("char",)), ("verbatimsrs", ("char",)), ("waterbody", ("char",)), ("year", ("char",)), ("dctype", ("char",)), ("modified", ("char",)), ("language", ("char",)), ("license", ("char",)), ("rightsholder", ("char",)), ("accessrights", ("char",)), ("bibliographiccitation", ("char",)), ("dc_references", ("char",)), ("institutionid", ("char",)), ("collectionid", ("char",)), ("datasetid", ("char",)), ("institutioncode", ("char",)), ("collectioncode", ("char",)), ("datasetname", ("char",)), ("ownerinstitutioncode", ("char",)), ("basisofrecord", ("char",)), ("informationwithheld", ("char",)), ("datageneralizations", ("char",)), ("dynamicproperties", ("char",)), ("scientificnameid", ("char",)), ("namepublishedinid", ("char",)), ("scientificname", ("char",)), ("acceptednameusage", ("char",)), ("originalnameusage", ("char",)), ("namepublishedin", ("char",)), ("namepublishedinyear", ("char",)), ("higherclassification", ("char",)), ("kingdom", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("genus", ("char",)), ("subgenus", ("char",)), ("specificepithet", ("char",)), ("infraspecificepithet", ("char",)), ("taxonrank", ("char",)), ("verbatimtaxonrank", ("char",)), ("scientificnameauthorship", ("char",)), ("vernacularname", ("char",)), ("nomenclaturalcode", ("char",)), ("taxonomicstatus", ("char",)), ("keyname", ("char",)), ("haslicense", ("int",)), ("vntype", ("char",)), ("rank", ("int",)), ("mappable", ("int",)), ("hashid", ("char",)), ("hastypestatus", ("int",)), ("wascaptive", ("int",)), ("wasinvasive", ("int",)), ("hastissue", ("int",)), ("hasmedia", ("int",)), ("isfossil", ("int",)), ("haslength", ("int",)), ("haslifestage", ("int",)), ("hasmass", ("int",)), ("hassex", ("int",)), ("lengthinmm", ("double",)), ("massing", ("double",)), ("lengthunitsinferred", ("char",)), ("massunitsinferred", ("char",)), ("underivedlifestage", ("char",)), ("underivedsex", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip", archivename="vertnet_latest_" + str(tablename)) engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Routes table if not os.path.isfile(engine.format_filename("routes_new.csv")): engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) read = open(engine.format_filename("routes.csv"), "rb") write = open(engine.format_filename("routes_new.csv"), "wb") print "Cleaning routes data..." write.write(read.readline()) for line in read: values = line.split(',') v = Decimal(values[5]) if v > 0: values[5] = str(v * Decimal("-1")) write.write(','.join(str(value) for value in values)) write.close() read.close() engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes_new.csv") engine.insert_data_from_file(engine.format_filename("routes_new.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open(engine.format_filename("weather.csv"), "rb") write = open(engine.format_filename("weather_new.csv"), "wb") print "Cleaning weather data..." for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Species table table = Table("species", pk=False, delimiter=',') table.columns=[("species_id" , ("pk-auto",) ), ("AOU" , ("int",) ), ("genus" , ("char",30) ), ("species" , ("char",50) ), ("subspecies" , ("char",30) ), ("id_to_species" , ("bool",) )] engine.table = table engine.create_table() engine.download_file(self.urls["species"], "SpeciesList.txt") species_list = open(engine.format_filename("SpeciesList.txt"), "rb") for n in range(8): species_list.readline() rows = [] for line in species_list: if line and len(line) > 273: latin_name = line[273:].split() if len(latin_name) < 2: # If there's no species given, add "None" value latin_name.append("None") subspecies = ' '.join(latin_name[2:]) if len(latin_name) > 2 else "None" id_to_species = "1" if latin_name[1] != "None" else "0" if latin_name[1] == "sp.": latin_name[1] = "None" id_to_species = "0" if ("x" in latin_name or "/" in latin_name or "/" in subspecies or "or" in latin_name): # Hybrid species or only identified to a group of species latin_name[1] = ' '.join(latin_name[1:]) subspecies = "None" id_to_species = "0" rows.append(','.join([ line.split()[1], latin_name[0], latin_name[1], subspecies, id_to_species ])) engine.add_to_table(rows) species_list.close() # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in replace.keys(): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns=[("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print "Inserting data from part " + part + "..." try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".exe", ["fifty" + part + ".csv"]) except: print "Failed bulk insert on " + part + ", inserting manually." engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".exe", ["fifty" + part + ".csv"]) except: print "There was an error in part " + part + "." raise except zipfile.BadZipfile: print "There was an unexpected error in the Breeding Bird Survey archives." raise return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns=[("record_id" , ("pk-auto",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("Year" , ("int",) ), ("Aou" , ("int",) ), ("Count10" , ("int",) ), ("Count20" , ("int",) ), ("Count30" , ("int",) ), ("Count40" , ("int",) ), ("Count50" , ("int",) ), ("StopTotal" , ("int",) ), ("SpeciesTotal" , ("int",) )] stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = ["observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [("record_id", ("pk-auto",)), ("observation_id", ("int",)), # subsequently refered to as "status record" ("update_datetime", ("char",)), ("site_id", ("int",)), ("latitude", ("double",)), ("longitude", ("double",)), ("elevation_in_meters", ("char",)), ("state", ("char",)), ("species_id", ("int",)), ("genus", ("char",)), ("species", ("char",)), ("common_name", ("char",)), ("kingdom", ("char",)), # skip kingdom ("individual_id", ("char",)), ("phenophase_id", ("int",)), ("phenophase_description", ("char",)), ("observation_date", ("char",)), ("day_of_year", ("char",)), ("phenophase_status", ("char",)), ("intensity_category_id", ("char",)), ("intensity_value", ("char",)), ("abundance_value", ("char",)) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["Data_Files/Amniote_Database_Aug_2015.csv", "Data_Files/Amniote_Database_References_Aug_2015.csv", "Data_Files/Amniote_Range_Count_Aug_2015.csv"], filetype="zip") ct_column = 'trait' # all tables use the same ct_column name # Create tables from Amniote_Database_Aug.csv and Amniote_Database_References_Aug_2015.csv # Both reference and main have the same headers ct_names = ['female_maturity_d', 'litter_or_clutch_size_n', 'litters_or_clutches_per_y', 'adult_body_mass_g', 'maximum_longevity_y', 'gestation_d', 'weaning_d', 'birth_or_hatching_weight_g', 'weaning_weight_g', 'egg_mass_g', 'incubation_d', 'fledging_age_d', 'longevity_y', 'male_maturity_d', 'inter_litter_or_interbirth_interval_y', 'female_body_mass_g', 'male_body_mass_g', 'no_sex_body_mass_g', 'egg_width_mm', 'egg_length_mm', 'fledging_mass_g', 'adult_svl_cm', 'male_svl_cm', 'female_svl_cm', 'birth_or_hatching_svl_cm', 'female_svl_at_maturity_cm', 'female_body_mass_at_maturity_g', 'no_sex_svl_cm', 'no_sex_maturity_d'] # Create table main from Amniote_Database_Aug_2015.csv columns = [ ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')), ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')), ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))] table_main = Table('main', delimiter=',', cleanup=self.cleanup_func_table) table_main.ct_column = ct_column table_main.ct_names = ct_names table_main.columns = columns engine.auto_create_table(table_main, filename="Amniote_Database_Aug_2015.csv") engine.insert_data_from_file(engine.format_filename("Amniote_Database_Aug_2015.csv")) # Create table reference from Amniote_Database_References_Aug_2015.csv reference_columns = [ ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')), ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')), ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('reference', ('ct-char',))] table_references = Table('references', delimiter=',', cleanup=self.cleanup_func_table) table_references.ct_column = ct_column table_references.ct_names = ct_names table_references.columns = reference_columns engine.auto_create_table(table_references, filename="Amniote_Database_References_Aug_2015.csv") engine.insert_data_from_file(engine.format_filename("Amniote_Database_References_Aug_2015.csv")) # Create table Range # This table has different values for headers from the above tables. range_ct_names = ["min_female_maturity", "max_female_maturity", "count_female_maturity", "min_litter_clutch_size", "max_litter_clutch_size", "count_litter_clutch_size", "min_litters_clutches", "max_litters_clutches", "count_litters_clutches", "min_adult_body_mass", "max_adult_body_mass", "count_adult_body_mass", "min_maximum_longevity", "max_maximum_longevity", "count_maximum_longevity", "min_gestation", "max_gestation", "count_gestation", "min_weaning", "max_weaning", "count_weaning", "min_birth_hatching_weight", "max_birth_hatching_weight", "count_birth_hatching_weight", "min_weaning_weight", "max_weaning_weight", "count_weaning_weight", "min_egg_mass", "max_egg_mass", "count_egg_mass", "min_incubation", "max_incubation", "count_incubation", "min_fledging_age", "max_fledging_age", "count_fledging_age", "min_male_maturity", "max_male_maturity", "count_male_maturity", "min_inter_litter_interbirth_interval", "max_inter_litter_interbirth_interval", "count_inter_litter_interbirth_interval", "min_female_body_mass", "max_female_body_mass", "count_female_body_mass", "min_male_body_mass", "max_male_body_mass", "count_male_body_mass", "min_no_sex_body_mass", "max_no_sex_body_mass", "count_no_sex_body_mass", "min_egg_width", "max_egg_width", "count_egg_width", "min_egg_length", "max_egg_length", "count_egg_length", "min_fledging_mass", "max_fledging_mass", "count_fledging_mass", "min_adult_svl", "max_adult_svl", "count_adult_svl", "min_male_svl", "max_male_svl", "count_male_svl", "min_female_svl", "max_female_svl", "count_female_svl", "min_hatching_svl", "max_hatching_svl", "count_hatching_svl", "min_female_svl_at_maturity", "max_female_svl_at_maturity", "count_female_svl_at_maturity", "min_female_body_mass_at_maturity", "max_female_body_mass_at_maturity", "count_female_body_mass_at_maturity", "min_no_sex_svl", "max_no_sex_svl", "count_no_sex_svl", "min_no_sex_maturity", "max_no_sex_maturity", "count_no_sex_maturity"] range_columns = [ ('record_id', ('pk-auto',)), ('classx', ('char', '20')), ('orderx', ('char', '20')), ('familyx', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')), ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))] table_range = Table('range', delimiter=',', cleanup=self.cleanup_func_table) table_range.ct_column = ct_column table_range.ct_names = range_ct_names table_range.columns = range_columns engine.auto_create_table(table_range, filename="Amniote_Range_Count_Aug_2015.csv") engine.insert_data_from_file(engine.format_filename("Amniote_Range_Count_Aug_2015.csv"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv') self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls") filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = c.value.lower().strip() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: if row[cn[i]].ctype != 2: # if the cell type(ctype) is not a number this_line[i] = row[cn[i]].value.lower().strip().replace("\\", "/").replace('"', '') else: this_line[i] = row[cn[i]].value if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [row[c] for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] # Manually correct CEDRAL data, which has a single line # that is shifted by one to the left starting at Liana if this_line["site"] == "CEDRAL" and type(this_line["liana"]) == float: this_line["liana"] = "" this_line["count"] = 3 this_line["stems"] = [2.5, 2.5, 30, 18, 25] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"], id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = {} tax_count = 0 # Get all unique families/genera/species print("\n") for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.flush() sys.stdout.write(msg + "\b" * len(msg)) print("\n") # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("int",) )] data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group] for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",") table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"])], line["site"], liana ] try: counts.append([value for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [str(i)] stems.append(stem) self.engine.table = table self.engine.create_table() self.engine.add_to_table(stems) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] self.engine.table = table self.engine.create_table() self.engine.add_to_table(counts) return self.engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=6) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table if not os.path.isfile(engine.format_filename("routes_new.csv")): engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) read = open(engine.format_filename("routes.csv"), "rb") write = open(engine.format_filename("routes_new.csv"), "wb") print "Cleaning routes data..." write.write(read.readline()) for line in read: values = line.split(',') v = Decimal(values[5]) if v > 0: values[5] = str(v * Decimal("-1")) write.write(','.join(str(value) for value in values)) write.close() read.close() engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes_new.csv") engine.insert_data_from_file(engine.format_filename("routes_new.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open(engine.format_filename("weather.csv"), "rb") write = open(engine.format_filename("weather_new.csv"), "wb") print "Cleaning weather data..." for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in replace.keys(): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns=[("RouteDataID" , ("int",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print "Inserting data from part " + part + "..." try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "Failed bulk insert on " + part + ", inserting manually." engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "There was an error in part " + part + "." raise except zipfile.BadZipfile: print "There was an unexpected error in the Breeding Bird Survey archives." raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int",)), ("Family", ("char",)), ("Binomial", ("char",)), ("Wood_Density", ("double",)), ("Region", ("char",)), ("Reference_Number", ("int",))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"]) self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] lines = [] tax = [] for filename in filelist: print "Extracting data from " + filename + "..." book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for c in sh.row(0): if not Excel.empty_cell(c): cid = Excel.cell_value(c).lower() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in cn.keys(): cn["liana"] = -1 if not "count" in cn.keys(): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if cellcount > 4 and not Excel.empty_cell(row[0]): try: this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: this_line[i] = format_value(row[cn[i]]) if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [Excel.cell_value(row[c]) for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"].lower().replace('\\', '').replace('"', ''), id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = dict() tax_count = 0 # Get all unique families/genera/species for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.write(msg + "\b" * len(msg)) print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS) # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("bool",) )] data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",", contains_pk=False) table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"].lower())], line["site"], liana ] try: counts.append([str(value) for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [i] stems.append([str(value) for value in stem]) data = [','.join(stem) for stem in stems] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] data = [','.join(count) for count in counts] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filenames = ['Aquatic_animal_excretion_data.csv', 'Aquatic_animal_excretion_variable_descriptions.csv'] for file_paths in filenames: if not os.path.isfile(engine.format_filename(file_paths)): url = self.urls["aquatic_animals"] engine.download_files_from_archive(url, filenames, "zip") # processing Aquatic_animal_excretion_data.csv filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [ ("index", ("pk-int",)), ("sourcenumber", ("int",)), ("sourcename", ("char",)), ("speciesname", ("char",)), ("speciescode", ("char",)), ("invert/vert", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("trophicgild", ("char",)), ("drymass", ("double",)), ("logdrymass", ("double",)), ("ecosystemtype", ("char",)), ("energysource", ("char",)), ("habitat", ("char",)), ("residentecosystem", ("char",)), ("temperature", ("double",)), ("nexcretionrate", ("double",)), ("pexcretionrate", ("double",)), ("lognexcretionrate", ("double",)), ("logpexcretionrate", ("double",)), ("incubationtime", ("double",)), ("nform", ("char",)), ("pform", ("char",)), ("bodyc", ("double",)), ("bodyn", ("double",)), ("bodyp", ("double",)), ("bodyc:n", ("double",)), ("bodyc:p", ("double",)), ("bodyn:p", ("double",)), ("bodydatasource", ("char",)), ("datasource", ("char",)), ("dataproviders", ("char",))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename))) # processing Aquatic_animal_excretion_variable_descriptions.csv filename = 'Aquatic_animal_excretion_variable_descriptions.csv' tablename = 'variable_descriptions' table = Table(str(tablename), delimiter=',') table.columns = [ ("Column", ("char",)), ("Variable", ("char",)), ("Description", ("char",)), ("Data Class", ("char",)), ("Units", ("char",)), ("Minimum_value", ("char",)), ("Maximum_value", ("char",)), ("Possible_values", ("char",)), ("Missing_data_symbol", ("char",)), ("Notes", ("char",))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows #Creating data table lines = [] for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("data", delimiter="\t") table.columns=[("Number" , ("pk-int",) ), ("Family" , ("char",) ), ("Binomial" , ("char",) ), ("Wood_Density" , ("double",) ), ("Region" , ("char",) ), ("Reference_Number" , ("int",) )] table.pk = 'Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_data = [line["Number"], line["Family"], line["Binomial"], line["Wood_Density"], line["Region"], line["Reference_Number"]] gwdd.append(gwdd_data) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) #Creating reference table lines = [] sh = book.sheet_by_index(2) rows = sh.nrows for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Reference_Number", "Reference"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("reference", delimiter="\t") table.columns=[("Reference_Number" , ("pk-int",) ), ("Reference" , ("char",) )] table.pk = 'Reference_Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_ref = [line["Reference_Number"], line["Reference"]] gwdd.append(gwdd_ref) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = "database.csv" tablename = "predicts_main" table = Table(str(tablename), delimiter=',') table.columns = [("Source_ID", ("char",)), ("Reference", ("char",)), ("Study_number", ("int",)), ("Study_name", ("char",)), ("SS", ("char",)), ("Diversity_metric", ("char",)), ("Diversity_metric_unit", ("char",)), ("Diversity_metric_type", ("char",)), ("Diversity_metric_is_effort_sensitive", ("char",)), ("Diversity_metric_is_suitable_for_Chao", ("char",)), ("Sampling_method", ("char",)), ("Sampling_effort_unit", ("char",)), ("Study_common_taxon", ("char",)), ("Rank_of_study_common_taxon", ("char",)), ("Site_number", ("int",)), ("Site_name", ("char",)), ("Block", ("char",)), ("SSS", ("char",)), ("SSB", ("char",)), ("SSBS", ("char",)), ("Sample_start_earliest", ("char",)), ("Sample_end_latest", ("char",)), ("Sample_midpoint", ("char",)), ("Sample_date_resolution", ("char",)), ("Max_linear_extent_metres", ("double",)), ("Habitat_patch_area_square_metres", ("double",)), ("Sampling_effort", ("double",)), ("Rescaled_sampling_effort", ("double",)), ("Habitat_as_described", ("char",)), ("Predominant_land_use", ("char",)), ("Source_for_predominant_land_use", ("char",)), ("Use_intensity", ("char",)), ("Km_to_nearest_edge_of_habitat", ("double",)), ("Years_since_fragmentation_or_conversion", ("double",)), ("Transect_details", ("char",)), ("Coordinates_method", ("char",)), ("Longitude", ("double",)), ("Latitude", ("double",)), ("Country_distance_metres", ("double",)), ("Country", ("char")), ("UN_subregion", ("char",)), ("UN_region", ("char",)), ("Ecoregion_distance_metres", ("double",)), ("Ecoregion", ("char",)), ("Biome", ("char",)), ("Realm", ("char",)), ("Hotspot", ("char",)), ("Wilderness_area", ("char",)), ("N_samples", ("double",)), ("Taxon_number", ("double",)), ("Taxon_name_entered", ("char",)), ("Indication", ("char",)), ("Parsed_name", ("char",)), ("Taxon", ("char",)), ("COL_ID", ("double",)), ("Name_status", ("char",)), ("Rank", ("char",)), ("Kingdom", ("char",)), ("Phylum", ("char",)), ("Class", ("char",)), ("Order", ("char",)), ("Family", ("char",)), ("Genus", ("char",)), ("Species", ("char",)), ("Best_guess_binomial", ("char",)), ("Higher_taxa", ("char",)), ("Higher_taxon", ("char",)), ("Measurement", ("double",)), ("Effort_corrected_measurement", ("double",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls["PREDICTS"], [filename], "zip", False, "download.zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'vertnet_latest_reptiles.csv' tablename = 'reptiles' table = Table(str(tablename), delimiter=',') table.columns = [ ("record_id", ("pk-auto",)), ("beginrecord", ("char",)), ("icode", ("char",)), ("title", ("char",)), ("citation", ("char",)), ("contact", ("char",)), ("email", ("char",)), ("emlrights", ("char",)), ("gbifdatasetid", ("char",)), ("gbifpublisherid", ("char",)), ("doi", ("char",)), ("migrator", ("char",)), ("networks", ("char",)), ("orgcountry", ("char",)), ("orgname", ("char",)), ("orgstateprovince", ("char",)), ("pubdate", ("char",)), ("source_url", ("char",)), ("iptrecordid", ("char",)), ("associatedmedia", ("char",)), ("associatedoccurrences", ("char",)), ("associatedorganisms", ("char",)), ("associatedreferences", ("char",)), ("associatedsequences", ("char",)), ("associatedtaxa", ("char",)), ("bed", ("char",)), ("behavior", ("char",)), ("catalognumber", ("char",)), ("continent", ("char",)), ("coordinateprecision", ("char",)), ("coordinateuncertaintyinmeters", ("char",)), ("country", ("char",)), ("countrycode", ("char",)), ("county", ("char",)), ("dateidentified", ("char",)), ("day", ("char",)), ("decimallatitude", ("char",)), ("decimallongitude", ("char",)), ("disposition", ("char",)), ("earliestageorloweststage", ("char",)), ("earliesteonorlowesteonothem", ("char",)), ("earliestepochorlowestseries", ("char",)), ("earliesteraorlowesterathem", ("char",)), ("earliestperiodorlowestsystem", ("char",)), ("enddayofyear", ("char",)), ("establishmentmeans", ("char",)), ("eventdate", ("char",)), ("eventid", ("char",)), ("eventremarks", ("char",)), ("eventtime", ("char",)), ("fieldnotes", ("char",)), ("fieldnumber", ("char",)), ("footprintspatialfit", ("char",)), ("footprintsrs", ("char",)), ("footprintwkt", ("char",)), ("formation", ("char",)), ("geodeticdatum", ("char",)), ("geologicalcontextid", ("char",)), ("georeferencedby", ("char",)), ("georeferenceddate", ("char",)), ("georeferenceprotocol", ("char",)), ("georeferenceremarks", ("char",)), ("georeferencesources", ("char",)), ("georeferenceverificationstatus", ("char",)), ("group", ("char",)), ("habitat", ("char",)), ("highergeography", ("char",)), ("highergeographyid", ("char",)), ("highestbiostratigraphiczone", ("char",)), ("identificationid", ("char",)), ("identificationqualifier", ("char",)), ("identificationreferences", ("char",)), ("identificationremarks", ("char",)), ("identificationverificationstatus", ("char",)), ("identifiedby", ("char",)), ("individualcount", ("char",)), ("island", ("char",)), ("islandgroup", ("char",)), ("latestageorhigheststage", ("char",)), ("latesteonorhighesteonothem", ("char",)), ("latestepochorhighestseries", ("char",)), ("latesteraorhighesterathem", ("char",)), ("latestperiodorhighestsystem", ("char",)), ("lifestage", ("char",)), ("lithostratigraphicterms", ("char",)), ("locality", ("char",)), ("locationaccordingto", ("char",)), ("locationid", ("char",)), ("locationremarks", ("char",)), ("lowestbiostratigraphiczone", ("char",)), ("materialsampleid", ("char",)), ("maximumdepthinmeters", ("char",)), ("maximumdistanceabovesurfaceinmeters", ("char",)), ("maximumelevationinmeters", ("char",)), ("member", ("char",)), ("minimumdepthinmeters", ("char",)), ("minimumdistanceabovesurfaceinmeters", ("char",)), ("minimumelevationinmeters", ("char",)), ("month", ("char",)), ("municipality", ("char",)), ("occurrenceid", ("char",)), ("occurrenceremarks", ("char",)), ("occurrencestatus", ("char",)), ("organismid", ("char",)), ("organismname", ("char",)), ("organismremarks", ("char",)), ("organismscope", ("char",)), ("othercatalognumbers", ("char",)), ("pointradiusspatialfit", ("char",)), ("preparations", ("char",)), ("previousidentifications", ("char",)), ("recordedby", ("char",)), ("recordnumber", ("char",)), ("reproductivecondition", ("char",)), ("samplingeffort", ("char",)), ("samplingprotocol", ("char",)), ("sex", ("char",)), ("startdayofyear", ("char",)), ("stateprovince", ("char",)), ("typestatus", ("char",)), ("verbatimcoordinates", ("char",)), ("verbatimcoordinatesystem", ("char",)), ("verbatimdepth", ("char",)), ("verbatimelevation", ("char",)), ("verbatimeventdate", ("char",)), ("verbatimlatitude", ("char",)), ("verbatimlocality", ("char",)), ("verbatimlongitude", ("char",)), ("verbatimsrs", ("char",)), ("waterbody", ("char",)), ("year", ("char",)), ("dctype", ("char",)), ("modified", ("char",)), ("language", ("char",)), ("license", ("char",)), ("rightsholder", ("char",)), ("accessrights", ("char",)), ("bibliographiccitation", ("char",)), ("dc_references", ("char",)), ("institutionid", ("char",)), ("collectionid", ("char",)), ("datasetid", ("char",)), ("institutioncode", ("char",)), ("collectioncode", ("char",)), ("datasetname", ("char",)), ("ownerinstitutioncode", ("char",)), ("basisofrecord", ("char",)), ("informationwithheld", ("char",)), ("datageneralizations", ("char",)), ("dynamicproperties", ("char",)), ("scientificnameid", ("char",)), ("namepublishedinid", ("char",)), ("scientificname", ("char",)), ("acceptednameusage", ("char",)), ("originalnameusage", ("char",)), ("namepublishedin", ("char",)), ("namepublishedinyear", ("char",)), ("higherclassification", ("char",)), ("kingdom", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("genus", ("char",)), ("subgenus", ("char",)), ("specificepithet", ("char",)), ("infraspecificepithet", ("char",)), ("taxonrank", ("char",)), ("verbatimtaxonrank", ("char",)), ("scientificnameauthorship", ("char",)), ("vernacularname", ("char",)), ("nomenclaturalcode", ("char",)), ("taxonomicstatus", ("char",)), ("keyname", ("char",)), ("haslicense", ("int",)), ("vntype", ("char",)), ("rank", ("int",)), ("mappable", ("int",)), ("hashid", ("char",)), ("hastypestatus", ("int",)), ("wascaptive", ("int",)), ("wasinvasive", ("int",)), ("hastissue", ("int",)), ("hasmedia", ("int",)), ("isfossil", ("int",)), ("haslength", ("int",)), ("haslifestage", ("int",)), ("hasmass", ("int",)), ("hassex", ("int",)), ("lengthinmm", ("double",)), ("massing", ("double",)), ("lengthunitsinferred", ("char",)), ("massunitsinferred", ("char",)), ("underivedlifestage", ("char",)), ("underivedsex", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], "zip", False, "vertnet_latest_" + str(tablename)) engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["capture"], archive_type="zip") # Convert xlsx to csv. xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx") file_path = self.engine.format_filename("DSD_CAPTURE.csv") book = xlrd.open_workbook(xlsx_file) sh = book.sheet_by_index(0) rows = sh.nrows # Creating data files new_data = open_fw(file_path) csv_writer = open_csvw(new_data) csv_writer.writerow(["Order", "Concept_id", "Role_Type", "Codelist_id", "Codelist_Code_id", "Description"]) for index in range(2, rows): row = sh.row(index) # Get each row and format the sell value. # Data starts at index 2 row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) new_data.close() file_names = [ ('CL_FI_UNIT.csv', 'unit_data'), ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'), ('DSD_CAPTURE.csv', 'dsd_capture_data'), ('CL_FI_SPECIES_GROUPS.csv', 'species_group') ] for (filename, tablename) in file_names: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path) # File CL_FI_COUNTRY_GROUPS.csv has multi encoding file_names_encoded = [ ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'), ] for (filename, tablename) in file_names_encoded: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) table.columns = [('UN_Code', ('int', )), ('Identifier', ('int', )), ('ISO2_Code', ('char', '5')), ('ISO3_Code', ('char', '5')), ('Name_En', ('char', '50')), ('Name_Fr', ('char', '50')), ('Name_Es', ('char', '50')), ('Name_Ar', ('char', '120')), ('Name_Cn', ('char', '90')), ('Name_Ru', ('char', '150')), ('Official_Name_En', ('char', '70')), ('Official_Name_Fr', ('char', '70')), ('Official_Name_Es', ('char', '70')), ('Official_Name_Ar', ('char', '1100')), ('Official_Name_Cn', ('char', '70')), ('Official_Name_Ru', ('char', '130')), ('Continent_Group', ('char', '15')), ('EcoClass_Group', ('char', '50')), ('GeoRegion_Group', ('char', '30'))] self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path) # TS_FI_CAPTURE is file_names_encoded = [ ('TS_FI_CAPTURE.csv', 'ts_capture_data',) ] for (filename, tablename) in file_names_encoded: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) table.columns = [('COUNTRY', ('int', )), ('FISHING_AREA', ('int', )), ('SPECIES', ('char', '10')), ('YEAR', ('int', )), ('UNIT', ('char', '5')), ('QUANTITY', ('double', )), ('SYMBOL', ('char', '4'))] self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path)