def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filenames = [ 'Aquatic_animal_excretion_data.csv', 'Aquatic_animal_excretion_variable_descriptions.csv' ] for file_paths in filenames: if not os.path.isfile(engine.format_filename(file_paths)): url = self.urls["aquatic_animals"] engine.download_files_from_archive(url, filenames, "zip") # processing Aquatic_animal_excretion_data.csv filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )), ("sourcename", ("char", )), ("speciesname", ("char", )), ("speciescode", ("char", )), ("invert/vert", ("char", )), ("phylum", ("char", )), ("class", ("char", )), ("order", ("char", )), ("family", ("char", )), ("trophicgild", ("char", )), ("drymass", ("double", )), ("logdrymass", ("double", )), ("ecosystemtype", ("char", )), ("energysource", ("char", )), ("habitat", ("char", )), ("residentecosystem", ("char", )), ("temperature", ("double", )), ("nexcretionrate", ("double", )), ("pexcretionrate", ("double", )), ("lognexcretionrate", ("double", )), ("logpexcretionrate", ("double", )), ("incubationtime", ("double", )), ("nform", ("char", )), ("pform", ("char", )), ("bodyc", ("double", )), ("bodyn", ("double", )), ("bodyp", ("double", )), ("bodyc:n", ("double", )), ("bodyc:p", ("double", )), ("bodyn:p", ("double", )), ("bodydatasource", ("char", )), ("datasource", ("char", )), ("dataproviders", ("char", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename))) # processing Aquatic_animal_excretion_variable_descriptions.csv filename = 'Aquatic_animal_excretion_variable_descriptions.csv' tablename = 'variable_descriptions' table = Table(str(tablename), delimiter=',') table.columns = [("Column", ("char", )), ("Variable", ("char", )), ("Description", ("char", )), ("Data Class", ("char", )), ("Units", ("char", )), ("Minimum_value", ("char", )), ("Maximum_value", ("char", )), ("Possible_values", ("char", )), ("Missing_data_symbol", ("char", )), ("Notes", ("char", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) # structure_plot_year table self.engine.auto_create_table(Table("structure_plot_year"), url=self.urls["structure_plot_year"]) self.engine.insert_data_from_url(self.urls["structure_plot_year"]) # structure_plot_year table self.engine.auto_create_table(Table("plots"), url=self.urls["plots"]) self.engine.insert_data_from_url(self.urls["plots"]) # species table self.engine.download_file(self.urls["species"], "original_MSH_SPECIES_DESCRIPTORS.csv") data_path = self.engine.format_filename("MSH_SPECIES_DESCRIPTORS.csv") old_data = os.path.normpath(self.engine.find_file("original_MSH_SPECIES_DESCRIPTORS.csv")) with open(old_data, 'rU') as infile, open(data_path, 'w')as new_data: for line in infile: line = str(line).encode('utf-8') new_data.write(line) infile.close() new_data.close() self.engine.auto_create_table(Table("species"), filename="MSH_SPECIES_DESCRIPTORS.csv") self.engine.insert_data_from_file(data_path) # species_plot_year tables table = Table("species_plot_year") table.delimiter = ',' table.columns = [ ('record_id', ('pk-auto',)), ('plot_id_year', ('char',)), ('plot_name', ('char',)), ('plot_number', ('int',)), ('year', ('int',)), ('species', ('ct_column',)), ('count', ('ct-double',)) ] table.ct_column = 'species' table.ct_names = ['Abilas', 'Abipro', 'Achmil', 'Achocc', 'Agoaur', 'Agrexa', 'Agrpal', 'Agrsca', 'Alnvir', 'Anamar', 'Antmic', 'Antros', 'Aqifor', 'Arcnev', 'Arnlat', 'Astled', 'Athdis', 'Blespi', 'Brocar', 'Brosit', 'Carmer', 'Carmic', 'Carpac', 'Carpay', 'Carpha', 'Carros', 'Carspe', 'Casmin', 'Chaang', 'Cirarv', 'Cisumb', 'Crycas', 'Danint', 'Descae', 'Elyely', 'Epiana', 'Eriova', 'Eripyr', 'Fesocc', 'Fravir', 'Gencal', 'Hiealb', 'Hiegra', 'Hyprad', 'Junmer', 'Junpar', 'Juncom', 'Leppun', 'Lommar', 'Luepec', 'Luihyp', 'Luplat', 'Luplep', 'Luzpar', 'Maiste', 'Pencar', 'Pencon', 'Penser', 'Phahas', 'Phlalp', 'Phldif', 'Phyemp', 'Pincon', 'Poasec', 'Poldav', 'Polmin', 'Pollon', 'Poljun', 'Popbal', 'Potarg', 'Psemen', 'Raccan', 'Rumace', 'Salsit', 'Saxfer', 'Senspp', 'Sibpro', 'Sorsit', 'Spiden', 'Trispi', 'Tsumer', 'Vacmem', 'Vervir', 'Vioadu', 'Xerten'] self.engine.table = table self.engine.create_table() self.engine.insert_data_from_url(self.urls["species_plot_year"])
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"] engine.download_files_from_archive(self.urls["data"], files, filetype="zip") # Create table species engine.auto_create_table(Table('species', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="Species_list.txt") engine.insert_data_from_file(engine.format_filename("Species_list.txt")) # Create table sites engine.auto_create_table(Table('sites', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="Site_variables.txt") engine.insert_data_from_file(engine.format_filename("Site_variables.txt")) # Create table microplots table = Table('microplots') table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))] table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9', 'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17', 'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26', 'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35', 'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46', 'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54', 'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62', 'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70', 'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79', 'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88', 'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96', 'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104'] table.ct_column = 'PlotID' engine.auto_create_table(table, filename="Microplot_data.txt") engine.insert_data_from_file(engine.format_filename("Microplot_data.txt")) # Create table microplots table = Table('macroplots') table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5'] table.ct_column = 'Tree' table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')), ('Girth', ('ct-int',))] engine.auto_create_table(table, filename="Macroplot_data_Rev.txt") engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"] engine.download_files_from_archive(self.urls["data"], files, filetype="zip") # Create table species engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table), filename="Species_list.txt") engine.insert_data_from_file(engine.format_filename("Species_list.txt")) # Create table sites engine.auto_create_table(Table('sites', cleanup=self.cleanup_func_table), filename="Site_variables.txt") engine.insert_data_from_file(engine.format_filename("Site_variables.txt")) # Create table microplots table = Table('microplots') table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))] table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9', 'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17', 'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26', 'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35', 'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46', 'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54', 'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62', 'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70', 'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79', 'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88', 'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96', 'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104'] table.ct_column = 'PlotID' engine.auto_create_table(table, filename="Microplot_data.txt") engine.insert_data_from_file(engine.format_filename("Microplot_data.txt")) # Create table microplots table = Table('macroplots') table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5'] table.ct_column = 'Tree' table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')), ('Girth', ('ct-int',))] engine.auto_create_table(table, filename="Macroplot_data_Rev.txt") engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [ ("index", ("pk-int",)), ("sourcenumber", ("int",)), ("sourcename", ("char",)), ("speciesname", ("char",)), ("speciescode", ("char",)), ("invert/vert", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("trophicgild", ("char",)), ("drymass", ("double",)), ("logdrymass", ("double",)), ("ecosystemtype", ("char",)), ("energysource", ("char",)), ("habitat", ("char",)), ("residentecosystem", ("char",)), ("temperature", ("double",)), ("nexcretionrate", ("double",)), ("pexcretionrate", ("double",)), ("lognexcretionrate", ("double",)), ("logpexcretionrate", ("double",)), ("incubationtime", ("double",)), ("nform", ("char",)), ("pform", ("char",)), ("bodyc", ("double",)), ("bodyn", ("double",)), ("bodyp", ("double",)), ("bodyc:n", ("double",)), ("bodyc:p", ("double",)), ("bodyn:p", ("double",)), ("bodydatasource", ("char",)), ("datasource", ("char",)), ("dataproviders", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine taxa = ('Plant', 'Animal') for tax in taxa: table = Table(tax.lower() + 's', delimiter=',', header_rows = 3, pk='record_id', contains_pk=True) columns = [("record_id" , ("pk-int",) ), ("station_id" , ("int",) ), ("obs_date" , ("char",) ), ("ind_id" , ("int",) ), ("sci_name" , ("char",) ), ("com_name" , ("char",) ), ("kingdom" , ("char",) ), ("pheno_cat" , ("char",) ), ("pheno_name" , ("char",) ), ("pheno_status" , ("char",) ), ("lat" , ("double",) ), ("lon" , ("double",) ), ("elevation" , ("int",) ), ("network_name" , ("char",) )] table.columns = columns engine.table = table engine.create_table() base_url = 'http://www.usanpn.org/getObs/observations/' years = range(2009, 2013) for year in years: if year == 2009 and tax == 'Animal': continue url = base_url + 'get%s%sDataNoDefinitions' % (year, tax) filename = '%s_%s.csv' % (tax, year) engine.download_file(url, filename) engine.insert_data_from_file(engine.find_file(filename)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )), ("sourcename", ("char", )), ("speciesname", ("char", )), ("speciescode", ("char", )), ("invert/vert", ("char", )), ("phylum", ("char", )), ("class", ("char", )), ("order", ("char", )), ("family", ("char", )), ("trophicgild", ("char", )), ("drymass", ("double", )), ("logdrymass", ("double", )), ("ecosystemtype", ("char", )), ("energysource", ("char", )), ("habitat", ("char", )), ("residentecosystem", ("char", )), ("temperature", ("double", )), ("nexcretionrate", ("double", )), ("pexcretionrate", ("double", )), ("lognexcretionrate", ("double", )), ("logpexcretionrate", ("double", )), ("incubationtime", ("double", )), ("nform", ("char", )), ("pform", ("char", )), ("bodyc", ("double", )), ("bodyn", ("double", )), ("bodyp", ("double", )), ("bodyc:n", ("double", )), ("bodyc:p", ("double", )), ("bodyn:p", ("double", )), ("bodydatasource", ("char", )), ("datasource", ("char", )), ("dataproviders", ("char", ))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=11) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"], archive_name="routes.zip") engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table engine.download_files_from_archive(self.urls["weather"], ["weather.csv"], archive_name="weather.zip") engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather.csv") engine.insert_data_from_file(engine.format_filename("weather.csv")) # Migrations data engine.download_files_from_archive( self.urls["migrants"], archive_name="MigrantNonBreeder.zip") engine.extract_zip( engine.format_filename("MigrantNonBreeder/Migrants.zip"), engine.format_filename("Migrant"), ) engine.extract_zip( engine.format_filename("MigrantNonBreeder/MigrantSummary.zip"), engine.format_filename("MigrantSummary"), ) table = Table("migrants", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('aou', ('int', )), ('stop1', ('int', )), ('stop2', ('int', )), ('stop3', ('int', )), ('stop4', ('int', )), ('stop5', ('int', )), ('stop6', ('int', )), ('stop7', ('int', )), ('stop8', ('int', )), ('stop9', ('int', )), ('stop10', ('int', )), ('stop11', ('int', )), ('stop12', ('int', )), ('stop13', ('int', )), ('stop14', ('int', )), ('stop15', ('int', )), ('stop16', ('int', )), ('stop17', ('int', )), ('stop18', ('int', )), ('stop19', ('int', )), ('stop20', ('int', )), ('stop21', ('int', )), ('stop22', ('int', )), ('stop23', ('int', )), ('stop24', ('int', )), ('stop25', ('int', )), ('stop26', ('int', )), ('stop27', ('int', )), ('stop28', ('int', )), ('stop29', ('int', )), ('stop30', ('int', )), ('stop31', ('int', )), ('stop32', ('int', )), ('stop33', ('int', )), ('stop34', ('int', )), ('stop35', ('int', )), ('stop36', ('int', )), ('stop37', ('int', )), ('stop38', ('int', )), ('stop39', ('int', )), ('stop40', ('int', )), ('stop41', ('int', )), ('stop42', ('int', )), ('stop43', ('int', )), ('stop44', ('int', )), ('stop45', ('int', )), ('stop46', ('int', )), ('stop47', ('int', )), ('stop48', ('int', )), ('stop49', ('int', )), ('stop50', ('int', ))] engine.table = table engine.create_table() engine.insert_data_from_file( engine.format_filename("Migrant/Migrants.csv")) table = Table("migrantsummary", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('aou', ('int', )), ('count10', ('int', )), ('count20', ('int', )), ('count30', ('int', )), ('count40', ('int', )), ('count50', ('int', )), ('stoptotal', ('int', )), ('speciestotal', ('int', ))] engine.table = table engine.create_table() engine.insert_data_from_file( engine.format_filename("MigrantSummary/MigrantSummary.csv")) table = Table("vehicledata", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('recordedcar', ('char', )), ('car1', ('int', )), ('car2', ('int', )), ('car3', ('int', )), ('car4', ('int', )), ('car5', ('int', )), ('car6', ('int', )), ('car7', ('int', )), ('car8', ('int', )), ('car9', ('int', )), ('car10', ('int', )), ('car11', ('int', )), ('car12', ('int', )), ('car13', ('int', )), ('car14', ('int', )), ('car15', ('int', )), ('car16', ('int', )), ('car17', ('int', )), ('car18', ('int', )), ('car19', ('int', )), ('car20', ('int', )), ('car21', ('int', )), ('car22', ('int', )), ('car23', ('int', )), ('car24', ('int', )), ('car25', ('int', )), ('car26', ('int', )), ('car27', ('int', )), ('car28', ('int', )), ('car29', ('int', )), ('car30', ('int', )), ('car31', ('int', )), ('car32', ('int', )), ('car33', ('int', )), ('car34', ('int', )), ('car35', ('int', )), ('car36', ('int', )), ('car37', ('int', )), ('car38', ('int', )), ('car39', ('int', )), ('car40', ('int', )), ('car41', ('int', )), ('car42', ('int', )), ('car43', ('int', )), ('car44', ('int', )), ('car45', ('int', )), ('car46', ('int', )), ('car47', ('int', )), ('car48', ('int', )), ('car49', ('int', )), ('car50', ('int', )), ('noise1', ('int', )), ('noise2', ('int', )), ('noise3', ('int', )), ('noise4', ('int', )), ('noise5', ('int', )), ('noise6', ('int', )), ('noise7', ('int', )), ('noise8', ('int', )), ('noise9', ('int', )), ('noise10', ('int', )), ('noise11', ('int', )), ('noise12', ('int', )), ('noise13', ('int', )), ('noise14', ('int', )), ('noise15', ('int', )), ('noise16', ('int', )), ('noise17', ('int', )), ('noise18', ('int', )), ('noise19', ('int', )), ('noise20', ('int', )), ('noise21', ('int', )), ('noise22', ('int', )), ('noise23', ('int', )), ('noise24', ('int', )), ('noise25', ('int', )), ('noise26', ('int', )), ('noise27', ('int', )), ('noise28', ('int', )), ('noise29', ('int', )), ('noise30', ('int', )), ('noise31', ('int', )), ('noise32', ('int', )), ('noise33', ('int', )), ('noise34', ('int', )), ('noise35', ('int', )), ('noise36', ('int', )), ('noise37', ('int', )), ('noise38', ('int', )), ('noise39', ('int', )), ('noise40', ('int', )), ('noise41', ('int', )), ('noise42', ('int', )), ('noise43', ('int', )), ('noise44', ('int', )), ('noise45', ('int', )), ('noise46', ('int', )), ('noise47', ('int', )), ('noise48', ('int', )), ('noise49', ('int', )), ('noise50', ('int', ))] engine.table = table engine.create_table() engine.download_files_from_archive(self.urls["Vehicledata"], archive_name="VehicleData.zip") engine.extract_zip( engine.format_filename("VehicleData/VehicleData.zip"), engine.format_filename("VehicleData"), ) engine.insert_data_from_file( engine.format_filename("VehicleData/VehicleData.csv")) # Counts table table = Table("counts", delimiter=",") engine.download_files_from_archive(self.urls["counts"], archive_name="States.zip") table.columns = [("record_id", ("pk-auto", )), ("RouteDataID", ("int", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if isinstance(state, (list, )): state, shortstate = state[0], state[1] else: shortstate = state[0:7] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.extract_zip( engine.format_filename("States/" + shortstate + ".zip"), engine.format_filename("States/" + shortstate), ) file_path = "{states}/{shortstate}/{shortstate}.csv".format( states="States", shortstate=shortstate) engine.insert_data_from_file( engine.format_filename(file_path)) except: print(state, ": Failed bulk insert on, inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_file( engine.format_filename(file_path)) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=6) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table if not os.path.isfile(engine.format_filename("routes_new.csv")): engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) read = open(engine.format_filename("routes.csv"), "rb") write = open(engine.format_filename("routes_new.csv"), "wb") print "Cleaning routes data..." write.write(read.readline()) for line in read: values = line.split(',') v = Decimal(values[5]) if v > 0: values[5] = str(v * Decimal("-1")) write.write(','.join(str(value) for value in values)) write.close() read.close() engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes_new.csv") engine.insert_data_from_file(engine.format_filename("routes_new.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open(engine.format_filename("weather.csv"), "rb") write = open(engine.format_filename("weather_new.csv"), "wb") print "Cleaning weather data..." for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in replace.keys(): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns=[("RouteDataID" , ("int",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print "Inserting data from part " + part + "..." try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "Failed bulk insert on " + part + ", inserting manually." engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "There was an error in part " + part + "." raise except zipfile.BadZipfile: print "There was an unexpected error in the Breeding Bird Survey archives." raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows #Creating data table lines = [] for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate([ "Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number" ]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("data", delimiter="\t") table.columns = [("Number", ("pk-int", )), ("Family", ("char", )), ("Binomial", ("char", )), ("Wood_Density", ("double", )), ("Region", ("char", )), ("Reference_Number", ("int", ))] table.pk = 'Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_data = [ line["Number"], line["Family"], line["Binomial"], line["Wood_Density"], line["Region"], line["Reference_Number"] ] gwdd.append(gwdd_data) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) #Creating reference table lines = [] sh = book.sheet_by_index(2) rows = sh.nrows for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Reference_Number", "Reference"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("reference", delimiter="\t") table.columns = [("Reference_Number", ("pk-int", )), ("Reference", ("char", ))] table.pk = 'Reference_Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_ref = [line["Reference_Number"], line["Reference"]] gwdd.append(gwdd_ref) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv') self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls") filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = c.value.lower().strip() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: if row[cn[i]].ctype != 2: # if the cell type(ctype) is not a number this_line[i] = row[cn[i]].value.lower().strip().replace("\\", "/").replace('"', '') else: this_line[i] = row[cn[i]].value if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [row[c] for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] # Manually correct CEDRAL data, which has a single line # that is shifted by one to the left starting at Liana if this_line["site"] == "CEDRAL" and type(this_line["liana"]) == float: this_line["liana"] = "" this_line["count"] = 3 this_line["stems"] = [2.5, 2.5, 30, 18, 25] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"], id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = {} tax_count = 0 # Get all unique families/genera/species print("\n") for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.flush() sys.stdout.write(msg + "\b" * len(msg)) print("\n") # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("int",) )] data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group] for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",") table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"])], line["site"], liana ] try: counts.append([value for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [str(i)] stems.append(stem) self.engine.table = table self.engine.create_table() self.engine.add_to_table(stems) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] self.engine.table = table self.engine.create_table() self.engine.add_to_table(counts) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine original_sql_file = "BioTIMESQL02_04_2018.sql" engine.download_file(self.urls["sql_file"], original_sql_file) sql_data = open_fr(self.engine.format_filename(original_sql_file)) set_open = False csv_writer = None csv_file = None table_name = None NULL = None for line in sql_data: table_indicator = "-- Table structure for table " if line.startswith(table_indicator): st = line[len(table_indicator):].replace("`", "") table_name = st.strip() current_file_process = table_name current_file_open = current_file_process if set_open and not current_file_process == current_file_open: csv_file.close() set_open = False else: out_file = "{name}.csv".format(name=table_name) csv_file = open_fw(engine.format_filename(out_file)) csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) set_open = True if line.startswith("INSERT INTO `{table_name}`".format( table_name=table_name)): row_val = line[line.index("VALUES (") + 8:-3] table_rows = row_val.replace("\r\n", "").split("),(") for i_row in table_rows: v = eval('[' + str(i_row) + ']') csv_writer.writerows([v]) if csv_file: csv_file.close() # Create abundance table table = Table("ID_ABUNDANCE", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_ABUNDANCE", ("int", )), ("ABUNDANCE_TYPE", ("char", "100")), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("abundance.csv")) # Create allrawdata table table = Table("allrawdata", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_ALL_RAW_DATA", ("int", )), ("ABUNDANCE", ("double", )), ("BIOMASS", ("double", )), ("ID_SPECIES", ("int", )), ("SAMPLE_DESC", ("char", 200)), ("PLOT", ("char", 150)), ("LATITUDE", ("double", )), ("LONGITUDE", ("double", )), ("DEPTH", ("double", )), ("DAY", ("int", )), ("MONTH", ("int", )), ("YEAR", ("int", )), ("STUDY_ID", ("int", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("allrawdata.csv")) # Create biomass table table = Table("biomass", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("ID_BIOMASS", ("int", )), ("BIOMASS_TYPE", ("char", "100"))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("biomass.csv")) # Create citation1 table table = Table("citation1", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_CITATION1", ("int", )), ("STUDY_ID", ("int", )), ("CITATION_LINE", ("char", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("citation1.csv")) # Create contacts table table = Table("contacts", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_CONTACTS", ("int", )), ("STUDY_ID", ("int", )), ("CONTACT_1", ("char", 500)), ("CONTACT_2", ("char", 500)), ("CONT_1_MAIL", ("char", 60)), ("CONT_2_MAIL", ("char", 60)), ("LICENSE", ("char", 200)), ("WEB_LINK", ("char", 200)), ("DATA_SOURCE", ("char", 250)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("contacts.csv")) # Create countries table table = Table("countries", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("COUNT_ID", ("int", )), ("COUNTRY_NAME", ("char", 200))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("countries.csv")) # Create curation table table = Table("curation", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_CURATION", ("int", )), ("STUDY_ID", ("int", )), ("LINK_ID", ("int", )), ("COMMENTS", ("char", )), ("DATE_STUDY_ADDED", ("char", 50)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("curation.csv")) # Create datasets table table = Table("datasets", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_DATASETS", ("int", )), ("STUDY_ID", ("int", )), ("TAXA", ("char", 50)), ("ORGANISMS", ("char", 200)), ("TITLE", ("char", 800)), ("AB_BIO", ("char", 2)), ("HAS_PLOT", ("char", 10)), ("DATA_POINTS", ("char", )), ("START_YEAR", ("char", )), ("END_YEAR", ("char", )), ("CENT_LAT", ("double", )), ("CENT_LONG", ("double", )), ("NUMBER_OF_SPECIES", ("char", )), ("NUMBER_OF_SAMPLES", ("char", )), ("NUMBER_LAT_LONG", ("char", )), ("TOTAL", ("char", )), ("GRAIN_SIZE_TEXT", ("char", )), ("GRAIN_SQ_KM", ("double", )), ("AREA_SQ_KM", ("double", )), ("AB_TYPE", ("char", )), ("BIO_TYPE", ("char", )), ("SAMPLE_TYPE", ("char", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("datasets.csv")) # Create downloads table table = Table("downloads", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("D_ID", ("int", )), ("STUDY", ("char", 25)), ("NAME", ("char", 150)), ("EMAIL", ("char", 150)), ("COUNTRY", ("char", 200)), ("ROLE", ("char", 150)), ("PURPOSE", ("char", 500)), ("LOCATION", ("char", 250)), ("DATE_STAMP", ("char", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("downloads.csv")) # Create methods table table = Table("methods", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_METHODS", ("int", )), ("STUDY_ID", ("int", )), ("METHODS", ("char", )), ("SUMMARY_METHODS", ("char", 500)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("methods.csv")) # Create sample table table = Table("sample", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_SAMPLE", ("int", )), ("ID_TREAT", ("int", )), ("SAMPLE_DESC_NAME", ("char", 200)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("sample.csv")) # Create site table table = Table("site", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("ID_SITE", ("int", )), ("STUDY_ID", ("int", )), ("REALM", ("char", 11)), ("CLIMATE", ("char", 20)), ("GENERAL_TREAT", ("char", 200)), ("TREATMENT", ("char", 200)), ("TREAT_COMMENTS", ("char", 250)), ("TREAT_DATE", ("char", 100)), ("CEN_LATITUDE", ("double", )), ("CEN_LONGITUDE", ("double", )), ("HABITAT", ("char", 100)), ("PROTECTED_AREA", ("char", 50)), ("AREA", ("double", )), ("BIOME_MAP", ("char", 500))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("site.csv")) # Create species table table = Table("species", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("ID_SPECIES", ("int", )), ("GENUS", ("char", 100)), ("SPECIES", ("char", 100)), ("GENUS_SPECIES", ("char", 100))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("species.csv"))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns=[("record_id" , ("pk-auto",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("Year" , ("int",) ), ("Aou" , ("int",) ), ("Count10" , ("int",) ), ("Count20" , ("int",) ), ("Count30" , ("int",) ), ("Count40" , ("int",) ), ("Count50" , ("int",) ), ("StopTotal" , ("int",) ), ("SpeciesTotal" , ("int",) )] stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = "database.csv" tablename = "predicts_main" table = Table(str(tablename), delimiter=',') table.columns = [ ("Source_ID", ("char", )), ("Reference", ("char", )), ("Study_number", ("int", )), ("Study_name", ("char", )), ("SS", ("char", )), ("Diversity_metric", ("char", )), ("Diversity_metric_unit", ("char", )), ("Diversity_metric_type", ("char", )), ("Diversity_metric_is_effort_sensitive", ("char", )), ("Diversity_metric_is_suitable_for_Chao", ("char", )), ("Sampling_method", ("char", )), ("Sampling_effort_unit", ("char", )), ("Study_common_taxon", ("char", )), ("Rank_of_study_common_taxon", ("char", )), ("Site_number", ("int", )), ("Site_name", ("char", )), ("Block", ("char", )), ("SSS", ("char", )), ("SSB", ("char", )), ("SSBS", ("char", )), ("Sample_start_earliest", ("char", )), ("Sample_end_latest", ("char", )), ("Sample_midpoint", ("char", )), ("Sample_date_resolution", ("char", )), ("Max_linear_extent_metres", ("double", )), ("Habitat_patch_area_square_metres", ("double", )), ("Sampling_effort", ("double", )), ("Rescaled_sampling_effort", ("double", )), ("Habitat_as_described", ("char", )), ("Predominant_land_use", ("char", )), ("Source_for_predominant_land_use", ("char", )), ("Use_intensity", ("char", )), ("Km_to_nearest_edge_of_habitat", ("double", )), ("Years_since_fragmentation_or_conversion", ("double", )), ("Transect_details", ("char", )), ("Coordinates_method", ("char", )), ("Longitude", ("double", )), ("Latitude", ("double", )), ("Country_distance_metres", ("double", )), ("Country", ("char", )), ("UN_subregion", ("char", )), ("UN_region", ("char", )), ("Ecoregion_distance_metres", ("double", )), ("Ecoregion", ("char", )), ("Biome", ("char", )), ("Realm", ("char", )), ("Hotspot", ("char", )), ("Wilderness_area", ("char", )), ("N_samples", ("double", )), ("Taxon_number", ("double", )), ("Taxon_name_entered", ("char", )), ("Indication", ("char", )), ("Parsed_name", ("char", )), ("Taxon", ("char", )), ("COL_ID", ("double", )), ("Name_status", ("char", )), ("Rank", ("char", )), ("Kingdom", ("char", )), ("Phylum", ("char", )), ("Class", ("char", )), ("Order", ("char", )), ("Family", ("char", )), ("Genus", ("char", )), ("Species", ("char", )), ("Best_guess_binomial", ("char", )), ("Higher_taxa", ("char", )), ("Higher_taxon", ("char", )), ("Measurement", ("double", )), ("Effort_corrected_measurement", ("double", )) ] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls["PREDICTS"], [filename], "zip", False, "download.zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=6) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table if not os.path.isfile(engine.format_filename("routes_new.csv")): engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) read = open(engine.format_filename("routes.csv"), "rb") write = open(engine.format_filename("routes_new.csv"), "wb") print "Cleaning routes data..." write.write(read.readline()) for line in read: values = line.split(',') v = Decimal(values[5]) if v > 0: values[5] = str(v * Decimal("-1")) write.write(','.join(str(value) for value in values)) write.close() read.close() engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes_new.csv") engine.insert_data_from_file(engine.format_filename("routes_new.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open(engine.format_filename("weather.csv"), "rb") write = open(engine.format_filename("weather_new.csv"), "wb") print "Cleaning weather data..." for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in replace.keys(): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns=[("RouteDataID" , ("int",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print "Inserting data from part " + part + "..." try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "Failed bulk insert on " + part + ", inserting manually." engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "There was an error in part " + part + "." raise except zipfile.BadZipfile: print "There was an unexpected error in the Breeding Bird Survey archives." raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow([ "Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number" ]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int", )), ("Family", ("char", )), ("Binomial", ("char", )), ("Wood_Density", ("double", )), ("Region", ("char", )), ("Reference_Number", ("int", ))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [ to_str(column_value.value, object_encoding=sys.stdout) for column_value in row ] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int", )), ("Reference", ("char", ))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filenames = ['Aquatic_animal_excretion_data.csv', 'Aquatic_animal_excretion_variable_descriptions.csv'] for file_paths in filenames: if not os.path.isfile(engine.format_filename(file_paths)): url = self.urls["aquatic_animals"] engine.download_files_from_archive(url, filenames, "zip") # processing Aquatic_animal_excretion_data.csv filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [ ("index", ("pk-int",)), ("sourcenumber", ("int",)), ("sourcename", ("char",)), ("speciesname", ("char",)), ("speciescode", ("char",)), ("invert/vert", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("trophicgild", ("char",)), ("drymass", ("double",)), ("logdrymass", ("double",)), ("ecosystemtype", ("char",)), ("energysource", ("char",)), ("habitat", ("char",)), ("residentecosystem", ("char",)), ("temperature", ("double",)), ("nexcretionrate", ("double",)), ("pexcretionrate", ("double",)), ("lognexcretionrate", ("double",)), ("logpexcretionrate", ("double",)), ("incubationtime", ("double",)), ("nform", ("char",)), ("pform", ("char",)), ("bodyc", ("double",)), ("bodyn", ("double",)), ("bodyp", ("double",)), ("bodyc:n", ("double",)), ("bodyc:p", ("double",)), ("bodyn:p", ("double",)), ("bodydatasource", ("char",)), ("datasource", ("char",)), ("dataproviders", ("char",))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename))) # processing Aquatic_animal_excretion_variable_descriptions.csv filename = 'Aquatic_animal_excretion_variable_descriptions.csv' tablename = 'variable_descriptions' table = Table(str(tablename), delimiter=',') table.columns = [ ("Column", ("char",)), ("Variable", ("char",)), ("Description", ("char",)), ("Data Class", ("char",)), ("Units", ("char",)), ("Minimum_value", ("char",)), ("Maximum_value", ("char",)), ("Possible_values", ("char",)), ("Missing_data_symbol", ("char",)), ("Notes", ("char",))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows #Creating data table lines = [] for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("data", delimiter="\t") table.columns=[("Number" , ("pk-int",) ), ("Family" , ("char",) ), ("Binomial" , ("char",) ), ("Wood_Density" , ("double",) ), ("Region" , ("char",) ), ("Reference_Number" , ("int",) )] table.pk = 'Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_data = [line["Number"], line["Family"], line["Binomial"], line["Wood_Density"], line["Region"], line["Reference_Number"]] gwdd.append(gwdd_data) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) #Creating reference table lines = [] sh = book.sheet_by_index(2) rows = sh.nrows for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Reference_Number", "Reference"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("reference", delimiter="\t") table.columns=[("Reference_Number" , ("pk-int",) ), ("Reference" , ("char",) )] table.pk = 'Reference_Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_ref = [line["Reference_Number"], line["Reference"]] gwdd.append(gwdd_ref) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=11) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"], archive_name="routes.zip") engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table engine.download_files_from_archive(self.urls["weather"], ["weather.csv"], archive_name="weather.zip") engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather.csv") engine.insert_data_from_file(engine.format_filename("weather.csv")) # Migrations data engine.download_files_from_archive( self.urls["migrants"], archive_name="MigrantNonBreeder.zip") engine.extract_zip( engine.format_filename("MigrantNonBreeder/Migrants.zip"), engine.format_filename("Migrant"), ) engine.extract_zip( engine.format_filename("MigrantNonBreeder/MigrantSummary.zip"), engine.format_filename("MigrantSummary"), ) table = Table("migrants", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('aou', ('int', )), ('stop1', ('int', )), ('stop2', ('int', )), ('stop3', ('int', )), ('stop4', ('int', )), ('stop5', ('int', )), ('stop6', ('int', )), ('stop7', ('int', )), ('stop8', ('int', )), ('stop9', ('int', )), ('stop10', ('int', )), ('stop11', ('int', )), ('stop12', ('int', )), ('stop13', ('int', )), ('stop14', ('int', )), ('stop15', ('int', )), ('stop16', ('int', )), ('stop17', ('int', )), ('stop18', ('int', )), ('stop19', ('int', )), ('stop20', ('int', )), ('stop21', ('int', )), ('stop22', ('int', )), ('stop23', ('int', )), ('stop24', ('int', )), ('stop25', ('int', )), ('stop26', ('int', )), ('stop27', ('int', )), ('stop28', ('int', )), ('stop29', ('int', )), ('stop30', ('int', )), ('stop31', ('int', )), ('stop32', ('int', )), ('stop33', ('int', )), ('stop34', ('int', )), ('stop35', ('int', )), ('stop36', ('int', )), ('stop37', ('int', )), ('stop38', ('int', )), ('stop39', ('int', )), ('stop40', ('int', )), ('stop41', ('int', )), ('stop42', ('int', )), ('stop43', ('int', )), ('stop44', ('int', )), ('stop45', ('int', )), ('stop46', ('int', )), ('stop47', ('int', )), ('stop48', ('int', )), ('stop49', ('int', )), ('stop50', ('int', ))] engine.table = table engine.create_table() engine.insert_data_from_file( engine.format_filename("Migrant/Migrants.csv")) table = Table("migrantsummary", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('aou', ('int', )), ('count10', ('int', )), ('count20', ('int', )), ('count30', ('int', )), ('count40', ('int', )), ('count50', ('int', )), ('stoptotal', ('int', )), ('speciestotal', ('int', ))] engine.table = table engine.create_table() engine.insert_data_from_file( engine.format_filename("MigrantSummary/MigrantSummary.csv")) table = Table("vehicledata", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('recordedcar', ('char', )), ('car1', ('int', )), ('car2', ('int', )), ('car3', ('int', )), ('car4', ('int', )), ('car5', ('int', )), ('car6', ('int', )), ('car7', ('int', )), ('car8', ('int', )), ('car9', ('int', )), ('car10', ('int', )), ('car11', ('int', )), ('car12', ('int', )), ('car13', ('int', )), ('car14', ('int', )), ('car15', ('int', )), ('car16', ('int', )), ('car17', ('int', )), ('car18', ('int', )), ('car19', ('int', )), ('car20', ('int', )), ('car21', ('int', )), ('car22', ('int', )), ('car23', ('int', )), ('car24', ('int', )), ('car25', ('int', )), ('car26', ('int', )), ('car27', ('int', )), ('car28', ('int', )), ('car29', ('int', )), ('car30', ('int', )), ('car31', ('int', )), ('car32', ('int', )), ('car33', ('int', )), ('car34', ('int', )), ('car35', ('int', )), ('car36', ('int', )), ('car37', ('int', )), ('car38', ('int', )), ('car39', ('int', )), ('car40', ('int', )), ('car41', ('int', )), ('car42', ('int', )), ('car43', ('int', )), ('car44', ('int', )), ('car45', ('int', )), ('car46', ('int', )), ('car47', ('int', )), ('car48', ('int', )), ('car49', ('int', )), ('car50', ('int', )), ('noise1', ('int', )), ('noise2', ('int', )), ('noise3', ('int', )), ('noise4', ('int', )), ('noise5', ('int', )), ('noise6', ('int', )), ('noise7', ('int', )), ('noise8', ('int', )), ('noise9', ('int', )), ('noise10', ('int', )), ('noise11', ('int', )), ('noise12', ('int', )), ('noise13', ('int', )), ('noise14', ('int', )), ('noise15', ('int', )), ('noise16', ('int', )), ('noise17', ('int', )), ('noise18', ('int', )), ('noise19', ('int', )), ('noise20', ('int', )), ('noise21', ('int', )), ('noise22', ('int', )), ('noise23', ('int', )), ('noise24', ('int', )), ('noise25', ('int', )), ('noise26', ('int', )), ('noise27', ('int', )), ('noise28', ('int', )), ('noise29', ('int', )), ('noise30', ('int', )), ('noise31', ('int', )), ('noise32', ('int', )), ('noise33', ('int', )), ('noise34', ('int', )), ('noise35', ('int', )), ('noise36', ('int', )), ('noise37', ('int', )), ('noise38', ('int', )), ('noise39', ('int', )), ('noise40', ('int', )), ('noise41', ('int', )), ('noise42', ('int', )), ('noise43', ('int', )), ('noise44', ('int', )), ('noise45', ('int', )), ('noise46', ('int', )), ('noise47', ('int', )), ('noise48', ('int', )), ('noise49', ('int', )), ('noise50', ('int', ))] engine.table = table engine.create_table() engine.download_files_from_archive(self.urls["Vehicledata"], archive_name="VehicleData.zip") engine.extract_zip( engine.format_filename("VehicleData/VehicleData.zip"), engine.format_filename("VehicleData"), ) engine.insert_data_from_file( engine.format_filename("VehicleData/VehicleData.csv")) # Counts table table = Table("counts", pk=False, delimiter=',') engine.download_files_from_archive(self.urls["counts"], archive_name="50-StopData.zip") table.columns = [("RouteDataID", ("int", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("year", ("int", )), ("AOU", ("int", )), ("Stop1", ("int", )), ("Stop2", ("int", )), ("Stop3", ("int", )), ("Stop4", ("int", )), ("Stop5", ("int", )), ("Stop6", ("int", )), ("Stop7", ("int", )), ("Stop8", ("int", )), ("Stop9", ("int", )), ("Stop10", ("int", )), ("Stop11", ("int", )), ("Stop12", ("int", )), ("Stop13", ("int", )), ("Stop14", ("int", )), ("Stop15", ("int", )), ("Stop16", ("int", )), ("Stop17", ("int", )), ("Stop18", ("int", )), ("Stop19", ("int", )), ("Stop20", ("int", )), ("Stop21", ("int", )), ("Stop22", ("int", )), ("Stop23", ("int", )), ("Stop24", ("int", )), ("Stop25", ("int", )), ("Stop26", ("int", )), ("Stop27", ("int", )), ("Stop28", ("int", )), ("Stop29", ("int", )), ("Stop30", ("int", )), ("Stop31", ("int", )), ("Stop32", ("int", )), ("Stop33", ("int", )), ("Stop34", ("int", )), ("Stop35", ("int", )), ("Stop36", ("int", )), ("Stop37", ("int", )), ("Stop38", ("int", )), ("Stop39", ("int", )), ("Stop40", ("int", )), ("Stop41", ("int", )), ("Stop42", ("int", )), ("Stop43", ("int", )), ("Stop44", ("int", )), ("Stop45", ("int", )), ("Stop46", ("int", )), ("Stop47", ("int", )), ("Stop48", ("int", )), ("Stop49", ("int", )), ("Stop50", ("int", ))] part = "" engine.table = table engine.create_table() for part in range(1, 11): part = str(part) try: print("Inserting data from part " + part + "...") try: "1997ToPresent_SurveyWide" engine.table.cleanup = Cleanup() engine.extract_zip( engine.format_filename( "50-StopData/1997ToPresent_SurveyWide/Fifty" + part + ".zip"), engine.format_filename("fifty" + part + ".csv"), ) except: print( "fifty{}: Failed bulk insert on, inserting manually." .format(part)) engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("There was an error in part " + part + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # Complete Plants Checklist file_name = "complete_plant_checklist.csv" table_name = "complete_plant_checklist" complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true" self.engine.download_file(complete_plant_url, filename=file_name) data_path = self.engine.format_filename(file_name) table = Table(table_name, delimiter=",") table.columns = [ ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("common_name", ("char", "42")), ("family", ("char", "30")), ] self.engine.auto_create_table(table, filename=file_name) self.engine.insert_data_from_file(data_path) # Symbols for Unknown Plants file_name = "symbols_unknown_plants.csv" table_name = "unknown_plants" unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt" self.engine.download_file(unknown_plants_url, filename=file_name) data_path = self.engine.format_filename(file_name) table = Table(table_name, delimiter=",") table.columns = [("symbol", ("char", "7")), ("common_name", ("char", "56"))] self.engine.auto_create_table(table, filename=file_name) self.engine.insert_data_from_file(data_path) # State PLANTS Checklist base_url = "https://plants.sc.egov.usda.gov/" state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}" state_plant_checklist_file = "all_state_plant_checklist.csv" table_name = "state_plant_checklist" state_plant_checklist = [ ("US01", "Alabama", "US"), ("US02", "Alaska", "US"), ("US05", "Arkansas", "US"), ("US04", "Arizona", "US"), ("US06", "California", "US"), ("US08", "Colorado", "US"), ("US09", "Connecticut", "US"), ("US10", "Delaware", "US"), ("US11", "District of Columbia", "US"), ("US12", "Florida", "US"), ("US13", "Georgia", "US"), ("US15", "Hawaii", "US"), ("US16", "Idaho", "US"), ("US17", "Illinois", "US"), ("US18", "Indiana", "US"), ("US19", "Iowa", "US"), ("US20", "Kansas", "US"), ("US21", "Kentucky", "US"), ("US22", "Louisiana", "US"), ("US23", "Maine", "US"), ("US24", "Maryland", "US"), ("US25", "Massachusetts", "US"), ("US26", "Michigan", "US"), ("US27", "Minnesota", "US"), ("US28", "Mississippi", "US"), ("US29", "Missouri", "US"), ("US30", "Montana", "US"), ("US31", "Nebraska", "US"), ("US32", "Nevada", "US"), ("US33", "New Hampshire", "US"), ("US34", "New Jersey", "US"), ("US35", "New Mexico", "US"), ("US36", "New York", "US"), ("US37", "North Carolina", "US"), ("US38", "North Dakota", "US"), ("US39", "Ohio", "US"), ("US40", "Oklahoma", "US"), ("US41", "Oregon", "US"), ("US42", "Pennsylvania", "US"), ("US44", "Rhode Island", "US"), ("US45", "South Carolina", "US"), ("US46", "South Dakota", "US"), ("US47", "Tennessee", "US"), ("US48", "Texas", "US"), ("US49", "Utah", "US"), ("US50", "Vermont", "US"), ("US51", "Virginia", "US"), ("US53", "Washington", "US"), ("US54", "West Virginia", "US"), ("US55", "Wisconsin", "US"), ("US56", "Wyoming", "US"), ("US72", "Puerto Rico", "US"), ("US78", "Virgin Islands", "US"), ("CA01", "Alberta", "Canada"), ("CA02", "British Columbia", "Canada"), ("CA03", "Manitoba", "Canada"), ("CA04", "New Brunswick", "Canada"), ("CALB", "Labrador", "Canada"), ("CANF", "Newfoundland", "Canada"), ("CA13", "Northwest Territories", "Canada"), ("CA07", "Nova Scotia", "Canada"), ("CA14", "Nunavut", "Canada"), ("CA08", "Ontario", "Canada"), ("CA09", "Prince Edward Island", "Canada"), ("CA10", "Québec", "Canada"), ("CA11", "Saskatchewan", "Canada"), ("CA12", "Yukon", "Canada"), ("GL", "Greenland", "Denmark"), ("SB", "St. Pierre and Miquelon", "France"), ] with open_fw(engine.format_filename( state_plant_checklist_file)) as write_object: csv_writer = open_csvw(write_object) for state_info in state_plant_checklist: file_name = state_info[1].replace(".", "").replace( " ", "_").lower() + ".csv" file_name = "old_state_plant_checklist_" + file_name state_url = state_plant_checklist_base_url.format( base=base_url, id=state_info[0]) self.engine.download_file(state_url, filename=file_name) with open_fr(engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) for row in csv.reader(read_object, delimiter=","): csv_writer.writerow([state_info[2]] + [state_info[1]] + row) data_path = self.engine.format_filename(state_plant_checklist_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("country", ("char", "7")), ("state", ("char", "23")), ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("national_common_name", ("char", "42")), ("family", ("char", "17")), ] self.engine.auto_create_table(table, filename=state_plant_checklist_file) self.engine.insert_data_from_file(data_path) # NRCS State GSAT Lists base_url = "https://www.plants.usda.gov/" nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}" nrcs_state_gsat_file = "all_nrcs_state_gsat.csv" table_name = "nrcs_state_gsat" nrcs_state_gsat = [ ("Alabama", "2"), ("Alaska", ""), ("Arkansas", ""), ("Arizona", "2"), ("California", ""), ("Colorado", ""), ("Connecticut", ""), ("Delaware", ""), ("Florida", ""), ("Georgia", ""), ("Hawaii", ""), ("Idaho", "9"), ("Illinois", ""), ("Indiana", ""), ("Iowa ", ""), ("Kansas", "6"), ("Kentucky", ""), ("Louisiana", "16"), ("Maine", ""), ("Maryland", ""), ("Massachusetts", ""), ("Michigan", ""), ("Minnesota", "11"), ("Mississippi", ""), ("Missouri", "14"), ("Montana", ""), ("Nebraska", "17"), ("Nevada", "4"), ("New Hampshire", ""), ("New Jersey ", ""), ("New Mexico", "1"), ("New York", ""), ("Noth Carolina", ""), ("North Dakota", "5"), ("Ohio", ""), ("Oklahoma", "12"), ("Oregon", "3"), ("Pennsylvania", "15"), ("Rhode Island", ""), ("South Carolina", ""), ("South Dakota", "7"), ("Tennessee", ""), ("Texas", "13"), ("Utah", ""), ("Vermont ", ""), ("Virginia", ""), ("Washington", "8"), ("West Virginia", ""), ("Wisconsin", ""), ("Wyoming", "10"), ] with open_fw( engine.format_filename(nrcs_state_gsat_file)) as write_object: for state_info in nrcs_state_gsat: if state_info[1]: # skip states with no data ("state", ""), file_name = state_info[0].replace(" ", "_").replace( ".", "").lower() + ".csv" file_name = "old_nrcs_state_gsat_" + file_name state_url = nrcs_state_gsat_base_url.format( base=base_url, id=state_info[1]) self.engine.download_file(state_url, filename=file_name) with open_fr( engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) state_quoted = '"{state}",'.format(state=state_info[0]) for line in read_object: write_object.write(state_quoted + line) data_path = self.engine.format_filename(nrcs_state_gsat_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("state", ("char", "12")), ("symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("gsat_common_name", ("char", "93")), ] self.engine.auto_create_table(table, filename=nrcs_state_gsat_file) self.engine.insert_data_from_file(data_path) base_url = "https://plants.sc.egov.usda.gov/" nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}" nrcs_state_plant_file = "all_nrcs_state_plant.csv" table_name = "nrcs_state_plant" nrcs_state_plant_lists = [ ("01", "Alabama"), ("02", "Alaska"), ("05", "Arkansas"), ("04", "Arizona"), ("06", "California"), ("08", "Colorado"), ("09", "Connecticut"), ("10", "Delaware"), ("12", "Florida"), ("13", "Georgia"), ("15", "Hawaii"), ("16", "Idaho"), ("17", "Illinois"), ("18", "Indiana"), ("19", "Iowa"), ("20", "Kansas"), ("21", "Kentucky"), ("22", "Louisiana"), ("23", "Maine"), ("24", "Maryland"), ("25", "Massachusetts"), ("26", "Michigan"), ("27", "Minnesota"), ("28", "Mississippi"), ("29", "Missouri"), ("30", "Montana"), ("31", "Nebraska"), ("32", "Nevada"), ("33", "New Hampshire"), ("34", "New Jersey"), ("35", "New Mexico"), ("36", "New York"), ("37", "North Carolina"), ("38", "North Dakota"), ("39", "Ohio"), ("40", "Oklahoma"), ("41", "Oregon"), ("42", "Pennsylvania"), ("44", "Rhode Island"), ("45", "South Carolina"), ("46", "South Dakota"), ("47", "Tennessee"), ("48", "Texas"), ("49", "Utah"), ("50", "Vermont"), ("51", "Virginia"), ("53", "Washington"), ("54", "West Virginia"), ("55", "Wisconsin"), ("56", "Wyoming"), ("72", "Puerto Rico"), ("78", "Virgin Islands"), ] with open_fw( engine.format_filename(nrcs_state_plant_file)) as write_object: for state_info in nrcs_state_plant_lists: file_name = state_info[1].replace(" ", "_").replace( ".", "").lower() + ".csv" file_name = "old_nrcs_state_plant_" + file_name state_url = nrcs_state_plant_lists_url.format(base=base_url, id=state_info[0]) self.engine.download_file(state_url, filename=file_name) with open_fr(engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) state_quoted = '"{state}",'.format(state=state_info[1]) for line in read_object: write_object.write(state_quoted + line) data_path = self.engine.format_filename(nrcs_state_plant_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("state", ("char", "17")), ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("state_common_name", ("char", "42")), ("family", ("char", "17")), ] self.engine.auto_create_table(table, filename=nrcs_state_plant_file) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["capture"], archive_type="zip") # Convert xlsx to csv. xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx") file_path = self.engine.format_filename("DSD_CAPTURE.csv") book = xlrd.open_workbook(xlsx_file) sh = book.sheet_by_index(0) rows = sh.nrows # Creating data files new_data = open_fw(file_path) csv_writer = open_csvw(new_data) csv_writer.writerow(["Order", "Concept_id", "Role_Type", "Codelist_id", "Codelist_Code_id", "Description"]) for index in range(2, rows): row = sh.row(index) # Get each row and format the sell value. # Data starts at index 2 row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) new_data.close() file_names = [ ('CL_FI_UNIT.csv', 'unit_data'), ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'), ('DSD_CAPTURE.csv', 'dsd_capture_data'), ('CL_FI_SPECIES_GROUPS.csv', 'species_group') ] for (filename, tablename) in file_names: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path) # File CL_FI_COUNTRY_GROUPS.csv has multi encoding file_names_encoded = [ ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'), ] for (filename, tablename) in file_names_encoded: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) table.columns = [('UN_Code', ('int', )), ('Identifier', ('int', )), ('ISO2_Code', ('char', '5')), ('ISO3_Code', ('char', '5')), ('Name_En', ('char', '50')), ('Name_Fr', ('char', '50')), ('Name_Es', ('char', '50')), ('Name_Ar', ('char', '120')), ('Name_Cn', ('char', '90')), ('Name_Ru', ('char', '150')), ('Official_Name_En', ('char', '70')), ('Official_Name_Fr', ('char', '70')), ('Official_Name_Es', ('char', '70')), ('Official_Name_Ar', ('char', '1100')), ('Official_Name_Cn', ('char', '70')), ('Official_Name_Ru', ('char', '130')), ('Continent_Group', ('char', '15')), ('EcoClass_Group', ('char', '50')), ('GeoRegion_Group', ('char', '30'))] self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path) # TS_FI_CAPTURE is file_names_encoded = [ ('TS_FI_CAPTURE.csv', 'ts_capture_data',) ] for (filename, tablename) in file_names_encoded: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) table.columns = [('COUNTRY', ('int', )), ('FISHING_AREA', ('int', )), ('SPECIES', ('char', '10')), ('YEAR', ('int', )), ('UNIT', ('char', '5')), ('QUANTITY', ('double', )), ('SYMBOL', ('char', '4'))] self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["Data_Files/Amniote_Database_Aug_2015.csv", "Data_Files/Amniote_Database_References_Aug_2015.csv", "Data_Files/Amniote_Range_Count_Aug_2015.csv"], filetype="zip") ct_column = 'trait' # all tables use the same ct_column name # Create tables from Amniote_Database_Aug.csv and Amniote_Database_References_Aug_2015.csv # Both reference and main have the same headers ct_names = ['female_maturity_d', 'litter_or_clutch_size_n', 'litters_or_clutches_per_y', 'adult_body_mass_g', 'maximum_longevity_y', 'gestation_d', 'weaning_d', 'birth_or_hatching_weight_g', 'weaning_weight_g', 'egg_mass_g', 'incubation_d', 'fledging_age_d', 'longevity_y', 'male_maturity_d', 'inter_litter_or_interbirth_interval_y', 'female_body_mass_g', 'male_body_mass_g', 'no_sex_body_mass_g', 'egg_width_mm', 'egg_length_mm', 'fledging_mass_g', 'adult_svl_cm', 'male_svl_cm', 'female_svl_cm', 'birth_or_hatching_svl_cm', 'female_svl_at_maturity_cm', 'female_body_mass_at_maturity_g', 'no_sex_svl_cm', 'no_sex_maturity_d'] # Create table main from Amniote_Database_Aug_2015.csv columns = [ ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')), ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')), ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))] table_main = Table('main', delimiter=',', cleanup=self.cleanup_func_table) table_main.ct_column = ct_column table_main.ct_names = ct_names table_main.columns = columns engine.auto_create_table(table_main, filename="Amniote_Database_Aug_2015.csv") engine.insert_data_from_file(engine.format_filename("Amniote_Database_Aug_2015.csv")) # Create table reference from Amniote_Database_References_Aug_2015.csv reference_columns = [ ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')), ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')), ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('reference', ('ct-char',))] table_references = Table('references', delimiter=',', cleanup=self.cleanup_func_table) table_references.ct_column = ct_column table_references.ct_names = ct_names table_references.columns = reference_columns engine.auto_create_table(table_references, filename="Amniote_Database_References_Aug_2015.csv") engine.insert_data_from_file(engine.format_filename("Amniote_Database_References_Aug_2015.csv")) # Create table Range # This table has different values for headers from the above tables. range_ct_names = ["min_female_maturity", "max_female_maturity", "count_female_maturity", "min_litter_clutch_size", "max_litter_clutch_size", "count_litter_clutch_size", "min_litters_clutches", "max_litters_clutches", "count_litters_clutches", "min_adult_body_mass", "max_adult_body_mass", "count_adult_body_mass", "min_maximum_longevity", "max_maximum_longevity", "count_maximum_longevity", "min_gestation", "max_gestation", "count_gestation", "min_weaning", "max_weaning", "count_weaning", "min_birth_hatching_weight", "max_birth_hatching_weight", "count_birth_hatching_weight", "min_weaning_weight", "max_weaning_weight", "count_weaning_weight", "min_egg_mass", "max_egg_mass", "count_egg_mass", "min_incubation", "max_incubation", "count_incubation", "min_fledging_age", "max_fledging_age", "count_fledging_age", "min_male_maturity", "max_male_maturity", "count_male_maturity", "min_inter_litter_interbirth_interval", "max_inter_litter_interbirth_interval", "count_inter_litter_interbirth_interval", "min_female_body_mass", "max_female_body_mass", "count_female_body_mass", "min_male_body_mass", "max_male_body_mass", "count_male_body_mass", "min_no_sex_body_mass", "max_no_sex_body_mass", "count_no_sex_body_mass", "min_egg_width", "max_egg_width", "count_egg_width", "min_egg_length", "max_egg_length", "count_egg_length", "min_fledging_mass", "max_fledging_mass", "count_fledging_mass", "min_adult_svl", "max_adult_svl", "count_adult_svl", "min_male_svl", "max_male_svl", "count_male_svl", "min_female_svl", "max_female_svl", "count_female_svl", "min_hatching_svl", "max_hatching_svl", "count_hatching_svl", "min_female_svl_at_maturity", "max_female_svl_at_maturity", "count_female_svl_at_maturity", "min_female_body_mass_at_maturity", "max_female_body_mass_at_maturity", "count_female_body_mass_at_maturity", "min_no_sex_svl", "max_no_sex_svl", "count_no_sex_svl", "min_no_sex_maturity", "max_no_sex_maturity", "count_no_sex_maturity"] range_columns = [ ('record_id', ('pk-auto',)), ('classx', ('char', '20')), ('orderx', ('char', '20')), ('familyx', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')), ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))] table_range = Table('range', delimiter=',', cleanup=self.cleanup_func_table) table_range.ct_column = ct_column table_range.ct_names = range_ct_names table_range.columns = range_columns engine.auto_create_table(table_range, filename="Amniote_Range_Count_Aug_2015.csv") engine.insert_data_from_file(engine.format_filename("Amniote_Range_Count_Aug_2015.csv"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = ["observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [("record_id", ("pk-auto",)), ("observation_id", ("int",)), # subsequently refered to as "status record" ("update_datetime", ("char",)), ("site_id", ("int",)), ("latitude", ("double",)), ("longitude", ("double",)), ("elevation_in_meters", ("char",)), ("state", ("char",)), ("species_id", ("int",)), ("genus", ("char",)), ("species", ("char",)), ("common_name", ("char",)), ("kingdom", ("char",)), # skip kingdom ("individual_id", ("char",)), ("phenophase_id", ("int",)), ("phenophase_description", ("char",)), ("observation_date", ("char",)), ("day_of_year", ("char",)), ("phenophase_status", ("char",)), ("intensity_category_id", ("char",)), ("intensity_value", ("char",)), ("abundance_value", ("char",)) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = [ "observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [ ("record_id", ("pk-auto", )), ("observation_id", ("int", )), # subsequently refered to as "status record" ("update_datetime", ("char", )), ("site_id", ("int", )), ("latitude", ("double", )), ("longitude", ("double", )), ("elevation_in_meters", ("char", )), ("state", ("char", )), ("species_id", ("int", )), ("genus", ("char", )), ("species", ("char", )), ("common_name", ("char", )), ("kingdom", ("char", )), # skip kingdom ("individual_id", ("char", )), ("phenophase_id", ("int", )), ("phenophase_description", ("char", )), ("observation_date", ("char", )), ("day_of_year", ("char", )), ("phenophase_status", ("char", )), ("intensity_category_id", ("char", )), ("intensity_value", ("char", )), ("abundance_value", ("char", )) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv') self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile( self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file( "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls") filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = c.value.lower().strip() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} # get the following information from the appropriate columns for i in [ "line", "family", "genus", "species", "liana", "count" ]: if cn[i] > -1: if row[cn[i]].ctype != 2: # if the cell type(ctype) is not a number this_line[i] = row[ cn[i]].value.lower().strip().replace( "\\", "/").replace('"', '') else: this_line[i] = row[cn[i]].value if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [ row[c] for c in cn["stems"] if not Excel.empty_cell(row[c]) ] this_line["site"] = filename[0:-4] # Manually correct CEDRAL data, which has a single line # that is shifted by one to the left starting at Liana if this_line["site"] == "CEDRAL" and type( this_line["liana"]) == float: this_line["liana"] = "" this_line["count"] = 3 this_line["stems"] = [2.5, 2.5, 30, 18, 25] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append( (this_line["family"], this_line["genus"], this_line["species"], id_level, str(full_id))) except: raise pass tax = sorted( tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = {} tax_count = 0 # Get all unique families/genera/species print("\n") for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str( tax_count) + " / " + str(TAX_GROUPS) sys.stdout.flush() sys.stdout.write(msg + "\b" * len(msg)) print("\n") # Create species table table = Table("species", delimiter=",") table.columns = [("species_id", ("pk-int", )), ("family", ("char", )), ("genus", ("char", )), ("species", ("char", )), ("id_level", ("char", 10)), ("full_id", ("int", ))] data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group] for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",") table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("stem", ("double", ))] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [ line["line"], tax_dict[(line["family"], line["genus"], line["species"])], line["site"], liana ] try: counts.append( [value for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [str(i)] stems.append(stem) self.engine.table = table self.engine.create_table() self.engine.add_to_table(stems) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("count", ("double", ))] self.engine.table = table self.engine.create_table() self.engine.add_to_table(counts) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int",)), ("Family", ("char",)), ("Binomial", ("char",)), ("Wood_Density", ("double",)), ("Region", ("char",)), ("Reference_Number", ("int",))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # Download both full and abbreviated versions and extract the data files abbrev_version = ["ABBREV.txt"] full_version = [ "DERIV_CD.txt", "FOOTNOTE.txt", "NUTR_DEF.txt", "WEIGHT.txt", "DATA_SRC.txt", "FD_GROUP.txt", "LANGDESC.txt", "NUT_DATA.txt", "DATSRCLN.txt", "FOOD_DES.txt", "LANGUAL.txt", "SRC_CD.txt" ] self.engine.download_files_from_archive(self.urls["full_version"], archive_type="zip", file_names=full_version) self.engine.download_files_from_archive( self.urls["abbreviated_version"], archive_type="zip", file_names=abbrev_version, ) # Convert original txt to csv convert_to_csv(self.engine.format_data_dir()) # FOOD_DES table new_file_name = "food_des.csv" table = Table("food_des", delimiter=",", header_rows=0) table.columns = [ ("ndb_no", ("int", )), ("fdgrp_cd", ("int", )), ("long_desc", ("char", "205")), ("shrt_desc", ("char", "65")), ("comname", ("char", "105")), ("manufacname", ("char", "70")), ("survey", ("char", "1")), ("ref_desc", ("char", "140")), ("refuse", ("double", )), ("sciname", ("char", "67")), ("n_factor", ("double", )), ("pro_factor", ("double", )), ("fat_factor", ("double", )), ("cho_factor", ("double", )), ] self.create_and_install(new_file_name, table) # FdGrp_Cd table new_file_name = "fd_group.csv" table = Table("fd_group", delimiter=",", header_rows=0) table.columns = [("fdgrp_cd", ("int", )), ("fdgrp_desc", ("char", "65"))] self.create_and_install(new_file_name, table) # LANGUAL table new_file_name = "langual.csv" table = Table("langual", delimiter=",", header_rows=0) table.columns = [("ndb_no", ("int", )), ("factor_code", ("char", "5"))] self.create_and_install(new_file_name, table) # LANGDESC Table new_file_name = "langdesc.csv" table = Table("langdesc", delimiter=",", header_rows=0) table.columns = [ ("factor_code", ("char", "5")), ("description", ("char", "145")), ] self.create_and_install(new_file_name, table) # NUT_DATA table new_file_name = "nut_data.csv" missingValues = [ "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 17" ] table = Table( "nut_data", delimiter=",", header_rows=0, missingValues=missingValues, do_not_bulk_insert=True, ) table.columns = [ ("ndb_no", ("int", )), ("nutr_no", ("int", )), ("nutr_val", ("double", )), ("num_data_pts", ("int", )), ("std_error", ("double", )), ("src_cd", ("int", )), ("deriv_cd", ("char", "12")), ("ref_ndb_no", ("double", )), ("add_nutr_mark", ("char", "12")), ("num_studies", ("double", )), ("min", ("double", )), ("max", ("double", )), ("df", ("double", )), ("low_eb", ("double", )), ("up_eb", ("double", )), ("stat_cmt", ("char", "12")), ("addmod_date", ("char", "12")), ("cc", ("char", "12")), ] self.create_and_install(new_file_name, table) # NUTR_DEF table new_file_name = "nutr_def.csv" table = Table("nutr_def", delimiter=",", header_rows=0) table.columns = [ ("nutr_no", ("int", )), ("units", ("char", "10")), ("tagname", ("char", "25")), ("nutrdesc", ("char", "60")), ("num_dec", ("int", )), ("sr_order", ("int", )), ] self.create_and_install(new_file_name, table) # SRC_CD table new_file_name = "src_cd.csv" table = Table("src_cd", delimiter=",", header_rows=0) table.columns = [("src_cd", ("int", )), ("srccd_desc", ("char", "65"))] self.create_and_install(new_file_name, table) # DERIV_CD table new_file_name = "deriv_cd.csv" table = Table("deriv_cd", delimiter=",", header_rows=0) table.columns = [("deriv_cd", ("char", "5")), ("deriv_desc", ("char", "130"))] self.create_and_install(new_file_name, table) # WEIGHT table new_file_name = "weight.csv" table = Table( "weight", delimiter=",", header_rows=0, missingValues=["Unnamed: 5", "Unnamed: 6"], ) table.columns = [ ("ndb_no", ("int", )), ("seq", ("int", )), ("amount", ("double", )), ("msre_desc", ("char", "130")), ("gm_wgt", ("double", )), ("num_data_pts", ("double", )), ("std_dev", ("double", )), ] self.create_and_install(new_file_name, table) # FOOTNOTE table new_file_name = "footnote.csv" table = Table("footnote", delimiter=",", header_rows=0, missingValues=["Unnamed: 3"]) table.columns = [ ("ndb_no", ("int", )), ("footnt_no", ("int", )), ("footnt_typ", ("char", "2")), ("nutr_no", ("double", )), ("footnt_txt", ("char", "200")), ] self.create_and_install(new_file_name, table) # DATSRCLN table new_file_name = "datsrcln.csv" table = Table("datsrcln", delimiter=",", header_rows=0) table.columns = [ ("ndb_no", ("int", )), ("nutr_no", ("int", )), ("datasrc_id", ("char", "7")), ] self.create_and_install(new_file_name, table) # DATA_SRC table new_file_name = "data_src.csv" table = Table("data_src", delimiter=",", header_rows=0) table.columns = [ ("datasrc_id", ("char", "7")), ("authors", ("char", "257")), ("title", ("char", "257")), ("year", ("char", "5")), ("journal", ("char", "137")), ("vol_city", ("char", "17")), ("issue_state", ("char", "5")), ("start_page", ("char", "5")), ("end_page", ("char", "5")), ] self.create_and_install(new_file_name, table) # ABBREV table new_file_name = "abbrev.csv" table = Table("abbrev", delimiter=",", header_rows=0) table.columns = [ ("ndb_no", ("char", "7")), ("shrt_desc", ("char", "60")), ("water", ("double", )), ("energ_kcal", ("int", )), ("protein", ("double", )), ("lipid_tot", ("double", )), ("ash", ("double", )), ("carbohydrt", ("double", )), ("fiber_td", ("double", )), ("sugar_tot", ("char", "6")), ("calcium", ("int", )), ("iron", ("double", )), ("magnesium", ("int", )), ("phosphorus", ("int", )), ("potassium", ("int", )), ("sodium", ("int", )), ("zinc", ("double", )), ("copper", ("double", )), ("manganese", ("double", )), ("selenium", ("double", )), ("vit_c", ("double", )), ("thiamin", ("double", )), ("riboflavin", ("double", )), ("niacin", ("double", )), ("panto_acid", ("double", )), ("vit_b6", ("double", )), ("folate_tot", ("int", )), ("folic_acid", ("int", )), ("food_folate", ("int", )), ("folate_dfe", ("int", )), ("choline_tot", ("double", )), ("vit_b12", ("double", )), ("vit_a_iu", ("int", )), ("vit_a_rae", ("int", )), ("retinol", ("int", )), ("alpha_carot", ("int", )), ("beta_carot", ("int", )), ("beta_crypt", ("int", )), ("lycopene", ("int", )), ("lut_zea", ("int", )), ("vit_e", ("double", )), ("vit_d_mcg", ("double", )), ("vit_d_iu", ("int", )), ("vit_k", ("double", )), ("fa_sat", ("double", )), ("fa_mono", ("double", )), ("fa_poly", ("double", )), ("cholestrl", ("int", )), ("gmwt_1", ("double", )), ("gmwt_desc1", ("char", "80")), ("gmwt_2", ("double", )), ("gmwt_desc2", ("char", "80")), ("refuse_pct", ("int", )), ] self.create_and_install(new_file_name, table)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = "database.csv" tablename = "predicts_main" table = Table(str(tablename), delimiter=',') table.columns = [("Source_ID", ("char",)), ("Reference", ("char",)), ("Study_number", ("int",)), ("Study_name", ("char",)), ("SS", ("char",)), ("Diversity_metric", ("char",)), ("Diversity_metric_unit", ("char",)), ("Diversity_metric_type", ("char",)), ("Diversity_metric_is_effort_sensitive", ("char",)), ("Diversity_metric_is_suitable_for_Chao", ("char",)), ("Sampling_method", ("char",)), ("Sampling_effort_unit", ("char",)), ("Study_common_taxon", ("char",)), ("Rank_of_study_common_taxon", ("char",)), ("Site_number", ("int",)), ("Site_name", ("char",)), ("Block", ("char",)), ("SSS", ("char",)), ("SSB", ("char",)), ("SSBS", ("char",)), ("Sample_start_earliest", ("char",)), ("Sample_end_latest", ("char",)), ("Sample_midpoint", ("char",)), ("Sample_date_resolution", ("char",)), ("Max_linear_extent_metres", ("double",)), ("Habitat_patch_area_square_metres", ("double",)), ("Sampling_effort", ("double",)), ("Rescaled_sampling_effort", ("double",)), ("Habitat_as_described", ("char",)), ("Predominant_land_use", ("char",)), ("Source_for_predominant_land_use", ("char",)), ("Use_intensity", ("char",)), ("Km_to_nearest_edge_of_habitat", ("double",)), ("Years_since_fragmentation_or_conversion", ("double",)), ("Transect_details", ("char",)), ("Coordinates_method", ("char",)), ("Longitude", ("double",)), ("Latitude", ("double",)), ("Country_distance_metres", ("double",)), ("Country", ("char")), ("UN_subregion", ("char",)), ("UN_region", ("char",)), ("Ecoregion_distance_metres", ("double",)), ("Ecoregion", ("char",)), ("Biome", ("char",)), ("Realm", ("char",)), ("Hotspot", ("char",)), ("Wilderness_area", ("char",)), ("N_samples", ("double",)), ("Taxon_number", ("double",)), ("Taxon_name_entered", ("char",)), ("Indication", ("char",)), ("Parsed_name", ("char",)), ("Taxon", ("char",)), ("COL_ID", ("double",)), ("Name_status", ("char",)), ("Rank", ("char",)), ("Kingdom", ("char",)), ("Phylum", ("char",)), ("Class", ("char",)), ("Order", ("char",)), ("Family", ("char",)), ("Genus", ("char",)), ("Species", ("char",)), ("Best_guess_binomial", ("char",)), ("Higher_taxa", ("char",)), ("Higher_taxon", ("char",)), ("Measurement", ("double",)), ("Effort_corrected_measurement", ("double",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls["PREDICTS"], [filename], "zip", False, "download.zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'vertnet_latest_mammals.csv' tablename = 'mammals' table = Table(str(tablename), delimiter=',') table.columns = [ ("record_id", ("pk-auto",)), ("beginrecord", ("char",)), ("icode", ("char",)), ("title", ("char",)), ("citation", ("char",)), ("contact", ("char",)), ("email", ("char",)), ("emlrights", ("char",)), ("gbifdatasetid", ("char",)), ("gbifpublisherid", ("char",)), ("doi", ("char",)), ("migrator", ("char",)), ("networks", ("char",)), ("orgcountry", ("char",)), ("orgname", ("char",)), ("orgstateprovince", ("char",)), ("pubdate", ("char",)), ("source_url", ("char",)), ("iptrecordid", ("char",)), ("associatedmedia", ("char",)), ("associatedoccurrences", ("char",)), ("associatedorganisms", ("char",)), ("associatedreferences", ("char",)), ("associatedsequences", ("char",)), ("associatedtaxa", ("char",)), ("bed", ("char",)), ("behavior", ("char",)), ("catalognumber", ("char",)), ("continent", ("char",)), ("coordinateprecision", ("char",)), ("coordinateuncertaintyinmeters", ("char",)), ("country", ("char",)), ("countrycode", ("char",)), ("county", ("char",)), ("dateidentified", ("char",)), ("day", ("char",)), ("decimallatitude", ("char",)), ("decimallongitude", ("char",)), ("disposition", ("char",)), ("earliestageorloweststage", ("char",)), ("earliesteonorlowesteonothem", ("char",)), ("earliestepochorlowestseries", ("char",)), ("earliesteraorlowesterathem", ("char",)), ("earliestperiodorlowestsystem", ("char",)), ("enddayofyear", ("char",)), ("establishmentmeans", ("char",)), ("eventdate", ("char",)), ("eventid", ("char",)), ("eventremarks", ("char",)), ("eventtime", ("char",)), ("fieldnotes", ("char",)), ("fieldnumber", ("char",)), ("footprintspatialfit", ("char",)), ("footprintsrs", ("char",)), ("footprintwkt", ("char",)), ("formation", ("char",)), ("geodeticdatum", ("char",)), ("geologicalcontextid", ("char",)), ("georeferencedby", ("char",)), ("georeferenceddate", ("char",)), ("georeferenceprotocol", ("char",)), ("georeferenceremarks", ("char",)), ("georeferencesources", ("char",)), ("georeferenceverificationstatus", ("char",)), ("group", ("char",)), ("habitat", ("char",)), ("highergeography", ("char",)), ("highergeographyid", ("char",)), ("highestbiostratigraphiczone", ("char",)), ("identificationid", ("char",)), ("identificationqualifier", ("char",)), ("identificationreferences", ("char",)), ("identificationremarks", ("char",)), ("identificationverificationstatus", ("char",)), ("identifiedby", ("char",)), ("individualcount", ("char",)), ("island", ("char",)), ("islandgroup", ("char",)), ("latestageorhigheststage", ("char",)), ("latesteonorhighesteonothem", ("char",)), ("latestepochorhighestseries", ("char",)), ("latesteraorhighesterathem", ("char",)), ("latestperiodorhighestsystem", ("char",)), ("lifestage", ("char",)), ("lithostratigraphicterms", ("char",)), ("locality", ("char",)), ("locationaccordingto", ("char",)), ("locationid", ("char",)), ("locationremarks", ("char",)), ("lowestbiostratigraphiczone", ("char",)), ("materialsampleid", ("char",)), ("maximumdepthinmeters", ("char",)), ("maximumdistanceabovesurfaceinmeters", ("char",)), ("maximumelevationinmeters", ("char",)), ("member", ("char",)), ("minimumdepthinmeters", ("char",)), ("minimumdistanceabovesurfaceinmeters", ("char",)), ("minimumelevationinmeters", ("char",)), ("month", ("char",)), ("municipality", ("char",)), ("occurrenceid", ("char",)), ("occurrenceremarks", ("char",)), ("occurrencestatus", ("char",)), ("organismid", ("char",)), ("organismname", ("char",)), ("organismremarks", ("char",)), ("organismscope", ("char",)), ("othercatalognumbers", ("char",)), ("pointradiusspatialfit", ("char",)), ("preparations", ("char",)), ("previousidentifications", ("char",)), ("recordedby", ("char",)), ("recordnumber", ("char",)), ("reproductivecondition", ("char",)), ("samplingeffort", ("char",)), ("samplingprotocol", ("char",)), ("sex", ("char",)), ("startdayofyear", ("char",)), ("stateprovince", ("char",)), ("typestatus", ("char",)), ("verbatimcoordinates", ("char",)), ("verbatimcoordinatesystem", ("char",)), ("verbatimdepth", ("char",)), ("verbatimelevation", ("char",)), ("verbatimeventdate", ("char",)), ("verbatimlatitude", ("char",)), ("verbatimlocality", ("char",)), ("verbatimlongitude", ("char",)), ("verbatimsrs", ("char",)), ("waterbody", ("char",)), ("year", ("char",)), ("dctype", ("char",)), ("modified", ("char",)), ("language", ("char",)), ("license", ("char",)), ("rightsholder", ("char",)), ("accessrights", ("char",)), ("bibliographiccitation", ("char",)), ("dc_references", ("char",)), ("institutionid", ("char",)), ("collectionid", ("char",)), ("datasetid", ("char",)), ("institutioncode", ("char",)), ("collectioncode", ("char",)), ("datasetname", ("char",)), ("ownerinstitutioncode", ("char",)), ("basisofrecord", ("char",)), ("informationwithheld", ("char",)), ("datageneralizations", ("char",)), ("dynamicproperties", ("char",)), ("scientificnameid", ("char",)), ("namepublishedinid", ("char",)), ("scientificname", ("char",)), ("acceptednameusage", ("char",)), ("originalnameusage", ("char",)), ("namepublishedin", ("char",)), ("namepublishedinyear", ("char",)), ("higherclassification", ("char",)), ("kingdom", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("genus", ("char",)), ("subgenus", ("char",)), ("specificepithet", ("char",)), ("infraspecificepithet", ("char",)), ("taxonrank", ("char",)), ("verbatimtaxonrank", ("char",)), ("scientificnameauthorship", ("char",)), ("vernacularname", ("char",)), ("nomenclaturalcode", ("char",)), ("taxonomicstatus", ("char",)), ("keyname", ("char",)), ("haslicense", ("int",)), ("vntype", ("char",)), ("rank", ("int",)), ("mappable", ("int",)), ("hashid", ("char",)), ("hastypestatus", ("int",)), ("wascaptive", ("int",)), ("wasinvasive", ("int",)), ("hastissue", ("int",)), ("hasmedia", ("int",)), ("isfossil", ("int",)), ("haslength", ("int",)), ("haslifestage", ("int",)), ("hasmass", ("int",)), ("hassex", ("int",)), ("lengthinmm", ("double",)), ("massing", ("double",)), ("lengthunitsinferred", ("char",)), ("massunitsinferred", ("char",)), ("underivedlifestage", ("char",)), ("underivedsex", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip", archivename="vertnet_latest_" + str(tablename)) engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"]) self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] lines = [] tax = [] for filename in filelist: print "Extracting data from " + filename + "..." book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for c in sh.row(0): if not Excel.empty_cell(c): cid = Excel.cell_value(c).lower() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in cn.keys(): cn["liana"] = -1 if not "count" in cn.keys(): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if cellcount > 4 and not Excel.empty_cell(row[0]): try: this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: this_line[i] = format_value(row[cn[i]]) if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [Excel.cell_value(row[c]) for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"].lower().replace('\\', '').replace('"', ''), id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = dict() tax_count = 0 # Get all unique families/genera/species for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.write(msg + "\b" * len(msg)) print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS) # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("bool",) )] data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",", contains_pk=False) table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"].lower())], line["site"], liana ] try: counts.append([str(value) for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [i] stems.append([str(value) for value in stem]) data = [','.join(stem) for stem in stems] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] data = [','.join(count) for count in counts] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'vertnet_latest_reptiles.csv' tablename = 'reptiles' table = Table(str(tablename), delimiter=',') table.columns = [ ("record_id", ("pk-auto",)), ("beginrecord", ("char",)), ("icode", ("char",)), ("title", ("char",)), ("citation", ("char",)), ("contact", ("char",)), ("email", ("char",)), ("emlrights", ("char",)), ("gbifdatasetid", ("char",)), ("gbifpublisherid", ("char",)), ("doi", ("char",)), ("migrator", ("char",)), ("networks", ("char",)), ("orgcountry", ("char",)), ("orgname", ("char",)), ("orgstateprovince", ("char",)), ("pubdate", ("char",)), ("source_url", ("char",)), ("iptrecordid", ("char",)), ("associatedmedia", ("char",)), ("associatedoccurrences", ("char",)), ("associatedorganisms", ("char",)), ("associatedreferences", ("char",)), ("associatedsequences", ("char",)), ("associatedtaxa", ("char",)), ("bed", ("char",)), ("behavior", ("char",)), ("catalognumber", ("char",)), ("continent", ("char",)), ("coordinateprecision", ("char",)), ("coordinateuncertaintyinmeters", ("char",)), ("country", ("char",)), ("countrycode", ("char",)), ("county", ("char",)), ("dateidentified", ("char",)), ("day", ("char",)), ("decimallatitude", ("char",)), ("decimallongitude", ("char",)), ("disposition", ("char",)), ("earliestageorloweststage", ("char",)), ("earliesteonorlowesteonothem", ("char",)), ("earliestepochorlowestseries", ("char",)), ("earliesteraorlowesterathem", ("char",)), ("earliestperiodorlowestsystem", ("char",)), ("enddayofyear", ("char",)), ("establishmentmeans", ("char",)), ("eventdate", ("char",)), ("eventid", ("char",)), ("eventremarks", ("char",)), ("eventtime", ("char",)), ("fieldnotes", ("char",)), ("fieldnumber", ("char",)), ("footprintspatialfit", ("char",)), ("footprintsrs", ("char",)), ("footprintwkt", ("char",)), ("formation", ("char",)), ("geodeticdatum", ("char",)), ("geologicalcontextid", ("char",)), ("georeferencedby", ("char",)), ("georeferenceddate", ("char",)), ("georeferenceprotocol", ("char",)), ("georeferenceremarks", ("char",)), ("georeferencesources", ("char",)), ("georeferenceverificationstatus", ("char",)), ("group", ("char",)), ("habitat", ("char",)), ("highergeography", ("char",)), ("highergeographyid", ("char",)), ("highestbiostratigraphiczone", ("char",)), ("identificationid", ("char",)), ("identificationqualifier", ("char",)), ("identificationreferences", ("char",)), ("identificationremarks", ("char",)), ("identificationverificationstatus", ("char",)), ("identifiedby", ("char",)), ("individualcount", ("char",)), ("island", ("char",)), ("islandgroup", ("char",)), ("latestageorhigheststage", ("char",)), ("latesteonorhighesteonothem", ("char",)), ("latestepochorhighestseries", ("char",)), ("latesteraorhighesterathem", ("char",)), ("latestperiodorhighestsystem", ("char",)), ("lifestage", ("char",)), ("lithostratigraphicterms", ("char",)), ("locality", ("char",)), ("locationaccordingto", ("char",)), ("locationid", ("char",)), ("locationremarks", ("char",)), ("lowestbiostratigraphiczone", ("char",)), ("materialsampleid", ("char",)), ("maximumdepthinmeters", ("char",)), ("maximumdistanceabovesurfaceinmeters", ("char",)), ("maximumelevationinmeters", ("char",)), ("member", ("char",)), ("minimumdepthinmeters", ("char",)), ("minimumdistanceabovesurfaceinmeters", ("char",)), ("minimumelevationinmeters", ("char",)), ("month", ("char",)), ("municipality", ("char",)), ("occurrenceid", ("char",)), ("occurrenceremarks", ("char",)), ("occurrencestatus", ("char",)), ("organismid", ("char",)), ("organismname", ("char",)), ("organismremarks", ("char",)), ("organismscope", ("char",)), ("othercatalognumbers", ("char",)), ("pointradiusspatialfit", ("char",)), ("preparations", ("char",)), ("previousidentifications", ("char",)), ("recordedby", ("char",)), ("recordnumber", ("char",)), ("reproductivecondition", ("char",)), ("samplingeffort", ("char",)), ("samplingprotocol", ("char",)), ("sex", ("char",)), ("startdayofyear", ("char",)), ("stateprovince", ("char",)), ("typestatus", ("char",)), ("verbatimcoordinates", ("char",)), ("verbatimcoordinatesystem", ("char",)), ("verbatimdepth", ("char",)), ("verbatimelevation", ("char",)), ("verbatimeventdate", ("char",)), ("verbatimlatitude", ("char",)), ("verbatimlocality", ("char",)), ("verbatimlongitude", ("char",)), ("verbatimsrs", ("char",)), ("waterbody", ("char",)), ("year", ("char",)), ("dctype", ("char",)), ("modified", ("char",)), ("language", ("char",)), ("license", ("char",)), ("rightsholder", ("char",)), ("accessrights", ("char",)), ("bibliographiccitation", ("char",)), ("dc_references", ("char",)), ("institutionid", ("char",)), ("collectionid", ("char",)), ("datasetid", ("char",)), ("institutioncode", ("char",)), ("collectioncode", ("char",)), ("datasetname", ("char",)), ("ownerinstitutioncode", ("char",)), ("basisofrecord", ("char",)), ("informationwithheld", ("char",)), ("datageneralizations", ("char",)), ("dynamicproperties", ("char",)), ("scientificnameid", ("char",)), ("namepublishedinid", ("char",)), ("scientificname", ("char",)), ("acceptednameusage", ("char",)), ("originalnameusage", ("char",)), ("namepublishedin", ("char",)), ("namepublishedinyear", ("char",)), ("higherclassification", ("char",)), ("kingdom", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("genus", ("char",)), ("subgenus", ("char",)), ("specificepithet", ("char",)), ("infraspecificepithet", ("char",)), ("taxonrank", ("char",)), ("verbatimtaxonrank", ("char",)), ("scientificnameauthorship", ("char",)), ("vernacularname", ("char",)), ("nomenclaturalcode", ("char",)), ("taxonomicstatus", ("char",)), ("keyname", ("char",)), ("haslicense", ("int",)), ("vntype", ("char",)), ("rank", ("int",)), ("mappable", ("int",)), ("hashid", ("char",)), ("hastypestatus", ("int",)), ("wascaptive", ("int",)), ("wasinvasive", ("int",)), ("hastissue", ("int",)), ("hasmedia", ("int",)), ("isfossil", ("int",)), ("haslength", ("int",)), ("haslifestage", ("int",)), ("hasmass", ("int",)), ("hassex", ("int",)), ("lengthinmm", ("double",)), ("massing", ("double",)), ("lengthunitsinferred", ("char",)), ("massunitsinferred", ("char",)), ("underivedlifestage", ("char",)), ("underivedsex", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], "zip", False, "vertnet_latest_" + str(tablename)) engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Routes table if not os.path.isfile(engine.format_filename("routes_new.csv")): engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) read = open(engine.format_filename("routes.csv"), "rb") write = open(engine.format_filename("routes_new.csv"), "wb") print "Cleaning routes data..." write.write(read.readline()) for line in read: values = line.split(',') v = Decimal(values[5]) if v > 0: values[5] = str(v * Decimal("-1")) write.write(','.join(str(value) for value in values)) write.close() read.close() engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes_new.csv") engine.insert_data_from_file(engine.format_filename("routes_new.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open(engine.format_filename("weather.csv"), "rb") write = open(engine.format_filename("weather_new.csv"), "wb") print "Cleaning weather data..." for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Species table table = Table("species", pk=False, delimiter=',') table.columns=[("species_id" , ("pk-auto",) ), ("AOU" , ("int",) ), ("genus" , ("char",30) ), ("species" , ("char",50) ), ("subspecies" , ("char",30) ), ("id_to_species" , ("bool",) )] engine.table = table engine.create_table() engine.download_file(self.urls["species"], "SpeciesList.txt") species_list = open(engine.format_filename("SpeciesList.txt"), "rb") for n in range(8): species_list.readline() rows = [] for line in species_list: if line and len(line) > 273: latin_name = line[273:].split() if len(latin_name) < 2: # If there's no species given, add "None" value latin_name.append("None") subspecies = ' '.join(latin_name[2:]) if len(latin_name) > 2 else "None" id_to_species = "1" if latin_name[1] != "None" else "0" if latin_name[1] == "sp.": latin_name[1] = "None" id_to_species = "0" if ("x" in latin_name or "/" in latin_name or "/" in subspecies or "or" in latin_name): # Hybrid species or only identified to a group of species latin_name[1] = ' '.join(latin_name[1:]) subspecies = "None" id_to_species = "0" rows.append(','.join([ line.split()[1], latin_name[0], latin_name[1], subspecies, id_to_species ])) engine.add_to_table(rows) species_list.close() # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in replace.keys(): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns=[("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print "Inserting data from part " + part + "..." try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".exe", ["fifty" + part + ".csv"]) except: print "Failed bulk insert on " + part + ", inserting manually." engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".exe", ["fifty" + part + ".csv"]) except: print "There was an error in part " + part + "." raise except zipfile.BadZipfile: print "There was an unexpected error in the Breeding Bird Survey archives." raise return engine