def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"]) self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile( self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file( "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls", clean_line_endings=False) filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = Excel.cell_value(c).lower() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace( '"', '') # get the following information from the appropriate columns for i in [ "line", "family", "genus", "species", "liana", "count" ]: if cn[i] > -1: this_line[i] = format_value(row[cn[i]]) if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [ Excel.cell_value(row[c]) for c in cn["stems"] if not Excel.empty_cell(row[c]) ] this_line["site"] = filename[0:-4] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"].lower().replace( '\\', '').replace('"', ''), id_level, str(full_id))) except: raise pass tax = sorted( tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = dict() tax_count = 0 # Get all unique families/genera/species for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str( tax_count) + " / " + str(TAX_GROUPS) sys.stdout.write(msg + "\b" * len(msg)) print("Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS)) # Create species table table = Table("species", delimiter=",") table.columns = [("species_id", ("pk-int", )), ("family", ("char", )), ("genus", ("char", )), ("species", ("char", )), ("id_level", ("char", 10)), ("full_id", ("bool", ))] data = [ ','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) for group in unique_tax ] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",", contains_pk=False) table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("stem", ("double", ))] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [ line["line"], tax_dict[(line["family"], line["genus"], line["species"].lower())], line["site"], liana ] try: counts.append( [str(value) for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [i] stems.append([str(value) for value in stem]) data = [','.join(stem) for stem in stems] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("count", ("double", ))] data = [','.join(count) for count in counts] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace( '"', '')
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine table = self.tables["mass"] # Database column names and their data types. Use data type "skip" to skip the value, and # "combine" to merge a string value into the previous column table.columns = [("record_id", ("pk-auto", )), ("family", ("char", 20)), ("genus", ("char", 20)), ("species", ("char", 20)), ("subspecies", ("char", 20)), ("common_name", ("char", 50)), ("sex", ("char", 20)), ("N", ("double", )), ("mean", ("double", )), ("std_dev", ("double", )), ("min", ("double", )), ("max", ("double", )), ("season", ("char", 2)), ("location", ("char", 50)), ("source_num", ("char", 50))] engine.table = table engine.create_table() file_list = [ "broadbills - tapaculos", "cotingas - NZ wrens", "HA honeycreepers - icterids", "honeyeaters - corvids", "jacanas - doves", "larks - accentors", "muscicapids - babblers", "ostrich - waterfowl", "parrotbills - sugarbirds", "parrots - nightjars", "starlings - finches", "swifts - woodpeckers", "thrushes - gnatcatchers", "vultures - bustards" ] lines = [] for file in file_list: filename = file + ".xls" full_filename = engine.format_filename(filename) # Make sure file exists if not os.path.isfile(full_filename): raise Exception("Missing raw data file: " + full_filename) # Open excel file with xlrd book = xlrd.open_workbook(full_filename) sh = book.sheet_by_index(0) print "Inserting data from " + filename + " . . ." rows = sh.nrows cols = 11 lines = [] lastrow = None lastvalues = None family = "" for n in range(rows): row = sh.row(n) if len(row) == 0: continue empty_cols = len( [cell for cell in row[0:11] if Excel.empty_cell(cell)]) # Skip this row if all cells or all cells but one are empty # or if it's the legend row if ((empty_cols == cols) or Excel.cell_value(row[0]) == "Scientific Name" or Excel.cell_value(row[0])[0:7] == "Species"): pass elif empty_cols == cols - 1: if "Family" in Excel.cell_value(row[0]): family = Excel.cell_value( row[0]).lstrip("Family ").title() continue else: if not Excel.empty_cell(row[0]): lastvalues[3] = Excel.cell_value(row[0]) else: # Values: 0=Family 1=Genus 2=Species 3=Subspecies 4=common name 5=sex # 6=N 7=Mean 8=std_dev 9=min 10=max 11=season 12=location 13=source_num values = [] values.append(family) # If the first two columns are empty, but not all of them are, # use the first two columns from the previous row if Excel.empty_cell(row[0]) and Excel.empty_cell(row[1]): [ values.append(value) for value in sci_name(Excel.cell_value(lastrow[0])) ] values.append(Excel.cell_value(lastrow[1])) else: if len(Excel.cell_value(row[0]).split()) == 1: # If the scientific name is missing genus/species, fill it # in from the previous row values.append(lastvalues[1]) values.append(lastvalues[2]) values.append(lastvalues[3]) for i in range(0, 3): if not values[3 - i]: values[3 - i] = Excel.cell_value(row[0]) break # Add new information to the previous scientific name if lastvalues: lastvalues[1:4] = values[1:4] else: [ values.append(value) for value in sci_name(Excel.cell_value(row[0])) ] values.append(Excel.cell_value(row[1])) if Excel.cell_value(row[2]) == "M": values.append("Male") elif Excel.cell_value(row[2]) == "F": values.append("Female") elif Excel.cell_value(row[2]) == "B": values.append("Both") elif Excel.cell_value(row[2]) == "U": values.append("Unknown") else: values.append(Excel.cell_value(row[2])) # Enter remaining values from cells for i in range(3, cols): values.append(Excel.cell_value(row[i])) # If there isn't a common name or location, get it from # the previous row if not values[4]: values[4] = lastvalues[4] if not values[12]: if lastvalues: if lastvalues[5]: if lastvalues[5] == "Male" and values[ 5] == "Female": values[12] = lastvalues[12] # Insert the previous row into the database if lastvalues: lines.append('~'.join(lastvalues)) lastrow = row lastvalues = values if lines: lines.append('~'.join(lastvalues)) engine.add_to_table(lines) return engine
def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '')
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"]) self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] lines = [] tax = [] for filename in filelist: print "Extracting data from " + filename + "..." book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for c in sh.row(0): if not Excel.empty_cell(c): cid = Excel.cell_value(c).lower() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in cn.keys(): cn["liana"] = -1 if not "count" in cn.keys(): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if cellcount > 4 and not Excel.empty_cell(row[0]): try: this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: this_line[i] = format_value(row[cn[i]]) if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [Excel.cell_value(row[c]) for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"].lower().replace('\\', '').replace('"', ''), id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = dict() tax_count = 0 # Get all unique families/genera/species for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.write(msg + "\b" * len(msg)) print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS) # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("bool",) )] data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",", contains_pk=False) table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"].lower())], line["site"], liana ] try: counts.append([str(value) for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [i] stems.append([str(value) for value in stem]) data = [','.join(stem) for stem in stems] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] data = [','.join(count) for count in counts] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine table = self.tables["mass"] # Database column names and their data types. Use data type "skip" to skip the value, and # "combine" to merge a string value into the previous column table.columns=[("record_id" , ("pk-auto",) ), ("family" , ("char", 20) ), ("genus" , ("char", 20) ), ("species" , ("char", 20) ), ("subspecies" , ("char", 20) ), ("common_name" , ("char", 50) ), ("sex" , ("char", 20) ), ("N" , ("double",) ), ("mean" , ("double",) ), ("std_dev" , ("double",) ), ("min" , ("double",) ), ("max" , ("double",) ), ("season" , ("char",2) ), ("location" , ("char",50) ), ("source_num" , ("char",50) )] engine.table = table engine.create_table() file_list = ["broadbills - tapaculos", "cotingas - NZ wrens", "HA honeycreepers - icterids", "honeyeaters - corvids", "jacanas - doves", "larks - accentors", "muscicapids - babblers", "ostrich - waterfowl", "parrotbills - sugarbirds", "parrots - nightjars", "starlings - finches", "swifts - woodpeckers", "thrushes - gnatcatchers", "vultures - bustards"] lines = [] for file in file_list: filename = file + ".xls" full_filename = engine.format_filename(filename) # Make sure file exists if not os.path.isfile(full_filename): raise Exception("Missing raw data file: " + full_filename) # Open excel file with xlrd book = xlrd.open_workbook(full_filename) sh = book.sheet_by_index(0) print "Inserting data from " + filename + " . . ." rows = sh.nrows cols = 11 lines = [] lastrow = None lastvalues = None family = "" for n in range(rows): row = sh.row(n) if len(row) == 0: continue empty_cols = len([cell for cell in row[0:11] if Excel.empty_cell(cell)]) # Skip this row if all cells or all cells but one are empty # or if it's the legend row if ((empty_cols == cols) or Excel.cell_value(row[0]) == "Scientific Name" or Excel.cell_value(row[0])[0:7] == "Species"): pass elif empty_cols == cols - 1: if "Family" in Excel.cell_value(row[0]): family = Excel.cell_value(row[0]).lstrip("Family ").title() continue else: if not Excel.empty_cell(row[0]): lastvalues[3] = Excel.cell_value(row[0]) else: # Values: 0=Family 1=Genus 2=Species 3=Subspecies 4=common name 5=sex # 6=N 7=Mean 8=std_dev 9=min 10=max 11=season 12=location 13=source_num values = [] values.append(family) # If the first two columns are empty, but not all of them are, # use the first two columns from the previous row if Excel.empty_cell(row[0]) and Excel.empty_cell(row[1]): [values.append(value) for value in sci_name(Excel.cell_value(lastrow[0]))] values.append(Excel.cell_value(lastrow[1])) else: if len(Excel.cell_value(row[0]).split()) == 1: # If the scientific name is missing genus/species, fill it # in from the previous row values.append(lastvalues[1]) values.append(lastvalues[2]) values.append(lastvalues[3]) for i in range(0, 3): if not values[3-i]: values[3-i] = Excel.cell_value(row[0]) break # Add new information to the previous scientific name if lastvalues: lastvalues[1:4] = values[1:4] else: [values.append(value) for value in sci_name(Excel.cell_value(row[0]))] values.append(Excel.cell_value(row[1])) if Excel.cell_value(row[2]) == "M": values.append("Male") elif Excel.cell_value(row[2]) == "F": values.append("Female") elif Excel.cell_value(row[2]) == "B": values.append("Both") elif Excel.cell_value(row[2]) == "U": values.append("Unknown") else: values.append(Excel.cell_value(row[2])) # Enter remaining values from cells for i in range(3, cols): values.append(Excel.cell_value(row[i])) # If there isn't a common name or location, get it from # the previous row if not values[4]: values[4] = lastvalues[4] if not values[12]: if lastvalues: if lastvalues[5]: if lastvalues[5] == "Male" and values[5] == "Female": values[12] = lastvalues[12] # Insert the previous row into the database if lastvalues: lines.append('~'.join(lastvalues)) lastrow = row lastvalues = values if lines: lines.append('~'.join(lastvalues)) engine.add_to_table(lines) return engine