def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows #Creating data table lines = [] for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate([ "Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number" ]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("data", delimiter="\t") table.columns = [("Number", ("pk-int", )), ("Family", ("char", )), ("Binomial", ("char", )), ("Wood_Density", ("double", )), ("Region", ("char", )), ("Reference_Number", ("int", ))] table.pk = 'Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_data = [ line["Number"], line["Family"], line["Binomial"], line["Wood_Density"], line["Region"], line["Reference_Number"] ] gwdd.append(gwdd_data) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) #Creating reference table lines = [] sh = book.sheet_by_index(2) rows = sh.nrows for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Reference_Number", "Reference"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("reference", delimiter="\t") table.columns = [("Reference_Number", ("pk-int", )), ("Reference", ("char", ))] table.pk = 'Reference_Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_ref = [line["Reference_Number"], line["Reference"]] gwdd.append(gwdd_ref) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv') self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile( self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file( "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls") filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = c.value.lower().strip() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} # get the following information from the appropriate columns for i in [ "line", "family", "genus", "species", "liana", "count" ]: if cn[i] > -1: if row[cn[i]].ctype != 2: # if the cell type(ctype) is not a number this_line[i] = row[ cn[i]].value.lower().strip().replace( "\\", "/").replace('"', '') else: this_line[i] = row[cn[i]].value if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [ row[c] for c in cn["stems"] if not Excel.empty_cell(row[c]) ] this_line["site"] = filename[0:-4] # Manually correct CEDRAL data, which has a single line # that is shifted by one to the left starting at Liana if this_line["site"] == "CEDRAL" and type( this_line["liana"]) == float: this_line["liana"] = "" this_line["count"] = 3 this_line["stems"] = [2.5, 2.5, 30, 18, 25] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append( (this_line["family"], this_line["genus"], this_line["species"], id_level, str(full_id))) except: raise pass tax = sorted( tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = {} tax_count = 0 # Get all unique families/genera/species print("\n") for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str( tax_count) + " / " + str(TAX_GROUPS) sys.stdout.flush() sys.stdout.write(msg + "\b" * len(msg)) print("\n") # Create species table table = Table("species", delimiter=",") table.columns = [("species_id", ("pk-int", )), ("family", ("char", )), ("genus", ("char", )), ("species", ("char", )), ("id_level", ("char", 10)), ("full_id", ("int", ))] data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group] for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",") table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("stem", ("double", ))] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [ line["line"], tax_dict[(line["family"], line["genus"], line["species"])], line["site"], liana ] try: counts.append( [value for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [str(i)] stems.append(stem) self.engine.table = table self.engine.create_table() self.engine.add_to_table(stems) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("count", ("double", ))] self.engine.table = table self.engine.create_table() self.engine.add_to_table(counts) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int",)), ("Family", ("char",)), ("Binomial", ("char",)), ("Wood_Density", ("double",)), ("Region", ("char",)), ("Reference_Number", ("int",))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"]) self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] lines = [] tax = [] for filename in filelist: print "Extracting data from " + filename + "..." book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for c in sh.row(0): if not Excel.empty_cell(c): cid = Excel.cell_value(c).lower() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in cn.keys(): cn["liana"] = -1 if not "count" in cn.keys(): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if cellcount > 4 and not Excel.empty_cell(row[0]): try: this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: this_line[i] = format_value(row[cn[i]]) if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [Excel.cell_value(row[c]) for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"].lower().replace('\\', '').replace('"', ''), id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = dict() tax_count = 0 # Get all unique families/genera/species for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.write(msg + "\b" * len(msg)) print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS) # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("bool",) )] data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",", contains_pk=False) table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"].lower())], line["site"], liana ] try: counts.append([str(value) for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [i] stems.append([str(value) for value in stem]) data = [','.join(stem) for stem in stems] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] data = [','.join(count) for count in counts] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow([ "Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number" ]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int", )), ("Family", ("char", )), ("Binomial", ("char", )), ("Wood_Density", ("double", )), ("Region", ("char", )), ("Reference_Number", ("int", ))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [ to_str(column_value.value, object_encoding=sys.stdout) for column_value in row ] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int", )), ("Reference", ("char", ))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv') self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls") filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = c.value.lower().strip() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: if row[cn[i]].ctype != 2: # if the cell type(ctype) is not a number this_line[i] = row[cn[i]].value.lower().strip().replace("\\", "/").replace('"', '') else: this_line[i] = row[cn[i]].value if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [row[c] for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] # Manually correct CEDRAL data, which has a single line # that is shifted by one to the left starting at Liana if this_line["site"] == "CEDRAL" and type(this_line["liana"]) == float: this_line["liana"] = "" this_line["count"] = 3 this_line["stems"] = [2.5, 2.5, 30, 18, 25] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"], id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = {} tax_count = 0 # Get all unique families/genera/species print("\n") for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.flush() sys.stdout.write(msg + "\b" * len(msg)) print("\n") # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("int",) )] data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group] for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",") table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"])], line["site"], liana ] try: counts.append([value for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [str(i)] stems.append(stem) self.engine.table = table self.engine.create_table() self.engine.add_to_table(stems) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] self.engine.table = table self.engine.create_table() self.engine.add_to_table(counts) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows #Creating data table lines = [] for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("data", delimiter="\t") table.columns=[("Number" , ("pk-int",) ), ("Family" , ("char",) ), ("Binomial" , ("char",) ), ("Wood_Density" , ("double",) ), ("Region" , ("char",) ), ("Reference_Number" , ("int",) )] table.pk = 'Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_data = [line["Number"], line["Family"], line["Binomial"], line["Wood_Density"], line["Region"], line["Reference_Number"]] gwdd.append(gwdd_data) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) #Creating reference table lines = [] sh = book.sheet_by_index(2) rows = sh.nrows for i in range(1, rows): row = sh.row(i) if not all(Excel.empty_cell(cell) for cell in row): this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') for num, label in enumerate(["Reference_Number", "Reference"]): this_line[label] = format_value(row[num]) lines.append(this_line) table = Table("reference", delimiter="\t") table.columns=[("Reference_Number" , ("pk-int",) ), ("Reference" , ("char",) )] table.pk = 'Reference_Number' table.contains_pk = True gwdd = [] for line in lines: gwdd_ref = [line["Reference_Number"], line["Reference"]] gwdd.append(gwdd_ref) data = ['\t'.join(gwdd_line) for gwdd_line in gwdd] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine