예제 #1
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"])
        self.engine.insert_data_from_url(self.urls["sites"])

        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(
            self.engine.format_filename("all_Excel.zip"))
        filelist = local_zip.namelist()
        local_zip.close()
        self.engine.download_files_from_archive(self.urls["stems"], filelist)

        filelist = [os.path.basename(filename) for filename in filelist]

        # Currently all_Excel.zip is missing CURUYUQU.xls
        # Download it separately and add it to the file list
        if not self.engine.find_file('CURUYUQU.xls'):
            self.engine.download_file(
                "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls",
                "CURUYUQU.xls",
                clean_line_endings=False)
            filelist.append('CURUYUQU.xls')

        lines = []
        tax = []
        for filename in filelist:
            print("Extracting data from " + filename + "...")
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for colnum, c in enumerate(sh.row(0)):
                if not Excel.empty_cell(c):
                    cid = Excel.cell_value(c).lower()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # in QUIAPACA.xls the "number of individuals" column is
                    # misnamed "STEMDBH" just like the stems columns, so weep
                    # for the state of scientific data and then fix manually
                    if filename == "QUIAPACA.xls" and colnum == 13:
                        cid = "count"

                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid or "dbh" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in list(cn.keys()):
                cn["liana"] = -1
            if not "count" in list(cn.keys()):
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if not all(Excel.empty_cell(cell) for cell in row):
                    try:
                        this_line = {}

                        def format_value(s):
                            s = Excel.cell_value(s)
                            return str(s).title().replace("\\", "/").replace(
                                '"', '')

                        # get the following information from the appropriate columns
                        for i in [
                                "line", "family", "genus", "species", "liana",
                                "count"
                        ]:
                            if cn[i] > -1:
                                this_line[i] = format_value(row[cn[i]])
                                if this_line[i] == '`':
                                    this_line[i] = 1

                        this_line["stems"] = [
                            Excel.cell_value(row[c]) for c in cn["stems"]
                            if not Excel.empty_cell(row[c])
                        ]
                        this_line["site"] = filename[0:-4]

                        lines.append(this_line)

                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"], this_line["genus"],
                                    this_line["species"].lower().replace(
                                        '\\', '').replace('"', ''), id_level,
                                    str(full_id)))
                    except:
                        raise
                        pass

        tax = sorted(
            tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = dict()
        tax_count = 0

        # Get all unique families/genera/species
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(
                        tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.write(msg + "\b" * len(msg))
        print("Generating taxonomic groups: " + str(TAX_GROUPS) + " / " +
              str(TAX_GROUPS))

        # Create species table
        table = Table("species", delimiter=",")
        table.columns = [("species_id", ("pk-int", )), ("family", ("char", )),
                         ("genus", ("char", )), ("species", ("char", )),
                         ("id_level", ("char", 10)), ("full_id", ("bool", ))]

        data = [
            ','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group])
            for group in unique_tax
        ]
        table.pk = 'species_id'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        # Create stems table
        table = Table("stems", delimiter=",", contains_pk=False)
        table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )),
                         ("species_id", ("int", )),
                         ("site_code", ("char", 12)), ("liana", ("char", 10)),
                         ("stem", ("double", ))]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [
                line["line"], tax_dict[(line["family"], line["genus"],
                                        line["species"].lower())],
                line["site"], liana
            ]
            try:
                counts.append(
                    [str(value) for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [i]
                stems.append([str(value) for value in stem])

        data = [','.join(stem) for stem in stems]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )),
                         ("species_id", ("int", )),
                         ("site_code", ("char", 12)), ("liana", ("char", 10)),
                         ("count", ("double", ))]
        data = [','.join(count) for count in counts]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        return self.engine
예제 #2
0
 def format_value(s):
     s = Excel.cell_value(s)
     return str(s).title().replace("\\", "/").replace(
         '"', '')
예제 #3
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine

        table = self.tables["mass"]

        # Database column names and their data types. Use data type "skip" to skip the value, and
        # "combine" to merge a string value into the previous column
        table.columns = [("record_id", ("pk-auto", )),
                         ("family", ("char", 20)), ("genus", ("char", 20)),
                         ("species", ("char", 20)),
                         ("subspecies", ("char", 20)),
                         ("common_name", ("char", 50)), ("sex", ("char", 20)),
                         ("N", ("double", )), ("mean", ("double", )),
                         ("std_dev", ("double", )), ("min", ("double", )),
                         ("max", ("double", )), ("season", ("char", 2)),
                         ("location", ("char", 50)),
                         ("source_num", ("char", 50))]
        engine.table = table
        engine.create_table()

        file_list = [
            "broadbills - tapaculos", "cotingas - NZ wrens",
            "HA honeycreepers - icterids", "honeyeaters - corvids",
            "jacanas - doves", "larks - accentors", "muscicapids - babblers",
            "ostrich - waterfowl", "parrotbills - sugarbirds",
            "parrots - nightjars", "starlings - finches",
            "swifts - woodpeckers", "thrushes - gnatcatchers",
            "vultures - bustards"
        ]

        lines = []

        for file in file_list:
            filename = file + ".xls"
            full_filename = engine.format_filename(filename)

            # Make sure file exists
            if not os.path.isfile(full_filename):
                raise Exception("Missing raw data file: " + full_filename)

            # Open excel file with xlrd
            book = xlrd.open_workbook(full_filename)
            sh = book.sheet_by_index(0)

            print "Inserting data from " + filename + " . . ."
            rows = sh.nrows
            cols = 11
            lines = []
            lastrow = None
            lastvalues = None
            family = ""
            for n in range(rows):
                row = sh.row(n)
                if len(row) == 0:
                    continue

                empty_cols = len(
                    [cell for cell in row[0:11] if Excel.empty_cell(cell)])

                # Skip this row if all cells or all cells but one are empty
                # or if it's the legend row
                if ((empty_cols == cols)
                        or Excel.cell_value(row[0]) == "Scientific Name"
                        or Excel.cell_value(row[0])[0:7] == "Species"):
                    pass
                elif empty_cols == cols - 1:
                    if "Family" in Excel.cell_value(row[0]):
                        family = Excel.cell_value(
                            row[0]).lstrip("Family ").title()
                        continue
                    else:
                        if not Excel.empty_cell(row[0]):
                            lastvalues[3] = Excel.cell_value(row[0])
                else:
                    # Values: 0=Family 1=Genus 2=Species 3=Subspecies 4=common name 5=sex
                    # 6=N 7=Mean 8=std_dev 9=min 10=max 11=season 12=location 13=source_num
                    values = []
                    values.append(family)
                    # If the first two columns are empty, but not all of them are,
                    # use the first two columns from the previous row
                    if Excel.empty_cell(row[0]) and Excel.empty_cell(row[1]):
                        [
                            values.append(value)
                            for value in sci_name(Excel.cell_value(lastrow[0]))
                        ]
                        values.append(Excel.cell_value(lastrow[1]))
                    else:
                        if len(Excel.cell_value(row[0]).split()) == 1:
                            # If the scientific name is missing genus/species, fill it
                            # in from the previous row
                            values.append(lastvalues[1])
                            values.append(lastvalues[2])
                            values.append(lastvalues[3])
                            for i in range(0, 3):
                                if not values[3 - i]:
                                    values[3 - i] = Excel.cell_value(row[0])
                                    break
                            # Add new information to the previous scientific name
                            if lastvalues:
                                lastvalues[1:4] = values[1:4]
                        else:
                            [
                                values.append(value)
                                for value in sci_name(Excel.cell_value(row[0]))
                            ]
                        values.append(Excel.cell_value(row[1]))

                    if Excel.cell_value(row[2]) == "M":
                        values.append("Male")
                    elif Excel.cell_value(row[2]) == "F":
                        values.append("Female")
                    elif Excel.cell_value(row[2]) == "B":
                        values.append("Both")
                    elif Excel.cell_value(row[2]) == "U":
                        values.append("Unknown")
                    else:
                        values.append(Excel.cell_value(row[2]))

                    # Enter remaining values from cells
                    for i in range(3, cols):
                        values.append(Excel.cell_value(row[i]))

                    # If there isn't a common name or location, get it from
                    # the previous row
                    if not values[4]:
                        values[4] = lastvalues[4]
                    if not values[12]:
                        if lastvalues:
                            if lastvalues[5]:
                                if lastvalues[5] == "Male" and values[
                                        5] == "Female":
                                    values[12] = lastvalues[12]

                    # Insert the previous row into the database
                    if lastvalues:
                        lines.append('~'.join(lastvalues))

                    lastrow = row
                    lastvalues = values

            if lines:
                lines.append('~'.join(lastvalues))
                engine.add_to_table(lines)

        return engine
예제 #4
0
파일: gentry.py 프로젝트: imclab/retriever
 def format_value(s):
     s = Excel.cell_value(s)
     return str(s).title().replace("\\", "/").replace('"', '')
예제 #5
0
파일: gentry.py 프로젝트: imclab/retriever
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        
        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"])
        self.engine.insert_data_from_url(self.urls["sites"])
              
        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip"))        
        filelist = local_zip.namelist()
        local_zip.close()        
        self.engine.download_files_from_archive(self.urls["stems"], filelist)
        
        filelist = [os.path.basename(filename) for filename in filelist]
        
        lines = []
        tax = []
        for filename in filelist:
            print "Extracting data from " + filename + "..."
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for c in sh.row(0):
                if not Excel.empty_cell(c):
                    cid = Excel.cell_value(c).lower()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in cn.keys():
                cn["liana"] = -1
            if not "count" in cn.keys():
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if cellcount > 4 and not Excel.empty_cell(row[0]):
                    try:
                        this_line = {}
                        
                        def format_value(s):
                            s = Excel.cell_value(s)
                            return str(s).title().replace("\\", "/").replace('"', '')
                        
                        # get the following information from the appropriate columns
                        for i in ["line", "family", "genus", "species", 
                                  "liana", "count"]:
                            if cn[i] > -1:
                                this_line[i] = format_value(row[cn[i]])
                                if this_line[i] == '`':
                                    this_line[i] = 1

                        this_line["stems"] = [Excel.cell_value(row[c]) 
                                              for c in cn["stems"]
                                              if not Excel.empty_cell(row[c])]
                        this_line["site"] = filename[0:-4]
                        
                        lines.append(this_line)
                        
                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"], 
                                    this_line["genus"], 
                                    this_line["species"].lower().replace('\\', '').replace('"', ''), 
                                    id_level, 
                                    str(full_id)))
                    except:
                        raise
                        pass                    
        
        tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = dict()
        tax_count = 0
        
        # Get all unique families/genera/species
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.write(msg + "\b" * len(msg))
        print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS)
        
        
        # Create species table
        table = Table("species", delimiter=",")
        table.columns=[("species_id"            ,   ("pk-int",)    ),
                       ("family"                ,   ("char", )    ),
                       ("genus"                 ,   ("char", )    ),
                       ("species"               ,   ("char", )    ),
                       ("id_level"              ,   ("char", 10)    ),
                       ("full_id"               ,   ("bool",)       )]

        data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) 
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True
        
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create stems table
        table = Table("stems", delimiter=",", contains_pk=False)
        table.columns=[("stem_id"               ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("stem"                  ,   ("double",)     )]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [line["line"], 
                            tax_dict[(line["family"], 
                                      line["genus"], 
                                      line["species"].lower())],
                            line["site"],
                            liana
                            ]
            try:
                counts.append([str(value) for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [i]
                stems.append([str(value) for value in stem])
            
        data = [','.join(stem) for stem in stems]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns=[("count_id"              ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("count"                 ,   ("double",)     )]
        data = [','.join(count) for count in counts]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
            
        return self.engine
예제 #6
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine

        table = self.tables["mass"]

        # Database column names and their data types. Use data type "skip" to skip the value, and
        # "combine" to merge a string value into the previous column
        table.columns=[("record_id"             ,   ("pk-auto",)    ),
                       ("family"                ,   ("char", 20)    ),
                       ("genus"                 ,   ("char", 20)    ),
                       ("species"               ,   ("char", 20)    ),
                       ("subspecies"            ,   ("char", 20)    ),
                       ("common_name"           ,   ("char", 50)    ),
                       ("sex"                   ,   ("char", 20)    ),
                       ("N"                     ,   ("double",)     ),
                       ("mean"                  ,   ("double",)     ),
                       ("std_dev"               ,   ("double",)     ),
                       ("min"                   ,   ("double",)     ),
                       ("max"                   ,   ("double",)     ),
                       ("season"                ,   ("char",2)      ),
                       ("location"              ,   ("char",50)     ),
                       ("source_num"            ,   ("char",50)     )]
        engine.table = table
        engine.create_table()

        file_list = ["broadbills - tapaculos", "cotingas - NZ wrens",
                     "HA honeycreepers - icterids", "honeyeaters - corvids",
                     "jacanas - doves", "larks - accentors",
                     "muscicapids - babblers", "ostrich - waterfowl",
                     "parrotbills - sugarbirds", "parrots - nightjars",
                     "starlings - finches", "swifts - woodpeckers",
                     "thrushes - gnatcatchers", "vultures - bustards"]

        lines = []

        for file in file_list:
            filename = file + ".xls"
            full_filename = engine.format_filename(filename)

            # Make sure file exists
            if not os.path.isfile(full_filename):
                raise Exception("Missing raw data file: " + full_filename)

            # Open excel file with xlrd
            book = xlrd.open_workbook(full_filename)
            sh = book.sheet_by_index(0)

            print "Inserting data from " + filename + " . . ."
            rows = sh.nrows
            cols = 11
            lines = []
            lastrow = None
            lastvalues = None
            family = ""
            for n in range(rows):
                row = sh.row(n)
                if len(row) == 0:
                    continue

                empty_cols = len([cell for cell in row[0:11] if Excel.empty_cell(cell)])

                # Skip this row if all cells or all cells but one are empty
                # or if it's the legend row
                if ((empty_cols == cols)
                            or Excel.cell_value(row[0]) == "Scientific Name"
                            or Excel.cell_value(row[0])[0:7] == "Species"):
                    pass
                elif empty_cols == cols - 1:
                    if "Family" in Excel.cell_value(row[0]):
                        family = Excel.cell_value(row[0]).lstrip("Family ").title()
                        continue
                    else:
                        if not Excel.empty_cell(row[0]):
                            lastvalues[3] = Excel.cell_value(row[0])
                else:
                    # Values: 0=Family 1=Genus 2=Species 3=Subspecies 4=common name 5=sex
                    # 6=N 7=Mean 8=std_dev 9=min 10=max 11=season 12=location 13=source_num
                    values = []
                    values.append(family)
                    # If the first two columns are empty, but not all of them are,
                    # use the first two columns from the previous row
                    if Excel.empty_cell(row[0]) and Excel.empty_cell(row[1]):
                        [values.append(value) for value in sci_name(Excel.cell_value(lastrow[0]))]
                        values.append(Excel.cell_value(lastrow[1]))
                    else:
                        if len(Excel.cell_value(row[0]).split()) == 1:
                            # If the scientific name is missing genus/species, fill it
                            # in from the previous row
                            values.append(lastvalues[1])
                            values.append(lastvalues[2])
                            values.append(lastvalues[3])
                            for i in range(0, 3):
                                if not values[3-i]:
                                    values[3-i] = Excel.cell_value(row[0])
                                    break
                            # Add new information to the previous scientific name
                            if lastvalues:
                                lastvalues[1:4] = values[1:4]
                        else:
                            [values.append(value) for value in sci_name(Excel.cell_value(row[0]))]
                        values.append(Excel.cell_value(row[1]))

                    if Excel.cell_value(row[2]) == "M":
                        values.append("Male")
                    elif Excel.cell_value(row[2]) == "F":
                        values.append("Female")
                    elif Excel.cell_value(row[2]) == "B":
                        values.append("Both")
                    elif Excel.cell_value(row[2]) == "U":
                        values.append("Unknown")
                    else:
                        values.append(Excel.cell_value(row[2]))

                    # Enter remaining values from cells
                    for i in range(3, cols):
                        values.append(Excel.cell_value(row[i]))

                    # If there isn't a common name or location, get it from
                    # the previous row
                    if not values[4]:
                        values[4] = lastvalues[4]
                    if not values[12]:
                        if lastvalues:
                            if lastvalues[5]:
                                if lastvalues[5] == "Male" and values[5] == "Female":
                                    values[12] = lastvalues[12]

                    # Insert the previous row into the database
                    if lastvalues:
                        lines.append('~'.join(lastvalues))

                    lastrow = row
                    lastvalues = values

            if lines:
                lines.append('~'.join(lastvalues))
                engine.add_to_table(lines)

        return engine