Exemplo n.º 1
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # files are nested in another baad_data folder
        # important files considered (baad_data.csv,baad_methods.csv)
        # relevant files can be added in the same manner

        file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"]
        engine.download_files_from_archive(self.urls["BAAD"], file_names)

        # creating data from baad_data.csv
        engine.auto_create_table(Table("data",
                                       cleanup=Cleanup(correct_invalid_value,
                                                       nulls=['NA'])),
                                 filename="baad_data.csv")
        engine.insert_data_from_file(engine.format_filename("baad_data.csv"))

        # creating methods from baad_methods.csv
        engine.auto_create_table(Table("methods",
                                       cleanup=Cleanup(correct_invalid_value,
                                                       nulls=['NA'])),
                                 filename="baad_methods.csv")
        engine.insert_data_from_file(
            engine.format_filename("baad_methods.csv"))
Exemplo n.º 2
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        # structure_plot_year table
        self.engine.auto_create_table(Table("structure_plot_year"), url=self.urls["structure_plot_year"])
        self.engine.insert_data_from_url(self.urls["structure_plot_year"])

        # structure_plot_year table
        self.engine.auto_create_table(Table("plots"), url=self.urls["plots"])
        self.engine.insert_data_from_url(self.urls["plots"])

        # species table
        self.engine.download_file(self.urls["species"], "original_MSH_SPECIES_DESCRIPTORS.csv")
        data_path = self.engine.format_filename("MSH_SPECIES_DESCRIPTORS.csv")

        old_data = os.path.normpath(self.engine.find_file("original_MSH_SPECIES_DESCRIPTORS.csv"))

        with open(old_data, 'rU') as infile, open(data_path, 'w')as new_data:
            for line in infile:
                line = str(line).encode('utf-8')
                new_data.write(line)
        infile.close()
        new_data.close()

        self.engine.auto_create_table(Table("species"),
                                      filename="MSH_SPECIES_DESCRIPTORS.csv")
        self.engine.insert_data_from_file(data_path)

        # species_plot_year tables
        table = Table("species_plot_year")
        table.delimiter = ','
        table.columns = [
            ('record_id', ('pk-auto',)),
            ('plot_id_year', ('char',)),
            ('plot_name', ('char',)),
            ('plot_number', ('int',)),
            ('year', ('int',)),
            ('species', ('ct_column',)),
            ('count', ('ct-double',))
        ]

        table.ct_column = 'species'
        table.ct_names = ['Abilas', 'Abipro', 'Achmil', 'Achocc', 'Agoaur', 'Agrexa', 'Agrpal', 'Agrsca', 'Alnvir',
                          'Anamar', 'Antmic', 'Antros', 'Aqifor', 'Arcnev', 'Arnlat', 'Astled', 'Athdis', 'Blespi',
                          'Brocar', 'Brosit', 'Carmer', 'Carmic', 'Carpac', 'Carpay', 'Carpha', 'Carros', 'Carspe',
                          'Casmin', 'Chaang', 'Cirarv', 'Cisumb', 'Crycas', 'Danint', 'Descae', 'Elyely', 'Epiana',
                          'Eriova', 'Eripyr', 'Fesocc', 'Fravir', 'Gencal', 'Hiealb', 'Hiegra', 'Hyprad', 'Junmer',
                          'Junpar', 'Juncom', 'Leppun', 'Lommar', 'Luepec', 'Luihyp', 'Luplat', 'Luplep', 'Luzpar',
                          'Maiste', 'Pencar', 'Pencon', 'Penser', 'Phahas', 'Phlalp', 'Phldif', 'Phyemp', 'Pincon',
                          'Poasec', 'Poldav', 'Polmin', 'Pollon', 'Poljun', 'Popbal', 'Potarg', 'Psemen', 'Raccan',
                          'Rumace', 'Salsit', 'Saxfer', 'Senspp', 'Sibpro', 'Sorsit', 'Spiden', 'Trispi', 'Tsumer',
                          'Vacmem', 'Vervir', 'Vioadu', 'Xerten']

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_url(self.urls["species_plot_year"])
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("index", ("pk-int",)),
            ("sourcenumber", ("int",)),
            ("sourcename", ("char",)),
            ("speciesname", ("char",)),
            ("speciescode", ("char",)),
            ("invert/vert", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("trophicgild", ("char",)),
            ("drymass", ("double",)),
            ("logdrymass", ("double",)),
            ("ecosystemtype", ("char",)),
            ("energysource", ("char",)),
            ("habitat", ("char",)),
            ("residentecosystem", ("char",)),
            ("temperature", ("double",)),
            ("nexcretionrate", ("double",)),
            ("pexcretionrate", ("double",)),
            ("lognexcretionrate", ("double",)),
            ("logpexcretionrate", ("double",)),
            ("incubationtime", ("double",)),
            ("nform", ("char",)),
            ("pform", ("char",)),
            ("bodyc", ("double",)),
            ("bodyn", ("double",)),
            ("bodyp", ("double",)),
            ("bodyc:n", ("double",)),
            ("bodyc:p", ("double",)),
            ("bodyn:p", ("double",)),
            ("bodydatasource", ("char",)),
            ("datasource", ("char",)),
            ("dataproviders", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip")

        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Exemplo n.º 4
0
 def download(self, engine=None, debug=False):
     data_file_name = "eBird_Observation_Dataset_2013.csv"
     Script.download(self, engine, debug)
     self.engine.download_files_from_archive(self.urls["main"],
                                             [data_file_name],
                                             filetype='gz')
     table = (Table("main", delimiter=","))
     table.columns = [("BASISOFRECORD", ("char", )),
                      ("INSTITUTIONCODE", ("char", )),
                      ("COLLECTIONCODE", ("char", )),
                      ("CATALOGNUMBER", ("char", )),
                      ("OCCURRENCEID", ("char", )),
                      ("RECORDEDBY", ("char", )), ("YEAR", ("int", )),
                      ("MONTH", ("int", )), ("DAY", ("int", )),
                      ("COUNTRY", ("char", )),
                      ("STATEPROVINCE", ("char", )), ("COUNTY", ("char", )),
                      ("DECIMALLATITUDE", ("double", )),
                      ("DECIMALLONGITUDE", ("double", )),
                      ("LOCALITY", ("char", )), ("KINGDOM", ("char", )),
                      ("PHYLUM", ("char", )), ("CLASS", ("char", )),
                      ("SPORDER", ("char", )), ("FAMILY", ("char", )),
                      ("GENUS", ("char", )),
                      ("SPECIFICEPITHET", ("char", )),
                      ("SCIENTIFICNAME", ("char", )),
                      ("VERNACULARNAME", ("char", )),
                      ("INDIVIDUALCOUNT", ("int", ))]
     engine.table = table
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(data_file_name))
     return engine
Exemplo n.º 5
0
    def __init__(self, **kwargs):
        Script.__init__(self, **kwargs)
        self.title = "Gulf of Maine intertidal density/cover (Petraitis et al. 2008)"
        self.citation = "Peter S. Petraitis, Harrison Liu, and " \
                        "Erika C. Rhile. 2008. Densities and cover " \
                        "data for intertidal organisms in the Gulf of " \
                        "Maine, USA, from 2003 to 2007. Ecology 89:588."
        self.name = "intertidal-abund-me"
        self.ref = "https://figshare.com/collections/DENSITIES_AND_COVER_DATA_FOR_INTERTIDAL_ORGANISMS_IN_THE_GULF_OF_MAINE_USA_FROM_2003_TO_2007/3300200"
        self.description = "The data on densities and percent cover in the " \
                           "60 experimental plots from 2003 to 2007 and to " \
                           "update data from 1996 to 2002 that are already " \
                           "published in Ecological Archives." \
                           "Includes densities of mussels, " \
                           "herbivorous limpet, herbivorous snails, " \
                           "predatory snail, barnacle , fucoid algae and " \
                           "percent cover by mussels, barnacles, fucoids, " \
                           "and other sessile organisms."
        self.retriever_minimum_version = '2.0.dev'
        self.version = '1.5.3'
        self.urls = {"main": "https://ndownloader.figshare.com/files/5600831"}
        self.cleanup_func_table = Cleanup(correct_invalid_value,
                                          missing_values=[-999.9])

        if parse_version(VERSION) <= parse_version("2.0.0"):
            self.shortname = self.name
            self.name = self.title
            self.cleanup_func_table = Cleanup(correct_invalid_value,
                                              nulls=[-999.9])
        self.tables = {"main": Table("main", cleanup=self.cleanup_func_table)}
Exemplo n.º 6
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key],
                                      self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(
                self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(
                key, cleanup=self.cleanup_func_table),
                                          filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
Exemplo n.º 7
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
Exemplo n.º 8
0
 def __init__(self):
     Script.__init__(self,
                     tables={'trees': Table('trees', cleanup=Cleanup(correct_invalid_value, nulls=[-999]))},
                     name="Tree growth, mortality, physical condition - Clark, 2006",
                     tags=['Taxon > Plants'],
                     urls={'trees': 'http://esapubs.org/archive/ecol/E087/132/LS_trees_1983_2000.txt'},
                     shortname="Clark2006",
                     description="David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132.")
Exemplo n.º 9
0
 def __init__(self, **kwargs):
     Script.__init__(self, **kwargs)
     self.name = "CRC Avian Body Masses"
     self.shortname = "AvianBodyMass"
     self.public = False
     self.ref = "http://www.crcpress.com/ecommerce_product/product_detail.jsf?isbn=1420064444"
     self.tables = {"mass": Table("mass", delimiter="~")}
     self.urls = {"mass": ""}
     self.tags = ["Taxon > Birds", "Data Type > Compilation"]
Exemplo n.º 10
0
 def __init__(self):
     Script.__init__(self,
                     tables={'trees': Table('trees', cleanup=Cleanup(correct_invalid_value, nulls=[-999]))},
                     name="Tree growth, mortality, physical condition - Clark, 2006",
                     tags=['Taxon > Plants'],
                     urls={'trees': 'http://esapubs.org/archive/ecol/E087/132/LS_trees_1983_2000.txt'},
                     shortname="Clark2006",
                     description = "The data set helps to examine the post-establishment ecology of 10 species of tropical wet forest trees selected to span a range of predicted life history patterns at the La Selva Biological Station in Costa Rica.",
                     ref = "http://esapubs.org/archive/ecol/E087/132/",
                     citation="David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132.")
Exemplo n.º 11
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine

        taxa = ('Plant', 'Animal')

        for tax in taxa:
            table = Table(tax.lower() + 's', delimiter=',', header_rows = 3, pk='record_id', contains_pk=True)

            columns =     [("record_id"             ,   ("pk-int",)     ),
                           ("station_id"            ,   ("int",)        ),
                           ("obs_date"              ,   ("char",)       ),
                           ("ind_id"                ,   ("int",)        ),
                           ("sci_name"              ,   ("char",)       ),
                           ("com_name"              ,   ("char",)       ),
                           ("kingdom"               ,   ("char",)       ),
                           ("pheno_cat"             ,   ("char",)       ),
                           ("pheno_name"            ,   ("char",)       ),
                           ("pheno_status"          ,   ("char",)       ),
                           ("lat"                   ,   ("double",)     ),
                           ("lon"                   ,   ("double",)     ),
                           ("elevation"             ,   ("int",)        ),
                           ("network_name"          ,   ("char",)       )]
            table.columns = columns

            engine.table = table
            engine.create_table()

            base_url = 'http://www.usanpn.org/getObs/observations/'
            years = range(2009, 2013)

            for year in years:
                if year == 2009 and tax == 'Animal': continue

                url = base_url + 'get%s%sDataNoDefinitions' % (year, tax)

                filename = '%s_%s.csv' % (tax, year)
                engine.download_file(url, filename)

                engine.insert_data_from_file(engine.find_file(filename))

        return engine
Exemplo n.º 12
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )),
                         ("sourcename", ("char", )),
                         ("speciesname", ("char", )),
                         ("speciescode", ("char", )),
                         ("invert/vert", ("char", )), ("phylum", ("char", )),
                         ("class", ("char", )), ("order", ("char", )),
                         ("family", ("char", )), ("trophicgild", ("char", )),
                         ("drymass", ("double", )),
                         ("logdrymass", ("double", )),
                         ("ecosystemtype", ("char", )),
                         ("energysource", ("char", )), ("habitat", ("char", )),
                         ("residentecosystem", ("char", )),
                         ("temperature", ("double", )),
                         ("nexcretionrate", ("double", )),
                         ("pexcretionrate", ("double", )),
                         ("lognexcretionrate", ("double", )),
                         ("logpexcretionrate", ("double", )),
                         ("incubationtime", ("double", )),
                         ("nform", ("char", )), ("pform", ("char", )),
                         ("bodyc", ("double", )), ("bodyn", ("double", )),
                         ("bodyp", ("double", )), ("bodyc:n", ("double", )),
                         ("bodyc:p", ("double", )), ("bodyn:p", ("double", )),
                         ("bodydatasource", ("char", )),
                         ("datasource", ("char", )),
                         ("dataproviders", ("char", ))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename],
                                               [filename],
                                               filetype="zip")

        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"],
                                           ["UPSP_Demo_data.txt",
                                            "UPSP_Species_list2.txt"],
                                           archive_type="zip")
        # Create table sp_list(Species)
        filename = "UPSP_Species_list2.txt"
        engine.auto_create_table(
            Table('sp_list', cleanup=self.cleanup_func_table),
            filename=filename)
        engine.insert_data_from_file(engine.format_filename(filename))

        # Create table ind_loc_girth
        filename = "UPSP_Demo_data.txt"
        engine.auto_create_table(
            Table('ind_loc_girth', cleanup=self.cleanup_func_table),
            filename=filename)
        engine.insert_data_from_file(engine.format_filename(filename))
Exemplo n.º 14
0
 def __init__(self, **kwargs):
     Script.__init__(self, **kwargs)
     self.name = "CRC Avian Body Masses"
     self.shortname = "AvianBodyMass"
     self.public = False
     self.ref = "http://www.crcnetbase.com/isbn/9781420064452"
     self.citation = "Robert B. Payne, CRC Handbook of Avian Body Masses. Second Edition. The Wilson Journal of Ornithology Sep 2009 : Vol. 121, Issue 3, pg(s) 661-662 doi: 10.1676/1559-4491-121.3.661."
     self.description = "Body masses of birds of the world."
     self.tables = {"mass": Table("mass", delimiter="~")}
     self.urls = {"mass": ""}
     self.tags = ["Taxon > Birds", "Data Type > Compilation"]
Exemplo n.º 15
0
 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     engine = self.engine
     file_name = "PanTHERIA_1-0_WR05_Aug2008.txt"
     engine.download_files_from_archive(self.urls["data"], [file_name],
                                        "zip")
     # Create table Species
     engine.auto_create_table(Table('species',
                                    cleanup=self.cleanup_func_table),
                              filename=file_name)
     engine.insert_data_from_file(engine.format_filename(file_name))
Exemplo n.º 16
0
 def __init__(self):
     Script.__init__(self,
                     tables={'trees': Table('trees', cleanup=Cleanup(correct_invalid_value, nulls=[-999]))},
                     name="Tree growth, mortality, physical condition - Clark, 2006",
                     tags=['plants', 'time-series'],
                     urls={'trees': 'https://ndownloader.figshare.com/files/5597693'},
                     shortname="la-selva-trees",
                     description="The data set helps to examine the post-establishment ecology of 10 species of tropical wet forest trees selected to span a range of predicted life history patterns at the La Selva Biological Station in Costa Rica.",
                     ref="https://doi.org/10.6084/m9.figshare.c.3299324.v1",
                     retriever_minimum_version= "2.0.dev",
                     version='1.3.0',
                     citation="David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132.")
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # files are nested in another baad_data folder
        # important files considered (baad_data.csv,baad_methods.csv)
        # relevant files can be added in the same manner

        file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"]
        engine.download_files_from_archive(self.urls["BAAD"], file_names)

        # creating data from baad_data.csv
        if parse_version(VERSION).__str__() >= parse_version(
                "2.1.dev").__str__():
            filename = "baad_data/baad_data.csv"
            engine.auto_create_table(Table("data",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
        else:
            filename = "baad_data.csv"
            engine.auto_create_table(Table("data",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))

        # creating methods from baad_methods.csv
        if parse_version(VERSION).__str__() >= parse_version(
                "2.1.dev").__str__():
            filename = "baad_data/baad_methods.csv"
            engine.auto_create_table(Table("methods",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
        else:
            filename = "baad_methods.csv"
            engine.auto_create_table(Table("methods",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filenames = [
            'Aquatic_animal_excretion_data.csv',
            'Aquatic_animal_excretion_variable_descriptions.csv'
        ]
        for file_paths in filenames:
            if not os.path.isfile(engine.format_filename(file_paths)):
                url = self.urls["aquatic_animals"]
                engine.download_files_from_archive(url, filenames, "zip")

        # processing Aquatic_animal_excretion_data.csv
        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'
        table = Table(str(tablename), delimiter=',')
        table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )),
                         ("sourcename", ("char", )),
                         ("speciesname", ("char", )),
                         ("speciescode", ("char", )),
                         ("invert/vert", ("char", )), ("phylum", ("char", )),
                         ("class", ("char", )), ("order", ("char", )),
                         ("family", ("char", )), ("trophicgild", ("char", )),
                         ("drymass", ("double", )),
                         ("logdrymass", ("double", )),
                         ("ecosystemtype", ("char", )),
                         ("energysource", ("char", )), ("habitat", ("char", )),
                         ("residentecosystem", ("char", )),
                         ("temperature", ("double", )),
                         ("nexcretionrate", ("double", )),
                         ("pexcretionrate", ("double", )),
                         ("lognexcretionrate", ("double", )),
                         ("logpexcretionrate", ("double", )),
                         ("incubationtime", ("double", )),
                         ("nform", ("char", )), ("pform", ("char", )),
                         ("bodyc", ("double", )), ("bodyn", ("double", )),
                         ("bodyp", ("double", )), ("bodyc:n", ("double", )),
                         ("bodyc:p", ("double", )), ("bodyn:p", ("double", )),
                         ("bodydatasource", ("char", )),
                         ("datasource", ("char", )),
                         ("dataproviders", ("char", ))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

        # processing Aquatic_animal_excretion_variable_descriptions.csv
        filename = 'Aquatic_animal_excretion_variable_descriptions.csv'
        tablename = 'variable_descriptions'
        table = Table(str(tablename), delimiter=',')
        table.columns = [("Column", ("char", )), ("Variable", ("char", )),
                         ("Description", ("char", )),
                         ("Data Class", ("char", )), ("Units", ("char", )),
                         ("Minimum_value", ("char", )),
                         ("Maximum_value", ("char", )),
                         ("Possible_values", ("char", )),
                         ("Missing_data_symbol", ("char", )),
                         ("Notes", ("char", ))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filename = "Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt"
        engine.download_files_from_archive(self.urls["data"], [filename],
                                           filetype="zip")

        # Create table Species

        engine.auto_create_table(Table('main',
                                       cleanup=self.cleanup_func_table),
                                 filename=filename)
        engine.insert_data_from_file(engine.format_filename(filename))
Exemplo n.º 20
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        file_names = [ ('isotopes.csv','isotopes'),
                        ('sources.csv','sources'), 
                        ('diet.csv', 'diet')
                     ]

        engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], filetype="zip", archivename="ECOL_92_97")
        
        for(filename,tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            self.engine.auto_create_table(Table(str(tablename), cleanup=self.cleanup_func_table),filename=filename)
            self.engine.insert_data_from_file(data_path)
Exemplo n.º 21
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"],
                                           ["PanTHERIA_1-0_WR05_Aug2008.txt"],
                                           filetype="zip")

        # Create table Species
        engine.auto_create_table(Table('species',
                                       cleanup=Cleanup(correct_invalid_value,
                                                       nulls=['NA'])),
                                 filename="PanTHERIA_1-0_WR05_Aug2008.txt")
        engine.insert_data_from_file(
            engine.format_filename("PanTHERIA_1-0_WR05_Aug2008.txt"))
 def __init__(self, **kwargs):
     Script.__init__(self, **kwargs)
     self.title = "Mammal Super Tree"
     self.name = 'mammal-super-tree'
     self.ref = 'http://doi.org/10.1111/j.1461-0248.2009.01307.x'
     self.citation = "Fritz, S. A., Bininda-Emonds, O. R. P. and Purvis, A. (2009), Geographical variation in predictors of mammalian extinction risk: big is bad, but only in the tropics. Ecology Letters, 12: 538-549. doi:10.1111/j.1461-0248.2009.01307.x"
     self.description = "Mammal Super Tree from Fritz, S.A., O.R.P Bininda-Emonds, and A. Purvis. 2009. Geographical variation in predictors of mammalian extinction risk: big is bad, but only in the tropics. Ecology Letters 12:538-549"
     self.retriever_minimum_version = '2.0.dev'
     self.version = "2.0.0"
     self.urls = {
         'mammal_super_tree_fritz2009.tre':
         'http://onlinelibrary.wiley.com/store/10.1111/j.1461-0248.2009.01307.x/asset/supinfo/ELE_1307_sm_SA1.tre?v=1&s=366b28651a9b5d1a3148ef9a8620f8aa31a7df44'
     }
     self.downlaod_only = True
     self.tables = {'trees': Table("trees")}
Exemplo n.º 23
0
 def __init__(self):
     Script.__init__(self,
                     name="Gulf of Maine intertidal density/cover (Ecological Archives 2008)",
                     description="Peter S. Petraitis, Harrison Liu, and Erika C. Rhile. 2008. Densities and cover data for intertidal organisms in the Gulf of Maine, USA, from 2003 to 2007. Ecology 89:588.",
                     shortname="Petraitis2008",
                     ref="http://www.esapubs.org/archive/ecol/E089/032/",
                     urls = {
                             "main": "http://www.esapubs.org/archive/ecol/E089/032/Succession_sampling_03-07_data.txt",
                            },
                     tables = {
                               "main": Table("main",
                                             cleanup=Cleanup(correct_invalid_value,
                                                             nulls=[-999.9])),
                               }
                     )
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        file_names = [('Flensburg_Data_Links.csv', 'links'),
                      ('Flensburg_Data_Nodes.csv', 'nodes')]

        engine.download_files_from_archive(self.urls["zip"],
                                           [i[0] for i in file_names], "zip",
                                           False, "ECOL_92_174")

        for (filename, tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            self.engine.auto_create_table(Table(
                str(tablename), cleanup=self.cleanup_func_table),
                                          filename=filename)
            self.engine.insert_data_from_file(data_path)
Exemplo n.º 25
0
 def __init__(self):
     Script.__init__(self,
                     name = "Gulf of Maine intertidal density/cover (Ecological Archives 2008)",
                     citation = "Peter S. Petraitis, Harrison Liu, and Erika C. Rhile. 2008. Densities and cover data for intertidal organisms in the Gulf of Maine, USA, from 2003 to 2007. Ecology 89:588.",
                     shortname = "Petraitis2008",
                     ref = "http://www.esapubs.org/archive/ecol/E089/032/",
                     description = "The data set provides access to data on densities and percent cover in the 60 experimental plots from 2003 to 2007 and to update data from 1996 to 2002 that are already published in Ecological Archives.It includes densities of mussels, an herbivorous limpet, herbivorous snails, a predatory snail, a barnacle , and fucoid algae and percent cover by mussels, barnacles, fucoids, and other sessile organisms.",
                     urls = {
                             "main": "http://www.esapubs.org/archive/ecol/E089/032/Succession_sampling_03-07_data.txt",
                            },
                     tables = {
                               "main": Table("main",
                                             cleanup=Cleanup(correct_invalid_value,
                                                             nulls=[-999.9])),
                               }
                     )
Exemplo n.º 26
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"]
        engine.download_files_from_archive(self.urls["data"], files, filetype="zip")

        # Create table species
        engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table),
                                 filename="Species_list.txt")
        engine.insert_data_from_file(engine.format_filename("Species_list.txt"))

        # Create table sites
        engine.auto_create_table(Table('sites', cleanup=self.cleanup_func_table),
                                 filename="Site_variables.txt")
        engine.insert_data_from_file(engine.format_filename("Site_variables.txt"))

        # Create table microplots
        table = Table('microplots')
        table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))]
        table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9',
                          'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17',
                          'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26',
                          'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35',
                          'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46',
                          'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54',
                          'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62',
                          'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70',
                          'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79',
                          'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88',
                          'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96',
                          'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104']
        table.ct_column = 'PlotID'
        engine.auto_create_table(table, filename="Microplot_data.txt")
        engine.insert_data_from_file(engine.format_filename("Microplot_data.txt"))

        # Create table microplots
        table = Table('macroplots')
        table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5']
        table.ct_column = 'Tree'
        table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')),
                         ('Girth', ('ct-int',))]
        engine.auto_create_table(table, filename="Macroplot_data_Rev.txt")
        engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
Exemplo n.º 27
0
 def __init__(self):
     Script.__init__(
         self,
         name=
         "Gulf of Maine intertidal density/cover (Petraitis et al. 2008)",
         citation=
         "Peter S. Petraitis, Harrison Liu, and Erika C. Rhile. 2008. Densities and cover data for intertidal organisms in the Gulf of Maine, USA, from 2003 to 2007. Ecology 89:588.",
         shortname="intertidal-abund-me",
         ref=
         "https://figshare.com/collections/DENSITIES_AND_COVER_DATA_FOR_INTERTIDAL_ORGANISMS_IN_THE_GULF_OF_MAINE_USA_FROM_2003_TO_2007/3300200",
         description=
         "The data set provides access to data on densities and percent cover in the 60 experimental plots from 2003 to 2007 and to update data from 1996 to 2002 that are already published in Ecological Archives.It includes densities of mussels, an herbivorous limpet, herbivorous snails, a predatory snail, a barnacle , and fucoid algae and percent cover by mussels, barnacles, fucoids, and other sessile organisms.",
         retriever_minimum_version='2.0.dev',
         version='1.4.0',
         urls={"main": "https://ndownloader.figshare.com/files/5600831"},
         tables={
             "main":
             Table("main",
                   cleanup=Cleanup(correct_invalid_value, nulls=[-999.9]))
         })
Exemplo n.º 28
0
def create_resources(file, skip_lines):
    """Creates resources for the script or errors out if not possible"""
    engine = Engine()
    table = engine.auto_create_table(Table(str(file), header_rows=skip_lines),
                                     filename=file,
                                     make=False)
    clean_table = table.__dict__
    resource_dict = {}
    path_to_table = os.path.basename(clean_table["name"])
    resource_dict["name"] = os.path.splitext(path_to_table)[0]
    resource_dict["schema"] = {}
    resource_dict["dialect"] = {}
    resource_dict["schema"]["fields"] = []
    for cname, ctuple in clean_table["columns"]:
        resource_dict["schema"]["fields"].append({
            "name": cname,
            "type": ctuple[0]
        })
    resource_dict["url"] = "FILL"
    return resource_dict
Exemplo n.º 29
0
 def create_tabular_resources(self, file, skip_lines, encoding):
     """Create resources for tabular scripts"""
     engine = Engine()
     self.encoding = encoding
     engine.encoding = encoding
     table_val = Table(str(file), header_rows=skip_lines)
     table = engine.auto_create_table(table_val, filename=file, make=False)
     clean_table = table.__dict__
     resource_dict = {}
     path_to_table = os.path.basename(clean_table["name"])
     print("Processing... {file_name}".format(file_name=path_to_table))
     r_name = os.path.splitext(path_to_table)[0].lower()
     resource_dict["name"] = clean_table_name(r_name)
     resource_dict["path"] = path_to_table
     resource_dict["schema"] = {}
     resource_dict["dialect"] = {"delimiter": ","}
     resource_dict["schema"]["fields"] = []
     for cname, ctuple in clean_table["columns"]:
         if len(ctuple) >= 2:
             if ctuple[0] == "char":
                 # char sizes need quotes
                 char_size = "{a}".format(a=ctuple[1])
                 resource_dict["schema"]["fields"].append({
                     "name": cname,
                     "type": ctuple[0],
                     "size": char_size
                 })
             else:
                 resource_dict["schema"]["fields"].append({
                     "name": cname,
                     "type": ctuple[0],
                     "size": ctuple[1]
                 })
         else:
             resource_dict["schema"]["fields"].append({
                 "name": cname,
                 "type": ctuple[0]
             })
     resource_dict["url"] = "fill"
     return resource_dict
Exemplo n.º 30
0
def create_resources(file, skip_lines):
    """Creates resources for the script or errors out if not possible"""
    engine = Engine()
    table = engine.auto_create_table(Table(str(file), header_rows=skip_lines),
                                     filename=file,
                                     make=False)
    clean_table = table.__dict__
    resource_dict = {}
    path_to_table = os.path.basename(clean_table["name"])
    print("Processing... {file_name}".format(file_name=path_to_table))
    resource_dict["name"] = os.path.splitext(path_to_table)[0].lower()
    resource_dict["path"] = path_to_table
    resource_dict["schema"] = {}
    resource_dict["dialect"] = {"delimiter": ","}
    resource_dict["schema"]["fields"] = []
    for cname, ctuple in clean_table["columns"]:
        if len(ctuple) >= 2:
            if ctuple[0] == 'char':
                # char sizes need quotes
                char_size = "{a}".format(a=ctuple[1])
                resource_dict["schema"]["fields"].append({
                    "name": cname,
                    "type": ctuple[0],
                    "size": char_size
                })
            else:
                resource_dict["schema"]["fields"].append({
                    "name": cname,
                    "type": ctuple[0],
                    "size": ctuple[1]
                })
        else:
            resource_dict["schema"]["fields"].append({
                "name": cname,
                "type": ctuple[0]
            })
    resource_dict["url"] = "FILL"
    return resource_dict
Exemplo n.º 31
0
    def __init__(self, **kwargs):
        Script.__init__(self, **kwargs)
        self.cleanup_func_table = Cleanup(correct_invalid_value,
                                          missing_values=[-999])
        self.title = "Tree growth, mortality, physical condition - Clark, 2006"
        self.keywords = ['plants', 'time-series']
        self.urls = {'trees': 'https://ndownloader.figshare.com/files/5597693'}
        self.name = "la-selva-trees"
        self.description = "The data set helps to examine the post-establishment ecology of 10 species of tropical wet forest trees selected to span a range of predicted life history patterns at the La Selva Biological Station in Costa Rica."
        self.ref = "https://doi.org/10.6084/m9.figshare.c.3299324.v1"
        self.retriever_minimum_version = "2.0.dev"
        self.version = '1.4.1'
        self.citation = "David B. Clark and Deborah A. Clark. 2006. Tree growth, mortality, physical condition, and microsite in an old-growth lowland tropical rain forest. Ecology 87:2132."

        if parse_version(VERSION) <= parse_version("2.0.0"):
            self.shortname = self.name
            self.name = self.title
            self.tags = self.keywords
            self.cleanup_func_table = Cleanup(correct_invalid_value,
                                              nulls=[-999])
        self.tables = {
            'trees': Table('trees', cleanup=self.cleanup_func_table)
        }
Exemplo n.º 32
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'vertnet_latest_mammals.csv'
        tablename = 'mammals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("record_id", ("pk-auto",)),
            ("beginrecord", ("char",)),
            ("icode", ("char",)),
            ("title", ("char",)),
            ("citation", ("char",)),
            ("contact", ("char",)),
            ("email", ("char",)),
            ("emlrights", ("char",)),
            ("gbifdatasetid", ("char",)),
            ("gbifpublisherid", ("char",)),
            ("doi", ("char",)),
            ("migrator", ("char",)),
            ("networks", ("char",)),
            ("orgcountry", ("char",)),
            ("orgname", ("char",)),
            ("orgstateprovince", ("char",)),
            ("pubdate", ("char",)),
            ("source_url", ("char",)),
            ("iptrecordid", ("char",)),
            ("associatedmedia", ("char",)),
            ("associatedoccurrences", ("char",)),
            ("associatedorganisms", ("char",)),
            ("associatedreferences", ("char",)),
            ("associatedsequences", ("char",)),
            ("associatedtaxa", ("char",)),
            ("bed", ("char",)),
            ("behavior", ("char",)),
            ("catalognumber", ("char",)),
            ("continent", ("char",)),
            ("coordinateprecision", ("char",)),
            ("coordinateuncertaintyinmeters", ("char",)),
            ("country", ("char",)),
            ("countrycode", ("char",)),
            ("county", ("char",)),
            ("dateidentified", ("char",)),
            ("day", ("char",)),
            ("decimallatitude", ("char",)),
            ("decimallongitude", ("char",)),
            ("disposition", ("char",)),
            ("earliestageorloweststage", ("char",)),
            ("earliesteonorlowesteonothem", ("char",)),
            ("earliestepochorlowestseries", ("char",)),
            ("earliesteraorlowesterathem", ("char",)),
            ("earliestperiodorlowestsystem", ("char",)),
            ("enddayofyear", ("char",)),
            ("establishmentmeans", ("char",)),
            ("eventdate", ("char",)),
            ("eventid", ("char",)),
            ("eventremarks", ("char",)),
            ("eventtime", ("char",)),
            ("fieldnotes", ("char",)),
            ("fieldnumber", ("char",)),
            ("footprintspatialfit", ("char",)),
            ("footprintsrs", ("char",)),
            ("footprintwkt", ("char",)),
            ("formation", ("char",)),
            ("geodeticdatum", ("char",)),
            ("geologicalcontextid", ("char",)),
            ("georeferencedby", ("char",)),
            ("georeferenceddate", ("char",)),
            ("georeferenceprotocol", ("char",)),
            ("georeferenceremarks", ("char",)),
            ("georeferencesources", ("char",)),
            ("georeferenceverificationstatus", ("char",)),
            ("group", ("char",)),
            ("habitat", ("char",)),
            ("highergeography", ("char",)),
            ("highergeographyid", ("char",)),
            ("highestbiostratigraphiczone", ("char",)),
            ("identificationid", ("char",)),
            ("identificationqualifier", ("char",)),
            ("identificationreferences", ("char",)),
            ("identificationremarks", ("char",)),
            ("identificationverificationstatus", ("char",)),
            ("identifiedby", ("char",)),
            ("individualcount", ("char",)),
            ("island", ("char",)),
            ("islandgroup", ("char",)),
            ("latestageorhigheststage", ("char",)),
            ("latesteonorhighesteonothem", ("char",)),
            ("latestepochorhighestseries", ("char",)),
            ("latesteraorhighesterathem", ("char",)),
            ("latestperiodorhighestsystem", ("char",)),
            ("lifestage", ("char",)),
            ("lithostratigraphicterms", ("char",)),
            ("locality", ("char",)),
            ("locationaccordingto", ("char",)),
            ("locationid", ("char",)),
            ("locationremarks", ("char",)),
            ("lowestbiostratigraphiczone", ("char",)),
            ("materialsampleid", ("char",)),
            ("maximumdepthinmeters", ("char",)),
            ("maximumdistanceabovesurfaceinmeters", ("char",)),
            ("maximumelevationinmeters", ("char",)),
            ("member", ("char",)),
            ("minimumdepthinmeters", ("char",)),
            ("minimumdistanceabovesurfaceinmeters", ("char",)),
            ("minimumelevationinmeters", ("char",)),
            ("month", ("char",)),
            ("municipality", ("char",)),
            ("occurrenceid", ("char",)),
            ("occurrenceremarks", ("char",)),
            ("occurrencestatus", ("char",)),
            ("organismid", ("char",)),
            ("organismname", ("char",)),
            ("organismremarks", ("char",)),
            ("organismscope", ("char",)),
            ("othercatalognumbers", ("char",)),
            ("pointradiusspatialfit", ("char",)),
            ("preparations", ("char",)),
            ("previousidentifications", ("char",)),
            ("recordedby", ("char",)),
            ("recordnumber", ("char",)),
            ("reproductivecondition", ("char",)),
            ("samplingeffort", ("char",)),
            ("samplingprotocol", ("char",)),
            ("sex", ("char",)),
            ("startdayofyear", ("char",)),
            ("stateprovince", ("char",)),
            ("typestatus", ("char",)),
            ("verbatimcoordinates", ("char",)),
            ("verbatimcoordinatesystem", ("char",)),
            ("verbatimdepth", ("char",)),
            ("verbatimelevation", ("char",)),
            ("verbatimeventdate", ("char",)),
            ("verbatimlatitude", ("char",)),
            ("verbatimlocality", ("char",)),
            ("verbatimlongitude", ("char",)),
            ("verbatimsrs", ("char",)),
            ("waterbody", ("char",)),
            ("year", ("char",)),
            ("dctype", ("char",)),
            ("modified", ("char",)),
            ("language", ("char",)),
            ("license", ("char",)),
            ("rightsholder", ("char",)),
            ("accessrights", ("char",)),
            ("bibliographiccitation", ("char",)),
            ("dc_references", ("char",)),
            ("institutionid", ("char",)),
            ("collectionid", ("char",)),
            ("datasetid", ("char",)),
            ("institutioncode", ("char",)),
            ("collectioncode", ("char",)),
            ("datasetname", ("char",)),
            ("ownerinstitutioncode", ("char",)),
            ("basisofrecord", ("char",)),
            ("informationwithheld", ("char",)),
            ("datageneralizations", ("char",)),
            ("dynamicproperties", ("char",)),
            ("scientificnameid", ("char",)),
            ("namepublishedinid", ("char",)),
            ("scientificname", ("char",)),
            ("acceptednameusage", ("char",)),
            ("originalnameusage", ("char",)),
            ("namepublishedin", ("char",)),
            ("namepublishedinyear", ("char",)),
            ("higherclassification", ("char",)),
            ("kingdom", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("genus", ("char",)),
            ("subgenus", ("char",)),
            ("specificepithet", ("char",)),
            ("infraspecificepithet", ("char",)),
            ("taxonrank", ("char",)),
            ("verbatimtaxonrank", ("char",)),
            ("scientificnameauthorship", ("char",)),
            ("vernacularname", ("char",)),
            ("nomenclaturalcode", ("char",)),
            ("taxonomicstatus", ("char",)),
            ("keyname", ("char",)),
            ("haslicense", ("int",)),
            ("vntype", ("char",)),
            ("rank", ("int",)),
            ("mappable", ("int",)),
            ("hashid", ("char",)),
            ("hastypestatus", ("int",)),
            ("wascaptive", ("int",)),
            ("wasinvasive", ("int",)),
            ("hastissue", ("int",)),
            ("hasmedia", ("int",)),
            ("isfossil", ("int",)),
            ("haslength", ("int",)),
            ("haslifestage", ("int",)),
            ("hasmass", ("int",)),
            ("hassex", ("int",)),
            ("lengthinmm", ("double",)),
            ("massing", ("double",)),
            ("lengthunitsinferred", ("char",)),
            ("massunitsinferred", ("char",)),
            ("underivedlifestage", ("char",)),
            ("underivedsex", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip", archivename="vertnet_latest_" + str(tablename))
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Exemplo n.º 33
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)
            
            engine = self.engine
            
            # Routes table            
            if not os.path.isfile(engine.format_filename("routes_new.csv")):
                engine.download_files_from_archive(self.urls["routes"],
                                                   ["routes.csv"])
                read = open(engine.format_filename("routes.csv"), "rb")
                write = open(engine.format_filename("routes_new.csv"), "wb")
                print "Cleaning routes data..."
                write.write(read.readline())
                for line in read:
                    values = line.split(',')
                    v = Decimal(values[5])
                    if  v > 0:
                        values[5] = str(v * Decimal("-1"))
                    write.write(','.join(str(value) for value in values))
                write.close()
                read.close()
                
            engine.auto_create_table(Table("routes", cleanup=Cleanup()), 
                                     filename="routes_new.csv")
                
            engine.insert_data_from_file(engine.format_filename("routes_new.csv"))

            
            # Weather table                
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"], 
                                                   ["weather.csv"])            
                read = open(engine.format_filename("weather.csv"), "rb")
                write = open(engine.format_filename("weather_new.csv"), "wb")
                print "Cleaning weather data..."
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:
                        
                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()
            
            engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), 
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))
            
            
            # Species table
            table = Table("species", pk=False, delimiter=',')
            
            table.columns=[("species_id"            ,   ("pk-auto",)        ),
                           ("AOU"                   ,   ("int",)            ),
                           ("genus"                 ,   ("char",30)         ),
                           ("species"               ,   ("char",50)         ),
                           ("subspecies"            ,   ("char",30)         ),
                           ("id_to_species"         ,   ("bool",)           )]
            
            engine.table = table
            engine.create_table()
            
            engine.download_file(self.urls["species"], "SpeciesList.txt")
            species_list = open(engine.format_filename("SpeciesList.txt"), "rb")
            for n in range(8):
                species_list.readline()
            
            rows = []
            for line in species_list:
                if line and len(line) > 273:
                    latin_name = line[273:].split()
                    if len(latin_name) < 2:
                        # If there's no species given, add "None" value
                        latin_name.append("None")
                    subspecies = ' '.join(latin_name[2:]) if len(latin_name) > 2 else "None"                    
                    id_to_species = "1" if latin_name[1] != "None" else "0"
                    if latin_name[1] == "sp.":
                        latin_name[1] = "None"
                        id_to_species = "0"
                    if ("x" in latin_name or "/" in latin_name
                        or "/" in subspecies or "or" in latin_name):
                        # Hybrid species or only identified to a group of species
                        latin_name[1] = ' '.join(latin_name[1:])
                        subspecies = "None"
                        id_to_species = "0"
                    
                    rows.append(','.join([
                                          line.split()[1], 
                                          latin_name[0],
                                          latin_name[1],
                                          subspecies,
                                          id_to_species
                                          ]))
                    
            engine.add_to_table(rows)
            
            species_list.close()
            
            
            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])
            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in replace.keys():
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)
            
            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]
            
            engine.table = table
            engine.create_table()
                                    
            engine.insert_data_from_url(self.urls["region_codes"])
                        
            # Counts table
            table = Table("counts", delimiter=',')
            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]
            
            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print "Inserting data from part " + part + "..."
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] + 
                                                        "Fifty" + part + ".exe", 
                                                        ["fifty" + part + ".csv"])
                    except:               
                        print "Failed bulk insert on " + part + ", inserting manually."
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] + 
                                                        "Fifty" + part + ".exe", 
                                                        ["fifty" + part + ".csv"])
                            
                except:
                    print "There was an error in part " + part + "."
                    raise
            
            
        except zipfile.BadZipfile:            
            print "There was an unexpected error in the Breeding Bird Survey archives."
            raise    
        
        return engine
Exemplo n.º 34
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns=[("species_id",               ("pk-int",)         ),
                           ("AOU",                      ("int",)            ),
                           ("english_common_name",      ("char",50)         ),
                           ("french_common_name",       ("char",50)         ),
                           ("spanish_common_name",      ("char",50)         ),
                           ("sporder",                  ("char",30)         ),
                           ("family",                   ("char",30)         ),
                           ("genus",                    ("char",30)         ),
                           ("species",                  ("char",50)         ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns=[("record_id"             ,   ("pk-auto",)    ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("Year"                  ,   ("int",)        ),
                           ("Aou"                   ,   ("int",)        ),
                           ("Count10"               ,   ("int",)        ),
                           ("Count20"               ,   ("int",)        ),
                           ("Count30"               ,   ("int",)        ),
                           ("Count40"               ,   ("int",)        ),
                           ("Count50"               ,   ("int",)        ),
                           ("StopTotal"             ,   ("int",)        ),
                           ("SpeciesTotal"          ,   ("int",)        )]

            stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
                          "Connecticut", "Delaware", "Florida", "Georgia", "Idaho",
                          "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
                          "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                          "Missouri", "Montana", "Nebraska", "Nevada",
                          ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"],
                          ["New Mexico", "NMexico"], ["New York", "NYork"],
                          ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio",
                          "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"],
                          ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee",
                          "Texas", "Utah", "Vermont", "Virginia", "Washington",
                          ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                          ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"],
                          ["Northwest Territories", "NWTerri"], "Newfoundland",
                          ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                          ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state + ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine
Exemplo n.º 35
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = ["observation_id",
                         "update_datetime",
                         "site_id",
                         "latitude",
                         "longitude",
                         "elevation_in_meters",
                         "state",
                         "species_id",
                         "genus",
                         "species",
                         "common_name",
                         "kingdom",
                         "individual_id",
                         "phenophase_id",
                         "phenophase_description",
                         "observation_date",
                         "day_of_year",
                         "phenophase_status",
                         "intensity_category_id",
                         "intensity_value",
                         "abundance_value"
                         ]

        columns = [("record_id", ("pk-auto",)),
                   ("observation_id", ("int",)),  # subsequently refered to as "status record"
                   ("update_datetime", ("char",)),
                   ("site_id", ("int",)),
                   ("latitude", ("double",)),
                   ("longitude", ("double",)),
                   ("elevation_in_meters", ("char",)),
                   ("state", ("char",)),
                   ("species_id", ("int",)),
                   ("genus", ("char",)),
                   ("species", ("char",)),
                   ("common_name", ("char",)),
                   ("kingdom", ("char",)),  # skip kingdom
                   ("individual_id", ("char",)),
                   ("phenophase_id", ("int",)),
                   ("phenophase_description", ("char",)),
                   ("observation_date", ("char",)),
                   ("day_of_year", ("char",)),
                   ("phenophase_status", ("char",)),
                   ("intensity_category_id", ("char",)),
                   ("intensity_value", ("char",)),
                   ("abundance_value", ("char",))
                   ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine
Exemplo n.º 36
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"], ["Data_Files/Amniote_Database_Aug_2015.csv",
                                                               "Data_Files/Amniote_Database_References_Aug_2015.csv",
                                                               "Data_Files/Amniote_Range_Count_Aug_2015.csv"],
                                           filetype="zip")

        ct_column = 'trait'  # all tables use the same ct_column name

        # Create tables from Amniote_Database_Aug.csv and Amniote_Database_References_Aug_2015.csv
        # Both reference and main have the same headers

        ct_names = ['female_maturity_d', 'litter_or_clutch_size_n', 'litters_or_clutches_per_y', 'adult_body_mass_g',
                    'maximum_longevity_y', 'gestation_d', 'weaning_d', 'birth_or_hatching_weight_g', 'weaning_weight_g',
                    'egg_mass_g', 'incubation_d', 'fledging_age_d', 'longevity_y', 'male_maturity_d',
                    'inter_litter_or_interbirth_interval_y', 'female_body_mass_g', 'male_body_mass_g',
                    'no_sex_body_mass_g', 'egg_width_mm', 'egg_length_mm', 'fledging_mass_g', 'adult_svl_cm',
                    'male_svl_cm', 'female_svl_cm', 'birth_or_hatching_svl_cm', 'female_svl_at_maturity_cm',
                    'female_body_mass_at_maturity_g', 'no_sex_svl_cm', 'no_sex_maturity_d']

        # Create table main from Amniote_Database_Aug_2015.csv

        columns = [
            ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')),
            ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')),
            ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))]
        table_main = Table('main', delimiter=',', cleanup=self.cleanup_func_table)
        table_main.ct_column = ct_column
        table_main.ct_names = ct_names
        table_main.columns = columns
        engine.auto_create_table(table_main,
                                 filename="Amniote_Database_Aug_2015.csv")
        engine.insert_data_from_file(engine.format_filename("Amniote_Database_Aug_2015.csv"))

        # Create table reference from Amniote_Database_References_Aug_2015.csv
        reference_columns = [
            ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')),
            ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')),
            ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('reference', ('ct-char',))]

        table_references = Table('references', delimiter=',', cleanup=self.cleanup_func_table)
        table_references.ct_column = ct_column
        table_references.ct_names = ct_names
        table_references.columns = reference_columns
        engine.auto_create_table(table_references,
                                 filename="Amniote_Database_References_Aug_2015.csv")
        engine.insert_data_from_file(engine.format_filename("Amniote_Database_References_Aug_2015.csv"))

        # Create table Range
        # This table has different values for headers from the above tables.
        range_ct_names = ["min_female_maturity", "max_female_maturity", "count_female_maturity", "min_litter_clutch_size",
                    "max_litter_clutch_size", "count_litter_clutch_size", "min_litters_clutches",
                    "max_litters_clutches", "count_litters_clutches", "min_adult_body_mass", "max_adult_body_mass",
                    "count_adult_body_mass", "min_maximum_longevity", "max_maximum_longevity",
                    "count_maximum_longevity", "min_gestation", "max_gestation", "count_gestation", "min_weaning",
                    "max_weaning", "count_weaning", "min_birth_hatching_weight", "max_birth_hatching_weight",
                    "count_birth_hatching_weight", "min_weaning_weight", "max_weaning_weight", "count_weaning_weight",
                    "min_egg_mass", "max_egg_mass", "count_egg_mass", "min_incubation", "max_incubation",
                    "count_incubation", "min_fledging_age", "max_fledging_age", "count_fledging_age",
                    "min_male_maturity", "max_male_maturity", "count_male_maturity",
                    "min_inter_litter_interbirth_interval", "max_inter_litter_interbirth_interval",
                    "count_inter_litter_interbirth_interval", "min_female_body_mass", "max_female_body_mass",
                    "count_female_body_mass", "min_male_body_mass", "max_male_body_mass", "count_male_body_mass",
                    "min_no_sex_body_mass", "max_no_sex_body_mass", "count_no_sex_body_mass", "min_egg_width",
                    "max_egg_width", "count_egg_width", "min_egg_length", "max_egg_length", "count_egg_length",
                    "min_fledging_mass", "max_fledging_mass", "count_fledging_mass", "min_adult_svl", "max_adult_svl",
                    "count_adult_svl", "min_male_svl", "max_male_svl", "count_male_svl", "min_female_svl",
                    "max_female_svl", "count_female_svl", "min_hatching_svl", "max_hatching_svl", "count_hatching_svl",
                    "min_female_svl_at_maturity", "max_female_svl_at_maturity", "count_female_svl_at_maturity",
                    "min_female_body_mass_at_maturity", "max_female_body_mass_at_maturity",
                    "count_female_body_mass_at_maturity", "min_no_sex_svl", "max_no_sex_svl", "count_no_sex_svl",
                    "min_no_sex_maturity", "max_no_sex_maturity", "count_no_sex_maturity"]
        range_columns = [
            ('record_id', ('pk-auto',)), ('classx', ('char', '20')), ('orderx', ('char', '20')),
            ('familyx', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')),
            ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))]

        table_range = Table('range', delimiter=',', cleanup=self.cleanup_func_table)
        table_range.ct_column = ct_column
        table_range.ct_names = range_ct_names
        table_range.columns = range_columns
        engine.auto_create_table(table_range,
                                 filename="Amniote_Range_Count_Aug_2015.csv")
        engine.insert_data_from_file(engine.format_filename("Amniote_Range_Count_Aug_2015.csv"))
Exemplo n.º 37
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv')
        self.engine.insert_data_from_url(self.urls["sites"])

        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip"))
        filelist = local_zip.namelist()
        local_zip.close()
        self.engine.download_files_from_archive(self.urls["stems"], filelist)

        filelist = [os.path.basename(filename) for filename in filelist]

        # Currently all_Excel.zip is missing CURUYUQU.xls
        # Download it separately and add it to the file list
        if not self.engine.find_file('CURUYUQU.xls'):
            self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls")
            filelist.append('CURUYUQU.xls')

        lines = []
        tax = []
        for filename in filelist:
            print("Extracting data from " + filename + "...")
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for colnum, c in enumerate(sh.row(0)):
                if not Excel.empty_cell(c):
                    cid = c.value.lower().strip()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # in QUIAPACA.xls the "number of individuals" column is
                    # misnamed "STEMDBH" just like the stems columns, so weep
                    # for the state of scientific data and then fix manually
                    if filename == "QUIAPACA.xls" and colnum == 13:
                        cid = "count"

                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid or "dbh" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in list(cn.keys()):
                cn["liana"] = -1
            if not "count" in list(cn.keys()):
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if not all(Excel.empty_cell(cell) for cell in row):
                    try:
                        this_line = {}

                        # get the following information from the appropriate columns
                        for i in ["line", "family", "genus", "species",
                                  "liana", "count"]:
                            if cn[i] > -1:
                                if row[cn[i]].ctype != 2:
                                    # if the cell type(ctype) is not a number
                                    this_line[i] = row[cn[i]].value.lower().strip().replace("\\", "/").replace('"', '')
                                else:
                                    this_line[i] = row[cn[i]].value
                                if this_line[i] == '`':
                                    this_line[i] = 1
                        this_line["stems"] = [row[c]
                                              for c in cn["stems"]
                                              if not Excel.empty_cell(row[c])]
                        this_line["site"] = filename[0:-4]

                        # Manually correct CEDRAL data, which has a single line
                        # that is shifted by one to the left starting at Liana
                        if this_line["site"] == "CEDRAL" and type(this_line["liana"]) == float:
                            this_line["liana"] = ""
                            this_line["count"] = 3
                            this_line["stems"] = [2.5, 2.5, 30, 18, 25]

                        lines.append(this_line)

                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"],
                                    this_line["genus"],
                                    this_line["species"],
                                    id_level,
                                    str(full_id)))
                    except:
                        raise
                        pass

        tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = {}
        tax_count = 0

        # Get all unique families/genera/species
        print("\n")
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.flush()
                    sys.stdout.write(msg + "\b" * len(msg))
        print("\n")
        # Create species table
        table = Table("species", delimiter=",")
        table.columns=[("species_id"            ,   ("pk-int",)    ),
                       ("family"                ,   ("char", )    ),
                       ("genus"                 ,   ("char", )    ),
                       ("species"               ,   ("char", )    ),
                       ("id_level"              ,   ("char", 10)    ),
                       ("full_id"               ,   ("int",)       )]

        data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        # Create stems table
        table = Table("stems", delimiter=",")
        table.columns=[("stem_id"               ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("stem"                  ,   ("double",)     )]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [line["line"],
                            tax_dict[(line["family"],
                                      line["genus"],
                                      line["species"])],
                            line["site"],
                            liana
                            ]
            try:
                counts.append([value for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [str(i)]
                stems.append(stem)

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(stems)

        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns=[("count_id"              ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("count"                 ,   ("double",)     )]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(counts)

        return self.engine
Exemplo n.º 38
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=6)

            table.columns=[("species_id", ("pk-int",) ),
                           ("AOU", ("int",) ),
                           ("english_common_name", ("char",50) ),
                           ("french_common_name", ("char",50) ),
                           ("spanish_common_name", ("char",50) ),
                           ("sporder", ("char",30) ),
                           ("family", ("char",30) ),
                           ("genus", ("char",30) ),
                           ("species", ("char",50) ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            if not os.path.isfile(engine.format_filename("routes_new.csv")):
                engine.download_files_from_archive(self.urls["routes"],
                                                   ["routes.csv"])
                read = open(engine.format_filename("routes.csv"), "rb")
                write = open(engine.format_filename("routes_new.csv"), "wb")
                print "Cleaning routes data..."
                write.write(read.readline())
                for line in read:
                    values = line.split(',')
                    v = Decimal(values[5])
                    if  v > 0:
                        values[5] = str(v * Decimal("-1"))
                    write.write(','.join(str(value) for value in values))
                write.close()
                read.close()

            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes_new.csv")

            engine.insert_data_from_file(engine.format_filename("routes_new.csv"))


            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open(engine.format_filename("weather.csv"), "rb")
                write = open(engine.format_filename("weather_new.csv"), "wb")
                print "Cleaning weather data..."
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))


            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])
            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in replace.keys():
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns=[("RouteDataID"           ,   ("int",)        ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print "Inserting data from part " + part + "..."
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print "Failed bulk insert on " + part + ", inserting manually."
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print "There was an error in part " + part + "."
                    raise


        except zipfile.BadZipfile:
            print "There was an unexpected error in the Breeding Bird Survey archives."
            raise

        return engine
Exemplo n.º 39
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int",)),
                         ("Family", ("char",)),
                         ("Binomial", ("char",)),
                         ("Wood_Density", ("double",)),
                         ("Region", ("char",)),
                         ("Reference_Number", ("int",))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine
Exemplo n.º 40
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        
        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"])
        self.engine.insert_data_from_url(self.urls["sites"])
              
        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip"))        
        filelist = local_zip.namelist()
        local_zip.close()        
        self.engine.download_files_from_archive(self.urls["stems"], filelist)
        
        filelist = [os.path.basename(filename) for filename in filelist]
        
        lines = []
        tax = []
        for filename in filelist:
            print "Extracting data from " + filename + "..."
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for c in sh.row(0):
                if not Excel.empty_cell(c):
                    cid = Excel.cell_value(c).lower()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in cn.keys():
                cn["liana"] = -1
            if not "count" in cn.keys():
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if cellcount > 4 and not Excel.empty_cell(row[0]):
                    try:
                        this_line = {}
                        
                        def format_value(s):
                            s = Excel.cell_value(s)
                            return str(s).title().replace("\\", "/").replace('"', '')
                        
                        # get the following information from the appropriate columns
                        for i in ["line", "family", "genus", "species", 
                                  "liana", "count"]:
                            if cn[i] > -1:
                                this_line[i] = format_value(row[cn[i]])
                                if this_line[i] == '`':
                                    this_line[i] = 1

                        this_line["stems"] = [Excel.cell_value(row[c]) 
                                              for c in cn["stems"]
                                              if not Excel.empty_cell(row[c])]
                        this_line["site"] = filename[0:-4]
                        
                        lines.append(this_line)
                        
                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"], 
                                    this_line["genus"], 
                                    this_line["species"].lower().replace('\\', '').replace('"', ''), 
                                    id_level, 
                                    str(full_id)))
                    except:
                        raise
                        pass                    
        
        tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = dict()
        tax_count = 0
        
        # Get all unique families/genera/species
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.write(msg + "\b" * len(msg))
        print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS)
        
        
        # Create species table
        table = Table("species", delimiter=",")
        table.columns=[("species_id"            ,   ("pk-int",)    ),
                       ("family"                ,   ("char", )    ),
                       ("genus"                 ,   ("char", )    ),
                       ("species"               ,   ("char", )    ),
                       ("id_level"              ,   ("char", 10)    ),
                       ("full_id"               ,   ("bool",)       )]

        data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) 
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True
        
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create stems table
        table = Table("stems", delimiter=",", contains_pk=False)
        table.columns=[("stem_id"               ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("stem"                  ,   ("double",)     )]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [line["line"], 
                            tax_dict[(line["family"], 
                                      line["genus"], 
                                      line["species"].lower())],
                            line["site"],
                            liana
                            ]
            try:
                counts.append([str(value) for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [i]
                stems.append([str(value) for value in stem])
            
        data = [','.join(stem) for stem in stems]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns=[("count_id"              ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("count"                 ,   ("double",)     )]
        data = [','.join(count) for count in counts]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
            
        return self.engine
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filenames = ['Aquatic_animal_excretion_data.csv',
                     'Aquatic_animal_excretion_variable_descriptions.csv']
        for file_paths in filenames:
            if not os.path.isfile(engine.format_filename(file_paths)):
                url = self.urls["aquatic_animals"]
                engine.download_files_from_archive(url, filenames, "zip")

        # processing Aquatic_animal_excretion_data.csv
        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'
        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("index", ("pk-int",)),
            ("sourcenumber", ("int",)),
            ("sourcename", ("char",)),
            ("speciesname", ("char",)),
            ("speciescode", ("char",)),
            ("invert/vert", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("trophicgild", ("char",)),
            ("drymass", ("double",)),
            ("logdrymass", ("double",)),
            ("ecosystemtype", ("char",)),
            ("energysource", ("char",)),
            ("habitat", ("char",)),
            ("residentecosystem", ("char",)),
            ("temperature", ("double",)),
            ("nexcretionrate", ("double",)),
            ("pexcretionrate", ("double",)),
            ("lognexcretionrate", ("double",)),
            ("logpexcretionrate", ("double",)),
            ("incubationtime", ("double",)),
            ("nform", ("char",)),
            ("pform", ("char",)),
            ("bodyc", ("double",)),
            ("bodyn", ("double",)),
            ("bodyp", ("double",)),
            ("bodyc:n", ("double",)),
            ("bodyc:p", ("double",)),
            ("bodyn:p", ("double",)),
            ("bodydatasource", ("char",)),
            ("datasource", ("char",)),
            ("dataproviders", ("char",))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

        # processing Aquatic_animal_excretion_variable_descriptions.csv
        filename = 'Aquatic_animal_excretion_variable_descriptions.csv'
        tablename = 'variable_descriptions'
        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("Column", ("char",)),
            ("Variable", ("char",)),
            ("Description", ("char",)),
            ("Data Class", ("char",)),
            ("Units", ("char",)),
            ("Minimum_value", ("char",)),
            ("Maximum_value", ("char",)),
            ("Possible_values", ("char",)),
            ("Missing_data_symbol", ("char",)),
            ("Notes", ("char",))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Exemplo n.º 42
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")

        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        #Creating data table
        lines = []
        for i in range(1, rows):
            row = sh.row(i)
            if not all(Excel.empty_cell(cell) for cell in row):
                this_line = {}
                def format_value(s):
                    s = Excel.cell_value(s)
                    return str(s).title().replace("\\", "/").replace('"', '')
                for num, label in enumerate(["Number", "Family", "Binomial", "Wood_Density",
                            "Region", "Reference_Number"]):
                    this_line[label] = format_value(row[num])
                lines.append(this_line)

        table = Table("data", delimiter="\t")
        table.columns=[("Number"                ,   ("pk-int",) ),
                       ("Family"                ,   ("char",)   ),
                       ("Binomial"              ,   ("char",)   ),
                       ("Wood_Density"          ,   ("double",) ),
                       ("Region"                ,   ("char",)   ),
                       ("Reference_Number"      ,   ("int",)    )]
        table.pk = 'Number'
        table.contains_pk = True

        gwdd = []
        for line in lines:
            gwdd_data = [line["Number"],
                         line["Family"],
                         line["Binomial"],
                         line["Wood_Density"],
                         line["Region"],
                         line["Reference_Number"]]
            gwdd.append(gwdd_data)

        data = ['\t'.join(gwdd_line) for gwdd_line in gwdd]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        #Creating reference table
        lines = []
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for i in range(1, rows):
            row = sh.row(i)
            if not all(Excel.empty_cell(cell) for cell in row):
                this_line = {}
                def format_value(s):
                    s = Excel.cell_value(s)
                    return str(s).title().replace("\\", "/").replace('"', '')
                for num, label in enumerate(["Reference_Number", "Reference"]):
                    this_line[label] = format_value(row[num])
                lines.append(this_line)

        table = Table("reference", delimiter="\t")
        table.columns=[("Reference_Number"  ,   ("pk-int",) ),
                       ("Reference"         ,   ("char",)   )]
        table.pk = 'Reference_Number'
        table.contains_pk = True

        gwdd = []
        for line in lines:
            gwdd_ref = [line["Reference_Number"],
                        line["Reference"]]
            gwdd.append(gwdd_ref)

        data = ['\t'.join(gwdd_line) for gwdd_line in gwdd]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        return self.engine
Exemplo n.º 43
0
 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     engine = self.engine
     filename = "database.csv"
     tablename = "predicts_main"
     table = Table(str(tablename), delimiter=',')
     table.columns = [("Source_ID", ("char",)),
                      ("Reference", ("char",)),
                      ("Study_number", ("int",)),
                      ("Study_name", ("char",)),
                      ("SS", ("char",)),
                      ("Diversity_metric", ("char",)),
                      ("Diversity_metric_unit", ("char",)),
                      ("Diversity_metric_type", ("char",)),
                      ("Diversity_metric_is_effort_sensitive", ("char",)),
                      ("Diversity_metric_is_suitable_for_Chao", ("char",)),
                      ("Sampling_method", ("char",)),
                      ("Sampling_effort_unit", ("char",)),
                      ("Study_common_taxon", ("char",)),
                      ("Rank_of_study_common_taxon", ("char",)),
                      ("Site_number", ("int",)),
                      ("Site_name", ("char",)),
                      ("Block", ("char",)),
                      ("SSS", ("char",)),
                      ("SSB", ("char",)),
                      ("SSBS", ("char",)),
                      ("Sample_start_earliest", ("char",)),
                      ("Sample_end_latest", ("char",)),
                      ("Sample_midpoint", ("char",)),
                      ("Sample_date_resolution", ("char",)),
                      ("Max_linear_extent_metres", ("double",)),
                      ("Habitat_patch_area_square_metres", ("double",)),
                      ("Sampling_effort", ("double",)),
                      ("Rescaled_sampling_effort", ("double",)),
                      ("Habitat_as_described", ("char",)),
                      ("Predominant_land_use", ("char",)),
                      ("Source_for_predominant_land_use", ("char",)),
                      ("Use_intensity", ("char",)),
                      ("Km_to_nearest_edge_of_habitat", ("double",)),
                      ("Years_since_fragmentation_or_conversion", ("double",)),
                      ("Transect_details", ("char",)),
                      ("Coordinates_method", ("char",)),
                      ("Longitude", ("double",)),
                      ("Latitude", ("double",)),
                      ("Country_distance_metres", ("double",)),
                      ("Country", ("char")),
                      ("UN_subregion", ("char",)),
                      ("UN_region", ("char",)),
                      ("Ecoregion_distance_metres", ("double",)),
                      ("Ecoregion", ("char",)),
                      ("Biome", ("char",)),
                      ("Realm", ("char",)),
                      ("Hotspot", ("char",)),
                      ("Wilderness_area", ("char",)),
                      ("N_samples", ("double",)),
                      ("Taxon_number", ("double",)),
                      ("Taxon_name_entered", ("char",)),
                      ("Indication", ("char",)),
                      ("Parsed_name", ("char",)),
                      ("Taxon", ("char",)),
                      ("COL_ID", ("double",)),
                      ("Name_status", ("char",)),
                      ("Rank", ("char",)),
                      ("Kingdom", ("char",)),
                      ("Phylum", ("char",)),
                      ("Class", ("char",)),
                      ("Order", ("char",)),
                      ("Family", ("char",)),
                      ("Genus", ("char",)),
                      ("Species", ("char",)),
                      ("Best_guess_binomial", ("char",)),
                      ("Higher_taxa", ("char",)),
                      ("Higher_taxon", ("char",)),
                      ("Measurement", ("double",)),
                      ("Effort_corrected_measurement", ("double",))]
     engine.table = table
     if not os.path.isfile(engine.format_filename(filename)):
         engine.download_files_from_archive(self.urls["PREDICTS"],
                                            [filename],
                                            "zip",
                                            False,
                                            "download.zip")
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(str(filename)))
Exemplo n.º 44
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'vertnet_latest_reptiles.csv'
        tablename = 'reptiles'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("record_id", ("pk-auto",)),
            ("beginrecord", ("char",)),
            ("icode", ("char",)),
            ("title", ("char",)),
            ("citation", ("char",)),
            ("contact", ("char",)),
            ("email", ("char",)),
            ("emlrights", ("char",)),
            ("gbifdatasetid", ("char",)),
            ("gbifpublisherid", ("char",)),
            ("doi", ("char",)),
            ("migrator", ("char",)),
            ("networks", ("char",)),
            ("orgcountry", ("char",)),
            ("orgname", ("char",)),
            ("orgstateprovince", ("char",)),
            ("pubdate", ("char",)),
            ("source_url", ("char",)),
            ("iptrecordid", ("char",)),
            ("associatedmedia", ("char",)),
            ("associatedoccurrences", ("char",)),
            ("associatedorganisms", ("char",)),
            ("associatedreferences", ("char",)),
            ("associatedsequences", ("char",)),
            ("associatedtaxa", ("char",)),
            ("bed", ("char",)),
            ("behavior", ("char",)),
            ("catalognumber", ("char",)),
            ("continent", ("char",)),
            ("coordinateprecision", ("char",)),
            ("coordinateuncertaintyinmeters", ("char",)),
            ("country", ("char",)),
            ("countrycode", ("char",)),
            ("county", ("char",)),
            ("dateidentified", ("char",)),
            ("day", ("char",)),
            ("decimallatitude", ("char",)),
            ("decimallongitude", ("char",)),
            ("disposition", ("char",)),
            ("earliestageorloweststage", ("char",)),
            ("earliesteonorlowesteonothem", ("char",)),
            ("earliestepochorlowestseries", ("char",)),
            ("earliesteraorlowesterathem", ("char",)),
            ("earliestperiodorlowestsystem", ("char",)),
            ("enddayofyear", ("char",)),
            ("establishmentmeans", ("char",)),
            ("eventdate", ("char",)),
            ("eventid", ("char",)),
            ("eventremarks", ("char",)),
            ("eventtime", ("char",)),
            ("fieldnotes", ("char",)),
            ("fieldnumber", ("char",)),
            ("footprintspatialfit", ("char",)),
            ("footprintsrs", ("char",)),
            ("footprintwkt", ("char",)),
            ("formation", ("char",)),
            ("geodeticdatum", ("char",)),
            ("geologicalcontextid", ("char",)),
            ("georeferencedby", ("char",)),
            ("georeferenceddate", ("char",)),
            ("georeferenceprotocol", ("char",)),
            ("georeferenceremarks", ("char",)),
            ("georeferencesources", ("char",)),
            ("georeferenceverificationstatus", ("char",)),
            ("group", ("char",)),
            ("habitat", ("char",)),
            ("highergeography", ("char",)),
            ("highergeographyid", ("char",)),
            ("highestbiostratigraphiczone", ("char",)),
            ("identificationid", ("char",)),
            ("identificationqualifier", ("char",)),
            ("identificationreferences", ("char",)),
            ("identificationremarks", ("char",)),
            ("identificationverificationstatus", ("char",)),
            ("identifiedby", ("char",)),
            ("individualcount", ("char",)),
            ("island", ("char",)),
            ("islandgroup", ("char",)),
            ("latestageorhigheststage", ("char",)),
            ("latesteonorhighesteonothem", ("char",)),
            ("latestepochorhighestseries", ("char",)),
            ("latesteraorhighesterathem", ("char",)),
            ("latestperiodorhighestsystem", ("char",)),
            ("lifestage", ("char",)),
            ("lithostratigraphicterms", ("char",)),
            ("locality", ("char",)),
            ("locationaccordingto", ("char",)),
            ("locationid", ("char",)),
            ("locationremarks", ("char",)),
            ("lowestbiostratigraphiczone", ("char",)),
            ("materialsampleid", ("char",)),
            ("maximumdepthinmeters", ("char",)),
            ("maximumdistanceabovesurfaceinmeters", ("char",)),
            ("maximumelevationinmeters", ("char",)),
            ("member", ("char",)),
            ("minimumdepthinmeters", ("char",)),
            ("minimumdistanceabovesurfaceinmeters", ("char",)),
            ("minimumelevationinmeters", ("char",)),
            ("month", ("char",)),
            ("municipality", ("char",)),
            ("occurrenceid", ("char",)),
            ("occurrenceremarks", ("char",)),
            ("occurrencestatus", ("char",)),
            ("organismid", ("char",)),
            ("organismname", ("char",)),
            ("organismremarks", ("char",)),
            ("organismscope", ("char",)),
            ("othercatalognumbers", ("char",)),
            ("pointradiusspatialfit", ("char",)),
            ("preparations", ("char",)),
            ("previousidentifications", ("char",)),
            ("recordedby", ("char",)),
            ("recordnumber", ("char",)),
            ("reproductivecondition", ("char",)),
            ("samplingeffort", ("char",)),
            ("samplingprotocol", ("char",)),
            ("sex", ("char",)),
            ("startdayofyear", ("char",)),
            ("stateprovince", ("char",)),
            ("typestatus", ("char",)),
            ("verbatimcoordinates", ("char",)),
            ("verbatimcoordinatesystem", ("char",)),
            ("verbatimdepth", ("char",)),
            ("verbatimelevation", ("char",)),
            ("verbatimeventdate", ("char",)),
            ("verbatimlatitude", ("char",)),
            ("verbatimlocality", ("char",)),
            ("verbatimlongitude", ("char",)),
            ("verbatimsrs", ("char",)),
            ("waterbody", ("char",)),
            ("year", ("char",)),
            ("dctype", ("char",)),
            ("modified", ("char",)),
            ("language", ("char",)),
            ("license", ("char",)),
            ("rightsholder", ("char",)),
            ("accessrights", ("char",)),
            ("bibliographiccitation", ("char",)),
            ("dc_references", ("char",)),
            ("institutionid", ("char",)),
            ("collectionid", ("char",)),
            ("datasetid", ("char",)),
            ("institutioncode", ("char",)),
            ("collectioncode", ("char",)),
            ("datasetname", ("char",)),
            ("ownerinstitutioncode", ("char",)),
            ("basisofrecord", ("char",)),
            ("informationwithheld", ("char",)),
            ("datageneralizations", ("char",)),
            ("dynamicproperties", ("char",)),
            ("scientificnameid", ("char",)),
            ("namepublishedinid", ("char",)),
            ("scientificname", ("char",)),
            ("acceptednameusage", ("char",)),
            ("originalnameusage", ("char",)),
            ("namepublishedin", ("char",)),
            ("namepublishedinyear", ("char",)),
            ("higherclassification", ("char",)),
            ("kingdom", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("genus", ("char",)),
            ("subgenus", ("char",)),
            ("specificepithet", ("char",)),
            ("infraspecificepithet", ("char",)),
            ("taxonrank", ("char",)),
            ("verbatimtaxonrank", ("char",)),
            ("scientificnameauthorship", ("char",)),
            ("vernacularname", ("char",)),
            ("nomenclaturalcode", ("char",)),
            ("taxonomicstatus", ("char",)),
            ("keyname", ("char",)),
            ("haslicense", ("int",)),
            ("vntype", ("char",)),
            ("rank", ("int",)),
            ("mappable", ("int",)),
            ("hashid", ("char",)),
            ("hastypestatus", ("int",)),
            ("wascaptive", ("int",)),
            ("wasinvasive", ("int",)),
            ("hastissue", ("int",)),
            ("hasmedia", ("int",)),
            ("isfossil", ("int",)),
            ("haslength", ("int",)),
            ("haslifestage", ("int",)),
            ("hasmass", ("int",)),
            ("hassex", ("int",)),
            ("lengthinmm", ("double",)),
            ("massing", ("double",)),
            ("lengthunitsinferred", ("char",)),
            ("massunitsinferred", ("char",)),
            ("underivedlifestage", ("char",)),
            ("underivedsex", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename],
                                               [filename],
                                               "zip",
                                               False,
                                               "vertnet_latest_" + str(tablename))
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        engine.download_files_from_archive(self.urls["capture"], archive_type="zip")

        # Convert xlsx to csv.
        xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx")
        file_path = self.engine.format_filename("DSD_CAPTURE.csv")
        book = xlrd.open_workbook(xlsx_file)
        sh = book.sheet_by_index(0)
        rows = sh.nrows

        # Creating data files
        new_data = open_fw(file_path)
        csv_writer = open_csvw(new_data)
        csv_writer.writerow(["Order", "Concept_id",
                             "Role_Type", "Codelist_id",
                             "Codelist_Code_id", "Description"])

        for index in range(2, rows):
            row = sh.row(index)
            # Get each row and format the sell value.
            # Data starts at index 2
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        new_data.close()

        file_names = [
            ('CL_FI_UNIT.csv', 'unit_data'),
            ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'),
            ('DSD_CAPTURE.csv', 'dsd_capture_data'),
            ('CL_FI_SPECIES_GROUPS.csv', 'species_group')
        ]

        for (filename, tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

        # File CL_FI_COUNTRY_GROUPS.csv has multi encoding
        file_names_encoded = [
            ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'),
        ]
        for (filename, tablename) in file_names_encoded:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            table.columns = [('UN_Code', ('int', )),
                             ('Identifier', ('int', )),
                             ('ISO2_Code', ('char', '5')),
                             ('ISO3_Code', ('char', '5')),
                             ('Name_En', ('char', '50')),
                             ('Name_Fr', ('char', '50')),
                             ('Name_Es', ('char', '50')),
                             ('Name_Ar', ('char', '120')),
                             ('Name_Cn', ('char', '90')),
                             ('Name_Ru', ('char', '150')),
                             ('Official_Name_En', ('char', '70')),
                             ('Official_Name_Fr', ('char', '70')),
                             ('Official_Name_Es', ('char', '70')),
                             ('Official_Name_Ar', ('char', '1100')),
                             ('Official_Name_Cn', ('char', '70')),
                             ('Official_Name_Ru', ('char', '130')),
                             ('Continent_Group', ('char', '15')),
                             ('EcoClass_Group', ('char', '50')),
                             ('GeoRegion_Group', ('char', '30'))]
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

            # TS_FI_CAPTURE is
            file_names_encoded = [
                ('TS_FI_CAPTURE.csv', 'ts_capture_data',)
            ]
            for (filename, tablename) in file_names_encoded:
                data_path = self.engine.format_filename(filename)
                table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
                table.columns = [('COUNTRY', ('int', )),
                                 ('FISHING_AREA', ('int', )),
                                 ('SPECIES', ('char', '10')),
                                 ('YEAR', ('int', )),
                                 ('UNIT', ('char', '5')),
                                 ('QUANTITY', ('double', )),
                                 ('SYMBOL', ('char', '4'))]
                self.engine.auto_create_table(table, filename=filename)
                self.engine.insert_data_from_file(data_path)