Пример #1
0
 def set_table_delimiter(self, file_path):
     """Get the delimiter from the data file and set it."""
     if os.name == "nt":
         dataset_file = open_fr(file_path)
     else:
         dataset_file = open_fr(file_path, encoding=self.encoding)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()
Пример #2
0
def json2csv(input_file,
             output_file=None,
             header_values=None,
             encoding=ENCODING):
    """Convert Json file to CSV.

    Function is used for only testing and can handle the file of the size.
    """
    file_out = open_fr(input_file, encoding=encoding)
    # set output file name and write header
    if output_file is None:
        output_file = os.path.splitext(
            os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(output_file, encoding=encoding)
    if os.name == 'nt':
        outfile = csv.DictWriter(csv_out,
                                 dialect='excel',
                                 escapechar="\\",
                                 lineterminator='\n',
                                 fieldnames=header_values)
    else:
        outfile = csv.DictWriter(csv_out,
                                 dialect='excel',
                                 escapechar="\\",
                                 fieldnames=header_values)
    raw_data = json.loads(file_out.read())
    outfile.writeheader()

    for item in raw_data:
        outfile.writerow(item)
    file_out.close()
    subprocess.call(['rm', '-r', input_file])
    return output_file
Пример #3
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
Пример #5
0
def xml2csv_test(input_file,
                 outputfile=None,
                 header_values=None,
                 row_tag="row"):
    """Convert xml to csv.

    Function is used for only testing and can handle the file of the size.
    """
    file_output = open_fr(input_file, encoding=ENCODING)
    # set output file name and write header
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(outputfile)
    if os.name == 'nt':
        csv_writer = csv.writer(csv_out,
                                dialect='excel',
                                escapechar='\\',
                                lineterminator='\n')
    else:
        csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\')

    v = file_output.read()
    csv_writer.writerow(header_values)
    tree = ET.parse(NewFile(v))
    root = tree.getroot()
    for rows in root.findall(row_tag):
        x = [
            name.text for name in header_values for name in rows.findall(name)
        ]
        csv_writer.writerow(x)
    file_output.close()
    subprocess.call(['rm', '-r', input_file])
    return outputfile
Пример #6
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(
            self.urls["main"], "Succession_sampling_03-07_data_original.txt")
        data_path = self.engine.format_filename(
            "Succession_sampling_03-07_data.txt")
        old_data = open_fr(
            self.engine.find_file(
                "Succession_sampling_03-07_data_original.txt"))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(
            self.tables["main"], filename="Succession_sampling_03-07_data.txt")
        self.engine.insert_data_from_file(data_path)
Пример #7
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key],
                                      self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(
                self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(
                key, cleanup=self.cleanup_func_table),
                                          filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
Пример #8
0
def read_json(json_file, debug=False):
    """Read Json dataset package files"""
    json_object = OrderedDict()
    json_file = str(json_file) + ".json"

    try:
        json_object = json.load(open_fr(json_file))
    except ValueError:
        pass
    if type(json_object) is dict and "resources" in json_object.keys():

        # Note::formats described by frictionlessdata data may need to change
        tabular_formats = ["csv", "tab"]
        vector_formats = ["shp", "kmz"]
        raster_formats = ["tif","tiff" "bil", ".hdr", "h5","hdf5", "hr", "image"]

        for resource_item in json_object["resources"]:
            if "format" in resource_item:
                if resource_item["format"] in tabular_formats:
                    resource_item["format"] = "tabular"
                elif resource_item["format"] in vector_formats:
                    resource_item["format"] = "vector"
                elif resource_item["format"] in raster_formats:
                    resource_item["format"] = "raster"
            else:
                resource_item["format"] = "tabular"

            # Check for required resource fields
            spec_list = ["name", "url"]

            rspec = set(spec_list)
            if not rspec.issubset(resource_item.keys()):
                raise ValueError("One of the required attributes is missing from the {} dataset script.\
                    Make sure it has all of the following attributes: {}".format(json_file, rspec))

            for spec in spec_list:
                if not resource_item[spec]:
                    raise ValueError("Check either {} for missing values.\n Package {}".format(rspec, json_file))

        json_object["tables"] = {}
        temp_tables = {}
        table_names = [item["name"] for item in json_object["resources"]]
        temp_tables["tables"] = dict(zip(table_names, json_object["resources"]))

        for table_name, table_spec in temp_tables["tables"].items():
            json_object["tables"][table_name] = myTables[temp_tables["tables"][table_name]["format"]](**table_spec)
        json_object.pop("resources", None)
        json_object["urls"] = {table: json_object["tables"][table].url for table in json_object["tables"]}

        return TEMPLATES["default"](**json_object)
    return None
Пример #9
0
    def load_data(self, filename):
        """Generator returning lists of values from lines in a data file.

        1. Works on both delimited (csv module)
        and fixed width data (extract_fixed_width)
        2. Identifies the delimiter if not known
        3. Removes extra line endings

        """
        if not self.table.delimiter:
            self.set_table_delimiter(filename)
        if os.name == "nt":
            dataset_file = open_fr(filename)
        else:
            dataset_file = open_fr(filename, encoding=self.encoding)
        if self.table.fixed_width:
            for row in dataset_file:
                yield self.extract_fixed_width(row)
        else:
            reg = re.compile("\\r\\n|\n|\r")
            for row in csv.reader(dataset_file,
                                  delimiter=self.table.delimiter):
                yield [reg.sub(" ", values) for values in row]
Пример #10
0
def sort_csv(filename, encoding=ENCODING):
    """Sort CSV rows minus the header and return the file.

    Function is used for only testing and can handle the file of the size.
    """
    filename = os.path.normpath(filename)
    input_file = open_fr(filename, encoding)
    csv_reader_infile = csv.reader(input_file, escapechar="\\")
    #  write the data to a temporary file and sort it
    temp_path = os.path.normpath("tempfile")
    temp_file = open_fw(temp_path, encoding)

    csv_writer = open_csvw(temp_file)
    i = 0
    infields = None
    for row in csv_reader_infile:
        if i == 0:
            # The first entry is the header line
            infields = row
            i += 1
        else:
            csv_writer.writerow(row)
    input_file.close()
    temp_file.close()

    # sort the temp file
    sorted_txt = sort_file(temp_path, encoding)
    tmp = open_fr(sorted_txt, encoding)
    in_txt = csv.reader(tmp, delimiter=',', escapechar="\\")
    csv_file = open_fw(filename, encoding)
    csv_writer = open_csvw(csv_file)
    csv_writer.writerow(infields)
    csv_writer.writerows(in_txt)
    tmp.close()
    csv_file.close()
    os.remove(os.path.normpath(temp_path))
    return filename
Пример #11
0
def sort_file(file_path, encoding=ENCODING):
    """Sort file by line and return the file.

    Function is used for only testing and can handle the file of the size.
    """
    file_path = os.path.normpath(file_path)
    input_file = open_fr(file_path, encoding)
    lines = [line.strip() for line in input_file]
    input_file.close()
    outfile = open_fw(file_path, encoding)
    lines.sort()
    for line in lines:
        outfile.write(line + "\n")
    outfile.close()
    return file_path
Пример #12
0
    def disconnect(self):
        """Close out the xml files

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a closing tag)
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.write(u'\n</root>')
                current_output_file.close()
            self.table_names = []
Пример #13
0
    def disconnect(self):
        """Close out the JSON with a `\\n]}` and close the file.

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a `\\n]}`.
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',\n')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.writelines(['\n]'])
                current_output_file.close()
            self.table_names = []
Пример #14
0
    def disconnect(self):
        """Close out the JSON with a `\\n]}` and close the file.

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a `\\n]}`.
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',\n')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.writelines(['\n]'])
                current_output_file.close()
            self.table_names = []
def convert_to_csv(dir_name):
    """Change the file delimiter to comma delimiter"""
    for file_name in os.listdir(dir_name):
        file_path = os.path.join(dir_name, file_name)
        if file_path.endswith(".txt"):
            csv_file_name = file_name.replace(".txt", ".csv").lower()
            output_file = os.path.join(dir_name, csv_file_name)
            with open_fr(file_path,
                         encoding="latin-1") as read_object, open_fw(
                             output_file) as outputfw:
                fr = csv.reader(read_object, delimiter="^", quotechar="~")
                fw = csv.writer(outputfw,
                                delimiter=",",
                                quoting=csv.QUOTE_MINIMAL)
                for line in fr:
                    if line:
                        fw.writerow(line)
            # delete the text files
            os.remove(file_path)
Пример #16
0
    def load_data(self, filename):
        """Generator returning lists of values from lines in a data file.

        1. Works on both delimited (csv module)
        and fixed width data (extract_fixed_width)
        2. Identifies the delimiter if not known
        3. Removes extra line endings

        """
        if not self.table.delimiter:
            self.set_table_delimiter(filename)

        dataset_file = open_fr(filename)

        if self.table.fixed_width:
            for row in dataset_file:
                yield self.extract_fixed_width(row)
        else:
            reg = re.compile("\\r\\n|\n|\r")
            for row in csv.reader(dataset_file, delimiter=self.table.delimiter):
                yield [reg.sub(" ", values) for values in row]
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        original_data = "Succession_sampling_03-07_data_original.txt"
        new_date = "Succession_sampling_03-07_data.txt"
        self.engine.download_file(self.urls["main"], original_data)
        data_path = self.engine.format_filename(new_date)
        old_data = open_fr(self.engine.find_file(original_data))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor
        #  in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(self.tables["main"], filename=new_date)
        self.engine.insert_data_from_file(data_path)
Пример #18
0
def json2csv(input_file,
             output_file=None,
             header_values=None,
             encoding=ENCODING,
             row_key=None):
    """Convert Json file to CSV."""
    file_out = open_fr(input_file, encoding=encoding)
    # set output file name and write header
    if output_file is None:
        output_file = os.path.splitext(
            os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(output_file, encoding=encoding)
    if os.name == 'nt':
        outfile = csv.writer(csv_out,
                             dialect='excel',
                             escapechar="\\",
                             lineterminator='\n')
    else:
        outfile = csv.writer(csv_out, dialect='excel', escapechar="\\")

    raw_data = json.loads(file_out.read(), object_pairs_hook=OrderedDict)

    raw_data, header_values = walker(raw_data,
                                     row_key=row_key,
                                     header_values=header_values,
                                     rows=[],
                                     normalize=False)

    if isinstance(raw_data[0], dict):
        # row values are in a list of dictionaries
        raw_data = [list(row.values()) for row in raw_data]
    else:
        raw_data = [row.tolist() for row in raw_data]
    if header_values:
        outfile.writerow(header_values)
    outfile.writerows(raw_data)
    file_out.close()
    subprocess.call(['rm', '-r', input_file])
    return output_file
Пример #19
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(key,
                                                cleanup=self.cleanup_func_table), filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
Пример #20
0
 def set_table_delimiter(self, file_path):
     """Get the delimiter from the data file and set it."""
     dataset_file = open_fr(file_path)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()
Пример #21
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine
Пример #22
0
 def set_table_delimiter(self, file_path):
     dataset_file = open_fr(file_path)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()
Пример #23
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # Complete Plants Checklist
        file_name = "complete_plant_checklist.csv"
        table_name = "complete_plant_checklist"
        complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true"
        self.engine.download_file(complete_plant_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("common_name", ("char", "42")),
            ("family", ("char", "30")),
        ]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # Symbols for Unknown Plants
        file_name = "symbols_unknown_plants.csv"
        table_name = "unknown_plants"
        unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt"
        self.engine.download_file(unknown_plants_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [("symbol", ("char", "7")),
                         ("common_name", ("char", "56"))]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # State PLANTS Checklist
        base_url = "https://plants.sc.egov.usda.gov/"
        state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}"
        state_plant_checklist_file = "all_state_plant_checklist.csv"
        table_name = "state_plant_checklist"
        state_plant_checklist = [
            ("US01", "Alabama", "US"),
            ("US02", "Alaska", "US"),
            ("US05", "Arkansas", "US"),
            ("US04", "Arizona", "US"),
            ("US06", "California", "US"),
            ("US08", "Colorado", "US"),
            ("US09", "Connecticut", "US"),
            ("US10", "Delaware", "US"),
            ("US11", "District of Columbia", "US"),
            ("US12", "Florida", "US"),
            ("US13", "Georgia", "US"),
            ("US15", "Hawaii", "US"),
            ("US16", "Idaho", "US"),
            ("US17", "Illinois", "US"),
            ("US18", "Indiana", "US"),
            ("US19", "Iowa", "US"),
            ("US20", "Kansas", "US"),
            ("US21", "Kentucky", "US"),
            ("US22", "Louisiana", "US"),
            ("US23", "Maine", "US"),
            ("US24", "Maryland", "US"),
            ("US25", "Massachusetts", "US"),
            ("US26", "Michigan", "US"),
            ("US27", "Minnesota", "US"),
            ("US28", "Mississippi", "US"),
            ("US29", "Missouri", "US"),
            ("US30", "Montana", "US"),
            ("US31", "Nebraska", "US"),
            ("US32", "Nevada", "US"),
            ("US33", "New Hampshire", "US"),
            ("US34", "New Jersey", "US"),
            ("US35", "New Mexico", "US"),
            ("US36", "New York", "US"),
            ("US37", "North Carolina", "US"),
            ("US38", "North Dakota", "US"),
            ("US39", "Ohio", "US"),
            ("US40", "Oklahoma", "US"),
            ("US41", "Oregon", "US"),
            ("US42", "Pennsylvania", "US"),
            ("US44", "Rhode Island", "US"),
            ("US45", "South Carolina", "US"),
            ("US46", "South Dakota", "US"),
            ("US47", "Tennessee", "US"),
            ("US48", "Texas", "US"),
            ("US49", "Utah", "US"),
            ("US50", "Vermont", "US"),
            ("US51", "Virginia", "US"),
            ("US53", "Washington", "US"),
            ("US54", "West Virginia", "US"),
            ("US55", "Wisconsin", "US"),
            ("US56", "Wyoming", "US"),
            ("US72", "Puerto Rico", "US"),
            ("US78", "Virgin Islands", "US"),
            ("CA01", "Alberta", "Canada"),
            ("CA02", "British Columbia", "Canada"),
            ("CA03", "Manitoba", "Canada"),
            ("CA04", "New Brunswick", "Canada"),
            ("CALB", "Labrador", "Canada"),
            ("CANF", "Newfoundland", "Canada"),
            ("CA13", "Northwest Territories", "Canada"),
            ("CA07", "Nova Scotia", "Canada"),
            ("CA14", "Nunavut", "Canada"),
            ("CA08", "Ontario", "Canada"),
            ("CA09", "Prince Edward Island", "Canada"),
            ("CA10", "Québec", "Canada"),
            ("CA11", "Saskatchewan", "Canada"),
            ("CA12", "Yukon", "Canada"),
            ("GL", "Greenland", "Denmark"),
            ("SB", "St. Pierre and Miquelon", "France"),
        ]

        with open_fw(engine.format_filename(
                state_plant_checklist_file)) as write_object:
            csv_writer = open_csvw(write_object)
            for state_info in state_plant_checklist:
                file_name = state_info[1].replace(".", "").replace(
                    " ", "_").lower() + ".csv"
                file_name = "old_state_plant_checklist_" + file_name
                state_url = state_plant_checklist_base_url.format(
                    base=base_url, id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    for row in csv.reader(read_object, delimiter=","):
                        csv_writer.writerow([state_info[2]] + [state_info[1]] +
                                            row)

        data_path = self.engine.format_filename(state_plant_checklist_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("country", ("char", "7")),
            ("state", ("char", "23")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("national_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table,
                                      filename=state_plant_checklist_file)
        self.engine.insert_data_from_file(data_path)

        # NRCS State GSAT Lists
        base_url = "https://www.plants.usda.gov/"
        nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}"
        nrcs_state_gsat_file = "all_nrcs_state_gsat.csv"
        table_name = "nrcs_state_gsat"
        nrcs_state_gsat = [
            ("Alabama", "2"),
            ("Alaska", ""),
            ("Arkansas", ""),
            ("Arizona", "2"),
            ("California", ""),
            ("Colorado", ""),
            ("Connecticut", ""),
            ("Delaware", ""),
            ("Florida", ""),
            ("Georgia", ""),
            ("Hawaii", ""),
            ("Idaho", "9"),
            ("Illinois", ""),
            ("Indiana", ""),
            ("Iowa ", ""),
            ("Kansas", "6"),
            ("Kentucky", ""),
            ("Louisiana", "16"),
            ("Maine", ""),
            ("Maryland", ""),
            ("Massachusetts", ""),
            ("Michigan", ""),
            ("Minnesota", "11"),
            ("Mississippi", ""),
            ("Missouri", "14"),
            ("Montana", ""),
            ("Nebraska", "17"),
            ("Nevada", "4"),
            ("New Hampshire", ""),
            ("New Jersey ", ""),
            ("New Mexico", "1"),
            ("New York", ""),
            ("Noth Carolina", ""),
            ("North Dakota", "5"),
            ("Ohio", ""),
            ("Oklahoma", "12"),
            ("Oregon", "3"),
            ("Pennsylvania", "15"),
            ("Rhode Island", ""),
            ("South Carolina", ""),
            ("South Dakota", "7"),
            ("Tennessee", ""),
            ("Texas", "13"),
            ("Utah", ""),
            ("Vermont ", ""),
            ("Virginia", ""),
            ("Washington", "8"),
            ("West Virginia", ""),
            ("Wisconsin", ""),
            ("Wyoming", "10"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_gsat_file)) as write_object:
            for state_info in nrcs_state_gsat:
                if state_info[1]:
                    # skip states with no data ("state", ""),
                    file_name = state_info[0].replace(" ", "_").replace(
                        ".", "").lower() + ".csv"
                    file_name = "old_nrcs_state_gsat_" + file_name
                    state_url = nrcs_state_gsat_base_url.format(
                        base=base_url, id=state_info[1])
                    self.engine.download_file(state_url, filename=file_name)
                    with open_fr(
                            engine.format_filename(file_name)) as read_object:
                        # Read state file and only write the data minus header
                        next(read_object)
                        state_quoted = '"{state}",'.format(state=state_info[0])
                        for line in read_object:
                            write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_gsat_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "12")),
            ("symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("gsat_common_name", ("char", "93")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_gsat_file)
        self.engine.insert_data_from_file(data_path)

        base_url = "https://plants.sc.egov.usda.gov/"
        nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}"
        nrcs_state_plant_file = "all_nrcs_state_plant.csv"
        table_name = "nrcs_state_plant"
        nrcs_state_plant_lists = [
            ("01", "Alabama"),
            ("02", "Alaska"),
            ("05", "Arkansas"),
            ("04", "Arizona"),
            ("06", "California"),
            ("08", "Colorado"),
            ("09", "Connecticut"),
            ("10", "Delaware"),
            ("12", "Florida"),
            ("13", "Georgia"),
            ("15", "Hawaii"),
            ("16", "Idaho"),
            ("17", "Illinois"),
            ("18", "Indiana"),
            ("19", "Iowa"),
            ("20", "Kansas"),
            ("21", "Kentucky"),
            ("22", "Louisiana"),
            ("23", "Maine"),
            ("24", "Maryland"),
            ("25", "Massachusetts"),
            ("26", "Michigan"),
            ("27", "Minnesota"),
            ("28", "Mississippi"),
            ("29", "Missouri"),
            ("30", "Montana"),
            ("31", "Nebraska"),
            ("32", "Nevada"),
            ("33", "New Hampshire"),
            ("34", "New Jersey"),
            ("35", "New Mexico"),
            ("36", "New York"),
            ("37", "North Carolina"),
            ("38", "North Dakota"),
            ("39", "Ohio"),
            ("40", "Oklahoma"),
            ("41", "Oregon"),
            ("42", "Pennsylvania"),
            ("44", "Rhode Island"),
            ("45", "South Carolina"),
            ("46", "South Dakota"),
            ("47", "Tennessee"),
            ("48", "Texas"),
            ("49", "Utah"),
            ("50", "Vermont"),
            ("51", "Virginia"),
            ("53", "Washington"),
            ("54", "West Virginia"),
            ("55", "Wisconsin"),
            ("56", "Wyoming"),
            ("72", "Puerto Rico"),
            ("78", "Virgin Islands"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_plant_file)) as write_object:
            for state_info in nrcs_state_plant_lists:
                file_name = state_info[1].replace(" ", "_").replace(
                    ".", "").lower() + ".csv"
                file_name = "old_nrcs_state_plant_" + file_name
                state_url = nrcs_state_plant_lists_url.format(base=base_url,
                                                              id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    state_quoted = '"{state}",'.format(state=state_info[1])
                    for line in read_object:
                        write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_plant_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "17")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("state_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_plant_file)
        self.engine.insert_data_from_file(data_path)
Пример #24
0
 def set_table_delimiter(self, file_path):
     dataset_file = open_fr(file_path)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # download and create species table
        table = Table('species')
        self.engine.auto_create_table(table, url=self.urls['species'])
        self.engine.insert_data_from_url(self.urls['species'])

        # State abbreviations with the year annual inventory began for that state
        stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000),
                      ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004),
                      ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001),
                      ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999),
                      ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003),
                      ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006),
                      ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002),
                      ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003),
                      ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001),
                      ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001),
                      ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003),
                      ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000),
                      ('WY', 2000), ('PR', 2001)]

        tablelist = [
            "SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE",
            "SEEDLING"
        ]

        for table in tablelist:
            for state, year in stateslist:

                engine.download_files_from_archive(
                    self.urls["main"] + state + "_" + table + ".ZIP",
                    [state + "_" + table + ".csv"])

        for table in tablelist:
            print("Scanning data for table %s..." % table)
            prep_file_name = "%s.csv" % table
            prep_file = open_fw(engine.format_filename(prep_file_name))
            this_file = open_fr(
                engine.format_filename(stateslist[0][0] + "_" + table +
                                       ".csv"))
            col_names = this_file.readline()
            prep_file.write(col_names)
            column_names = [col.strip('"') for col in col_names.split(',')]
            year_column = column_names.index("INVYR")
            this_file.close()

            for state, year in stateslist:
                this_file = open_fr(
                    engine.format_filename(state + "_" + table + ".csv"))
                this_file.readline()
                for line in this_file:
                    values = line.split(',')
                    this_year = values[year_column]
                    if int(this_year) >= year:
                        prep_file.write(line)
            prep_file.close()
            engine.auto_create_table(Table(table), filename=prep_file_name)
            engine.insert_data_from_file(
                engine.format_filename(prep_file_name))
            try:
                os.remove(engine.format_filename(prep_file_name))
            except:
                pass
        return engine
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()
            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns = [("RouteDataID", ("int", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("year", ("int", )),
                             ("AOU", ("int", )), ("Stop1", ("int", )),
                             ("Stop2", ("int", )), ("Stop3", ("int", )),
                             ("Stop4", ("int", )), ("Stop5", ("int", )),
                             ("Stop6", ("int", )), ("Stop7", ("int", )),
                             ("Stop8", ("int", )), ("Stop9", ("int", )),
                             ("Stop10", ("int", )), ("Stop11", ("int", )),
                             ("Stop12", ("int", )), ("Stop13", ("int", )),
                             ("Stop14", ("int", )), ("Stop15", ("int", )),
                             ("Stop16", ("int", )), ("Stop17", ("int", )),
                             ("Stop18", ("int", )), ("Stop19", ("int", )),
                             ("Stop20", ("int", )), ("Stop21", ("int", )),
                             ("Stop22", ("int", )), ("Stop23", ("int", )),
                             ("Stop24", ("int", )), ("Stop25", ("int", )),
                             ("Stop26", ("int", )), ("Stop27", ("int", )),
                             ("Stop28", ("int", )), ("Stop29", ("int", )),
                             ("Stop30", ("int", )), ("Stop31", ("int", )),
                             ("Stop32", ("int", )), ("Stop33", ("int", )),
                             ("Stop34", ("int", )), ("Stop35", ("int", )),
                             ("Stop36", ("int", )), ("Stop37", ("int", )),
                             ("Stop38", ("int", )), ("Stop39", ("int", )),
                             ("Stop40", ("int", )), ("Stop41", ("int", )),
                             ("Stop42", ("int", )), ("Stop43", ("int", )),
                             ("Stop44", ("int", )), ("Stop45", ("int", )),
                             ("Stop46", ("int", )), ("Stop47", ("int", )),
                             ("Stop48", ("int", )), ("Stop49", ("int", )),
                             ("Stop50", ("int", ))]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1, 11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + "Fifty" + part + ".zip",
                            ["fifty" + part + ".csv"])
                    except:
                        print("Failed bulk insert on " + part +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + "Fifty" + part + ".zip",
                            ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine
Пример #27
0
def read_json(json_file):
    """Read Json dataset package files

    Load each json and get the appropriate encoding for the dataset
    Reload the json using the encoding to ensure correct character sets
    """
    json_object = OrderedDict()
    json_file_encoding = None
    json_file = str(json_file) + ".json"

    try:
        file_obj = open_fr(json_file)
        json_object = json.load(file_obj)
        if "encoding" in json_object:
            json_file_encoding = json_object['encoding']
        file_obj.close()
    except ValueError:
        return None

    # Reload json using encoding if available
    try:
        if json_file_encoding:
            file_obj = open_fr(json_file, encoding=json_file_encoding)
        else:
            file_obj = open_fr(json_file)
        json_object = json.load(file_obj)
        file_obj.close()

    except ValueError:
        return None

    if isinstance(json_object, dict) and "resources" in json_object.keys():
        # Note::formats described by frictionless data may need to change
        tabular_exts = {"csv", "tab", "geojson", "sqlite", "db", "json", "xml"}
        vector_exts = {"shp", "kmz"}
        raster_exts = {
            "tif", "tiff", "bil", "hdr", "h5", "hdf5", "hr", "image"
        }
        for resource_item in json_object["resources"]:
            if "format" not in resource_item:
                if "format" in json_object:
                    resource_item["format"] = json_object["format"]
                else:
                    resource_item["format"] = "tabular"
            if "extensions" in resource_item:
                exts = set(resource_item["extensions"])
                if exts <= tabular_exts:
                    resource_item["format"] = "tabular"
                elif exts <= vector_exts:
                    resource_item["format"] = "vector"
                elif exts <= raster_exts:
                    resource_item["format"] = "raster"
            if "url" in resource_item:
                if "urls" in json_object:
                    json_object["urls"][
                        resource_item["name"]] = resource_item["url"]

        json_object["tables"] = OrderedDict()
        temp_tables = {}
        table_names = [item["name"] for item in json_object["resources"]]
        temp_tables["tables"] = OrderedDict(
            zip(table_names, json_object["resources"]))
        for table_name, table_spec in temp_tables["tables"].items():
            json_object["tables"][table_name] = myTables[
                temp_tables["tables"][table_name]["format"]](**table_spec)
        json_object.pop("resources", None)
        return TEMPLATES["default"](**json_object)
    return None
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        original_sql_file = "BioTIMESQL02_04_2018.sql"
        engine.download_file(self.urls["sql_file"], original_sql_file)
        sql_data = open_fr(self.engine.format_filename(original_sql_file))

        set_open = False
        csv_writer = None
        csv_file = None
        table_name = None
        NULL = None
        for line in sql_data:
            table_indicator = "-- Table structure for table "
            if line.startswith(table_indicator):
                st = line[len(table_indicator):].replace("`", "")
                table_name = st.strip()
                current_file_process = table_name
                current_file_open = current_file_process
                if set_open and not current_file_process == current_file_open:
                    csv_file.close()
                    set_open = False
                else:
                    out_file = "{name}.csv".format(name=table_name)
                    csv_file = open_fw(engine.format_filename(out_file))
                    csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
                    set_open = True

            if line.startswith("INSERT INTO `{table_name}`".format(
                    table_name=table_name)):
                row_val = line[line.index("VALUES (") + 8:-3]
                table_rows = row_val.replace("\r\n", "").split("),(")
                for i_row in table_rows:
                    v = eval('[' + str(i_row) + ']')
                    csv_writer.writerows([v])
        if csv_file:
            csv_file.close()

        # Create abundance table
        table = Table("ID_ABUNDANCE",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_ABUNDANCE", ("int", )),
            ("ABUNDANCE_TYPE", ("char", "100")),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("abundance.csv"))

        # Create allrawdata table
        table = Table("allrawdata",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_ALL_RAW_DATA", ("int", )),
            ("ABUNDANCE", ("double", )),
            ("BIOMASS", ("double", )),
            ("ID_SPECIES", ("int", )),
            ("SAMPLE_DESC", ("char", 200)),
            ("PLOT", ("char", 150)),
            ("LATITUDE", ("double", )),
            ("LONGITUDE", ("double", )),
            ("DEPTH", ("double", )),
            ("DAY", ("int", )),
            ("MONTH", ("int", )),
            ("YEAR", ("int", )),
            ("STUDY_ID", ("int", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("allrawdata.csv"))

        # Create biomass table
        table = Table("biomass",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [("ID_BIOMASS", ("int", )),
                         ("BIOMASS_TYPE", ("char", "100"))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("biomass.csv"))

        # Create citation1 table
        table = Table("citation1",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_CITATION1", ("int", )),
            ("STUDY_ID", ("int", )),
            ("CITATION_LINE", ("char", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("citation1.csv"))

        # Create contacts table
        table = Table("contacts",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_CONTACTS", ("int", )),
            ("STUDY_ID", ("int", )),
            ("CONTACT_1", ("char", 500)),
            ("CONTACT_2", ("char", 500)),
            ("CONT_1_MAIL", ("char", 60)),
            ("CONT_2_MAIL", ("char", 60)),
            ("LICENSE", ("char", 200)),
            ("WEB_LINK", ("char", 200)),
            ("DATA_SOURCE", ("char", 250)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("contacts.csv"))

        # Create countries table
        table = Table("countries",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [("COUNT_ID", ("int", )),
                         ("COUNTRY_NAME", ("char", 200))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("countries.csv"))

        # Create curation table
        table = Table("curation",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_CURATION", ("int", )),
            ("STUDY_ID", ("int", )),
            ("LINK_ID", ("int", )),
            ("COMMENTS", ("char", )),
            ("DATE_STUDY_ADDED", ("char", 50)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("curation.csv"))

        # Create datasets table
        table = Table("datasets",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_DATASETS", ("int", )),
            ("STUDY_ID", ("int", )),
            ("TAXA", ("char", 50)),
            ("ORGANISMS", ("char", 200)),
            ("TITLE", ("char", 800)),
            ("AB_BIO", ("char", 2)),
            ("HAS_PLOT", ("char", 10)),
            ("DATA_POINTS", ("char", )),
            ("START_YEAR", ("char", )),
            ("END_YEAR", ("char", )),
            ("CENT_LAT", ("double", )),
            ("CENT_LONG", ("double", )),
            ("NUMBER_OF_SPECIES", ("char", )),
            ("NUMBER_OF_SAMPLES", ("char", )),
            ("NUMBER_LAT_LONG", ("char", )),
            ("TOTAL", ("char", )),
            ("GRAIN_SIZE_TEXT", ("char", )),
            ("GRAIN_SQ_KM", ("double", )),
            ("AREA_SQ_KM", ("double", )),
            ("AB_TYPE", ("char", )),
            ("BIO_TYPE", ("char", )),
            ("SAMPLE_TYPE", ("char", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("datasets.csv"))

        # Create downloads table
        table = Table("downloads",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("D_ID", ("int", )),
            ("STUDY", ("char", 25)),
            ("NAME", ("char", 150)),
            ("EMAIL", ("char", 150)),
            ("COUNTRY", ("char", 200)),
            ("ROLE", ("char", 150)),
            ("PURPOSE", ("char", 500)),
            ("LOCATION", ("char", 250)),
            ("DATE_STAMP", ("char", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("downloads.csv"))

        # Create methods table
        table = Table("methods",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_METHODS", ("int", )),
            ("STUDY_ID", ("int", )),
            ("METHODS", ("char", )),
            ("SUMMARY_METHODS", ("char", 500)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("methods.csv"))

        # Create sample table
        table = Table("sample",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_SAMPLE", ("int", )),
            ("ID_TREAT", ("int", )),
            ("SAMPLE_DESC_NAME", ("char", 200)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("sample.csv"))

        # Create site table
        table = Table("site", delimiter=",", header_rows=0, contains_pk=False)
        table.columns = [("ID_SITE", ("int", )), ("STUDY_ID", ("int", )),
                         ("REALM", ("char", 11)), ("CLIMATE", ("char", 20)),
                         ("GENERAL_TREAT", ("char", 200)),
                         ("TREATMENT", ("char", 200)),
                         ("TREAT_COMMENTS", ("char", 250)),
                         ("TREAT_DATE", ("char", 100)),
                         ("CEN_LATITUDE", ("double", )),
                         ("CEN_LONGITUDE", ("double", )),
                         ("HABITAT", ("char", 100)),
                         ("PROTECTED_AREA", ("char", 50)),
                         ("AREA", ("double", )), ("BIOME_MAP", ("char", 500))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("site.csv"))

        # Create species table
        table = Table("species",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [("ID_SPECIES", ("int", )), ("GENUS", ("char", 100)),
                         ("SPECIES", ("char", 100)),
                         ("GENUS_SPECIES", ("char", 100))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("species.csv"))
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns = [("species_id", ("pk-int",)),
                             ("AOU", ("int",)),
                             ("english_common_name", ("char", 50)),
                             ("french_common_name", ("char", 50)),
                             ("spanish_common_name", ("char", 50)),
                             ("sporder", ("char", 30)),
                             ("family", ("char", 30)),
                             ("genus", ("char", 30)),
                             ("species", ("char", 50)),
                             ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(
                self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()
            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int",)),
                             ("regioncode", ("int",)),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns = [("RouteDataID", ("int",)),
                             ("countrynum", ("int",)),
                             ("statenum", ("int",)),
                             ("Route", ("int",)),
                             ("RPID", ("int",)),
                             ("year", ("int",)),
                             ("AOU", ("int",)),
                             ("Stop1", ("int",)),
                             ("Stop2", ("int",)),
                             ("Stop3", ("int",)),
                             ("Stop4", ("int",)),
                             ("Stop5", ("int",)),
                             ("Stop6", ("int",)),
                             ("Stop7", ("int",)),
                             ("Stop8", ("int",)),
                             ("Stop9", ("int",)),
                             ("Stop10", ("int",)),
                             ("Stop11", ("int",)),
                             ("Stop12", ("int",)),
                             ("Stop13", ("int",)),
                             ("Stop14", ("int",)),
                             ("Stop15", ("int",)),
                             ("Stop16", ("int",)),
                             ("Stop17", ("int",)),
                             ("Stop18", ("int",)),
                             ("Stop19", ("int",)),
                             ("Stop20", ("int",)),
                             ("Stop21", ("int",)),
                             ("Stop22", ("int",)),
                             ("Stop23", ("int",)),
                             ("Stop24", ("int",)),
                             ("Stop25", ("int",)),
                             ("Stop26", ("int",)),
                             ("Stop27", ("int",)),
                             ("Stop28", ("int",)),
                             ("Stop29", ("int",)),
                             ("Stop30", ("int",)),
                             ("Stop31", ("int",)),
                             ("Stop32", ("int",)),
                             ("Stop33", ("int",)),
                             ("Stop34", ("int",)),
                             ("Stop35", ("int",)),
                             ("Stop36", ("int",)),
                             ("Stop37", ("int",)),
                             ("Stop38", ("int",)),
                             ("Stop39", ("int",)),
                             ("Stop40", ("int",)),
                             ("Stop41", ("int",)),
                             ("Stop42", ("int",)),
                             ("Stop43", ("int",)),
                             ("Stop44", ("int",)),
                             ("Stop45", ("int",)),
                             ("Stop46", ("int",)),
                             ("Stop47", ("int",)),
                             ("Stop48", ("int",)),
                             ("Stop49", ("int",)),
                             ("Stop50", ("int",))]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1, 11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print(
                            "Failed bulk insert on " +
                            part +
                            ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine
Пример #30
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns = [("species_id", ("pk-int",)),
                             ("AOU", ("int",)),
                             ("english_common_name", ("char", 50)),
                             ("french_common_name", ("char", 50)),
                             ("spanish_common_name", ("char", 50)),
                             ("sporder", ("char", 30)),
                             ("family", ("char", 30)),
                             ("genus", ("char", 30)),
                             ("species", ("char", 50)),
                             ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int",)),
                             ("regioncode", ("int",)),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto",)),
                             ("RouteDataID", ("int",)),
                             ("countrynum", ("int",)),
                             ("statenum", ("int",)),
                             ("Route", ("int",)),
                             ("RPID", ("int",)),
                             ("Year", ("int",)),
                             ("Aou", ("int",)),
                             ("Count10", ("int",)),
                             ("Count20", ("int",)),
                             ("Count30", ("int",)),
                             ("Count40", ("int",)),
                             ("Count50", ("int",)),
                             ("StopTotal", ("int",)),
                             ("SpeciesTotal", ("int",))]

            stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
                          "Connecticut", "Delaware", "Florida", "Georgia", "Idaho",
                          "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
                          "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                          "Missouri", "Montana", "Nebraska", "Nevada",
                          ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"],
                          ["New Mexico", "NMexico"], ["New York", "NYork"],
                          ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio",
                          "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"],
                          ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee",
                          "Texas", "Utah", "Vermont", "Virginia", "Washington",
                          ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                          ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"],
                          ["Northwest Territories", "NWTerri"], "Newfoundland",
                          ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                          ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if isinstance(state, (list,)):
                        state, shortstate = state[0], state[1]
                    else:
                        shortstate = state[0:7]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"])
                    except:
                        print(
                            "Failed bulk insert on " +
                            state +
                            ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine
Пример #31
0
 def set_table_delimiter(self, file_path):
     """Get the delimiter from the data file and set it."""
     dataset_file = open_fr(file_path)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()
Пример #32
0
def read_json(json_file, debug=False):
    """Read Json dataset package files

    Load each json and get the appropriate encoding for the dataset
    Reload the json using the encoding to ensure correct character sets
    """
    json_object = OrderedDict()
    json_file_encoding = None
    json_file = str(json_file) + ".json"

    try:
        file_obj = open_fr(json_file)
        json_object = json.load(file_obj)
        if "encoding" in json_object:
            json_file_encoding = json_object['encoding']
        file_obj.close()
    except ValueError:
        pass

    # Reload json using encoding if available
    try:
        if json_file_encoding:
            file_obj = open_fr(json_file, encoding=json_file_encoding)
        else:
            file_obj = open_fr(json_file)
        json_object = json.load(file_obj)
        file_obj.close()

    except ValueError:
        pass
    if type(json_object) is dict and "resources" in json_object.keys():

        # Note::formats described by frictionlessdata data may need to change
        tabular_exts = {"csv", "tab"}
        vector_exts = {"shp", "kmz"}
        raster_exts = {"tif", "tiff", "bil",
                           "hdr", "h5", "hdf5", "hr", "image"}
        for resource_item in json_object["resources"]:
            if "format" not in resource_item:
                if "format" in json_object:
                    resource_item["format"] = json_object["format"]
                else:
                    resource_item["format"] = "tabular"
            if "extensions" in resource_item:
                exts = set(resource_item["extensions"])
                if exts <= tabular_exts:
                    resource_item["format"] = "tabular"
                elif exts <= vector_exts:
                    resource_item["format"] = "vector"
                elif exts <= raster_exts:
                    resource_item["format"] = "raster"
            if "url" in resource_item:
                if "urls" in json_object:
                    json_object["urls"][resource_item["name"]] = resource_item["url"]

        json_object["tables"] = OrderedDict()
        temp_tables = {}
        table_names = [item["name"] for item in json_object["resources"]]
        temp_tables["tables"] = OrderedDict(zip(table_names, json_object["resources"]))
        for table_name, table_spec in temp_tables["tables"].items():
            json_object["tables"][table_name] = myTables[
                temp_tables["tables"][table_name]["format"]
            ](**table_spec)
        json_object.pop("resources", None)
        return TEMPLATES["default"](**json_object)
    return None