Exemplos de open_fr em Python, exemplos de retriever.open_fr em Python

Exemplo n.º 1

0

Exibir arquivo

def sort_csv(filename):
    """Sort CSV rows minus the header and return the file
    function is used for only testing and can handle the file of the size
    """
    filename = os.path.normpath(filename)
    input_file = open_fr(filename)
    csv_reader_infile = csv.reader(input_file, escapechar="\\")
    #  write the data to a temporary file and sort it
    temp_path = os.path.normpath("tempfile")
    temp_file = open_fw(temp_path)

    csv_writer = open_csvw(temp_file)
    i = 0
    for row in csv_reader_infile:
        if i == 0:
            # The first entry is the header line
            infields = row
            i += 1
        else:
            csv_writer.writerow(row)
    input_file.close()
    temp_file.close()

    # sort the temp file
    sorted_txt = sort_file(temp_path)
    tmp = open_fr(sorted_txt)
    in_txt = csv.reader(tmp, delimiter=',', escapechar="\\")
    csv_file = open_fw(filename)
    csv_writer = open_csvw(csv_file)
    csv_writer.writerow(infields)
    csv_writer.writerows(in_txt)
    tmp.close()
    csv_file.close()
    os.remove(os.path.normpath(temp_path))
    return filename

Exemplo n.º 2

0

Exibir arquivo

def sort_file(file_path):
    """Sort file by line and return the file
    function is used for only testing and can handle the file of the size
    """
    file_path = os.path.normpath(file_path)
    input_file = open_fr(file_path)
    lines = [line.strip().replace('\x00', '') for line in input_file]
    input_file.close()
    outfile = open_fw(file_path)
    lines.sort()
    for line in lines:
        outfile.write(line + "\n")
    outfile.close()
    return file_path

Exemplo n.º 3

0

Exibir arquivo

Arquivo: jsonengine.py Projeto: dmcglinn/retriever

    def disconnect(self):
        """Close out the JSON with a `\\n]}` and close the file.

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a `\\n]}`.
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',\n')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.writelines(['\n]'])
                current_output_file.close()
            self.table_names = []

Exemplo n.º 4

0

Exibir arquivo

    def disconnect(self):
        """Close out the xml files

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a closing tag)
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name, encode=False)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.write(u'\n</root>')
                current_output_file.close()
            self.table_names = []

Exemplo n.º 5

0

Exibir arquivo

    def load_data(self, filename):
        """Generator returning lists of values from lines in a data file

        1. Works on both delimited (csv module) and fixed width data (extract_fixed_width)
        2. Identifies the delimiter if not known
        3. Removes extra line endings

        """
        if not self.table.delimiter:
            self.set_table_delimiter(filename)

        dataset_file = open_fr(filename)

        if self.table.fixed_width:
            for row in dataset_file:
                yield self.extract_fixed_width(row)
        else:
            reg = re.compile("\\r\\n|\n|\r")
            for row in csv.reader(dataset_file, delimiter=self.table.delimiter):
                yield [reg.sub(" ", values) for values in row]

Exemplo n.º 6

0

Exibir arquivo

Arquivo: intertidal_abund_me.py Projeto: dmcglinn/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["main"], "Succession_sampling_03-07_data_original.txt")
        data_path = self.engine.format_filename("Succession_sampling_03-07_data.txt")
        old_data = open_fr(self.engine.find_file("Succession_sampling_03-07_data_original.txt"))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(self.tables["main"],
                                      filename="Succession_sampling_03-07_data.txt")
        self.engine.insert_data_from_file(data_path)

Exemplo n.º 7

0

Exibir arquivo

def json2csv(input_file, output_file=None, header_values=None):
    """Convert Json file to CSV
    function is used for only testing and can handle the file of the size
    """
    file_out = open_fr(input_file)
    # set output file name and write header
    if output_file is None:
        output_file = os.path.splitext(os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(output_file, encode=False)
    if os.name == 'nt':
        outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", lineterminator='\n', fieldnames=header_values)
    else:
        outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", fieldnames=header_values)
    raw_data = json.loads(file_out.read())
    outfile.writeheader()

    for item in raw_data:
        outfile.writerow(item)
    file_out.close()
    os.system("rm -r {}".format(input_file))
    return output_file

Exemplo n.º 8

0

Exibir arquivo

Arquivo: plant_life_hist_eu.py Projeto: dmcglinn/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(key,
                                                cleanup=cleanup_func_table), filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: plant_life_hist_eu.py Projeto: mcglinnlab/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(key,
                                                cleanup=Cleanup(correct_invalid_value,
                                                                nulls=[-999.9])),filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)

Exemplo n.º 10

0

Exibir arquivo

def xml2csv(input_file, outputfile=None, header_values=None, row_tag="row"):
    """Convert xml to csv
    function is used for only testing and can handle the file of the size
    """
    file_output = open_fr(input_file, encode=False)
    # set output file name and write header
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(outputfile)
    if os.name == 'nt':
        csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\', lineterminator='\n')
    else:
        csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\')

    v = file_output.read()
    csv_writer.writerow(header_values)
    tree = ET.parse(newfile(v))
    root = tree.getroot()
    for rows in root.findall(row_tag):
        x = [name.text for name in header_values for name in rows.findall(name)]
        csv_writer.writerow(x)
    file_output.close()
    os.system("rm -r {}".format(input_file))
    return outputfile

Exemplo n.º 11

0

Exibir arquivo

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # download and create species table
        table = Table('species')
        self.engine.auto_create_table(table, url=self.urls['species'])
        self.engine.insert_data_from_url(self.urls['species'])

        # State abbreviations with the year annual inventory began for that state
        stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000),
                      ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004),
                      ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001),
                      ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999),
                      ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003),
                      ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006),
                      ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002),
                      ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003),
                      ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001),
                      ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001),
                      ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003),
                      ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000),
                      ('WY', 2000), ('PR', 2001)]

        tablelist = ["SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING"]

        for table in tablelist:
            for state, year in stateslist:
                engine.download_files_from_archive(self.urls["main"] + state + "_" + table + ".ZIP",
                                                   [state + "_" + table + ".csv"])

        for table in tablelist:
            print("Scanning data for table %s..." % table)
            prep_file_name = "%s.csv" % table
            prep_file = open_fw(engine.format_filename(prep_file_name))
            this_file = open_fr(engine.format_filename(stateslist[0][0] + "_" + table + ".csv"))
            col_names = this_file.readline()
            prep_file.write(col_names)
            column_names = [col.strip('"') for col in col_names.split(',')]
            year_column = column_names.index("INVYR")
            this_file.close()

            for state, year in stateslist:
                this_file = open_fr(engine.format_filename(state + "_" + table + ".csv"))
                this_file.readline()
                for line in this_file:
                    values = line.split(',')
                    this_year = values[year_column]
                    if int(this_year) >= year:
                        prep_file.write(line)
            prep_file.close()
            engine.auto_create_table(Table(table), filename=prep_file_name)

            engine.insert_data_from_file(engine.format_filename(prep_file_name))

            try:
                os.remove(engine.format_filename(prep_file_name))
            except:
                pass

        return engine

Exemplo n.º 12

0

Exibir arquivo

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=Cleanup(
                                               correct_invalid_value,
                                               nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine

Exemplo n.º 13

0

Exibir arquivo

 def set_table_delimiter(self, file_path):
     dataset_file = open_fr(file_path)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: engine.py Projeto: iskunalpal/retriever

 def set_table_delimiter(self, file_path):
     dataset_file = open_fr(file_path)
     self.auto_get_delimiter(dataset_file.readline())
     dataset_file.close()

Exemplo n.º 15

0

Exibir arquivo

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns=[("species_id",               ("pk-int",)         ),
                           ("AOU",                      ("int",)            ),
                           ("english_common_name",      ("char",50)         ),
                           ("french_common_name",       ("char",50)         ),
                           ("spanish_common_name",      ("char",50)         ),
                           ("sporder",                  ("char",30)         ),
                           ("family",                   ("char",30)         ),
                           ("genus",                    ("char",30)         ),
                           ("species",                  ("char",50)         ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns=[("record_id"             ,   ("pk-auto",)    ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("Year"                  ,   ("int",)        ),
                           ("Aou"                   ,   ("int",)        ),
                           ("Count10"               ,   ("int",)        ),
                           ("Count20"               ,   ("int",)        ),
                           ("Count30"               ,   ("int",)        ),
                           ("Count40"               ,   ("int",)        ),
                           ("Count50"               ,   ("int",)        ),
                           ("StopTotal"             ,   ("int",)        ),
                           ("SpeciesTotal"          ,   ("int",)        )]

            stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
                          "Connecticut", "Delaware", "Florida", "Georgia", "Idaho",
                          "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
                          "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                          "Missouri", "Montana", "Nebraska", "Nevada",
                          ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"],
                          ["New Mexico", "NMexico"], ["New York", "NYork"],
                          ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio",
                          "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"],
                          ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee",
                          "Texas", "Utah", "Vermont", "Virginia", "Washington",
                          ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                          ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"],
                          ["Northwest Territories", "NWTerri"], "Newfoundland",
                          ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                          ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state + ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine

Exemplo n.º 16

0

Exibir arquivo

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # download and create species table
        table = Table('species')
        self.engine.auto_create_table(table, url=self.urls['species'])
        self.engine.insert_data_from_url(self.urls['species'])

        # State abbreviations with the year annual inventory began for that state
        stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000),
                      ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004),
                      ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001),
                      ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999),
                      ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003),
                      ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006),
                      ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002),
                      ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003),
                      ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001),
                      ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001),
                      ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003),
                      ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000),
                      ('WY', 2000), ('PR', 2001)]

        tablelist = [
            "SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE",
            "SEEDLING"
        ]

        for table in tablelist:
            for state, year in stateslist:
                engine.download_files_from_archive(
                    self.urls["main"] + state + "_" + table + ".ZIP",
                    [state + "_" + table + ".csv"])

        for table in tablelist:
            print("Scanning data for table %s..." % table)
            prep_file_name = "%s.csv" % table
            prep_file = open_fw(engine.format_filename(prep_file_name))
            this_file = open_fr(
                engine.format_filename(stateslist[0][0] + "_" + table +
                                       ".csv"))
            col_names = this_file.readline()
            prep_file.write(col_names)
            column_names = [col.strip('"') for col in col_names.split(',')]
            year_column = column_names.index("INVYR")
            this_file.close()

            for state, year in stateslist:
                this_file = open_fr(
                    engine.format_filename(state + "_" + table + ".csv"))
                this_file.readline()
                for line in this_file:
                    values = line.split(',')
                    this_year = values[year_column]
                    if int(this_year) >= year:
                        prep_file.write(line)
            prep_file.close()
            engine.auto_create_table(Table(table), filename=prep_file_name)

            engine.insert_data_from_file(
                engine.format_filename(prep_file_name))

            try:
                os.remove(engine.format_filename(prep_file_name))
            except:
                pass

        return engine

Exemplo n.º 17

0

Exibir arquivo

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns=[("species_id", ("pk-int",) ),
                           ("AOU", ("int",) ),
                           ("english_common_name", ("char",50) ),
                           ("french_common_name", ("char",50) ),
                           ("spanish_common_name", ("char",50) ),
                           ("sporder", ("char",30) ),
                           ("family", ("char",30) ),
                           ("genus", ("char",30) ),
                           ("species", ("char",50) ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"], ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns=[("RouteDataID"           ,   ("int",)        ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print("Failed bulk insert on " + part + ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine

Exemplo n.º 18

0

Exibir arquivo

Arquivo: breed_bird_survey_50stop.py Projeto: mcglinnlab/retriever

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns=[("species_id", ("pk-int",) ),
                           ("AOU", ("int",) ),
                           ("english_common_name", ("char",50) ),
                           ("french_common_name", ("char",50) ),
                           ("spanish_common_name", ("char",50) ),
                           ("sporder", ("char",30) ),
                           ("family", ("char",30) ),
                           ("genus", ("char",30) ),
                           ("species", ("char",50) ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"], ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns=[("RouteDataID"           ,   ("int",)        ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print("Failed bulk insert on " + part + ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine