예제 #1
0
    def test_limit(self):
        #
        # need to test limit with a noheader file
        #

        reader = FileReader(f("data/2018_Yellow_Taxi_Trip_Data_1000.csv"),
                            delimiter=";",
                            limit=10,
                            has_header=True)
        count = 0
        for doc in reader.readline(limit=10):
            count = count + 1

        self.assertEqual(count, 10)
예제 #2
0
 def test_fieldfile_nomatch(self):
     fc = FieldFile(f("data/AandE_Data_2011-04-10.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/inventory.csv'), has_header=True)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     with self.assertRaises(ValueError):
         bw.write()
예제 #3
0
 def test_delimiter_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/AandE_Data_2011-04-10.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/AandE_Data_2011-04-10.csv'), has_header=True)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     bw.write()
     self.assertEqual(self._col.count_documents({}) - start_count, 300)
예제 #4
0
 def test_delimiter_no_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/10k.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f("data/10k.txt"), has_header=False, delimiter="|")
     bw = FileWriter(self._col, reader=reader, parser=parser)
     bw.write()
     self.assertEqual(self._col.count_documents({}) - start_count, 10000)
예제 #5
0
 def test_new_delimiter_and_timeformat_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/mot.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|")
     self.assertTrue(type(reader.name) == str)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     total = bw.write()
     lines = LineCounter(f('data/mot_test_set_small.csv')).line_count
     inserted_count = self._col.count_documents({}) - start_count
     self.assertEqual(inserted_count, total)
     self.assertEqual(inserted_count, lines)
예제 #6
0
    def test_date(self):
        config = FieldFile(f("data/inventory_dates.tff"))
        parser = LineToDictParser(config, locator=False)  # screws up comparison later if locator is true
        reader = FileReader(f("data/inventory.csv"), has_header=True)
        start_count = self._col.count_documents({})
        writer = FileWriter(self._col, reader=reader, parser=parser)
        docs_written = writer.write()
        line_count = LineCounter(f("data/inventory.csv")).line_count
        self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1)  # header must be subtracted
        self.assertEqual(self._col.count_documents({}), docs_written)

        nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")})
        self.assertTrue(nuts_doc)
예제 #7
0
 def test_generate_fieldfile(self):
     fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff")
     self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename)
     self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f("data/inventory.csv"), has_header=True)
     start_count = self._col.count_documents({})
     writer = FileWriter(self._col, reader=reader, parser=parser)
     write_count = writer.write()
     line_count = LineCounter(f("data/inventory.csv")).line_count
     new_inserted_count = self._col.count_documents({}) - start_count
     self.assertEqual(new_inserted_count, write_count)  # header must be subtracted
     self.assertEqual(new_inserted_count, line_count - 1)  # header must be subtracted
     os.unlink(f("data/inventory.testff"))
예제 #8
0
    def test_http_import(self):
        if check_internet():
            csv_parser = LineToDictParser(self._ff)
            reader = FileReader(
                "https://data.cityofnewyork.us/api/views/biws-g3hs/rows.csv?accessType=DOWNLOAD&bom=true&format=true&delimiter=%3B",
                has_header=True,
                delimiter=';')

            writer = FileWriter(self._collection, reader, csv_parser)
            before_doc_count = self._collection.count_documents({})
            after_doc_count = writer.write(1000)
            self.assertEqual(after_doc_count - before_doc_count, 1000)
        else:
            print("Warning:No internet: test_http_import() skipped")
예제 #9
0
    def test_local_import(self):
        reader = FileReader(f("data/2018_Yellow_Taxi_Trip_Data_1000.csv"),
                            has_header=True,
                            delimiter=";")

        before_doc_count = self._collection.count_documents({})

        writer = FileWriter(self._collection,
                            reader=reader,
                            parser=self._parser)
        writer.write(10)

        after_doc_count = self._collection.count_documents({})

        self.assertEqual(after_doc_count - before_doc_count, 10)
예제 #10
0
    def pre_execute(self, arg):
        # print(f"'{arg}'")
        super().pre_execute(arg)
        self._log.info("Using collection:'{}'".format(self._collection.full_name))

        if self._field_filename is None:
            self._field_filename = FieldFile.make_default_tff_name(arg)

        self._log.info(f"Using field file:'{self._field_filename}'")

        if not os.path.isfile(self._field_filename):
            raise OSError(f"No such field file:'{self._field_filename}'")

        self._fieldinfo = FieldFile(self._field_filename)

        self._reader = FileReader(arg,
                                  limit=self._limit,
                                  has_header=self._has_header,
                                  delimiter=self._delimiter)
        self._parser = LineToDictParser(self._fieldinfo,
                                        locator=self._locator,
                                        timestamp=self._timestamp,
                                        onerror=self._onerror)
        self._writer = FileWriter(self._collection,self._reader,self._parser)
예제 #11
0
    def test_reader(self):
        fc = FieldFile.generate_field_file(f("data/inventory.csv"), f("data/inventory_test.tff"))
        ff = FieldFile(fc.field_filename)
        reader = FileReader(f("data/inventory.csv"), has_header=True)
        parser = LineToDictParser(ff)
        for i, row in enumerate(reader.readline(), 1):
            doc = parser.parse_list(row, i)
            for field in ff.fields():
                self.assertTrue(field in doc, f"'{field}'")

        os.unlink(fc.field_filename)

        ff = FieldFile(f("data/uk_property_prices.tff"))
        reader = FileReader(f("data/uk_property_prices.csv"), has_header=True)

        parser = LineToDictParser(ff)
        for i, row in enumerate(reader.readline(), i):
            doc = parser.parse_list(row, i)
            for field in ff.fields():
                if field == "txn":  # converted to _id field
                    continue
                self.assertTrue(field in doc, f"{field} not present")
                self.assertTrue(type(doc["Price"]) == int)
                self.assertTrue(type(doc["Date of Transfer"]) == datetime)
예제 #12
0
    def generate_field_file(csv_filename,
                            ff_filename=None,
                            ext=DEFAULT_EXTENSION,
                            delimiter=","):

        toml_dict: dict = {}
        if not ext.startswith("."):
            ext = f".{ext}"

        if ff_filename is None:
            if csv_filename.startswith("http://") or csv_filename.startswith(
                    "https://"):
                ff_filename = csv_filename.split('/')[-1]
                ff_filename = os.path.splitext(ff_filename)[0] + ext
            else:
                ff_filename = os.path.splitext(csv_filename)[0] + ext

        reader = FileReader(csv_filename, has_header=True, delimiter=delimiter)
        first_line = next(reader.readline())
        header_line = reader.header_line
        if len(first_line) > len(header_line):
            raise ValueError(
                f"Header line has more columns than first "
                "line: {len(column_names)} > {len(column_values)}")
        elif len(first_line) < len(header_line):
            raise ValueError(
                f"Header line has less columns"
                "than first line: {len(column_names)} < {len(column_values)}")
        else:
            for i, (key, value) in enumerate(zip(header_line, first_line)):
                value = value.strip()
                if value == "":
                    value = f"blank-{i}"
                # print( i )

                if value.startswith('"'):  # strip out quotes if they exist
                    value = value.strip('"')
                if value.startswith("'"):
                    value = value.strip("'")
                key = key.replace('$', '_')  # not valid keys for mongodb
                key = key.replace('.', '_')  # not valid keys for mongodb
                t = Converter.guess_type(value)
                key = key.strip()  # remove any white space inside quotes
                toml_dict[key] = {}
                toml_dict[key]["type"] = t
                toml_dict[key]["name"] = key
                # ff_file.write(f"[{name}]\n")
                # ff_file.write(f"type={t}\n")
                # ff_file.write(f"name={name}")

            with open(ff_filename, "w") as ff_file:
                #print(toml_dict)
                toml_string = toml.dumps(toml_dict)
                ff_file.write("#\n")
                ts = datetime.utcnow()
                ff_file.write(
                    f"# Created '{ff_filename}' at UTC:{ts} by class {__name__}\n"
                )
                ff_file.write("#\n")
                ff_file.write(toml_string)
                ff_file.write(f"#end\n")

        return FieldFile(ff_filename)