def test_limit(self): # # need to test limit with a noheader file # reader = FileReader(f("data/2018_Yellow_Taxi_Trip_Data_1000.csv"), delimiter=";", limit=10, has_header=True) count = 0 for doc in reader.readline(limit=10): count = count + 1 self.assertEqual(count, 10)
def test_fieldfile_nomatch(self): fc = FieldFile(f("data/AandE_Data_2011-04-10.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/inventory.csv'), has_header=True) bw = FileWriter(self._col, reader=reader, parser=parser) with self.assertRaises(ValueError): bw.write()
def test_delimiter_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/AandE_Data_2011-04-10.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/AandE_Data_2011-04-10.csv'), has_header=True) bw = FileWriter(self._col, reader=reader, parser=parser) bw.write() self.assertEqual(self._col.count_documents({}) - start_count, 300)
def test_delimiter_no_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/10k.tff")) parser = LineToDictParser(fc) reader = FileReader(f("data/10k.txt"), has_header=False, delimiter="|") bw = FileWriter(self._col, reader=reader, parser=parser) bw.write() self.assertEqual(self._col.count_documents({}) - start_count, 10000)
def test_new_delimiter_and_timeformat_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/mot.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|") self.assertTrue(type(reader.name) == str) bw = FileWriter(self._col, reader=reader, parser=parser) total = bw.write() lines = LineCounter(f('data/mot_test_set_small.csv')).line_count inserted_count = self._col.count_documents({}) - start_count self.assertEqual(inserted_count, total) self.assertEqual(inserted_count, lines)
def test_date(self): config = FieldFile(f("data/inventory_dates.tff")) parser = LineToDictParser(config, locator=False) # screws up comparison later if locator is true reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) docs_written = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1) # header must be subtracted self.assertEqual(self._col.count_documents({}), docs_written) nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")}) self.assertTrue(nuts_doc)
def test_generate_fieldfile(self): fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff") self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename) self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff")) parser = LineToDictParser(fc) reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) write_count = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count new_inserted_count = self._col.count_documents({}) - start_count self.assertEqual(new_inserted_count, write_count) # header must be subtracted self.assertEqual(new_inserted_count, line_count - 1) # header must be subtracted os.unlink(f("data/inventory.testff"))
def test_http_import(self): if check_internet(): csv_parser = LineToDictParser(self._ff) reader = FileReader( "https://data.cityofnewyork.us/api/views/biws-g3hs/rows.csv?accessType=DOWNLOAD&bom=true&format=true&delimiter=%3B", has_header=True, delimiter=';') writer = FileWriter(self._collection, reader, csv_parser) before_doc_count = self._collection.count_documents({}) after_doc_count = writer.write(1000) self.assertEqual(after_doc_count - before_doc_count, 1000) else: print("Warning:No internet: test_http_import() skipped")
def test_local_import(self): reader = FileReader(f("data/2018_Yellow_Taxi_Trip_Data_1000.csv"), has_header=True, delimiter=";") before_doc_count = self._collection.count_documents({}) writer = FileWriter(self._collection, reader=reader, parser=self._parser) writer.write(10) after_doc_count = self._collection.count_documents({}) self.assertEqual(after_doc_count - before_doc_count, 10)
def pre_execute(self, arg): # print(f"'{arg}'") super().pre_execute(arg) self._log.info("Using collection:'{}'".format(self._collection.full_name)) if self._field_filename is None: self._field_filename = FieldFile.make_default_tff_name(arg) self._log.info(f"Using field file:'{self._field_filename}'") if not os.path.isfile(self._field_filename): raise OSError(f"No such field file:'{self._field_filename}'") self._fieldinfo = FieldFile(self._field_filename) self._reader = FileReader(arg, limit=self._limit, has_header=self._has_header, delimiter=self._delimiter) self._parser = LineToDictParser(self._fieldinfo, locator=self._locator, timestamp=self._timestamp, onerror=self._onerror) self._writer = FileWriter(self._collection,self._reader,self._parser)
def test_reader(self): fc = FieldFile.generate_field_file(f("data/inventory.csv"), f("data/inventory_test.tff")) ff = FieldFile(fc.field_filename) reader = FileReader(f("data/inventory.csv"), has_header=True) parser = LineToDictParser(ff) for i, row in enumerate(reader.readline(), 1): doc = parser.parse_list(row, i) for field in ff.fields(): self.assertTrue(field in doc, f"'{field}'") os.unlink(fc.field_filename) ff = FieldFile(f("data/uk_property_prices.tff")) reader = FileReader(f("data/uk_property_prices.csv"), has_header=True) parser = LineToDictParser(ff) for i, row in enumerate(reader.readline(), i): doc = parser.parse_list(row, i) for field in ff.fields(): if field == "txn": # converted to _id field continue self.assertTrue(field in doc, f"{field} not present") self.assertTrue(type(doc["Price"]) == int) self.assertTrue(type(doc["Date of Transfer"]) == datetime)
def generate_field_file(csv_filename, ff_filename=None, ext=DEFAULT_EXTENSION, delimiter=","): toml_dict: dict = {} if not ext.startswith("."): ext = f".{ext}" if ff_filename is None: if csv_filename.startswith("http://") or csv_filename.startswith( "https://"): ff_filename = csv_filename.split('/')[-1] ff_filename = os.path.splitext(ff_filename)[0] + ext else: ff_filename = os.path.splitext(csv_filename)[0] + ext reader = FileReader(csv_filename, has_header=True, delimiter=delimiter) first_line = next(reader.readline()) header_line = reader.header_line if len(first_line) > len(header_line): raise ValueError( f"Header line has more columns than first " "line: {len(column_names)} > {len(column_values)}") elif len(first_line) < len(header_line): raise ValueError( f"Header line has less columns" "than first line: {len(column_names)} < {len(column_values)}") else: for i, (key, value) in enumerate(zip(header_line, first_line)): value = value.strip() if value == "": value = f"blank-{i}" # print( i ) if value.startswith('"'): # strip out quotes if they exist value = value.strip('"') if value.startswith("'"): value = value.strip("'") key = key.replace('$', '_') # not valid keys for mongodb key = key.replace('.', '_') # not valid keys for mongodb t = Converter.guess_type(value) key = key.strip() # remove any white space inside quotes toml_dict[key] = {} toml_dict[key]["type"] = t toml_dict[key]["name"] = key # ff_file.write(f"[{name}]\n") # ff_file.write(f"type={t}\n") # ff_file.write(f"name={name}") with open(ff_filename, "w") as ff_file: #print(toml_dict) toml_string = toml.dumps(toml_dict) ff_file.write("#\n") ts = datetime.utcnow() ff_file.write( f"# Created '{ff_filename}' at UTC:{ts} by class {__name__}\n" ) ff_file.write("#\n") ff_file.write(toml_string) ff_file.write(f"#end\n") return FieldFile(ff_filename)