def test_nyc_2016_genfieldfile(self): fc = FieldFile.generate_field_file(f('data/2018_Yellow_Taxi_Trip_Data_1000.csv'), delimiter=";") fc_new = FieldFile(fc.field_filename) self.assertEqual(fc.fields(), fc_new.fields()) os.unlink(fc.field_filename)
def test_fieldfile_nomatch(self): fc = FieldFile(f("data/AandE_Data_2011-04-10.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/inventory.csv'), has_header=True) bw = FileWriter(self._col, reader=reader, parser=parser) with self.assertRaises(ValueError): bw.write()
def test_delimiter_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/AandE_Data_2011-04-10.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/AandE_Data_2011-04-10.csv'), has_header=True) bw = FileWriter(self._col, reader=reader, parser=parser) bw.write() self.assertEqual(self._col.count_documents({}) - start_count, 300)
def test_delimiter_no_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/10k.tff")) parser = LineToDictParser(fc) reader = FileReader(f("data/10k.txt"), has_header=False, delimiter="|") bw = FileWriter(self._col, reader=reader, parser=parser) bw.write() self.assertEqual(self._col.count_documents({}) - start_count, 10000)
def test_FieldConfig(self): fc = FieldFile(f("data/test_fieldconfig.tff")) self.assertEqual(len(fc.fields()), 4) self.assertEqual(fc.fields()[0], "Test 1") self.assertEqual(fc.fields()[3], "Test 4") fc:FieldFile = FieldFile(f("data/uk_property_prices.tff")) self.assertEqual(len(fc.fields()), 16) self.assertEqual(fc.fields()[0], "txn") self.assertEqual(fc.fields()[2], "Date of Transfer") self.assertEqual(fc.fields()[14], "PPD Category Type")
def test_new_delimiter_and_timeformat_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/mot.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|") self.assertTrue(type(reader.name) == str) bw = FileWriter(self._col, reader=reader, parser=parser) total = bw.write() lines = LineCounter(f('data/mot_test_set_small.csv')).line_count inserted_count = self._col.count_documents({}) - start_count self.assertEqual(inserted_count, total) self.assertEqual(inserted_count, lines)
def test_Config_File(self): ff = FieldFile(f("data/10k.tff")) self.assertTrue("test_id" in ff.fields()) self.assertTrue("cylinder_capacity" in ff.fields()) self.assertEqual(ff.type_value("test_id"), "int") self.assertEqual(ff.type_value("test_date"), "datetime")
def test_date(self): config = FieldFile(f("data/inventory_dates.tff")) parser = LineToDictParser(config, locator=False) # screws up comparison later if locator is true reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) docs_written = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1) # header must be subtracted self.assertEqual(self._col.count_documents({}), docs_written) nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")}) self.assertTrue(nuts_doc)
def processOneFile(self, input_filename, field_filename=None, hasheader=False, restart=False, batchID=None): if not field_filename: field_filename = FieldFile.make_default_tff_name(input_filename) cmd = ImportCommand(collection=self._collection, field_filename=field_filename, delimiter=self._delimiter, has_header=hasheader, onerror=self._onerror, limit=self._limit) cmd.run(input_filename) return cmd.total_written()
def test_generate_fieldfile(self): fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff") self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename) self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff")) parser = LineToDictParser(fc) reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) write_count = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count new_inserted_count = self._col.count_documents({}) - start_count self.assertEqual(new_inserted_count, write_count) # header must be subtracted self.assertEqual(new_inserted_count, line_count - 1) # header must be subtracted os.unlink(f("data/inventory.testff"))
def pre_execute(self, arg): # print(f"'{arg}'") super().pre_execute(arg) self._log.info("Using collection:'{}'".format(self._collection.full_name)) if self._field_filename is None: self._field_filename = FieldFile.make_default_tff_name(arg) self._log.info(f"Using field file:'{self._field_filename}'") if not os.path.isfile(self._field_filename): raise OSError(f"No such field file:'{self._field_filename}'") self._fieldinfo = FieldFile(self._field_filename) self._reader = FileReader(arg, limit=self._limit, has_header=self._has_header, delimiter=self._delimiter) self._parser = LineToDictParser(self._fieldinfo, locator=self._locator, timestamp=self._timestamp, onerror=self._onerror) self._writer = FileWriter(self._collection,self._reader,self._parser)
def run(self, filename): if not self._log: self._log = Logger(self._args.logname, self._args.loglevel).log() if not self._args.silent: Logger.add_stream_handler(self._args.logname) self._log.info("Started pymongoimport") if self._field_filename is None: self._field_filename = FieldFile.make_default_tff_name(filename) if self._write_concern == 0: # pymongo won't allow other args with w=0 even if they are false client = pymongo.MongoClient(self._host, w=self._write_concern) else: client = pymongo.MongoClient(self._host, w=self._write_concern, fsync=self._fsync, j=self._journal) database = client[self._database_name] self._collection = database[self._collection_name] self._log.info(f"Write concern : {self._write_concern}") self._log.info(f"journal : {self._journal}") self._log.info(f"fsync : {self._fsync}") self._log.info(f"has header : {self._has_header}") cmd = ImportCommand(collection=self._collection, field_filename=self._field_filename, delimiter=self._delimiter, has_header=self._has_header, onerror=self._onerror, limit=self._limit, audit=self._audit, locator=self._locator, timestamp=self._timestamp, id=self._batch_ID) cmd.run(filename) return 1
def test_http_generate_fieldfile(self): if check_internet(): # Demographic_Statistics_By_Zip_Code.csv url = "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.csv?accessType=DOWNLOAD" ff_file = FieldFile.generate_field_file( url, delimiter=",", ff_filename=f("data/Demographic_Statistics_By_Zip_Code.tff")) self.assertTrue("JURISDICTION NAME" in ff_file.fields(), ff_file.fields()) self.assertEqual(len(ff_file.fields()), 46) self.assertTrue( "PERCENT PUBLIC ASSISTANCE TOTAL" in ff_file.fields()) os.unlink(f("data/Demographic_Statistics_By_Zip_Code.tff")) else: print( "Warning:No internet: Skipping test for generating field files from URLs" )
def test_reader(self): fc = FieldFile.generate_field_file(f("data/inventory.csv"), f("data/inventory_test.tff")) ff = FieldFile(fc.field_filename) reader = FileReader(f("data/inventory.csv"), has_header=True) parser = LineToDictParser(ff) for i, row in enumerate(reader.readline(), 1): doc = parser.parse_list(row, i) for field in ff.fields(): self.assertTrue(field in doc, f"'{field}'") os.unlink(fc.field_filename) ff = FieldFile(f("data/uk_property_prices.tff")) reader = FileReader(f("data/uk_property_prices.csv"), has_header=True) parser = LineToDictParser(ff) for i, row in enumerate(reader.readline(), i): doc = parser.parse_list(row, i) for field in ff.fields(): if field == "txn": # converted to _id field continue self.assertTrue(field in doc, f"{field} not present") self.assertTrue(type(doc["Price"]) == int) self.assertTrue(type(doc["Date of Transfer"]) == datetime)
def pymongoimport_main(input_args=None): """ Expect to recieve an array of args 1.3 : Added lots of support for the NHS Public Data sets project. --addfilename and --addtimestamp. Also we now fail back to string when type conversions fail. >>> pymongoimport_main( [ 'test_set_small.txt' ] ) database: test, collection: test files ['test_set_small.txt'] Processing : test_set_small.txt Completed processing : test_set_small.txt, (100 records) Processed test_set_small.txt """ usage_message = """ pymongoimport is a python program that will import data into a mongodb database (default 'test' ) and a mongodb collection (default 'test' ). Each file in the input list must correspond to a fieldfile format that is common across all the files. The fieldfile is specified by the --fieldfile parameter. An example run: python pymongoimport.py --database demo --collection demo --fieldfile test_set_small.ff test_set_small.txt """ # if input_args: # print("args: {}".format( " ".join(input_args))) parser = argparse.ArgumentParser(usage=usage_message) parser = add_standard_args(parser) # print( "Argv: %s" % argv ) # print(argv) if input_args: cmd = input_args args = parser.parse_args(cmd) else: cmd = tuple(sys.argv[1:]) args = parser.parse_args(cmd) cmd_args = " ".join(cmd) # print("args: %s" % args) log = Logger(args.logname, args.loglevel).log() # Logger.add_file_handler(args.logname) if not args.silent: Logger.add_stream_handler(args.logname) #print(args.filenames) if args.filelist: try: with open(args.filelist) as input_file: for line in input_file.readlines(): args.filenames.append(line) except OSError as e: log.error(f"{e}") if args.writeconcern == 0: # pymongo won't allow other args with w=0 even if they are false client = pymongo.MongoClient(args.host, w=args.writeconcern) else: client = pymongo.MongoClient(args.host, w=args.writeconcern, fsync=args.fsync, j=args.journal) if args.genfieldfile: args.has_header = True log.info('Forcing has_header true for --genfieldfile') cmd = GenerateFieldfileCommand(field_filename=args.fieldfile, delimiter=args.delimiter) for i in args.filenames: cmd.run(i) if args.audit: audit = Audit(client=client) batch_ID = audit.start_batch({"command": input_args}) else: audit = None batch_ID = None if args.database: database_name = args.database else: database_name = "PYIM" if args.collection: collection_name = args.collection else: collection_name = "ported" database = client[database_name] collection = database[collection_name] if args.drop: if args.restart: log.info("Warning --restart overrides --drop ignoring drop commmand") else: cmd = Drop_Command(audit=audit, id=batch_ID, database=database) cmd.run(collection_name) if args.fieldinfo: cfg = FieldFile(args.fieldinfo) for i,field in enumerate(cfg.fields(), 1 ): print(f"{i:3}. {field:25}:{cfg.type_value(field)}") print(f"Total fields: {len(cfg.fields())}") if not args.genfieldfile: if args.filenames : if args.audit: audit = Audit(client=client) batch_ID = audit.start_batch({"command": sys.argv}) else: audit = None batch_ID = None process = Importer(audit, batch_ID, args) for i in args.filenames: try: process.run(i) except OSError as e: log.error(f"{e}") except exceptions.HTTPError as e: log.error(f"{e}") if args.audit: audit.end_batch(batch_ID) else: log.info("No input files: Nothing to do") return 1
def test_generate_field_filename(self): gfc = FieldFile.generate_field_file(f('data/inventory.csv'), ext="xx") self.assertEqual(gfc.field_filename, f("data/inventory.xx")) rfc = FieldFile(gfc.field_filename) self.assertTrue("Inventory Item" in rfc.fields()) self.assertTrue("Amount" in rfc.fields()) self.assertTrue("Last Order", rfc.fields()) self.assertEqual(len(rfc.fields()), 3) os.unlink(gfc.field_filename) fc = FieldFile.generate_field_file(f('data/inventory.csv')) self.assertEqual(fc.field_filename, f("data/inventory.tff")) os.unlink(fc.field_filename) fc = FieldFile.generate_field_file(f('data/inventory.csv.1')) self.assertEqual(fc.field_filename, f("data/inventory.csv.tff"), fc.field_filename) os.unlink(fc.field_filename) fc = FieldFile.generate_field_file(f('data/yellow_tripdata_2015-01-06-200k.csv.1')) self.assertEqual(fc.field_filename, f("data/yellow_tripdata_2015-01-06-200k.csv.tff"), fc.field_filename) os.unlink(fc.field_filename) fc = FieldFile.generate_field_file(f('data/yellow_tripdata_2015-01-06-200k.csv.10')) self.assertEqual(fc.field_filename, f("data/yellow_tripdata_2015-01-06-200k.csv.tff"), fc.field_filename) os.unlink(fc.field_filename) fc = FieldFile.generate_field_file(f('data/test_results_2016_10.txt.1')) self.assertEqual(fc.field_filename, f("data/test_results_2016_10.txt.tff"), fc.field_filename) os.unlink(fc.field_filename)
def execute(self, arg): ff = FieldFile.generate_field_file(csv_filename=arg, ff_filename=self._field_filename) self._field_filename = ff.field_filename return self._field_filename
def testFieldDict(self): d = FieldFile(f("data/testresults.tff")).field_dict self.assertTrue("TestID" in d) self.assertTrue("FirstUseDate" in d) self.assertTrue("Colour" in d) self.assertTrue(d["TestID"]["type"] == "int")
def test_property_prices(self): ff = FieldFile(f("data/uk_property_prices.tff")) self.assertTrue(ff.has_new_name("txn")) self.assertFalse(ff.name_value("txn") is None)
def setUp(self): self._client = pymongo.MongoClient() self._db = self._client["PYIM_HTTP_TEST"] self._collection = self._db["PYIM_HTTP_TEST"] self._ff = FieldFile(f("data/2018_Yellow_Taxi_Trip_Data_1000.ff")) self._parser = LineToDictParser(self._ff)