def test_generate_fieldfile(self):
        fc_filename = FieldConfig.generate_field_file(f("data/inventory.csv"), ext="testff")
        self.assertTrue(os.path.isfile(f("data/inventory.testff")))
        fc = FieldConfig(None, fc_filename, hasheader=True)
        config = fc.config()
        start_count = self._col.count_documents({})
        writer = File_Writer(self._col, fc)
        writer.insert_file(f("data/inventory.csv"))
        line_count = LineCounter(f("data/inventory.csv")).line_count()
        self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1)  # header must be subtracted

        os.unlink(f("data/inventory.testff"))

        c = Converter()
        with open(f("data/inventory.csv"), "r")  as file:
            if fc.hasheader():
                _ = file.readline()
            reader = fc.get_dict_reader(file)
            fields = config.fields()
            for row in reader:
                # print( row )
                for field in fields:
                    row[field] = c.convert(config.type_value(field), row[field])  # remember we type convert fields

                doc = self._col.find_one(row)
                self.assertTrue(doc)
 def testFieldDict(self):
     fc = FieldConfig( None, f("data/testresults.ff"), delimiter="|")
     d = fc.config().field_dict()
     self.assertTrue("TestID" in d )
     self.assertTrue("FirstUseDate" in d)
     self.assertTrue("Colour" in d)
     self.assertTrue(d["TestID"]["type"] == "int")
Exemplo n.º 3
0
 def testFieldDict(self):
     f = FieldConfig("data/testresults.ff", delimiter="|")
     d = f.fieldDict()
     self.assertTrue("TestID" in d)
     self.assertTrue("FirstUseDate" in d)
     self.assertTrue("Colour" in d)
     self.assertTrue(d["TestID"]["type"] == "int")
Exemplo n.º 4
0
    def test_generate_field_filename(self):
        fc_filename = FieldConfig.generate_field_filename('data/inventory.csv')
        self.assertEqual(fc_filename, "data/inventory.ff")
        fc_filename = FieldConfig.generate_field_filename('data/inventory.csv',
                                                          ext="xx")
        self.assertEqual(fc_filename, "data/inventory.xx")
        fc_filename = FieldConfig.generate_field_filename('data/inventory.csv',
                                                          ext=".xx")
        self.assertEqual(fc_filename, "data/inventory.xx")
        fc_filename = FieldConfig.generate_field_filename('data/inventory.csv')
        self.assertEqual(fc_filename, "data/inventory.ff")

        os.unlink("data/inventory.ff")
Exemplo n.º 5
0
    def test_generate_fieldfile(self):

        fc_filename = FieldConfig.generate_field_file("data/inventory.csv",
                                                      ext="testff")
        self.assertTrue(os.path.isfile("data/inventory.testff"))
        fc = FieldConfig(fc_filename, hasheader=True)

        start_count = self._col.count()
        writer = File_Writer(self._col, fc)
        writer.insert_file("data/inventory.csv")
        line_count = File_Splitter("data/inventory.csv").count_lines()
        self.assertEqual(self._col.count() - start_count,
                         line_count - 1)  # header must be subtracted

        os.unlink("data/inventory.testff")

        with open("data/inventory.csv", "r") as f:
            if fc.hasheader():
                _ = f.readline()
            reader = fc.get_dict_reader(f)
            fields = fc.fields()
            for row in reader:
                # print( row )
                for f in fields:
                    row[f] = fc.type_convert(
                        row[f],
                        fc.typeData(f))  # remember we type convert fields

                doc = self._col.find_one(row)
                self.assertTrue(doc)
    def test_FieldConfig(self):
        fc = FieldConfig(None, f("data/test_fieldconfig.ff"))
        cfg = fc.config()
        self.assertEqual(len(cfg.fields()), 4)

        self.assertEqual(cfg.fields()[0], "Test 1")
        self.assertEqual(cfg.fields()[3], "Test 4")

        fc = FieldConfig(None, f("data/uk_property_prices.ff"))
        cfg = fc.config()
        self.assertEqual(len(cfg.fields()), 15)

        self.assertEqual(cfg.fields()[0], "txn")
        self.assertEqual(cfg.fields()[2], "Date of Transfer")
        self.assertEqual(cfg.fields()[14], "PPD Category Type")
Exemplo n.º 7
0
    def test_Restart(self):
        batch_size = 500
        fc = FieldConfig( self._root.root_path( "data", "10k.ff"), hasheader=False, delimiter="|")
        bw = File_Writer( self._collection, fc, batch_size = batch_size )
        bw.insert_file( self._root.root_path( "data", "10k.txt"), restart=True )
        audit = self._db[ "audit"]
        self.assertEqual( audit.count(), 1 )
        audit_doc = audit.find_one()
        
        self.assertEqual( audit_doc[ "state" ], "completed")
        
        last_doc = self.get_last_doc( self._collection )
        self.assertEqual( audit_doc[ "last_doc_id"], last_doc[ "_id"])
        self.assertEqual( audit_doc[ "count"], 10000 )
            
        bw.insert_file( self._root.root_path( "data", "10k.txt"), restart=True )
        
        
        self.assertEqual( audit.count(), 2 )
        last_audit_doc = self.get_last_doc( audit )
        
        self.assertEqual( last_audit_doc[ "state" ], "completed")

        last_doc = self.get_last_doc( self._collection )
        
        self.assertEqual( last_audit_doc[ "last_doc_id"], last_doc[ "_id"])
        self.assertEqual( last_audit_doc[ "count"], 10000 )
Exemplo n.º 8
0
 def test_delimiter_header(self):
     start_count = self._col.count()
     fc = FieldConfig("data/AandE_Data_2011-04-10.ff",
                      delimiter=',',
                      hasheader=True)
     bw = File_Writer(self._col, fc)
     bw.insert_file("data/AandE_Data_2011-04-10.csv")
     self.assertEqual(self._col.count() - start_count, 300)
Exemplo n.º 9
0
    def test_FieldConfig(self):
        fc = FieldConfig("data/test_fieldconfig.ff")

        self.assertEqual(len(fc.fields()), 4)

        self.assertEqual(fc.fields()[0], "Test 1")
        self.assertEqual(fc.fields()[3], "Test 4")

        fc = FieldConfig("data/uk_property_prices.ff")

        self.assertEqual(len(fc.fields()), 15)

        self.assertEqual(fc.fields()[0], "txn")
        self.assertEqual(fc.fields()[2], "Date of Transfer")
        self.assertEqual(fc.fields()[14], "PPD Category Type")
Exemplo n.º 10
0
    def processOneFile(self, input_filename, field_filename=None, hasheader=False, restart=False, batchID=None):

        if not field_filename:
            field_filename = FieldConfig.generate_field_filename(input_filename)
        cmd = Import_Command(log=self._logger,
                             collection=self._collection,
                             field_filename=field_filename,
                             delimiter=self._delimiter,
                             hasheader=hasheader,
                             onerror=self._onerror,
                             limit=self._limit)

        cmd.run(input_filename)
        return cmd.total_written()
Exemplo n.º 11
0
    def processOneFile(self,
                       input_filename,
                       field_filename=None,
                       hasheader=False,
                       restart=False):

        if field_filename:
            self._logger.info("using field file: '%s'", field_filename)
        else:
            field_filename = os.path.splitext(input_filename)[0] + ".ff"

        fieldConfig = FieldConfig(field_filename, self._delimiter, hasheader,
                                  self._gen_id, self._onerror)

        fw = File_Writer(self._collection, fieldConfig, self._batchsize)
        totalWritten = fw.insert_file(input_filename, restart)
        return totalWritten
Exemplo n.º 12
0
    def test_dict_reader(self):
        fc_filename = FieldConfig.generate_field_file(f("data/inventory.csv"))
        fc = FieldConfig(None, fc_filename)
        cfg = fc.config()
        with open(f("data/inventory.csv"), "r") as file:
            if fc.hasheader():
                _ = file.readline()
            reader = fc.get_dict_reader(file)
            for row in reader:
                for field in cfg.fields():
                    self.assertTrue(field in row)

        fc = FieldConfig(None, f("data/uk_property_prices.ff"))
        cfg = fc.config()
        with open(f("data/uk_property_prices.csv"), "r") as file:
            if fc.hasheader():
                _ = file.readline()
            reader = fc.get_dict_reader(file)
            for row in reader:
                for field in cfg.fields():
                    self.assertTrue(field in row)
                    self.assertTrue(type(row["Price"]) == str)
                    self.assertTrue(type(row["Date of Transfer"]) == str)
Exemplo n.º 13
0
    def test_generate_field_filename(self):
        fc_filename = FieldConfig.generate_field_filename(f('data/inventory.csv'))
        self.assertEqual(fc_filename, f("data/inventory.ff"), fc_filename)
        fc_filename = FieldConfig.generate_field_filename(f('data/inventory.csv'), ext="xx")
        self.assertEqual(fc_filename, f("data/inventory.xx"))
        fc_filename = FieldConfig.generate_field_filename(f('data/inventory.csv'), ext=".xx")
        self.assertEqual(fc_filename, f("data/inventory.xx"))
        fc_filename = FieldConfig.generate_field_filename(f('data/inventory.csv'))
        self.assertEqual(fc_filename, f("data/inventory.ff"))

        fc_filename = FieldConfig.generate_field_filename(f('data/inventory.csv.1'))
        self.assertEqual(fc_filename, f("data/inventory.ff"), fc_filename)

        fc_filename = FieldConfig.generate_field_filename(f('data/yellow_tripdata_2015-01-06-200k.csv.1'))
        self.assertEqual(fc_filename, f("data/yellow_tripdata_2015-01-06-200k.ff"), fc_filename)

        fc_filename = FieldConfig.generate_field_filename(f('data/yellow_tripdata_2015-01-06-200k.csv.10'))
        self.assertEqual(fc_filename, f("data/yellow_tripdata_2015-01-06-200k.ff"), fc_filename)

        fc_filename = FieldConfig.generate_field_filename(f('test_result_2016.txt.1'))
        self.assertEqual(fc_filename, f("test_result_2016.ff"), fc_filename)
Exemplo n.º 14
0
def mongo_import(input_args=None):
    """
    Expect to recieve an array of args
    
    1.3 : Added lots of support for the NHS Public Data sets project. --addfilename and --addtimestamp.
    Also we now fail back to string when type conversions fail.
    
    >>> mongo_import( [ 'test_set_small.txt' ] )
    database: test, collection: test
    files ['test_set_small.txt']
    Processing : test_set_small.txt
    Completed processing : test_set_small.txt, (100 records)
    Processed test_set_small.txt
    """

    usage_message = '''
    
    pymongo_import is a python program that will import data into a mongodb
    database (default 'test' ) and a mongodb collection (default 'test' ).
    
    Each file in the input list must correspond to a fieldfile format that is
    common across all the files. The fieldfile is specified by the 
    --fieldfile parameter.
    
    An example run:
    
    python pymongo_import.py --database demo --collection demo --fieldfile test_set_small.ff test_set_small.txt
    '''

    # if arglist:
    #     print("args %s" % str(arglist))
    parser = argparse.ArgumentParser(usage=usage_message)
    parser = add_standard_args(parser)
    # print( "Argv: %s" % argv )
    # print(argv)

    if input_args:
        cmd = input_args
        args = parser.parse_args(cmd)
    else:
        cmd = tuple(sys.argv[1:])
        args = parser.parse_args(cmd)
    # print("args: %s" % args)

    log = Logger(args.logname, args.loglevel).log()

    Logger.add_file_handler(args.logname)

    if not args.silent:
        Logger.add_stream_handler(args.logname)

    log.info("Started pymongo_import")
    log.info("Write concern : %i", args.writeconcern)
    log.info("journal       : %i", args.journal)
    log.info("fsync         : %i", args.fsync)
    log.info("genfieldfile  : %s", args.genfieldfile)
    if args.genfieldfile:
        args.hasheader = True
        log.info("Forcing hasheader true for --genfieldfile")
    log.info("hasheader     : %s", args.hasheader)

    if args.writeconcern == 0:  # pymongo won't allow other args with w=0 even if they are false
        client = pymongo.MongoClient(args.host, w=args.writeconcern)
    else:
        client = pymongo.MongoClient(args.host,
                                     w=args.writeconcern,
                                     fsync=args.fsync,
                                     j=args.journal)
    database = client[args.database]
    collection = database[args.collection]

    if args.drop:
        if args.restart:
            log.info(
                "Warning --restart overrides --drop ignoring drop commmand")
        else:
            database.drop_collection(args.collection)
            log.info("dropped collection: %s.%s", args.database,
                     args.collection)

    if args.genfieldfile:
        for i in args.filenames:
            fc_filename = FieldConfig.generate_field_file(i, args.delimiter)
            log.info("Creating '%s' from '%s'", fc_filename, i)
        sys.exit(0)
    elif args.filenames:
        log.info("Using database: %s, collection: %s", args.database,
                 args.collection)
        # log.info( "processing %i files", len( args.filenames ))

        if args.batchsize < 1:
            log.warn("Chunksize must be 1 or more. Chunksize : %i",
                     args.batchsize)
            sys.exit(1)
        try:
            if args.audit:
                log.info("Auditing output")
                audit = Audit(database)
                batchID = audit.start_batch({"cmd": str(cmd)})

            file_processor = FileProcessor(collection, args.delimiter,
                                           args.onerror, args.id,
                                           args.batchsize)
            file_processor.processFiles(filenames=args.filenames,
                                        field_filename=args.fieldfile,
                                        hasheader=args.hasheader,
                                        restart=args.restart)

            if args.audit:
                audit.end_batch(batchID)

        except KeyboardInterrupt:
            log.warn("exiting due to keyboard interrupt...")
    else:
        log.info("No input files: Nothing to do")

    return 1
Exemplo n.º 15
0
    def test_dict_reader(self):
        fc_filename = FieldConfig.generate_field_file("data/inventory.csv")
        fc = FieldConfig(fc_filename)
        with open("data/inventory.csv", "r") as f:
            if fc.hasheader():
                _ = f.readline()
            reader = fc.get_dict_reader(f)
            for row in reader:
                for field in fc.fields():
                    self.assertTrue(field in row)

        fc = FieldConfig("data/uk_property_prices.ff")
        with open("data/uk_property_prices.csv", "r") as f:
            if fc.hasheader():
                _ = f.readline()
            reader = fc.get_dict_reader(f)
            for row in reader:
                for field in fc.fields():
                    self.assertTrue(field in row)
                    self.assertTrue(type(row["Price"]) == str)
                    self.assertTrue(type(row["Date of Transfer"]) == str)
Exemplo n.º 16
0
 def test_delimiter_no_header(self):
     start_count = self._col.count()
     fc = FieldConfig("data/10k.ff", delimiter='|', hasheader=False)
     bw = File_Writer(self._col, fc)
     bw.insert_file("data/10k.txt")
     self.assertEqual(self._col.count() - start_count, 10000)