def test_in(self):
     self.assertTrue(
         dq.match({'role': 'admin'}, 'role in ["admin", "observer"]'))
     self.assertTrue(
         dq.match({'age': 18}, 'age in [12, 56, 78, 18, 90, 20]'))
     self.assertFalse(
         dq.match({'role': 'user'}, 'role in ["admin", "observer"]'))
 def test_contains(self):
     self.assertTrue(
         dq.match({'roles': ['admin', 'observer']},
                  'roles CONTAINS "admin"'))
     self.assertFalse(
         dq.match({'roles': ['admin', 'observer']},
                  'roles CONTAINS "user"'))
    def test_validation(self):
        data = {}
        with self.assertRaises(DQValidationError):
            dq.match(data, "44 == 44")

        with self.assertRaises(DQValidationError):
            dq.compile("44 == 44")
 def test_pars(self):
     data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0}
     self.assertTrue(dq.match(data, "(a) AND (c)"))
     self.assertTrue(dq.match(data, "((a) AND (c))"))
     self.assertTrue(dq.match(data, "((((a)) AND ((c))))"))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(dq.match(data, "(a) AND (c"))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(dq.match(data, ")a AND c"))
Exemplo n.º 5
0
 def test_pars(self):
     data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0}
     self.assertTrue(match(data, "(`a`) AND (`c`)"))
     self.assertTrue(match(data, "((`a`) AND (`c`))"))
     self.assertTrue(match(data, "((((`a`)) AND ((`c`))))"))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(match(data, "(`a`) AND (`c`"))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(match(data, ")`a` AND `c`"))
 def test_key_order(self):
     data1 = {'age': 26}
     data2 = {'x': 12, 'y': 33}
     data3 = {
         'age': 12,
         'friends': [
             {
                 'age': 14
             },
             {
                 'age': 16
             },
             {
                 'age': 18
             },
             {
                 'age': 20
             },
         ]
     }
     self.assertTrue(dq.match(data1, "26 == age"))
     self.assertTrue(dq.match(data1, "[23, 45, 12, 26] CONTAINS age"))
     self.assertTrue(dq.match(data1, "age == age"))
     self.assertTrue(dq.match(data2, "x < y"))
     self.assertFalse(dq.match(data2, "x >= y"))
     self.assertTrue(dq.match(data2, "x != y"))
     self.assertTrue(dq.match(data3, "age < `friends.age`"))
Exemplo n.º 7
0
 def test_key_order(self):
     data1 = {'age': 26}
     data2 = {'x': 12, 'y': 33}
     data3 = {
         'age': 12,
         'friends': [
             {
                 'age': 14
             },
             {
                 'age': 16
             },
             {
                 'age': 18
             },
             {
                 'age': 20
             },
         ]
     }
     self.assertTrue(match(data1, "26 == `age`"))
     self.assertTrue(match(data1, "[23, 45, 12, 26] CONTAIN `age`"))
     self.assertTrue(match(data1, "`age` == `age`"))
     self.assertTrue(match(data2, "`x` < `y`"))
     self.assertFalse(match(data2, "`x` >= `y`"))
     self.assertTrue(match(data2, "`x` != `y`"))
     self.assertTrue(match(data3, "`age` < `friends.age`"))
 def test_only_keys(self):
     self.assertTrue(
         dq.match({
             'username': '******',
             'age': 26
         }, "username AND age"))
     self.assertTrue(
         dq.match({
             'username': '******',
             'age': 26
         }, "username"))
     self.assertFalse(dq.match({'username': '******', 'age': 0}, "age"))
     self.assertFalse(dq.match({'username': '******'}, "age"))
     self.assertFalse(
         dq.match({
             'username': '******',
             'age': 0
         }, "username AND age"))
Exemplo n.º 9
0
 def test_only_keys(self):
     self.assertTrue(
         match({
             'username': '******',
             'age': 26
         }, "`username` AND `age`"))
     self.assertTrue(
         match({
             'username': '******',
             'age': 26
         }, "`username`"))
     self.assertFalse(match({'username': '******', 'age': 0}, "`age`"))
     self.assertFalse(match({'username': '******'}, "`age`"))
     self.assertFalse(
         match({
             'username': '******',
             'age': 0
         }, "`username` AND `age`"))
 def test_match(self):
     data = {'username': '******'}
     self.assertTrue(dq.match(data, r'username MATCH /.*admin.*/'))
     self.assertTrue(dq.match(data, r'username MATCH /test.*/'))
     self.assertTrue(dq.match({'age': '98'}, r'age MATCH /\d+/'))
     self.assertFalse(dq.match(data, r'username MATCH /qwerty/'))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(dq.match(data, r'/\d+/ MATCH username'))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(dq.match(data, r'username MATCH "test"'))
 def test_like(self):
     data = {'username': '******'}
     self.assertTrue(dq.match(data, 'username LIKE "*admin*"'))
     self.assertTrue(dq.match(data, 'username LIKE "test*"'))
     self.assertTrue(dq.match(data, 'username LIKE "test?admin?username"'))
     self.assertFalse(dq.match(data, 'username LIKE "test"'))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(dq.match(data, 'username LIKE 23'))
     with self.assertRaises(DQSyntaxError):
         self.assertTrue(dq.match(data, '"test" LIKE username'))
 def test_now(self):
     utcnow = datetime.utcnow()
     self.assertTrue(
         dq.match({'time': utcnow - timedelta(hours=1)}, "time < NOW"))
     self.assertFalse(
         dq.match({'time': utcnow - timedelta(hours=1)}, "time == NOW"))
Exemplo n.º 13
0
 def test_lt(self):
     self.assertTrue(match({'age': 18}, '`age` < 20'))
     self.assertFalse(match({'age': 18}, '`age` < 17'))
     self.assertFalse(match({'age': 18}, '`age` < 18'))
Exemplo n.º 14
0
 def test_gt(self):
     self.assertTrue(match({'age': 18}, '`age` > 12'))
     self.assertFalse(match({'age': 18}, '`age` > 20'))
     self.assertFalse(match({'age': 18}, '`age` > 18'))
 def test_gte(self):
     self.assertTrue(dq.match({'age': 18}, 'age >= 12'))
     self.assertTrue(dq.match({'age': 18}, 'age >= 18'))
     self.assertFalse(dq.match({'age': 18}, 'age >= 20'))
Exemplo n.º 16
0
 def test_equal(self):
     self.assertTrue(match({'age': 18}, '`age` == 18'))
     self.assertFalse(match({'age': 18}, '`age` == 12'))
Exemplo n.º 17
0
    def split(self, fromfile, options={}):
        """Splits the given file with data into chunks based on chunk size or field value"""
        f_type = get_file_type(
            fromfile) if options['format_in'] is None else options['format_in']
        if options['zipfile']:
            z = zipfile.ZipFile(fromfile, mode='r')
            fnames = z.namelist()
            finfilename = fnames[0]
            if f_type == 'bson':
                infile = z.open(fnames[0], 'rb')
            else:
                infile = z.open(fnames[0], 'r')
        else:
            finfilename = fromfile
            if f_type == 'bson':
                infile = open(fromfile, 'rb')
            else:
                infile = open(fromfile,
                              'r',
                              encoding=get_option(options, 'encoding'))
        fields = options['fields'].split(
            ',') if options['fields'] is not None else None
        valuedict = {}
        delimiter = get_option(options, 'delimiter')
        if f_type == 'csv':
            reader = csv.DictReader(infile, delimiter=delimiter)
            n = 0
            chunknum = 1
            if options['fields'] is None:
                splitname = finfilename.rsplit('.',
                                               1)[0] + '_%d.csv' % (chunknum)
                out = open(splitname,
                           'w',
                           encoding=get_option(options, 'encoding'))
                writer = csv.DictWriter(out,
                                        fieldnames=reader.fieldnames,
                                        delimiter=delimiter)
                writer.writeheader()
                for r in reader:
                    n += 1
                    if n % 10000 == 0:
                        logging.info('split: processing %d records of %s' %
                                     (n, fromfile))
                    if options['filter'] is not None:
                        if not dq.match(r, options['filter']):
                            continue
                    writer.writerow(r)
                    if n % options['chunksize'] == 0:
                        out.close()
                        chunknum += 1
                        splitname = finfilename.rsplit(
                            '.', 1)[0] + '_%d.csv' % (chunknum)
                        out = open(splitname,
                                   'w',
                                   encoding=get_option(options, 'encoding'))
                        writer = csv.DictWriter(out,
                                                fieldnames=reader.fieldnames,
                                                delimiter=delimiter)
                        writer.writeheader()
                out.close()
        elif f_type == 'jsonl':
            n = 0
            chunknum = 1
            if options['fields'] is None:
                splitname = finfilename.rsplit('.',
                                               1)[0] + '_%d.jsonl' % (chunknum)
                out = open(splitname,
                           'w',
                           encoding=get_option(options, 'encoding'))

                for l in infile:
                    n += 1
                    if n % 10000 == 0:
                        logging.info('split: processing %d records of %s' %
                                     (n, fromfile))
                    r = json.loads(l)
                    if options['filter'] is not None:
                        if not dq.match(r, options['filter']):
                            continue
                    out.write(json.dumps(r) + '\n')
                    if n % options['chunksize'] == 0:
                        out.close()
                        chunknum += 1
                        splitname = finfilename.rsplit(
                            '.', 1)[0] + '_%d.jsonl' % (chunknum)
                        logging.info('split: new chunk %s' % splitname)
                        out = open(splitname,
                                   'w',
                                   encoding=get_option(options, 'encoding'))
            else:
                for l in infile:
                    n += 1
                    if n % 10000 == 0:
                        logging.info('split: processing %d records of %s' %
                                     (n, fromfile))
                    r = json.loads(l)
                    if options['filter'] is not None:
                        if not dq.match(r, options['filter']):
                            continue
                    try:
                        kx = get_dict_value(r, fields[0].split('.'))[0]
                    except IndexError:
                        continue
                        kx = "None"
                    v = valuedict.get(kx, None)
                    if v is None:
                        splitname = finfilename.rsplit(
                            '.', 1)[0] + '_%s.jsonl' % (kx)
                        valuedict[kx] = open(splitname, 'w', encoding='utf8')
                    valuedict[kx].write(l)
#                    valuedict[kx].write(l.decode('utf8'))#.decode('utf8')#)
                for opened in valuedict.values():
                    opened.close()
        elif f_type == 'bson':
            bson_iter = bson.decode_file_iter(infile)
            n = 0
            for r in bson_iter:
                n += 1
                #                print(r)
                r_selected = strip_dict_fields(r, fields, 0)
                #                out.write(json.dumps(r_selected)+'\n')
                if n % 10000 == 0:
                    logging.info('split: processing %d records of %s' %
                                 (n, fromfile))

        else:
            logging.info('File type not supported')
            return
        logging.debug('split: %d records processed' % (n))
Exemplo n.º 18
0
    def uniq(self, fromfile, options={}):
        logging.debug('Processing %s' % fromfile)
        f_type = get_file_type(
            fromfile) if options['format_in'] is None else options['format_in']
        if options['zipfile']:
            z = zipfile.ZipFile(fromfile, mode='r')
            fnames = z.namelist()
            if f_type == 'bson':
                infile = z.open(fnames[0], 'rb')
            else:
                infile = z.open(fnames[0], 'r')
        else:
            if f_type == 'bson':
                infile = open(fromfile, 'rb')
            else:
                infile = open(fromfile,
                              'r',
                              encoding=get_option(options, 'encoding'))
        to_file = get_option(options, 'output')
        if to_file:
            to_type = get_file_type(to_file)
            if not to_file:
                logging.debug('Output file type not supported')
                return
            out = open(to_file, 'w', encoding='utf8')
        else:
            to_type = 'csv'
            out = sys.stdout
        fields = options['fields'].split(',')
        logging.info('uniq: looking for fields: %s' % (options['fields']))
        if f_type == 'csv':
            delimiter = get_option(options, 'delimiter')
            uniqval = []
            reader = csv.DictReader(infile, delimiter=delimiter)
            n = 0
            for r in reader:
                n += 1
                if n % 1000 == 0:
                    logging.info('uniq: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                k = [r[x] for x in fields]
                if k not in uniqval:
                    uniqval.append(k)

        elif f_type == 'jsonl':
            uniqval = []
            n = 0
            for l in infile:
                n += 1
                if n % 10000 == 0:
                    logging.info('uniq: processing %d records of %s' %
                                 (n, fromfile))
                r = json.loads(l)
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                try:
                    allvals = []
                    for field in fields:
                        allvals.append(get_dict_value(r, field.split('.')))

                    for n1 in range(0, len(allvals[0]), 1):
                        k = []
                        for n2 in range(0, len(allvals)):
                            k.append(str(allvals[n2][n1]))
                        if k not in uniqval:
                            uniqval.append(k)
                except KeyError:
                    pass
        elif f_type == 'bson':
            uniqval = []
            bson_iter = bson.decode_file_iter(infile)
            n = 0
            for r in bson_iter:
                n += 1
                if n % 1000 == 0:
                    logging.info('uniq: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                try:
                    allvals = []
                    for field in fields:
                        allvals.append(get_dict_value(r, field.split('.')))

                    for n1 in range(0, len(allvals[0]), 1):
                        k = []
                        for n2 in range(0, len(allvals)):
                            k.append(str(allvals[n2][n1]))
                        if k not in uniqval:
                            uniqval.append(k)
                except KeyError:
                    pass
        else:
            logging.error('Invalid filed format provided')
            return
        infile.close()
        logging.debug('%d unique values found' % (len(uniqval)))
        write_items(fields, uniqval, filetype=to_type, handle=out)
 def test_not(self):
     self.assertTrue(dq.match({'age': 18}, 'NOT age == 12'))
 def test_eval_order(self):
     data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0}
     self.assertTrue(dq.match(data, "a == 1 OR c == 0"))
     self.assertFalse(dq.match(data, "a == 0 AND c == 1"))
     self.assertTrue(dq.match(data, "a == 0 AND c == 1 OR z == 0"))
     self.assertFalse(dq.match(data, "a == 0 AND (c == 1 OR z == 0)"))
Exemplo n.º 21
0
 def test_contain(self):
     self.assertTrue(
         match({'roles': ['admin', 'observer']}, '`roles` CONTAIN "admin"'))
     self.assertFalse(
         match({'roles': ['admin', 'observer']}, '`roles` CONTAIN "user"'))
Exemplo n.º 22
0
 def test_eval_order(self):
     data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0}
     self.assertTrue(match(data, "`a` == 1 OR `c` == 0"))
     self.assertFalse(match(data, "`a` == 0 AND `c` == 1"))
     self.assertTrue(match(data, "`a` == 0 AND `c` == 1 OR `z` == 0"))
     self.assertFalse(match(data, "`a` == 0 AND (`c` == 1 OR `z` == 0)"))
Exemplo n.º 23
0
    def frequency(self, fromfile, options={}):
        """Calculates frequency of the values in the file"""
        f_type = get_file_type(
            fromfile) if options['format_in'] is None else options['format_in']
        if options['zipfile']:
            z = zipfile.ZipFile(fromfile, mode='r')
            fnames = z.namelist()
            if f_type == 'bson':
                infile = z.open(fnames[0], 'rb')
            else:
                infile = z.open(fnames[0], 'r')
        else:
            if f_type == 'bson':
                infile = open(fromfile, 'rb')
            else:
                infile = open(fromfile,
                              'r',
                              encoding=get_option(options, 'encoding'))
        to_file = get_option(options, 'output')
        if to_file:
            to_type = get_file_type(to_file)
            if not to_file:
                print('Output file type not supported')
                return
            out = open(to_file, 'w', encoding='utf8')
        else:
            to_type = 'csv'
            out = sys.stdout
        fields = options['fields'].split(',')
        valuedict = {}
        if f_type == 'csv':
            delimiter = get_option(options, 'delimiter')
            reader = csv.DictReader(infile, delimiter=delimiter)
            n = 0
            for r in reader:
                n += 1
                if n % 10000 == 0:
                    logging.info('frequency: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                k = [r[x] for x in fields]
                kx = '\t'.join(k)
                v = valuedict.get(kx, 0)
                valuedict[kx] = v + 1
        elif f_type == 'jsonl':
            n = 0
            for l in infile:
                n += 1
                if n % 10000 == 0:
                    logging.info('frequency: processing %d records of %s' %
                                 (n, fromfile))
                r = json.loads(l)
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                try:
                    allvals = []
                    for field in fields:
                        allvals.append(get_dict_value(r, field.split('.')))

                    for n1 in range(0, len(allvals[0]), 1):
                        k = []
                        for n2 in range(0, len(allvals)):
                            k.append(str(allvals[n2][n1]))
                        kx = '\t'.join(k)
                        v = valuedict.get(kx, 0)
                        valuedict[kx] = v + 1
                except KeyError:
                    pass
        elif f_type == 'bson':
            bson_iter = bson.decode_file_iter(infile)
            n = 0
            for r in bson_iter:
                n += 1
                if n % 10000 == 0:
                    logging.info('frequency: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue

#                print(r)
                allvals = []
                for field in fields:
                    allvals.append(get_dict_value(r, field.split('.')))

                for n1 in range(0, len(allvals[0]), 1):
                    k = []
                    for n2 in range(0, len(allvals)):
                        k.append(str(allvals[n2][n1]))
                    v = valuedict.get(k, 0)
                    valuedict[k] = v + 1
        else:
            logging.info('File type not supported')
            return
        logging.debug('frequency: %d unique values found' % (len(valuedict)))
        thedict = sorted(valuedict.items(),
                         key=lambda item: item[1],
                         reverse=False)
        output = get_option(options, 'output')
        strkeys = '\t'.join(fields) + '\tcount'
        if output:
            f = open(output, 'w', encoding=get_option(options, 'encoding'))
            f.write(strkeys + '\n')
            for k, v in thedict:
                f.write('%s\t%d\n' % (k, v))
            f.close()
        else:
            print(strkeys)
            for k, v in thedict:
                print('%s\t%d' % (k, v))
 def test_notequal(self):
     self.assertTrue(dq.match({'age': 18}, 'age != 12'))
     self.assertFalse(dq.match({'age': 18}, 'age != 18'))
Exemplo n.º 25
0
    def select(self, fromfile, options={}):
        """Select or re-order columns from file"""
        f_type = get_file_type(
            fromfile) if options['format_in'] is None else options['format_in']
        if options['zipfile']:
            z = zipfile.ZipFile(fromfile, mode='r')
            fnames = z.namelist()
            if f_type == 'bson':
                infile = z.open(fnames[0], 'rb')
            else:
                infile = z.open(fnames[0], 'r')
        else:
            if f_type == 'bson':
                infile = open(fromfile, 'rb')
            else:
                infile = open(fromfile,
                              'r',
                              encoding=get_option(options, 'encoding'))
        to_file = get_option(options, 'output')
        if to_file:
            to_type = get_file_type(to_file)
            if not to_file:
                print('Output file type not supported')
                return
            if to_type == 'bson':
                out = open(to_file, 'wb')
            else:
                out = open(to_file, 'w', encoding='utf8')
        else:
            to_type = f_type
            out = sys.stdout
        fields = options['fields'].split(',')
        valuedict = {}
        delimiter = get_option(options, 'delimiter')
        if f_type == 'csv':
            reader = csv.DictReader(infile, delimiter=delimiter)
            if to_type == 'csv':
                writer = csv.DictWriter(out,
                                        fieldnames=fields,
                                        delimiter=delimiter)
                writer.writeheader()
            n = 0
            for r in reader:
                n += 1
                if n % 10000 == 0:
                    logging.info('select: processing %d records of %s' %
                                 (n, fromfile))
                item = {}
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                for x in fields:
                    item[x] = r[x]
                if to_type == 'csv':
                    writer.writerow(item)
                elif to_type == 'jsonl':
                    out.write(json.dumps(item) + "\n")
        elif f_type == 'jsonl':
            n = 0
            fields = [field.split('.') for field in fields]
            for l in infile:
                n += 1
                if n % 10000 == 0:
                    logging.info('select: processing %d records of %s' %
                                 (n, fromfile))
                r = json.loads(l)
                if options['filter'] is not None:
                    res = dq.match(r, options['filter'])
                    #                    print(options['filter'], r)
                    if not res:
                        continue
                r_selected = strip_dict_fields(r, fields, 0)
                out.write(json.dumps(r_selected) + '\n')
        elif f_type == 'bson':
            bson_iter = bson.decode_file_iter(infile)
            n = 0
            fields = [field.split('.') for field in fields]
            for r in bson_iter:
                n += 1
                if n % 10000 == 0:
                    logging.info('select: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    res = dq.match(r, options['filter'])
                    if not res:
                        continue
                r_selected = strip_dict_fields(r, fields, 0)
                out.write(json.dumps(r_selected) + '\n')

        else:
            logging.info('File type not supported')
            return
        logging.debug('select: %d records processed' % (n))
        out.close()
 def test_lte(self):
     self.assertTrue(dq.match({'age': 18}, 'age <= 20'))
     self.assertTrue(dq.match({'age': 18}, 'age <= 18'))
     self.assertFalse(dq.match({'age': 18}, 'age <= 17'))
Exemplo n.º 27
0
    def validate(self, fromfile, options={}):
        """Validates selected field against validation rule"""
        logging.debug('Processing %s' % fromfile)
        f_type = get_file_type(
            fromfile) if options['format_in'] is None else options['format_in']
        if options['zipfile']:
            z = zipfile.ZipFile(fromfile, mode='r')
            fnames = z.namelist()
            if f_type == 'bson':
                infile = z.open(fnames[0], 'rb')
            else:
                infile = z.open(fnames[0], 'r')
        else:
            if f_type == 'bson':
                infile = open(fromfile, 'rb')
            else:
                infile = open(fromfile,
                              'r',
                              encoding=get_option(options, 'encoding'))
        to_file = get_option(options, 'output')
        if to_file:
            to_type = get_file_type(to_file)
            if not to_file:
                logging.debug('Output file type not supported')
                return
            out = open(to_file, 'w', encoding='utf8')
        else:
            to_type = 'csv'
            out = sys.stdout
        fields = options['fields'].split(',')
        val_func = VALIDATION_RULEMAP[options['rule']]
        logging.info('uniq: looking for fields: %s' % (options['fields']))
        validated = []
        stats = {'total': 0, 'invalid': 0, 'novalue': 0}
        if f_type == 'csv':
            delimiter = get_option(options, 'delimiter')
            reader = csv.DictReader(infile, delimiter=delimiter)
            n = 0
            for r in reader:
                n += 1
                if n % 1000 == 0:
                    logging.info('uniq: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                res = val_func(r[fields[0]])
                stats['total'] += 1
                if not res:
                    stats['invalid'] += 1
                validated.append({
                    fields[0]: r[fields[0]],
                    fields[0] + '_valid': res
                })

        elif f_type == 'jsonl':
            n = 0
            for l in infile:
                n += 1
                if n % 10000 == 0:
                    logging.info('uniq: processing %d records of %s' %
                                 (n, fromfile))
                r = json.loads(l)
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                stats['total'] += 1
                values = get_dict_value(r, fields[0].split('.'))
                if len(values) > 0:
                    res = val_func(values[0])
                    if not res:
                        stats['invalid'] += 1
                    validated.append({
                        fields[0]: values[0],
                        fields[0] + '_valid': res
                    })
                else:
                    stats['novalue'] += 1

        elif f_type == 'bson':
            uniqval = []
            bson_iter = bson.decode_file_iter(infile)
            n = 0
            for r in bson_iter:
                n += 1
                if n % 1000 == 0:
                    logging.info('uniq: processing %d records of %s' %
                                 (n, fromfile))
                if options['filter'] is not None:
                    if not dq.match(r, options['filter']):
                        continue
                stats['total'] += 1
                values = get_dict_value(r, fields[0].split('.'))
                if len(values) > 0:
                    res = val_func(values[0])
                    if not res:
                        stats['invalid'] += 1
                    validated.append({
                        fields[0]: values[0],
                        fields[0] + '_valid': res
                    })
                else:
                    stats['novalue'] += 1
        else:
            logging.error('Invalid filed format provided')
            return
        infile.close()
        stats['share'] = 100.0 * stats['invalid'] / stats['total']
        logging.debug(
            'validate: complete, %d records (%.2f%%) not valid and %d (%.2f%%) not found of %d against %s'
            % (stats['invalid'], stats['share'], stats['novalue'],
               100.0 * stats['novalue'] / stats['total'], stats['total'],
               options['rule']))
        if options['mode'] != 'stats':
            writer = csv.DictWriter(
                out,
                fieldnames=[fields[0], fields[0] + '_valid'],
                delimiter=get_option(options, 'delimiter'))
            for row in validated:
                if options['mode'] == 'invalid':
                    if not row[fields[0] + '_valid']:
                        writer.writerow(row)
                elif options['mode'] == 'all':
                    writer.writerow(row)
        else:
            out.write(json.dumps(stats, indent=4))
Exemplo n.º 28
0
 def test_not(self):
     self.assertTrue(match({'age': 18}, 'NOT `age` == 12'))