示例#1
0
def main():
    '''Output the intersection of 2 files, based on certain columns in the
    files. This is equivalent to the SQL JOIN operation. Loads the keys in memory,
    so only works on small/medium sized files. It outputs the full lines from the
    left file, optionally adding columns from the right file into it'''

    parent_argparser = file_ops_common.set_ops_parser()
    argparser = argparse.ArgumentParser(description=main.__doc__,
                                        parents=[parent_argparser])
    argparser.add_argument('--insert-cols', dest='insert_cols', default='',
                           help='Columns from the right file to insert into the left.'
                           ' If there are multiple rows matching from the right file, we'
                           ' only consider the 1st match row')
    args = argparser.parse_args()

    left_key_cols = [int(col) for col in args.left_columns.split(',')]
    right_key_cols = [int(col) for col in args.right_columns.split(',')]
    insert_cols = [int(col) for col in args.insert_cols.split(',')] if (
        args.insert_cols) else []

    # Go through the left file and collect the keys
    all_keys = dict()
    for cols in csv_unicode.UnicodeReader(open(args.left_file, 'r'),
                                          delimiter=args.left_delim):
        key = file_ops_common.get_key(cols, left_key_cols)
        if args.lower_case: key = key.lower()
        if key not in all_keys:
            all_keys[key] = []
        all_keys[key].append(cols)

    output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.left_delim)

    # Right file
    for cols in csv_unicode.UnicodeReader(open(args.right_file, 'r'),
                                          delimiter=args.right_delim):
        key = file_ops_common.get_key(cols, right_key_cols)
        if args.lower_case: key = key.lower()
        if key in all_keys:
            for line in all_keys[key]:
                insert_values = [cols[i] for i in insert_cols]
                line = line + insert_values
                output.writerow(line)
            all_keys.pop(key)
示例#2
0
def main():
    '''Output the diff of 2 files, based on certain columns in the files.
    Loads the keys in memory, so only works on small/medium sized files.
    It outputs the entire line from the left file'''

    parent_argparser = file_ops_common.set_ops_parser()
    argparser = argparse.ArgumentParser(description=main.__doc__,
                                        parents=[parent_argparser])
    args = argparser.parse_args()

    left_key_cols = [int(col) for col in args.left_columns.split(',')]
    right_key_cols = [int(col) for col in args.right_columns.split(',')]

    # We use an ordered dict to maintain the original order of the lines
    all_keys = OrderedDict()

    # Go through the left file and collect the keys
    for cols in csv_unicode.UnicodeReader(open(args.left_file, 'r'),
                                          delimiter=args.left_delim):
        key = file_ops_common.get_key(cols, left_key_cols)
        if args.lower_case: key = key.lower()
        if key not in all_keys:
            all_keys[key] = []
        all_keys[key].append(cols)

    output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.left_delim)

    # Go through the right file and remove those keys from all_keys
    for cols in csv_unicode.UnicodeReader(open(args.right_file, 'r'),
                                          delimiter=args.right_delim):
        key = file_ops_common.get_key(cols, right_key_cols)
        if args.lower_case: key = key.lower()
        if key in all_keys:
            all_keys.pop(key, None)

    # Output the remaining keys
    for key in all_keys:
        for line in all_keys[key]:
            output.writerow(line)
示例#3
0
def main():
    ''' Do SQL-like operations on a delimited text file'''

    args = parse_args()

    # Parse the where clause
    where_filters = {}
    if args.where_clauses:
        for clause in args.where_clauses:
            if '!=' in clause:
                clause_parts = clause.split('!=')
                where_filters[int(clause_parts[0])] = {
                  'op' : 'NOT IN',
                  'vals' : clause_parts[1].split(',')
                }
            elif '=' in clause:
                clause_parts = clause.split('=')
                where_filters[int(clause_parts[0])] = {
                  'op' : 'IN',
                  'vals' : clause_parts[1].split(',')
                }

    select_cols = [int(col) for col in args.select_cols.split(',')]
    aggregate_cols = [int(col) for col in args.aggregate_cols.split(',')]

    if args.verbose:
        print_query(args, select_cols, aggregate_cols, where_filters)

    # The structure where we save the aggregated values
    # It is a nested dictionary for the form:
    # Select Keys -> Aggregate Column -> Aggregate value
    aggregates = {}

    # Go through the input file and do the aggregation
    for cols in csv_unicode.UnicodeReader(open(args.file, 'r'),
                                          delimiter=args.delim):

        # Where filters
        skip = False
        for col_num, filtr in where_filters.iteritems():
            if ((filtr['op'] == 'IN' and cols[col_num] not in filtr['vals']) or
                (filtr['op'] == 'NOT IN' and cols[col_num] in filtr['vals'])):
                skip = True
                break
        if skip:
            continue

        # Generate the key from the select columns
        key = file_ops_common.get_key(cols, select_cols)
        if key not in aggregates: aggregates[key] = defaultdict(int)

        # Go over each aggregate column and add it to the final structure
        for col_num in aggregate_cols:
            if args.agg_function == 'sum':
                aggregates[key][col_num] += float(cols[col_num])
            elif args.agg_function == 'count':
                aggregates[key][col_num] += 1
            else:
                sys.stderr.write('Invalid aggregate function: ' + args.agg_function)
                sys.exit(-1)

    output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.delim)

    # Output the remaining keys
    for key in aggregates:
        op_cols = file_ops_common.split_key(key)
        # We output the aggregate column in the order they are in the input file
        for col, value in sorted(aggregates[key].items()):
            op_cols += [unicode(value)]
        output.writerow(op_cols)