示例#1
0
def main():
    '''Output the intersection of 2 files, based on certain columns in the
    files. This is equivalent to the SQL JOIN operation. Loads the keys in memory,
    so only works on small/medium sized files. It outputs the full lines from the
    left file, optionally adding columns from the right file into it'''

    parent_argparser = file_ops_common.set_ops_parser()
    argparser = argparse.ArgumentParser(description=main.__doc__,
                                        parents=[parent_argparser])
    argparser.add_argument('--insert-cols', dest='insert_cols', default='',
                           help='Columns from the right file to insert into the left.'
                           ' If there are multiple rows matching from the right file, we'
                           ' only consider the 1st match row')
    args = argparser.parse_args()

    left_key_cols = [int(col) for col in args.left_columns.split(',')]
    right_key_cols = [int(col) for col in args.right_columns.split(',')]
    insert_cols = [int(col) for col in args.insert_cols.split(',')] if (
        args.insert_cols) else []

    # Go through the left file and collect the keys
    all_keys = dict()
    for cols in csv_unicode.UnicodeReader(open(args.left_file, 'r'),
                                          delimiter=args.left_delim):
        key = file_ops_common.get_key(cols, left_key_cols)
        if args.lower_case: key = key.lower()
        if key not in all_keys:
            all_keys[key] = []
        all_keys[key].append(cols)

    output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.left_delim)

    # Right file
    for cols in csv_unicode.UnicodeReader(open(args.right_file, 'r'),
                                          delimiter=args.right_delim):
        key = file_ops_common.get_key(cols, right_key_cols)
        if args.lower_case: key = key.lower()
        if key in all_keys:
            for line in all_keys[key]:
                insert_values = [cols[i] for i in insert_cols]
                line = line + insert_values
                output.writerow(line)
            all_keys.pop(key)
示例#2
0
def main():
    '''Output the diff of 2 files, based on certain columns in the files.
    Loads the keys in memory, so only works on small/medium sized files.
    It outputs the entire line from the left file'''

    parent_argparser = file_ops_common.set_ops_parser()
    argparser = argparse.ArgumentParser(description=main.__doc__,
                                        parents=[parent_argparser])
    args = argparser.parse_args()

    left_key_cols = [int(col) for col in args.left_columns.split(',')]
    right_key_cols = [int(col) for col in args.right_columns.split(',')]

    # We use an ordered dict to maintain the original order of the lines
    all_keys = OrderedDict()

    # Go through the left file and collect the keys
    for cols in csv_unicode.UnicodeReader(open(args.left_file, 'r'),
                                          delimiter=args.left_delim):
        key = file_ops_common.get_key(cols, left_key_cols)
        if args.lower_case: key = key.lower()
        if key not in all_keys:
            all_keys[key] = []
        all_keys[key].append(cols)

    output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.left_delim)

    # Go through the right file and remove those keys from all_keys
    for cols in csv_unicode.UnicodeReader(open(args.right_file, 'r'),
                                          delimiter=args.right_delim):
        key = file_ops_common.get_key(cols, right_key_cols)
        if args.lower_case: key = key.lower()
        if key in all_keys:
            all_keys.pop(key, None)

    # Output the remaining keys
    for key in all_keys:
        for line in all_keys[key]:
            output.writerow(line)