def test_full_outer_join(self): self.assertEqual(join.full_outer_join(self.tab1, 0, self.tab2, 0), [ ['id', 'name', 'i_work_here', 'id', 'age', 'i_work_here'], [u'1', u'Chicago Reader', u'first', u'1', u'first', u'0'], [u'1', u'Chicago Reader', u'first', u'1', u'second', u'0'], [u'2', u'Chicago Sun-Times', u'only', u'2', u'only', u'0', u'0'], [u'3', u'Chicago Tribune', u'only', u'', u'', u''], [u'1', u'Chicago Reader', u'second', u'1', u'first', u'0'], [u'1', u'Chicago Reader', u'second', u'1', u'second', u'0'], [u'', u'', u'', u'4', u'only', u'0']])
def test_full_outer_join_no_duplicate_column(self): self.maxDiff = 1000 self.assertEqual( join.full_outer_join(self.tab1, 0, self.tab2, 0, no_duplicate_id_column=True), [['id', 'name', 'i_work_here', 'age', 'i_work_here'], [u'1', u'Chicago Reader', u'first', u'first', u'0'], [u'1', u'Chicago Reader', u'first', u'second', u'0'], [u'2', u'Chicago Sun-Times', u'only', u'only', u'0', u'0'], [u'3', u'Chicago Tribune', u'only', u'', u''], [u'1', u'Chicago Reader', u'second', u'first', u'0'], [u'1', u'Chicago Reader', u'second', u'second', u'0'], [u'4', u'', u'', u'only', u'0']])
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names(self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.') if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error('You must provide join column names when performing an outer join.') if self.args.left_join and self.args.right_join: self.argparser.error('It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(CSVKitReader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append(match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error( 'You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names( self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error( 'The number of join column names must match the number of files, or be a single column name that exists in all files.' ) if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error( 'You must provide join column names when performing an outer join.' ) if self.args.left_join and self.args.right_join: self.argparser.error( 'It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(CSVKitReader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append( match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row)