def ReadCSVToPandas( p: beam.Pipeline, *args, **kwargs, ) -> PCollection[pd.DataFrame]: data = p | "Read CSV" >> df_io.read_csv(*args, **kwargs) return df_convert.to_pcollection(data, yield_elements='pandas')
def run_aggregation_pipeline(pipeline_args, input_path, output_path): # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: rides = p | read_csv(input_path) # Count the number of passengers dropped off per LocationID agg = rides.groupby('DOLocationID').passenger_count.sum() agg.to_csv(output_path)
def run_aggregation_pipeline(pipeline, input_path, output_path): # The pipeline will be run on exiting the with block. # [START DataFrame_taxiride_aggregation] with pipeline as p: rides = p | read_csv(input_path) # Count the number of passengers dropped off per LocationID agg = rides.groupby('DOLocationID').passenger_count.sum() agg.to_csv(output_path)
def test_read_write_csv(self): input = self.temp_dir({'1.csv': 'a,b\n1,2\n', '2.csv': 'a,b\n3,4\n'}) output = self.temp_dir() with beam.Pipeline() as p: df = p | io.read_csv(input + '*.csv') df['c'] = df.a + df.b df.to_csv(output + 'out.csv', index=False) self.assertCountEqual(['a,b,c', '1,2,3', '3,4,7'], set(self.read_all_lines(output + 'out.csv*')))
def run_enrich_pipeline( pipeline_args, input_path, output_path, zone_lookup_path): """Enrich taxi ride data with zone lookup table and perform a grouped aggregation.""" # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: rides = p | "Read taxi rides" >> read_csv(input_path) zones = p | "Read zone lookup" >> read_csv(zone_lookup_path) # Enrich taxi ride data with boroughs from zone lookup table # Joins on zones.LocationID and rides.DOLocationID, by first making the # former the index for zones. rides = rides.merge( zones.set_index('LocationID').Borough, right_index=True, left_on='DOLocationID', how='left') # Sum passengers dropped off per Borough agg = rides.groupby('Borough').passenger_count.sum() agg.to_csv(output_path)
def test_file_not_found(self): with self.assertRaisesRegex(FileNotFoundError, r'/tmp/fake_dir/\*\*'): with beam.Pipeline() as p: _ = p | io.read_csv('/tmp/fake_dir/**')
def read_csv_as_pcoll(pipeline, path): label = os.path.basename(path) raw_df = (pipeline | f"ReadCSV{label}" >> df_io.read_csv(path)) return df_convert.to_pcollection(raw_df, pipeline=pipeline, label=f"ToPColl{label}")