def expand(self, input_or_inputs): tokenize_result = (input_or_inputs | "Read Github Dataset" >> io.Read(io.BigQuerySource(query=self.query_string, use_standard_sql=True)) | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath()) | "Tokenize Code/Docstring Pairs" >> beam.ParDo(TokenizeCodeDocstring()) .with_outputs('err_rows', main='rows') ) #pylint: disable=expression-not-assigned (tokenize_result.err_rows | "Failed Row Tokenization" >> io.WriteToBigQuery(project=self.project, dataset=self.output_dataset, table=self.output_table + '_failed', schema=self.create_failed_output_schema()) ) # pylint: enable=expression-not-assigned info_result = (tokenize_result.rows | "Extract Function Info" >> beam.ParDo(ExtractFuncInfo(self.data_columns[2:])) .with_outputs('err_rows', main='rows') ) #pylint: disable=expression-not-assigned (info_result.err_rows | "Failed Function Info" >> io.WriteToBigQuery(project=self.project, dataset=self.output_dataset, table=self.output_table + '_failed', schema=self.create_failed_output_schema()) ) # pylint: enable=expression-not-assigned processed_rows = (info_result.rows | "Flatten Rows" >> beam.FlatMap(lambda x: x)) # pylint: disable=expression-not-assigned (processed_rows | "Filter Function tokens" >> beam.Map(lambda x: x['function_tokens']) | "Write Function tokens" >> io.WriteToText('{}/raw_data/data'.format(self.storage_bucket), file_name_suffix='.function', num_shards=self.num_shards)) (processed_rows | "Filter Docstring tokens" >> beam.Map(lambda x: x['docstring_tokens']) | "Write Docstring tokens" >> io.WriteToText('{}/raw_data/data'.format(self.storage_bucket), file_name_suffix='.docstring', num_shards=self.num_shards)) # pylint: enable=expression-not-assigned return (processed_rows | "Save Tokens" >> io.WriteToBigQuery(project=self.project, dataset=self.output_dataset, table=self.output_table, schema=self.create_output_schema()) )
def expand(self, input_or_inputs): return (input_or_inputs | "Read BigQuery Rows" >> io.Read( io.BigQuerySource(query=self.query_string, use_standard_sql=True)) | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath()) | "Tokenize Code/Docstring Pairs" >> beam.ParDo( TokenizeCodeDocstring()) | "Extract Function Info" >> beam.ParDo( ExtractFuncInfo(self.data_columns[2:])) | "Flatten Rows" >> beam.FlatMap(lambda x: x) | "Write to BigQuery" >> io.WriteToBigQuery( project=self.project, dataset=self.output_dataset, table=self.output_table, schema=self.create_output_schema()))
def expand(self, input_or_inputs): tokenize_result = ( input_or_inputs | "Read Github Dataset" >> io.Read( io.BigQuerySource(query=self.query_string, use_standard_sql=True)) | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath()) | "Tokenize Code/Docstring Pairs" >> beam.ParDo( TokenizeCodeDocstring()).with_outputs('err_rows', main='rows')) #pylint: disable=expression-not-assigned (tokenize_result.err_rows | "Failed Row Tokenization" >> io.WriteToBigQuery( project=self.project, dataset=self.output_dataset, table=self.output_table + '_failed', schema=self.create_failed_output_schema(), batch_size=self.batch_size)) # pylint: enable=expression-not-assigned info_result = ( tokenize_result.rows | "Extract Function Info" >> beam.ParDo( ExtractFuncInfo(self.data_columns[2:])).with_outputs( 'err_rows', main='rows')) #pylint: disable=expression-not-assigned (info_result.err_rows | "Failed Function Info" >> io.WriteToBigQuery( project=self.project, dataset=self.output_dataset, table=self.output_table + '_failed', schema=self.create_failed_output_schema(), batch_size=self.batch_size)) # pylint: enable=expression-not-assigned return (info_result.rows | "Flatten Rows" >> beam.FlatMap(lambda x: x) | "Save Tokens" >> io.WriteToBigQuery( project=self.project, dataset=self.output_dataset, table=self.output_table, schema=self.create_output_schema(), batch_size=self.batch_size))
def expand(self, xs): return ( xs | io.Read(io.gcp.bigquery.BigQuerySource(query=self.query)) )
def expand(self, xs): return (xs | io.Read( io.gcp.bigquery.BigQuerySource( query=self.query, use_standard_sql=self.use_standard_sql)))