示例#1
0
  def expand(self, input_or_inputs):
    tokenize_result = (input_or_inputs
      | "Read Github Dataset" >> io.Read(io.BigQuerySource(query=self.query_string,
                                                          use_standard_sql=True))
      | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath())
      | "Tokenize Code/Docstring Pairs" >> beam.ParDo(TokenizeCodeDocstring())
                                               .with_outputs('err_rows', main='rows')
    )

    #pylint: disable=expression-not-assigned
    (tokenize_result.err_rows
     | "Failed Row Tokenization" >> io.WriteToBigQuery(project=self.project,
                                                        dataset=self.output_dataset,
                                                        table=self.output_table + '_failed',
                                                        schema=self.create_failed_output_schema())
    )
    # pylint: enable=expression-not-assigned


    info_result = (tokenize_result.rows
      | "Extract Function Info" >> beam.ParDo(ExtractFuncInfo(self.data_columns[2:]))
                                       .with_outputs('err_rows', main='rows')
    )

    #pylint: disable=expression-not-assigned
    (info_result.err_rows
     | "Failed Function Info" >> io.WriteToBigQuery(project=self.project,
                                                        dataset=self.output_dataset,
                                                        table=self.output_table + '_failed',
                                                        schema=self.create_failed_output_schema())
    )
    # pylint: enable=expression-not-assigned

    processed_rows = (info_result.rows | "Flatten Rows" >> beam.FlatMap(lambda x: x))

    # pylint: disable=expression-not-assigned
    (processed_rows
     | "Filter Function tokens" >> beam.Map(lambda x: x['function_tokens'])
     | "Write Function tokens" >> io.WriteToText('{}/raw_data/data'.format(self.storage_bucket),
                                                 file_name_suffix='.function',
                                                 num_shards=self.num_shards))
    (processed_rows
     | "Filter Docstring tokens" >> beam.Map(lambda x: x['docstring_tokens'])
     | "Write Docstring tokens" >> io.WriteToText('{}/raw_data/data'.format(self.storage_bucket),
                                                  file_name_suffix='.docstring',
                                                  num_shards=self.num_shards))
    # pylint: enable=expression-not-assigned

    return (processed_rows
      | "Save Tokens" >> io.WriteToBigQuery(project=self.project,
                                                  dataset=self.output_dataset,
                                                  table=self.output_table,
                                                  schema=self.create_output_schema())
    )
示例#2
0
 def expand(self, input_or_inputs):
     return (input_or_inputs
             | "Read BigQuery Rows" >> io.Read(
                 io.BigQuerySource(query=self.query_string,
                                   use_standard_sql=True))
             | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath())
             | "Tokenize Code/Docstring Pairs" >> beam.ParDo(
                 TokenizeCodeDocstring())
             | "Extract Function Info" >> beam.ParDo(
                 ExtractFuncInfo(self.data_columns[2:]))
             | "Flatten Rows" >> beam.FlatMap(lambda x: x)
             | "Write to BigQuery" >> io.WriteToBigQuery(
                 project=self.project,
                 dataset=self.output_dataset,
                 table=self.output_table,
                 schema=self.create_output_schema()))
示例#3
0
    def expand(self, input_or_inputs):
        tokenize_result = (
            input_or_inputs
            | "Read Github Dataset" >> io.Read(
                io.BigQuerySource(query=self.query_string,
                                  use_standard_sql=True))
            | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath())
            | "Tokenize Code/Docstring Pairs" >> beam.ParDo(
                TokenizeCodeDocstring()).with_outputs('err_rows', main='rows'))

        #pylint: disable=expression-not-assigned
        (tokenize_result.err_rows
         | "Failed Row Tokenization" >> io.WriteToBigQuery(
             project=self.project,
             dataset=self.output_dataset,
             table=self.output_table + '_failed',
             schema=self.create_failed_output_schema(),
             batch_size=self.batch_size))
        # pylint: enable=expression-not-assigned

        info_result = (
            tokenize_result.rows
            | "Extract Function Info" >> beam.ParDo(
                ExtractFuncInfo(self.data_columns[2:])).with_outputs(
                    'err_rows', main='rows'))

        #pylint: disable=expression-not-assigned
        (info_result.err_rows
         | "Failed Function Info" >> io.WriteToBigQuery(
             project=self.project,
             dataset=self.output_dataset,
             table=self.output_table + '_failed',
             schema=self.create_failed_output_schema(),
             batch_size=self.batch_size))
        # pylint: enable=expression-not-assigned

        return (info_result.rows
                | "Flatten Rows" >> beam.FlatMap(lambda x: x)
                | "Save Tokens" >> io.WriteToBigQuery(
                    project=self.project,
                    dataset=self.output_dataset,
                    table=self.output_table,
                    schema=self.create_output_schema(),
                    batch_size=self.batch_size))
示例#4
0
 def expand(self, xs):
     return (
         xs
         | io.Read(io.gcp.bigquery.BigQuerySource(query=self.query))
     )
示例#5
0
 def expand(self, xs):
     return (xs
             | io.Read(
                 io.gcp.bigquery.BigQuerySource(
                     query=self.query,
                     use_standard_sql=self.use_standard_sql)))