Пример #1
0
    def _pipeline(root):
        """Pipeline."""

        for (input_file, output_file) in zip(input_files, output_files):
            name = os.path.basename(input_file)

            interactions = read_interactions(root, input_file, name)
            if use_fake_tables:
                interactions = (interactions
                                | f'InsertFakeTable_{name}' >>
                                beam.Map(insert_fake_table_fn))
            examples = (interactions
                        | f'CheckTableId_{name}' >> beam.FlatMap(
                            pretrain_utils.check_table_id_fn)
                        | f'AddNumericValues_{name}' >> beam.Map(
                            pretrain_utils.add_numeric_values_fn)
                        |
                        f'ToClassifierTensorflowExample_{name}' >> beam.ParDo(
                            ToClassifierTensorflowExample(
                                config,
                                name,
                                convert_impl_value=converter_impl.value,
                            )))

            pretrain_utils.write_proto_outputs(output_file,
                                               f'WriteExamples_{name}',
                                               examples, tf.train.Example)
    def _pipeline(root):
        """Pipeline."""

        interactions = (pretrain_utils.read_interactions(
            root, input_file, name="input")
                        | "Preprocess" >> beam.FlatMap(shard_interaction_fn))

        data = []

        if mode in [Mode.CONTRASTIVE, Mode.ALL]:

            data.append(
                interactions
                | "DropKey" >> beam.Map(
                    lambda key_interaction: key_interaction[1])
                | "ToContrastivePairs" >> beam.FlatMap(
                    _to_contrastive_statements_fn,
                    use_fake_table=use_fake_table,
                    drop_without_support_rate=drop_without_support_rate,
                ))

        if mode in [Mode.SYNTHETIC, Mode.ALL]:

            data.append(interactions
                        | "Synthesize" >> beam.FlatMap(
                            synthesize_fn,
                            config=config,
                            add_opposite_table=add_opposite_table,
                            use_fake_table=use_fake_table,
                        ))

        if not data:
            raise ValueError(f"Unknown mode: {mode}")

        output_data = (data | "Flatten" >> beam.Flatten())
        proto_message = interaction_pb2.Interaction

        if conversion_config is not None:
            pretrain_utils.write_proto_outputs(
                os.path.join(output_dir, "interactions") + output_suffix,
                "interactions",
                output_data,
                interaction_pb2.Interaction,
            )
            output_data = (output_data | "ToExamples" >> beam.ParDo(
                ToClassifierTensorflowExample(conversion_config)))
            proto_message = tf.train.Example

        pretrain_utils.split_by_table_id_and_write(
            (output_data | "Reshuffle" >> beam.transforms.util.Reshuffle()),
            output_dir,
            train_suffix=output_suffix,
            test_suffix=output_suffix,
            num_splits=num_splits,
            proto_message=proto_message)
Пример #3
0
    def _pipeline(root):
        """Pipeline."""

        for (input_file, output_file) in zip(input_files, output_files):
            name = os.path.basename(input_file)

            examples = (_read_inputs_with_format(root, input_file, name,
                                                 input_format)
                        | f'CheckTableId_{name}' >> beam.FlatMap(
                            pretrain_utils.check_table_id_fn)
                        | f'AddNumericValues_{name}' >> beam.Map(
                            pretrain_utils.add_numeric_values_fn)
                        | f'ToRetrievalTensorflowExample_{name}' >> beam.ParDo(
                            ToRetrievalTensorflowExample(
                                config,
                                convert_impl_value=converter_impl.value,
                            )))

            pretrain_utils.write_proto_outputs(output_file,
                                               f'WriteExamples_{name}',
                                               examples, tf.train.Example)