예제 #1
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.data.data_pipelines.data_pipeline import lowercase_and_remove_punctuations
from neurst.data.text import Tokenizer, build_tokenizer

FLAG_LIST = [
    flags_core.Flag("input", dtype=flags_core.Flag.TYPE.STRING, default=None,
                    help="The path to the input text file."),
    flags_core.Flag("output", dtype=flags_core.Flag.TYPE.STRING, default=None,
                    help="The path to the output text file."),
    flags_core.Flag("lowercase", dtype=flags_core.Flag.TYPE.BOOLEAN, default=None,
                    help="Whether to lowercase."),
    flags_core.Flag("remove_punctuation", dtype=flags_core.Flag.TYPE.BOOLEAN, default=None,
                    help="Whether to remove the punctuations."),
    flags_core.ModuleFlag(Tokenizer.REGISTRY_NAME, help="The tokenizer."),
]


def _main(_):
    arg_parser = flags_core.define_flags(FLAG_LIST, with_config_file=False)
    args, remaining_argv = flags_core.intelligent_parse_flags(FLAG_LIST, arg_parser)
    flags_core.verbose_flags(FLAG_LIST, args, remaining_argv)
예제 #2
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.metrics import Metric, build_metric
from neurst.utils.misc import flatten_string_list

FLAG_LIST = [
    flags_core.Flag("hypo_file", dtype=flags_core.Flag.TYPE.STRING, default=None,
                    help="The path to hypothesis file."),
    flags_core.Flag("ref_file", dtype=flags_core.Flag.TYPE.STRING, default=None, multiple=True,
                    help="The path to reference file. "),
    flags_core.ModuleFlag(Metric.REGISTRY_NAME, help="The metric for evaluation."),
]


def evaluate(metric, hypo_file, ref_file):
    assert metric is not None
    assert hypo_file
    assert ref_file
    with tf.io.gfile.GFile(hypo_file) as fp:
        hypo = [line.strip() for line in fp]

    ref_list = []
    for one_ref_file in flatten_string_list(ref_file):
예제 #3
0
        word_with_counts = word_with_counts[:max_vocab_size]

    with tf.io.gfile.GFile(output, "w") as foutput:
        # extra slots
        for x in extra_slots_list:
            foutput.write("{}\t{}\n".format(x, 1000))
        logging.info("Plus extra %d slots to the vocabulary in the front.",
                     len(extra_slots_list))
        for word, count in word_with_counts:
            foutput.write("{}\t{}\n".format(word, count))


FLAG_LIST = [
    flags_core.Flag(
        "min_frequency",
        dtype=flags_core.Flag.TYPE.INTEGER,
        default=0,
        help="Minimum frequency of a word to be included in the vocabulary."),
    flags_core.Flag("max_vocab_size",
                    dtype=flags_core.Flag.TYPE.INTEGER,
                    default=None,
                    help="Maximum number of tokens in the vocabulary."),
    flags_core.Flag(
        "lowercase",
        dtype=flags_core.Flag.TYPE.BOOLEAN,
        default=None,
        help="If set to true, downcase all text before processing."),
    flags_core.Flag("input",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default=None,
                    help="Input full vocabulary file."),
예제 #4
0
# limitations under the License.
import os
import random

import numpy
import tensorflow as tf
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.data.datasets import Dataset, build_dataset
from neurst.tasks import Task, build_task
from neurst.utils.compat import ModeKeys

FLAG_LIST = [
    flags_core.Flag("processor_id",
                    dtype=flags_core.Flag.TYPE.INTEGER,
                    default=None,
                    help="The processor id."),
    flags_core.Flag(
        "num_processors",
        dtype=flags_core.Flag.TYPE.INTEGER,
        default=None,
        help=
        "The number of processors. Must be divisible by `num_output_shards`."),
    flags_core.Flag("num_output_shards",
                    dtype=flags_core.Flag.TYPE.INTEGER,
                    default=None,
                    help="The total number of output shards."),
    flags_core.Flag(
        "output_range_begin",
        dtype=flags_core.Flag.TYPE.INTEGER,
        default=None,
예제 #5
0
import os

import tensorflow as tf
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.utils.compat import wrapper_var_name
from neurst.utils.configurable import ModelConfigs
from neurst.utils.flags_core import Flag
from neurst.utils.misc import flatten_string_list

FLAG_LIST = [
    flags_core.Flag(
        "checkpoints",
        dtype=Flag.TYPE.STRING,
        default=None,
        multiple=True,
        help="A list or comma-separated string of checkpoints to be averaged. "
        "The averaged checkpoint will be saved to `output_path`."),
    flags_core.Flag("output_path",
                    dtype=flags_core.Flag.TYPE.STRING,
                    help="The path to the averaged checkpoint."),
]


def checkpoint_exists(path):
    return (tf.io.gfile.exists(path) or tf.io.gfile.exists(path + ".meta")
            or tf.io.gfile.exists(path + ".index"))


def checkpoint_list_checking(path_list):
예제 #6
0
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.data.datasets import Dataset, build_dataset
from neurst.exps import BaseExperiment, build_exp
from neurst.layers.quantization import QuantLayer
from neurst.models import BaseModel
from neurst.tasks import Task, build_task
from neurst.training import training_utils
from neurst.utils.configurable import ModelConfigs, deep_merge_dict, load_from_config_path, yaml_load_checking
from neurst.utils.hparams_sets import get_hyper_parameters
from neurst.utils.misc import flatten_string_list

FLAG_LIST = [
    flags_core.Flag("distribution_strategy",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default="mirrored",
                    help="The distribution strategy."),
    flags_core.Flag("dtype",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default="float16",
                    help="The computation type of the whole model."),
    flags_core.Flag(
        "enable_check_numerics",
        dtype=flags_core.Flag.TYPE.BOOLEAN,
        default=None,
        help="Whether to open the tf.debugging.enable_check_numerics. "
        "Note that this may lower down the training speed."),
    flags_core.Flag("enable_xla",
                    dtype=flags_core.Flag.TYPE.BOOLEAN,
                    default=None,
                    help="Whether to enable XLA for training."),
예제 #7
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.utils.converters import Converter, build_converter

FLAG_LIST = [
    flags_core.Flag("from",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default=None,
                    required=True,
                    help="The path to pretrained model directory "
                    "or a key indicating the publicly available model name."),
    flags_core.Flag("to",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default=None,
                    required=True,
                    help="The path to save the converted checkpoint."),
    flags_core.Flag("model_name",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default=None,
                    required=True,
                    help="The name of pretrained model, e.g. google_bert."),
]

예제 #8
0
import neurst.utils.flags_core as flags_core
from neurst.data.datasets import Dataset, build_dataset
from neurst.data.datasets.audio.audio_dataset import AudioTripleTFRecordDataset
from neurst.data.datasets.parallel_text_dataset import ParallelTextDataset
from neurst.exps import build_exp
from neurst.exps.sequence_generator import SequenceGenerator
from neurst.layers.search import SequenceSearch
from neurst.metrics.metric import Metric
from neurst.tasks import build_task
from neurst.training import training_utils
from neurst.utils.configurable import ModelConfigs

FLAG_LIST = [
    flags_core.Flag("distribution_strategy",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default="mirrored",
                    help="The distribution strategy."),
    flags_core.Flag("dtype",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default="float16",
                    help="The computation type of the whole model."),
    flags_core.Flag(
        "enable_check_numerics",
        dtype=flags_core.Flag.TYPE.BOOLEAN,
        default=None,
        help="Whether to open the tf.debugging.enable_check_numerics. "
        "Note that this may lower down the training speed."),
    flags_core.Flag("asr_model_dir",
                    dtype=flags_core.Flag.TYPE.STRING,
                    help="The path to the ASR model checkpoint."),
    flags_core.Flag("mt_model_dir",
예제 #9
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl import app, logging

import neurst.utils.flags_core as flags_core
from neurst.data.datasets import Dataset, build_dataset
from neurst.data.datasets.audio.audio_dataset import RawAudioDataset

FLAG_LIST = [
    flags_core.Flag("output_transcript_file",
                    dtype=flags_core.Flag.TYPE.STRING,
                    required=True,
                    help="The path to save transcriptions."),
    flags_core.Flag("output_translation_file",
                    dtype=flags_core.Flag.TYPE.STRING,
                    default=None,
                    help="The path to save transcriptions."),
    flags_core.ModuleFlag(Dataset.REGISTRY_NAME, help="The raw dataset."),
]


def main(dataset, output_transcript_file, output_translation_file=None):
    assert isinstance(dataset, RawAudioDataset)
    transcripts = dataset.transcripts
    translations = dataset.translations
    assert transcripts, "Fail to extract transcripts."
    with tf.io.gfile.GFile(output_transcript_file, "w") as fw: