Python strip_filename_from_path示例，utilities.strip_filename_from_path Python示例

示例#1

0

显示文件

    def cleanse(self, src_lang_file, tar_lang_file):
        """
        Cleans the file provided by lowercasing all words and ensuring each line in
        the text file is within min_len and max_len. Operates on two streams
        simultaneously in order to keep line to line correspondence
        """
        self._validate_file(src_lang_file), self._validate_file(tar_lang_file)
        src_dest_file = self.destdir + utilities.strip_filename_from_path(src_lang_file) + ".cleansed"
        tar_dest_file = self.destdir + utilities.strip_filename_from_path(tar_lang_file) + ".cleansed"

        if utilities.files_exist([src_dest_file, tar_dest_file]):
            return
        else:
            utilities.wipe_files([src_dest_file, tar_dest_file])
        self._print("""Cleaning data.  Ensuring uniformity of data...""")

        src_buf, tar_buf = [], []
        for src_line, tar_line in zip(open(src_lang_file), open(tar_lang_file)):
            src_line = src_line.lower().split()
            tar_line = tar_line.lower().split()

            if len(src_line) > self.min_len and len(src_line) < self.max_len and \
                len(tar_line) > self.min_len and len(tar_line) < self.max_len:
                src_buf.append(' '.join(src_line))
                tar_buf.append(' '.join(tar_line))

            if asizeof.asizeof(src_buf) + asizeof.asizeof(tar_buf) > self.mem_limit:
                self._dump_bufs_to( [src_dest_file, tar_dest_file],
                                    [src_buf, tar_buf])

        self._dump_bufs_to([src_dest_file, tar_dest_file], [src_buf, tar_buf])
        self._print("Done\n")

示例#2

0

显示文件

 def _init_filenames(self):
     self.src_fname = self.dir + utilities.strip_filename_from_path(
         self.src_path)
     self.piv1_fname = self.dir + utilities.strip_filename_from_path(
         self.piv1_path)
     self.piv2_fname = self.dir + utilities.strip_filename_from_path(
         self.piv2_path)
     self.tar_fname = self.dir + utilities.strip_filename_from_path(
         self.tar_path)

示例#3

0

显示文件

    def build_language_models(self, datafile):
        """
        Building the language model ensures fluid output. Should only
        be built for the target language in each pivot.
        In the Moses tutortial, the .lm file corresponds to the .arpa
        file
        """
        self._validate_file(datafile)
        lm_file = self.lmdir + utilities.strip_filename_from_path(datafile) + ".lm"
        blm_file = self.lmdir + utilities.strip_filename_from_path(datafile) + ".blm"
        if utilities.file_exists(lm_file) and utilities.file_exists(blm_file):
            return

        self._print("Building and binarizing language models... ")
        command = self.path_to_moses + "bin/lmplz "\
              "-o {} ".format(self.NGRAM) + \
              "--text " + datafile + \
              " --arpa " + lm_file + \
              " >> {} 2>&1".format(self.lmdir + "lm.out")
        subprocess.call(command, shell=True)
        self.binarize_language_model(lm_file, blm_file)
        self._print("Done\n")

示例#4

0

显示文件

    def subset(self, src_lang_file, tar_lang_file, proportion, subdir = ""):
        """ Creates a new proportion of data set to create new datasets.
        Maintains the correspondence of entries between two data files.
        Takes as parameters the two files that must be subset, the fraction
        of data that should be taken as a subset (1/3), and an option subdir
        directory where the files should be placed within the data dir """
        self._validate_file(src_lang_file), self._validate_file(tar_lang_file)

        src_dest_file = self.destdir + subdir + utilities.strip_filename_from_path(src_lang_file) + ".subset"
        tar_dest_file = self.destdir + subdir + utilities.strip_filename_from_path(tar_lang_file) + ".subset"

        if utilities.files_exist([src_dest_file, tar_dest_file]):
            return
        else:
            utilities.wipe_files([src_dest_file, tar_dest_file])

        self._print("""Choosing a random subset of the data...""")
        text_size = self._min_text_size(src_lang_file, tar_lang_file)
        subset_size = int(proportion * text_size)
        assert subset_size > 0, "Subset length must be non-zero"

        subset_lines = deque(sorted(random.sample(range(0, text_size), subset_size)))
        self._get_lines(src_lang_file, tar_lang_file, subset_lines, src_dest_file, tar_dest_file)
        self._print("Done\n")

示例#5

0

显示文件

    def tokenize(self, src_file):
        """
        Tokenize the file provided using the mosesdecoder script
        by splitting the symbols in the sentences to be space-delimited
        """
        self._validate_file(src_file)
        dest_file = self.destdir + utilities.strip_filename_from_path(src_file) + ".tok"
        if utilities.file_exists(dest_file):
            return

        self._print("""Running tokenizer. """
            """Splitting into space delimited tokens... """)
        command = self.path_to_moses + "scripts/tokenizer/tokenizer.perl " + \
            "-q -threads {} ".format(NUM_CPUS) + \
            "< {}".format(src_file) + \
            " > {}".format(dest_file)
        subprocess.call(command, shell=True)
        self._print("Done\n")

示例#6

0

显示文件

    def train(self, src_file, tar_file, working_dir):
        """
        Carries out the training.  Creates a working directory,
        extracts the root file information and file extension information
        necessary for moses to run.  Sends output messages to working_dir/log
        """
        if utilities.dir_exists(working_dir):
            return

        self._validate_file(src_file)
        self._validate_file(tar_file)

        cwd = os.getcwd() + "/"
        blm = cwd + "lm/" + utilities.strip_filename_from_path(tar_file) + ".blm"

        shared = self._find_common_beginning(src_file, tar_file)
        file1_ext = src_file[shared+1:]
        file2_ext = tar_file[shared+1:]
        fileroot = cwd + src_file[:shared]
        log = "train.out"

        utilities.make_dir(working_dir)
        self._print("Training model at {}. This may take a while... ".format(working_dir))
        trainer = self.path_to_moses + "scripts/training/train-model.perl"
        command = "cd {};".format(working_dir) +\
            " nohup nice " + trainer + \
            " -root-dir train -corpus {}".format(fileroot) + \
            " -f {} -e {} -alignment".format(file1_ext, file2_ext) + \
            " grow-diag-final-and -reordering msd-bidirectional-fe" + \
            " -lm 0:3:{}:8".format(blm) + \
            " -cores {}".format(self.NCPUS) + \
            " -mgiza --parallel" + \
            " -external-bin-dir " + self.path_to_moses + "tools/mgizapp/" + \
            " >& {};".format(log) + \
            " cd .."
        subprocess.call(command, shell=True)
        self._print("Done\n")

示例#7

0

显示文件

    def _ttt_filenames(self, src_file, src_piv_file, piv_tar_file, tar_file):
        """
        Constructs the appropriate train tune test file extension names for the data.
        Returns a list of lists, where the list in index 0 is the name of the train
        files, the list in index 1 is the tune files, and index 2 is the test
        """
        src_train_file = self.traindir + utilities.strip_filename_from_path(src_file) + ".train"
        src_tune_file = self.tunedir + utilities.strip_filename_from_path(src_file) + ".tune"
        src_test_file = self.testdir + utilities.strip_filename_from_path(src_file) + ".test"

        src_piv_train_file = self.traindir + utilities.strip_filename_from_path(src_piv_file) + ".train"
        src_piv_tune_file = self.tunedir + utilities.strip_filename_from_path(src_piv_file) + ".tune"
        src_piv_test_file = self.testdir + utilities.strip_filename_from_path(src_piv_file) + ".test"

        piv_tar_train_file = self.traindir + utilities.strip_filename_from_path(piv_tar_file) + ".train"
        piv_tar_tune_file = self.tunedir + utilities.strip_filename_from_path(piv_tar_file) + ".tune"
        piv_tar_test_file = self.testdir + utilities.strip_filename_from_path(piv_tar_file) + ".test"

        tar_train_file = self.traindir + utilities.strip_filename_from_path(tar_file) + ".train"
        tar_tune_file = self.tunedir + utilities.strip_filename_from_path(tar_file) + ".tune"
        tar_test_file = self.testdir + utilities.strip_filename_from_path(tar_file) + ".test"

        train_files = [src_train_file, src_piv_train_file, piv_tar_train_file, tar_train_file]
        tune_files = [src_tune_file, src_piv_tune_file, piv_tar_tune_file, tar_tune_file]
        test_files = [src_test_file, src_piv_test_file, piv_tar_test_file, tar_test_file]
        return train_files, tune_files, test_files

示例#8

0

显示文件

 def _init_filename(self):
     self.base_name = self.dir + utilities.strip_filename_from_path(
         self.raw_file)

示例#9

0

显示文件

文件： FileData.py 项目： urielmandujano/Neural-Network-Machine-Translation

 def _init_filename(self):
     self.base_name = self.dir + utilities.strip_filename_from_path(self.raw_file)

示例#10

0

显示文件

文件： FileNames.py 项目： urielmandujano/Neural-Network-Machine-Translation

 def _init_filenames(self):
     self.src_fname = self.dir + utilities.strip_filename_from_path(self.src_path)
     self.piv1_fname = self.dir + utilities.strip_filename_from_path(self.piv1_path)
     self.piv2_fname = self.dir + utilities.strip_filename_from_path(self.piv2_path)
     self.tar_fname = self.dir + utilities.strip_filename_from_path(self.tar_path)