示例#1
0
    def cleanse(self, src_lang_file, tar_lang_file):
        """
        Cleans the file provided by lowercasing all words and ensuring each line in
        the text file is within min_len and max_len. Operates on two streams
        simultaneously in order to keep line to line correspondence
        """
        self._validate_file(src_lang_file), self._validate_file(tar_lang_file)
        src_dest_file = self.destdir + utilities.strip_filename_from_path(src_lang_file) + ".cleansed"
        tar_dest_file = self.destdir + utilities.strip_filename_from_path(tar_lang_file) + ".cleansed"

        if utilities.files_exist([src_dest_file, tar_dest_file]):
            return
        else:
            utilities.wipe_files([src_dest_file, tar_dest_file])
        self._print("""Cleaning data.  Ensuring uniformity of data...""")

        src_buf, tar_buf = [], []
        for src_line, tar_line in zip(open(src_lang_file), open(tar_lang_file)):
            src_line = src_line.lower().split()
            tar_line = tar_line.lower().split()

            if len(src_line) > self.min_len and len(src_line) < self.max_len and \
                len(tar_line) > self.min_len and len(tar_line) < self.max_len:
                src_buf.append(' '.join(src_line))
                tar_buf.append(' '.join(tar_line))

            if asizeof.asizeof(src_buf) + asizeof.asizeof(tar_buf) > self.mem_limit:
                self._dump_bufs_to( [src_dest_file, tar_dest_file],
                                    [src_buf, tar_buf])

        self._dump_bufs_to([src_dest_file, tar_dest_file], [src_buf, tar_buf])
        self._print("Done\n")
示例#2
0
 def _init_filenames(self):
     self.src_fname = self.dir + utilities.strip_filename_from_path(
         self.src_path)
     self.piv1_fname = self.dir + utilities.strip_filename_from_path(
         self.piv1_path)
     self.piv2_fname = self.dir + utilities.strip_filename_from_path(
         self.piv2_path)
     self.tar_fname = self.dir + utilities.strip_filename_from_path(
         self.tar_path)
示例#3
0
    def build_language_models(self, datafile):
        """
        Building the language model ensures fluid output. Should only
        be built for the target language in each pivot.
        In the Moses tutortial, the .lm file corresponds to the .arpa
        file
        """
        self._validate_file(datafile)
        lm_file = self.lmdir + utilities.strip_filename_from_path(datafile) + ".lm"
        blm_file = self.lmdir + utilities.strip_filename_from_path(datafile) + ".blm"
        if utilities.file_exists(lm_file) and utilities.file_exists(blm_file):
            return

        self._print("Building and binarizing language models... ")
        command = self.path_to_moses + "bin/lmplz "\
              "-o {} ".format(self.NGRAM) + \
              "--text " + datafile + \
              " --arpa " + lm_file + \
              " >> {} 2>&1".format(self.lmdir + "lm.out")
        subprocess.call(command, shell=True)
        self.binarize_language_model(lm_file, blm_file)
        self._print("Done\n")
示例#4
0
    def subset(self, src_lang_file, tar_lang_file, proportion, subdir = ""):
        """ Creates a new proportion of data set to create new datasets.
        Maintains the correspondence of entries between two data files.
        Takes as parameters the two files that must be subset, the fraction
        of data that should be taken as a subset (1/3), and an option subdir
        directory where the files should be placed within the data dir """
        self._validate_file(src_lang_file), self._validate_file(tar_lang_file)

        src_dest_file = self.destdir + subdir + utilities.strip_filename_from_path(src_lang_file) + ".subset"
        tar_dest_file = self.destdir + subdir + utilities.strip_filename_from_path(tar_lang_file) + ".subset"

        if utilities.files_exist([src_dest_file, tar_dest_file]):
            return
        else:
            utilities.wipe_files([src_dest_file, tar_dest_file])

        self._print("""Choosing a random subset of the data...""")
        text_size = self._min_text_size(src_lang_file, tar_lang_file)
        subset_size = int(proportion * text_size)
        assert subset_size > 0, "Subset length must be non-zero"

        subset_lines = deque(sorted(random.sample(range(0, text_size), subset_size)))
        self._get_lines(src_lang_file, tar_lang_file, subset_lines, src_dest_file, tar_dest_file)
        self._print("Done\n")
示例#5
0
    def tokenize(self, src_file):
        """
        Tokenize the file provided using the mosesdecoder script
        by splitting the symbols in the sentences to be space-delimited
        """
        self._validate_file(src_file)
        dest_file = self.destdir + utilities.strip_filename_from_path(src_file) + ".tok"
        if utilities.file_exists(dest_file):
            return

        self._print("""Running tokenizer. """
            """Splitting into space delimited tokens... """)
        command = self.path_to_moses + "scripts/tokenizer/tokenizer.perl " + \
            "-q -threads {} ".format(NUM_CPUS) + \
            "< {}".format(src_file) + \
            " > {}".format(dest_file)
        subprocess.call(command, shell=True)
        self._print("Done\n")
示例#6
0
    def train(self, src_file, tar_file, working_dir):
        """
        Carries out the training.  Creates a working directory,
        extracts the root file information and file extension information
        necessary for moses to run.  Sends output messages to working_dir/log
        """
        if utilities.dir_exists(working_dir):
            return

        self._validate_file(src_file)
        self._validate_file(tar_file)

        cwd = os.getcwd() + "/"
        blm = cwd + "lm/" + utilities.strip_filename_from_path(tar_file) + ".blm"

        shared = self._find_common_beginning(src_file, tar_file)
        file1_ext = src_file[shared+1:]
        file2_ext = tar_file[shared+1:]
        fileroot = cwd + src_file[:shared]
        log = "train.out"

        utilities.make_dir(working_dir)
        self._print("Training model at {}. This may take a while... ".format(working_dir))
        trainer = self.path_to_moses + "scripts/training/train-model.perl"
        command = "cd {};".format(working_dir) +\
            " nohup nice " + trainer + \
            " -root-dir train -corpus {}".format(fileroot) + \
            " -f {} -e {} -alignment".format(file1_ext, file2_ext) + \
            " grow-diag-final-and -reordering msd-bidirectional-fe" + \
            " -lm 0:3:{}:8".format(blm) + \
            " -cores {}".format(self.NCPUS) + \
            " -mgiza --parallel" + \
            " -external-bin-dir " + self.path_to_moses + "tools/mgizapp/" + \
            " >& {};".format(log) + \
            " cd .."
        subprocess.call(command, shell=True)
        self._print("Done\n")
示例#7
0
    def _ttt_filenames(self, src_file, src_piv_file, piv_tar_file, tar_file):
        """
        Constructs the appropriate train tune test file extension names for the data.
        Returns a list of lists, where the list in index 0 is the name of the train
        files, the list in index 1 is the tune files, and index 2 is the test
        """
        src_train_file = self.traindir + utilities.strip_filename_from_path(src_file) + ".train"
        src_tune_file = self.tunedir + utilities.strip_filename_from_path(src_file) + ".tune"
        src_test_file = self.testdir + utilities.strip_filename_from_path(src_file) + ".test"

        src_piv_train_file = self.traindir + utilities.strip_filename_from_path(src_piv_file) + ".train"
        src_piv_tune_file = self.tunedir + utilities.strip_filename_from_path(src_piv_file) + ".tune"
        src_piv_test_file = self.testdir + utilities.strip_filename_from_path(src_piv_file) + ".test"

        piv_tar_train_file = self.traindir + utilities.strip_filename_from_path(piv_tar_file) + ".train"
        piv_tar_tune_file = self.tunedir + utilities.strip_filename_from_path(piv_tar_file) + ".tune"
        piv_tar_test_file = self.testdir + utilities.strip_filename_from_path(piv_tar_file) + ".test"

        tar_train_file = self.traindir + utilities.strip_filename_from_path(tar_file) + ".train"
        tar_tune_file = self.tunedir + utilities.strip_filename_from_path(tar_file) + ".tune"
        tar_test_file = self.testdir + utilities.strip_filename_from_path(tar_file) + ".test"

        train_files = [src_train_file, src_piv_train_file, piv_tar_train_file, tar_train_file]
        tune_files = [src_tune_file, src_piv_tune_file, piv_tar_tune_file, tar_tune_file]
        test_files = [src_test_file, src_piv_test_file, piv_tar_test_file, tar_test_file]
        return train_files, tune_files, test_files
示例#8
0
 def _init_filename(self):
     self.base_name = self.dir + utilities.strip_filename_from_path(
         self.raw_file)
 def _init_filename(self):
     self.base_name = self.dir + utilities.strip_filename_from_path(self.raw_file)
 def _init_filenames(self):
     self.src_fname = self.dir + utilities.strip_filename_from_path(self.src_path)
     self.piv1_fname = self.dir + utilities.strip_filename_from_path(self.piv1_path)
     self.piv2_fname = self.dir + utilities.strip_filename_from_path(self.piv2_path)
     self.tar_fname = self.dir + utilities.strip_filename_from_path(self.tar_path)