Exemplo n.º 1
0
    def grid_search(self, kwargs):

        make_dir("../evaluations")
        wordNgrams = kwargs["wordNgrams"]
        bucket = kwargs["bucket"]
        lr = kwargs["lr"]
        dim = kwargs["dim"]
        epoch = kwargs["epoch"]
        loss = kwargs["loss"]

        args = product(wordNgrams, bucket, lr, dim, epoch, loss)

        for combinations in args:

            kwargs["wordNgrams"] = combinations[0]
            kwargs["bucket"] = int(combinations[1])
            kwargs["lr"] = combinations[2]
            kwargs["dim"] = combinations[3]
            kwargs["epoch"] = combinations[4]
            kwargs["loss"] = combinations[5]

            parameters = " ".join(
                map(str, [
                    kwargs["wordNgrams"], kwargs["bucket"], kwargs["lr"],
                    kwargs["dim"], kwargs["epoch"], kwargs["loss"]
                ]))

            self.trainClassifier(**kwargs)
            results = "{}\n{}\n\n".format(parameters,
                                          self.testClassifier(kwargs["name"]))
            save_data(directory="../evaluations",
                      name="results.txt",
                      docs=results,
                      mode="a")
Exemplo n.º 2
0
 def __init__(self, path_to_moses, NCPUS, NGRAM, verbose=False):
     self.path_to_moses = path_to_moses
     self.NCPUS = NCPUS
     self.NGRAM = NGRAM
     self.lmdir = "lm/"
     utilities.make_dir(self.lmdir)
     self.verbose = verbose
Exemplo n.º 3
0
    def __init__(self, path_to_moses, mem_limit, max_len, min_len, verbose=False):
        self.path_to_moses = path_to_moses
        self.mem_limit = mem_limit
        self.max_len = max_len
        self.min_len = min_len

        self.destdir = "data/"
        self.traindir = self.destdir + "train/"
        self.tunedir = self.destdir + "tune/"
        self.testdir = self.destdir + "test/"
        self.verbose = verbose
        utilities.make_dir(self.destdir)
Exemplo n.º 4
0
 def trainClassifier(self, **kwargs):
     """
         Trains supervised classifier
         Paras:
             hyper_parameters: parameters to train neural net
         Returns:
             None
     """
     make_dir("../fastTextModels")
     name = kwargs["name"]
     model = kwargs["model"]
     parameters = self.setParameters(**kwargs)
     system(
         "../fastText/fasttext {} -input ../Dataset/training_set_processed/training_{}.txt -output ../fastTextModels/model_{} -label __label__ {}"
         .format(model, name, name, parameters))
Exemplo n.º 5
0
    def train(self, src_file, tar_file, working_dir):
        """
        Carries out the training.  Creates a working directory,
        extracts the root file information and file extension information
        necessary for moses to run.  Sends output messages to working_dir/log
        """
        if utilities.dir_exists(working_dir):
            return

        self._validate_file(src_file)
        self._validate_file(tar_file)

        cwd = os.getcwd() + "/"
        blm = cwd + "lm/" + utilities.strip_filename_from_path(tar_file) + ".blm"

        shared = self._find_common_beginning(src_file, tar_file)
        file1_ext = src_file[shared+1:]
        file2_ext = tar_file[shared+1:]
        fileroot = cwd + src_file[:shared]
        log = "train.out"

        utilities.make_dir(working_dir)
        self._print("Training model at {}. This may take a while... ".format(working_dir))
        trainer = self.path_to_moses + "scripts/training/train-model.perl"
        command = "cd {};".format(working_dir) +\
            " nohup nice " + trainer + \
            " -root-dir train -corpus {}".format(fileroot) + \
            " -f {} -e {} -alignment".format(file1_ext, file2_ext) + \
            " grow-diag-final-and -reordering msd-bidirectional-fe" + \
            " -lm 0:3:{}:8".format(blm) + \
            " -cores {}".format(self.NCPUS) + \
            " -mgiza --parallel" + \
            " -external-bin-dir " + self.path_to_moses + "tools/mgizapp/" + \
            " >& {};".format(log) + \
            " cd .."
        subprocess.call(command, shell=True)
        self._print("Done\n")
Exemplo n.º 6
0
    def split_train_tune_test(self, src_file, src_piv_file, piv_tar_file, tar_file,
        train_split, test_split):
        """
        Splits the full datafiles into test, tune, and train sets.
        Receives 4 files as parameters and 2 decimals indicating the percentage of
        data to be used as train, tune, and test data. If line 1 in src langs is
        in test, then line 1 in tar langs will also be in test. Etc.
        """
        utilities.make_dir(self.traindir)
        utilities.make_dir(self.tunedir)
        utilities.make_dir(self.testdir)

        self._validate_file(src_file), self._validate_file(src_piv_file)
        self._validate_file(piv_tar_file), self._validate_file(tar_file)
        assert train_split + test_split <= 1 , "Invalid size for train, tune, and test splits"

        train_files, tune_files, test_files = self._ttt_filenames(src_file, src_piv_file, piv_tar_file, tar_file)
        if utilities.ttt_files_exist(train_files, tune_files, test_files):
            return
        else:
            utilities.ttt_wipe_files(train_files, tune_files, test_files)

        self._print("""Splitting data into train, tune, and test sets...""")
        train, tune, test = [[] ,[], [], []],  [[], [], [], []], [[], [], [], []]
        for src_line, src_piv_line, piv_tar_line, tar_line in \
            zip_longest(open(src_file), open(src_piv_file), open(piv_tar_file), open(tar_file)):

            x = numpy.random.sample()
            if x < train_split:
                self._add_line_to(train[0], src_line)
                self._add_line_to(train[1], src_piv_line)
                self._add_line_to(train[2], piv_tar_line)
                self._add_line_to(train[3], tar_line)
            elif x >= train_split and x < train_split + test_split:
                self._add_line_to(tune[0], src_line)
                self._add_line_to(tune[1], src_piv_line)
                self._add_line_to(tune[2], piv_tar_line)
                self._add_line_to(tune[3], tar_line)
            else:
                self._add_line_to(test[0], src_line)
                self._add_line_to(test[1], src_piv_line)
                self._add_line_to(test[2], piv_tar_line)
                self._add_line_to(test[3], tar_line)

            if asizeof.asizeof(train) + asizeof.asizeof(tune) + \
                asizeof.asizeof(test) > self.mem_limit:
                self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files)

        self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files)
        self._print("Done\n")
Exemplo n.º 7
0
 def make_dir(self):
     """bla bla.
     """
     utilities.make_dir(self.get_site_dir_path())
     for subdir in json_site_subdirs:
         utilities.make_dir(self.get_site_dir_subdir_path(subdir))
Exemplo n.º 8
0
 def make_dir(self):
     """bla bla.
     """
     utilities.make_dir(self.get_user_dir_path())