def classify(self, index, outfiles, command_by_outputfile): classifier_command = "{} -v 0 ".format(utils.path_to_binary(self.workingdir, "svm_phylo_classify")) ensemble_command = "{} -v 0 -e 1 ".format(utils.path_to_binary(self.workingdir, "svm_phylo_classify_ensemble")) fl = self.config.settings["fragment_len"][index] test_file = "{fasta}.{fl}.sl".format(fasta=self.fastafile, fl=fl) out_file = "{fasta}.{fl}.out".format(fasta=self.fastafile, fl=fl) outfiles.add(out_file) # get what classifiers to use classifiers_to_use = self.config.settings["fragment_len"][index:min(index + self.config.settings["n_classifiers"], len(self.config.settings["fragment_len"]))] classifier_to_use_string = "" for c in classifiers_to_use: classifier_to_use_string += str(c) classifier_to_use_string += "," classifier_to_use_string = classifier_to_use_string[:len(classifier_to_use_string)-1] sys.stdout.write("\tfragments close to length {fl} with classifiers {c}\n".format(fl=fl, c=classifier_to_use_string)) n_classifiers_to_use = len(classifiers_to_use) if n_classifiers_to_use == 1: extra_command = "{test} {model} {out}".format(test=test_file, model=self.models[fl], out=out_file) final_command = "{classifier}{extra}".format(classifier=classifier_command, extra=extra_command) else: extra_command = "-m {n} {test} ".format(n=n_classifiers_to_use, test=test_file) for i in classifiers_to_use: extra_command = "{cmd}{model} ".format(cmd=extra_command, model=self.models[i]) extra_command = "{cmd}{out} ".format(cmd=extra_command, out=out_file) final_command = "{cmd}{ext}".format(cmd=ensemble_command, ext=extra_command) if self.config.settings["processors"] != 1: sys.stdout.write("running predictions in parallel\n") if out_file not in command_by_outputfile: command_by_outputfile[out_file] = [parallel.TaskCmd(final_command)] else: command_by_outputfile[out_file].append(parallel.TaskCmd(final_command)) else: s = os.system(final_command) if s != 0: sys.stderr.write("Error in classification: couldn't run the system command.\n") sys.exit(1)
def generate_kmers(self): kmer_strings = map(lambda x: str(x), self.config.settings["kmer"]) sys.stdout.write("\nGenerating k-mer features ({})...\n".format("-".join(kmer_strings))) fasta2kmers_command = "{script} -s 1 -o 1 -h 1 -l 0 -C {rm} -t {n} -r {rev}".format(script=utils.path_to_binary(self.workingdir, "fasta2kmers"), rm=self.config.settings["rm_rev_complement"], n=self.config.settings["kmer_normalization"], rev=self.config.settings["rev_complement"]) fasta2kmers2_command = "{script} -a w -s 1 -l 2 -o 1 -b 1 " \ "-R {rm} -h 1 " \ "-n {n} " \ "-r {rev}".format(script=utils.path_to_binary(self.workingdir, "fasta2kmers2"), rm=self.config.settings["rm_rev_complement"], n=self.config.settings["kmer_normalization"], rev=self.config.settings["rev_complement"]) if len(self.config.settings["kmer"]) == 1: k = self.config.settings["kmer"][0] fasta2kmers_command = "{cmd} -k {kmer}".format(cmd=fasta2kmers_command, kmer=k) fasta2kmers2_command = "{cmd} -k {kmer} -j {kmer}".format(cmd=fasta2kmers2_command, kmer=k) else: k = max(self.config.settings["kmer"]) j = min(self.config.settings["kmer"]) fasta2kmers_command = "{cmd} -k {k} -j {j}".format(cmd=fasta2kmers_command, k=k, j=j) fasta2kmers2_command = "{cmd} -k {k} -j {j}".format(cmd=fasta2kmers2_command, k=k, j=j) fasta2kmers_command_final = "{cmd} {input} | sed 's/^/1 /' > {output}.sl".format(cmd=fasta2kmers_command, input=self.fastafile_filtered, output=self.fastafile) fasta2kmers2_command_final = "{cmd} -i {input} -f {output}.sl".format(cmd=fasta2kmers2_command, input=self.fastafile_filtered, output=self.fastafile) use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1 if use_fasta2kmers2: cmd = fasta2kmers2_command_final s = os.system(fasta2kmers2_command_final) else: cmd = fasta2kmers2_command_final s = os.system(fasta2kmers_command_final) if s != 0: sys.stderr.write("Generating kmers failed with command:\n{cmd}".format(cmd=cmd)) sys.exit(1) sys.stdout.write("done\n")
def build_models(self): my_log = logging.getLogger('train:build_models') # now as the training data is ready get the models # if no grid was given then just build models # kernel options kernel_opt = "-t {t} -g {g} -d {d} -s {s}".format(t=str(self.config.settings["kernel"]), g=str(self.config.settings["kernel_rbf_gamma"]), d=str(self.config.settings["kernel_polynomial_degree"]), s=str(self.config.settings["kernel_polynomial_s"])) loss_opt = "-l {l} --L {L}".format(l=str(self.loss_function), L=str(self.config.settings["loss_action"])) other_opt = "--z {z} --v {v} --t {t}".format(z=str(self.z_standardization), v=str(self.misc_nodes), t=self.tree_file) learn_command = "{bin} {kernel} {loss} {other} " \ "-v 1 -o 2".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_learn"), kernel=kernel_opt, loss=loss_opt, other=other_opt) cv_command = "{bin} {kernel} {loss} {other} " \ "-x 3 -v 1 -o 2 --r 1 --S 1".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_cv"), kernel=kernel_opt, loss=loss_opt, other=other_opt) if self.config.settings["balance_classes"]: learn_command = "{} --c 1".format(learn_command) cv_command = "{} --c 1".format(cv_command) my_log.debug('basic crossvalidation command:\n{} -c CVAL KMER_FILE'.format(cv_command)) my_log.debug('basic learning command:\n{} -c CVAL KMER_FILE MODEL_FILE'.format(learn_command)) tasks = [] # only needed when running in parallel if len(self.config.settings["c_grid"]) == 1: c_val = self.config.settings["c_grid"][0] for fl in self.config.settings["fragment_len"]: learn_command_final = "{cmd} -c {cval} " \ "{p}{sep}train_data{sep}{fl}.sl " \ "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command, sep=os.path.sep, cval=c_val, p=self.config.settings["project_dir"], fl=fl) if self.config.settings["processors"] == 1: my_log.info("build {fl} length model with c={cval}".format(fl=fl, cval=c_val)) s = os.system(learn_command_final) if s != 0: my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final)) sys.exit(1) else: tasks.append(parallel.TaskCmd(learn_command_final)) else: # crossvalidation for fl in self.config.settings["fragment_len"]: my_log.info("Cross-validatong {} length model.".format(fl)) cv_loss = [] cv_zeroone = [] for c_val in self.config.settings["c_grid"]: my_log.debug("c={}".format(c_val)) cv_command_final = "{cmd} -c {cval} " \ "{p}{sep}train_data{sep}{fl}.sl".format(cmd=cv_command, sep=os.path.sep, cval=c_val, p=self.config.settings["project_dir"], fl=fl) fr = sys.stdin os.system(cv_command_final) lines = fr.readlines() fr.close() floating_point = re.compile(r'\d+\.\d+') for line in lines: if "Average loss in cross-validation" in line: loss = floating_point.findall(line) try: loss = float(loss[0]) cv_loss.append(loss) except IndexError: continue if "one-error in cross-validation" in line: loss = floating_point.findall(line) try: loss = float(loss[0]) cv_zeroone.append(loss) except IndexError: continue if len(cv_loss) != len(self.config.settings["c_grid"]): my_log.critical("Error, something went wrong with cross-validation " "of {} length fragment model. Quitting".format(fl)) # exit or continue? sys.exit(1) my_log.debug("C grid: " + utils.any_list_to_string(self.config.settings["c_grid"])) my_log.debug("CV loss: " + utils.any_list_to_string(cv_loss)) my_log.debug("CV 0-1: " + utils.any_list_to_string(cv_zeroone)) loss_min = min(cv_loss) i = cv_loss.index(loss_min) # build model for minimum loss c_val = self.config.settings["c_grid"][i] learn_command_final = "{cmd} -c {cval} " \ "{p}{sep}train_data{sep}{fl}.sl " \ "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command, sep=os.path.sep, cval=c_val, p=self.config.settings["project_dir"], fl=fl) my_log.info("build {fl} length model with c={cval} and CV-loss={loss}".format(fl=fl, cval=c_val, loss=loss_min)) if self.config.settings["processors"] == 1: s = os.system("{}".format(learn_command_final)) if s != 0: my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final)) sys.exit(1) else: tasks.append(parallel.TaskCmd(learn_command_final)) if self.config.settings["processors"] > 1: my_log.info("building models in parallel...") if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change