def generate_kmer_features(self): my_log = logging.getLogger('train:generate_kmers') fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir) fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir) use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1 if use_fasta2kmers2: my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command)) else: my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command)) tasks_by_fl = {} max_entries = 0 for fl in self.config.settings["fragment_len"]: fl_tasks = [] p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) files = os.listdir(p) for f in files: fastafile = os.path.join(p, f) if use_fasta2kmers2: file_cmd = "{cmd} -i {fasta} -f {out}".format(cmd=fasta2kmers2_command, out=outfile, fasta=fastafile) else: file_cmd = "{cmd} {fasta} >> {out}".format(cmd=fasta2kmers_command, out=outfile, fasta=fastafile) if self.config.settings["processors"] == 1: s = os.system('{}'.format(file_cmd)) if s != 0: my_log.critical("problem with generating kmers..:\n{}".format(file_cmd)) sys.exit(1) else: fl_tasks.append(parallel.TaskCmd(file_cmd)) # attention, they all write to the same file if self.config.settings["processors"] > 1: tasks_by_fl[fl] = fl_tasks if len(fl_tasks) > max_entries: max_entries = len(fl_tasks) if self.config.settings["processors"] > 1: # now run commands in parallel but only, if they do not write to the same file # this means you can only run len(fragment_len) commands in parallel # instead: concat files and use generate_kmer_features_concat() for i in range(max_entries): l = [] for k in tasks_by_fl.keys(): try: l.append(tasks_by_fl[k][i]) except IndexError: pass if parallel.reportFailedCmd(parallel.runCmdParallel(l, self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change
def generate_kmer_features_concat(self): my_log = logging.getLogger('train:generate_kmers') fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir) fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir) use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1 if use_fasta2kmers2: my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command)) else: my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command)) tasks = [] for fl in self.config.settings["fragment_len"]: p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) combined_fasta = os.path.join(p, "{}.all.fna".format(fl)) outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) os.system("cat {dir}{sep}*.fna > {dir}{sep}{fl}.all.tmp".format(dir=p, fl=fl, sep=os.path.sep)) os.system("rm {dir}{sep}*.fna".format(dir=p, sep=os.path.sep)) os.system("mv {p}{sep}{fl}.all.tmp {combined}".format(p=p, fl=fl, combined=combined_fasta, sep=os.path.sep)) # in-efficent: # files = os.listdir(p) # s = os.system("mv {file0} {combined}".format(file0=os.path.join(p, files[0]), combined=combined_fasta)) # if s != 0: # sys.stderr.write("problem with moving file {}\n".format(os.path.join(p,files[0]))) # sys.exit(1) # # for f in files[1:]: # s = os.system("cat {combined} {f} >> {combined}.tmp".format(dir=p, combined=combined_fasta, f=os.path.join(p, f))) # if s != 0: # sys.stderr.write("Problem with concatenating files in sampled_fasta/{fl}\n".format(fl=fl)) # os.system("mv {c}.tmp {c}".format(c=combined_fasta)) # os.remove(os.path.join(p, f)) if use_fasta2kmers2: command = "{cmd} -i {combined} -f {out}".format(cmd=fasta2kmers2_command, combined=combined_fasta, out=outfile) else: command = "{cmd} {combined} >> {out}".format(cmd=fasta2kmers_command, combined=combined_fasta, out=outfile) if self.config.settings["processors"] > 1: tasks.append(parallel.TaskCmd(command)) else: s = os.system('{}'.format(command)) if s != 0: my_log.critical("problem with generating kmers..:\n{}".format(command)) sys.exit(1) if self.config.settings["processors"] > 1: if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change
def predict(self): sys.stdout.write("Predicting...\n") outfiles = set() # if commands should be run in parallel, store them in a list for each output file # run each list in parallel afterwards command_by_outputfile = {} for index in self.classifier.keys(): if len(self.classifier[index]) == 0: continue self.classify(index, outfiles, command_by_outputfile) if self.config.settings["processors"] != 1: for commandlist in command_by_outputfile.values(): if parallel.reportFailedCmd(parallel.runCmdParallel(commandlist, maxProc=self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change # join all outputs self.combined_output_file = "{}.out".format(self.fastafile) utils.concat_files(outfiles, self.combined_output_file)
def build_models(self): my_log = logging.getLogger('train:build_models') # now as the training data is ready get the models # if no grid was given then just build models # kernel options kernel_opt = "-t {t} -g {g} -d {d} -s {s}".format(t=str(self.config.settings["kernel"]), g=str(self.config.settings["kernel_rbf_gamma"]), d=str(self.config.settings["kernel_polynomial_degree"]), s=str(self.config.settings["kernel_polynomial_s"])) loss_opt = "-l {l} --L {L}".format(l=str(self.loss_function), L=str(self.config.settings["loss_action"])) other_opt = "--z {z} --v {v} --t {t}".format(z=str(self.z_standardization), v=str(self.misc_nodes), t=self.tree_file) learn_command = "{bin} {kernel} {loss} {other} " \ "-v 1 -o 2".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_learn"), kernel=kernel_opt, loss=loss_opt, other=other_opt) cv_command = "{bin} {kernel} {loss} {other} " \ "-x 3 -v 1 -o 2 --r 1 --S 1".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_cv"), kernel=kernel_opt, loss=loss_opt, other=other_opt) if self.config.settings["balance_classes"]: learn_command = "{} --c 1".format(learn_command) cv_command = "{} --c 1".format(cv_command) my_log.debug('basic crossvalidation command:\n{} -c CVAL KMER_FILE'.format(cv_command)) my_log.debug('basic learning command:\n{} -c CVAL KMER_FILE MODEL_FILE'.format(learn_command)) tasks = [] # only needed when running in parallel if len(self.config.settings["c_grid"]) == 1: c_val = self.config.settings["c_grid"][0] for fl in self.config.settings["fragment_len"]: learn_command_final = "{cmd} -c {cval} " \ "{p}{sep}train_data{sep}{fl}.sl " \ "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command, sep=os.path.sep, cval=c_val, p=self.config.settings["project_dir"], fl=fl) if self.config.settings["processors"] == 1: my_log.info("build {fl} length model with c={cval}".format(fl=fl, cval=c_val)) s = os.system(learn_command_final) if s != 0: my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final)) sys.exit(1) else: tasks.append(parallel.TaskCmd(learn_command_final)) else: # crossvalidation for fl in self.config.settings["fragment_len"]: my_log.info("Cross-validatong {} length model.".format(fl)) cv_loss = [] cv_zeroone = [] for c_val in self.config.settings["c_grid"]: my_log.debug("c={}".format(c_val)) cv_command_final = "{cmd} -c {cval} " \ "{p}{sep}train_data{sep}{fl}.sl".format(cmd=cv_command, sep=os.path.sep, cval=c_val, p=self.config.settings["project_dir"], fl=fl) fr = sys.stdin os.system(cv_command_final) lines = fr.readlines() fr.close() floating_point = re.compile(r'\d+\.\d+') for line in lines: if "Average loss in cross-validation" in line: loss = floating_point.findall(line) try: loss = float(loss[0]) cv_loss.append(loss) except IndexError: continue if "one-error in cross-validation" in line: loss = floating_point.findall(line) try: loss = float(loss[0]) cv_zeroone.append(loss) except IndexError: continue if len(cv_loss) != len(self.config.settings["c_grid"]): my_log.critical("Error, something went wrong with cross-validation " "of {} length fragment model. Quitting".format(fl)) # exit or continue? sys.exit(1) my_log.debug("C grid: " + utils.any_list_to_string(self.config.settings["c_grid"])) my_log.debug("CV loss: " + utils.any_list_to_string(cv_loss)) my_log.debug("CV 0-1: " + utils.any_list_to_string(cv_zeroone)) loss_min = min(cv_loss) i = cv_loss.index(loss_min) # build model for minimum loss c_val = self.config.settings["c_grid"][i] learn_command_final = "{cmd} -c {cval} " \ "{p}{sep}train_data{sep}{fl}.sl " \ "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command, sep=os.path.sep, cval=c_val, p=self.config.settings["project_dir"], fl=fl) my_log.info("build {fl} length model with c={cval} and CV-loss={loss}".format(fl=fl, cval=c_val, loss=loss_min)) if self.config.settings["processors"] == 1: s = os.system("{}".format(learn_command_final)) if s != 0: my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final)) sys.exit(1) else: tasks.append(parallel.TaskCmd(learn_command_final)) if self.config.settings["processors"] > 1: my_log.info("building models in parallel...") if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change