def generate_kmer_features(self): my_log = logging.getLogger('train:generate_kmers') fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir) fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir) use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1 if use_fasta2kmers2: my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command)) else: my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command)) tasks_by_fl = {} max_entries = 0 for fl in self.config.settings["fragment_len"]: fl_tasks = [] p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) files = os.listdir(p) for f in files: fastafile = os.path.join(p, f) if use_fasta2kmers2: file_cmd = "{cmd} -i {fasta} -f {out}".format(cmd=fasta2kmers2_command, out=outfile, fasta=fastafile) else: file_cmd = "{cmd} {fasta} >> {out}".format(cmd=fasta2kmers_command, out=outfile, fasta=fastafile) if self.config.settings["processors"] == 1: s = os.system('{}'.format(file_cmd)) if s != 0: my_log.critical("problem with generating kmers..:\n{}".format(file_cmd)) sys.exit(1) else: fl_tasks.append(parallel.TaskCmd(file_cmd)) # attention, they all write to the same file if self.config.settings["processors"] > 1: tasks_by_fl[fl] = fl_tasks if len(fl_tasks) > max_entries: max_entries = len(fl_tasks) if self.config.settings["processors"] > 1: # now run commands in parallel but only, if they do not write to the same file # this means you can only run len(fragment_len) commands in parallel # instead: concat files and use generate_kmer_features_concat() for i in range(max_entries): l = [] for k in tasks_by_fl.keys(): try: l.append(tasks_by_fl[k][i]) except IndexError: pass if parallel.reportFailedCmd(parallel.runCmdParallel(l, self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change
def generate_kmer_features_concat(self): my_log = logging.getLogger('train:generate_kmers') fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir) fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir) use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1 if use_fasta2kmers2: my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command)) else: my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command)) tasks = [] for fl in self.config.settings["fragment_len"]: p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) combined_fasta = os.path.join(p, "{}.all.fna".format(fl)) outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep) os.system("cat {dir}{sep}*.fna > {dir}{sep}{fl}.all.tmp".format(dir=p, fl=fl, sep=os.path.sep)) os.system("rm {dir}{sep}*.fna".format(dir=p, sep=os.path.sep)) os.system("mv {p}{sep}{fl}.all.tmp {combined}".format(p=p, fl=fl, combined=combined_fasta, sep=os.path.sep)) # in-efficent: # files = os.listdir(p) # s = os.system("mv {file0} {combined}".format(file0=os.path.join(p, files[0]), combined=combined_fasta)) # if s != 0: # sys.stderr.write("problem with moving file {}\n".format(os.path.join(p,files[0]))) # sys.exit(1) # # for f in files[1:]: # s = os.system("cat {combined} {f} >> {combined}.tmp".format(dir=p, combined=combined_fasta, f=os.path.join(p, f))) # if s != 0: # sys.stderr.write("Problem with concatenating files in sampled_fasta/{fl}\n".format(fl=fl)) # os.system("mv {c}.tmp {c}".format(c=combined_fasta)) # os.remove(os.path.join(p, f)) if use_fasta2kmers2: command = "{cmd} -i {combined} -f {out}".format(cmd=fasta2kmers2_command, combined=combined_fasta, out=outfile) else: command = "{cmd} {combined} >> {out}".format(cmd=fasta2kmers_command, combined=combined_fasta, out=outfile) if self.config.settings["processors"] > 1: tasks.append(parallel.TaskCmd(command)) else: s = os.system('{}'.format(command)) if s != 0: my_log.critical("problem with generating kmers..:\n{}".format(command)) sys.exit(1) if self.config.settings["processors"] > 1: if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None: # Ivan change sys.exit(-1) # Ivan change