def fimo(motif, bg_file=None, fasta_file=None, tempdir=None, motifdatabase=None, thresh=None, largewindow=None): '''This function runs fimo on a given fastafile for a single motif in a provided motif database. The output is cut and sorted to convert into a sorted bed file Parameters ---------- tempdir : string full path to temp directory in output directory (created by TFEA) motifdatabase : string full path to a motif database file in meme format bgfile : string full path to a markov background model motif : string the name of a motif that matches a motif within motifdatabase fastafile : string full path to a fasta file that fimo will perform motif scanning on Returns ------- fimo_out : string full path to where fimo output which is stored within the tempdir directory. ''' fimo_out = tempdir / (motif + '.fimo.bed') if bg_file is not None: command = ("fimo", "--skip-matched-sequence", "--verbosity", "1", "--thresh", str(thresh), "--bgfile", bg_file, "--motif", motif, motifdatabase, fasta_file) else: command = ("fimo", "--skip-matched-sequence", "--verbosity", "1", "--thresh", str(thresh), "--motif", motif, motifdatabase, fasta_file) try: fimo_out = subprocess.check_output( command, stderr=subprocess.PIPE).decode('UTF-8') except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) # fasta_count = fasta_linecount(fastafile=fasta_file) names = fasta_names(fastafile=fasta_file) distances = fimo_parse_stdout(fimo_stdout=fimo_out, largewindow=largewindow, names=names) # linecount=fasta_count) del fimo_out return [motif] + distances
def fasta_markov(tempdir=None, fastafile=None, order=None): '''This function runs meme's fasta-get-markov function that generates a background markov file (for use with fimo) from a fasta file. Parameters ---------- tempdir : string full path to temp directory in output directory (created by TFEA) fastafile : string full path to fasta file that will be used to generate the markov background model file order : string an integer formatted as a string where a user may specify what order markov model they would like (default='0') Returns ------- None ''' markov_background = tempdir / "markov_background.txt" try: with open(markov_background, 'w') as output: subprocess.run(["fasta-get-markov", "-m", order, fastafile], stdout=output, stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) return markov_background
def create_directories(srcdirectory=None): from TFEA import config if config.vars['SBATCH'] == False: #No sbatch flag make_out_directories(create=True) write_rerun(args=sys.argv, outputdir=config.vars['OUTPUT']) write_vars(config_vars=config.vars, outputfile=config.vars['OUTPUT'] / 'inputs.txt') config.vars['JOBID'] = 0 elif str(config.vars['SBATCH']) == 'SUBMITTED': #Internal flag make_out_directories(create=False) config.vars['JOBID'] = (config.vars['TEMPDIR'] / 'jobid.txt').read_text().strip('\n') else: #--sbatch specified make_out_directories(create=True) write_rerun(args=sys.argv, outputdir=config.vars['OUTPUT']) write_vars(config_vars=config.vars, outputfile=config.vars['OUTPUT'] / 'inputs.txt') script = srcdirectory / 'main.sbatch' email = str(config.vars['SBATCH']) error_file = config.vars['E_AND_O'] / ( 'TFEA_' + config.vars['OUTPUT'].name + '.err') args = sys.argv if '--sbatch' in args: args[args.index('--sbatch') + 1] = 'SUBMITTED' else: args.append('--sbatch') args.append('SUBMITTED') if '--venv' in args: venv = args[args.index('--venv') + 1] else: venv = '.' try: sbatch_out = subprocess.run([ "sbatch", "--error=" + (config.vars['E_AND_O'] / "%x.err").as_posix(), "--output=" + (config.vars['E_AND_O'] / "%x.out").as_posix(), "--mail-user="******"--export=cmd=" + ' '.join(args) + ',' + 'venv=' + venv, "--job-name=TFEA_" + config.vars['OUTPUT'].name, "--ntasks=" + str(config.vars['CPUS']), "--mem=" + str(config.vars['MEM']), "--time=" + str(config.vars['TIME']), "--partition=" + str(config.vars['PARTITION']), script ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, check=True) except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) (config.vars['TEMPDIR'] / 'jobid.txt').write_text( sbatch_out.stdout.decode().split()[-1]) print(("TFEA has been submitted using an sbatch script. \nIt can be " "monitored using:\ntail -f " + error_file.as_posix())) sys.exit()
def mumerge(input_file, output_basename, bed1=[], bed2=[], label1=None, label2=None, mumerge_path=Path(__file__).absolute().parent / 'mumerge.py'): '''This function runs MuMerge, a script written by Jacob T. Stanley that merges a list of bed files in a probabilistic way. Parameters ---------- input_file: path to .txt file A .txt file formatted according to MuMerge specifications. From doc: Input file containing bedfiles, sample ID's, and replicate groupings. Input file (indicated by the '-i' flag) should be of the following (tab delimited) format: #file sampid group /full/file/path/filename1.bed sampid1 A /full/file/path/filename2.bed sampid2 B ... Header line indicated by '#' character must be included and fields must follow the same order as non-header lines. The order of subsequent lines does matter. 'group' identifiers should group files that are technical/biological replicates. Different experimental conditions should recieve different 'group' identifiers. The 'group' identifier can be of type 'int' or 'str'. If 'sampid' is not specified, then default sample ID's will be used. output_basename: Path to output file without file extension From doc: Output file basename (full path, sans extension). WARNING: will overwrite any existing file)''' with open(input_file, 'w') as F: F.write("#file\tsampid\tgroup\n") for i, bedpath in enumerate(bed1, 1): F.write(f'{bedpath}\t{label1}{i}\t{label1}\n') for i, bedpath in enumerate(bed2, 1): F.write(f'{bedpath}\t{label2}{i}\t{label2}\n') mumerge_command = [ 'python3', mumerge_path, '-i', input_file, '-o', output_basename ] try: subprocess.check_output(mumerge_command, stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) combined_file = Path(str(output_basename) + '_MUMERGE.bed') return combined_file
def meme_logo(motif_file, motif_ID, figuredir, plot_format=None): '''Runs meme2images that creates logo images ''' meme2images_command = ['meme2images', '-rc', '-eps', '-motif', motif_ID, motif_file, figuredir] motif_ID = motif_ID.replace('.', '_') imagemagick_command = ['convert', figuredir / ('logo'+motif_ID+'.eps'), figuredir / (f'logo{motif_ID}.png')] imagemagick_rc_command = ['convert', figuredir / ('logo_rc'+motif_ID+'.eps'), figuredir / (f'logo_rc{motif_ID}.png')] try: subprocess.check_output(meme2images_command, stderr=subprocess.PIPE) subprocess.check_output(imagemagick_command, stderr=subprocess.PIPE) subprocess.check_output(imagemagick_rc_command, stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) return
def getfasta(bedfile=None, genomefasta=None, tempdir=None, outname=None): '''Converts a bed file to a fasta file using bedtools. Outputs into the tempdir directory created by TFEA. Parameters ---------- bedfile : string full path to a bed file genomefasta : string full path to a fasta file for the genome of interest tempdir : string full path to temp directory in output directory (created by TFEA) Returns ------- ranked_file_fasta : string full path to a fasta file containing the inputted bed file regions in fasta format ''' fasta_file = tempdir / outname #pybedtools implementation (incomplete) # pybed = BedTool(bedfile).sequence(fi=genomefasta).saveas(fasta_file) getfasta_command = [ "bedtools", "getfasta", "-fi", genomefasta, "-bed", bedfile, "-fo", fasta_file ] try: subprocess.run(getfasta_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) return fasta_file
def bedtools_closest(motif, genomehits=None, ranked_center_file=None, tempdir=None, distance_cutoff=None, rank_index=None): '''Calculates nearest motif hit from a bed file. TFEA provides this function with a bed file containing the center of the inputted regions. Parameters ---------- TFresults : list of lists contains calculated enrichment scores for all TFs of interest specified by the user Returns ------- motif_distance_bed_sorted : string full path to where the sorted motif distance file was outputted ''' try: motif_path = genomehits / motif if os.stat(motif_path).st_size == 0: return [motif] + [ '.' for i in range(os.stat(ranked_center_file).st_size) ] command = ("bedtools", "closest", "-D", "ref", "-t", "first", "-a", ranked_center_file, "-b", motif_path) closest_out = tempdir / (motif + '.closest.bed') # import sys # print(' '.join([str(c) for c in command]) + ' > ' + closest_out.as_posix(), file=sys.stderr) try: closest_out.write_bytes( subprocess.check_output(command, stderr=subprocess.PIPE)) except subprocess.CalledProcessError as e: raise exceptions.SubprocessError(e.stderr.decode()) distances = list() ranks = list() with open(closest_out) as F: for line in F: linelist = line.strip('\n').split('\t') distance = int(linelist[-1]) if rank_index is not None: rank = int(linelist[rank_index].split(',')[-1]) ranks.append(rank) if abs(distance) <= distance_cutoff: distances.append(distance) else: distances.append('.') if rank_index is not None: distances = [x for i, x in sorted(zip(ranks, distances))] closest_out.unlink() except Exception as e: # This prints the type, value, and stack trace of the # current exception being handled. print(traceback.print_exc()) raise e return [motif.strip('.bed')] + distances