Exemplo n.º 1
0
def main0():

	output_directory.mkdir(exist_ok=overwrite)
	print("running:\n" + allcmds)
	(output_directory / "cmds.txt").write_text(allcmds)

	pep_ion_minprob=get_pep_ion_minprob(
		Filter_option.all
		# Filter_option.by_2D_filtering
	)

	cmd2 = " ".join(spectrast_cmd(pep_ion_minprob))
	print(f"running:\n{cmd2}\n…")
	(output_directory / "cmds2.txt").write_text(cmd2)
	subprocess.run(spectrast_cmd(pep_ion_minprob), cwd=os_fspath(output_directory), check=True)

	print("take only proteins from philosopher’s proteins.fas…")
	filter_proteins(fasta, decoy_prefix)
	# swathwindowssetup_file_path.write_text(txt, "ascii")
	if is_DIA_Umpire_output:
		print("modifying splib file to combine Q[123] from DIA-umpire…")
		modify_splib()
	# %%time
	# subprocess.run(spectrast_cmds, shell=True, cwd=os_fspath(output_directory), check=True)
	subprocess.run(adjust_command(spectrast_cmds_part1), shell=True, cwd=os_fspath(output_directory), check=True)
	if align_with_iRT:
		cp = subprocess.run(adjust_command(spectrast_cmds_part2), shell=True, cwd=os_fspath(output_directory), check=not True)
		if cp.returncode != 0:
			shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib')
	else:
		shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib')
	subprocess.run(adjust_command(spectrast_cmds_part3), shell=True, cwd=os_fspath(output_directory), check=True)
Exemplo n.º 2
0
def main0():

	output_directory.mkdir(exist_ok=overwrite)
	print(f'''Spectral library building
Commands to execute:
{allcmds}
{'~' * 69}''', flush=True)
	(output_directory / "cmds.txt").write_text(allcmds)
	pep_ion_minprob=get_pep_ion_minprob(
		Filter_option.all
		# Filter_option.by_2D_filtering
		,
		philosopher_filter_log_path.read_text() if skip_philosopher_filter else None
	)
	# http://tools.proteomecenter.org/wiki/index.php?title=Software:SpectraST#User-defined_Modifications
	# http://tools.proteomecenter.org/wiki/index.php?title=Spectrast.usermods
	(output_directory / 'spectrast.usermods').write_text(
		r'''M|+16|
C|+57|
n|+42|
C|119.004099|Cysteinyl
c|-0.02|AmidatedCorrected
''')
	'''
	c|-0.984016|Amidated
	c[c[17]]|-0.984016|Amidated
	'''
	cmd2 = " ".join(spectrast_cmd(pep_ion_minprob))
	print(f'Executing:{cmd2}\n')
	(output_directory / "cmds2.txt").write_text(cmd2)
	subprocess.run(spectrast_cmd(pep_ion_minprob), cwd=os_fspath(output_directory), check=True)

	# print("take only proteins from philosopher’s proteins.fas")
	filter_proteins(fasta, decoy_prefix)
	# swathwindowssetup_file_path.write_text(txt, "ascii")
	if is_DIA_Umpire_output:
		print("modifying splib file to combine Q[123] from DIA-umpire")
		modify_splib()

	print(f'Executing:{spectrast_cmds_part1}\n')
	subprocess.run(adjust_command(spectrast_cmds_part1), shell=True, cwd=os_fspath(output_directory), check=True)
	if align_with_iRT:
		print(f'Executing:{spectrast_cmds_part2}\n')
		cp = subprocess.run(adjust_command(spectrast_cmds_part2), shell=True, cwd=os_fspath(output_directory), check=not True,
							stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
		if cp.returncode != 0:
			print('Skipping iRT alignment\n')
			(output_directory / 'spectrast2spectrast_irt.log').write_bytes(cp.stdout)
			shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib')
		else:
			print(cp.stdout.decode())
			print('iRT alignment done\n')
	else:
		shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib')
	spectrast2tsv_additional_mods_path.write_text(spectrast2tsv_additional_mods_tsv_txt)
	print(f'Executing:{spectrast_cmds_part3}\n')
	subprocess.run(adjust_command(spectrast_cmds_part3), shell=True, cwd=os_fspath(output_directory), check=True)
Exemplo n.º 3
0
def main_easypqp():

    output_directory.mkdir(exist_ok=overwrite)
    if irt_choice is Irt_choice.iRT:
        irt_df.to_csv(irt_file, index=False, sep='\t', line_terminator='\n')
    elif irt_choice is Irt_choice.ciRT:
        shutil.copyfile(script_dir / 'hela_irtkit.tsv', irt_file)
    elif irt_choice is Irt_choice.userRT:
        shutil.copyfile(userRT_file, irt_file)
    print(f'''Spectral library building
Commands to execute:
{allcmds}
{'~' * 69}''',
          flush=True)
    (output_directory / "cmds.txt").write_text(allcmds)
    subprocess.run([os.fspath(easypqp), '--version'], check=True)
    procs = []
    for i, e in enumerate(easypqp_convert_cmds):
        while sum(p.poll() is None for p in procs) >= nproc:
            time.sleep(1)
        procs.append(
            subprocess.Popen(e,
                             cwd=os_fspath(output_directory),
                             stdout=open(
                                 output_directory / f'easypqp_convert_{i}.log',
                                 'w'),
                             stderr=subprocess.STDOUT))
        print(f'Executing {e}')

    for p in procs:
        p.wait()
    for i, p in enumerate(procs):
        if p.returncode != 0:
            print("EasyPQP convert error BEGIN")
            try:
                print(open(output_directory /
                           f'easypqp_convert_{i}.log').read(),
                      end="")
            except OSError as e:
                print(e)
            print("EasyPQP convert error END")
    assert all(p.returncode == 0 for p in procs)
    try:
        subprocess.run(easypqp_library_cmd(use_iRT),
                       cwd=os_fspath(output_directory),
                       check=True)
    except subprocess.CalledProcessError:
        print(
            '''Library not generated, not enough peptides could be found for alignment.
Please try using other options for alignment (e.g. ciRT if used other options)'''
        )
        sys.exit()
Exemplo n.º 4
0
def spectrast_cmd(prob):
    return [
         os_fspath(SPECTRAST_PATH),
         "-c_BIN!",
         f"-cP{prob}",
         "-cIHCD", f"-cN{output_directory / 'input000'}"] + \
        list(map(os_fspath, iproph_pep_xmls))
Exemplo n.º 5
0
def table_from_pep_xml(infile: pathlib.Path):
    tree = lxml.etree.parse(os_fspath(infile))

    spectrum_paths = tree.findall(
        "/{http://regis-web.systemsbiology.net/pepXML}msms_run_summary")
    try:
        (msms_file, ) = set(
            pathlib.Path(spectrum_path.get("base_name")).with_suffix(
                ".mzXML").resolve(strict=True)
            for spectrum_path in spectrum_paths)
    except FileNotFoundError as e:
        [spectrum_path] = tree.findall(
            # "/{http://regis-web.systemsbiology.net/pepXML}msms_pipeline_analysis"
            "/{http://regis-web.systemsbiology.net/pepXML}msms_run_summary"
            "/{http://regis-web.systemsbiology.net/pepXML}search_summary"
            "/{*}parameter[@name='spectrum, path']")
        msms_file = pathlib.Path(
            spectrum_path.get("value")).resolve(strict=True)
        import re
        infile.write_text(
            re.compile('base_name="(.+?)" ').sub(f'base_name="{msms_file}" ',
                                                 infile.read_text('utf-8')))

    scannum_to_rt = get_scannum_to_rt(msms_file)
    gen = (get_pep(ee, scannum_to_rt)
           for ee in tree.findall("/{*}msms_run_summary/{*}spectrum_query"))
    p = set((FullUniModPeptideName, PrecursorCharge, rt)
            for probablity, FullUniModPeptideName, PrecursorCharge, rt in gen
            if probablity > PEPTIDE_PROB)
    colnames = ["FullUniModPeptideName", "PrecursorCharge", "Tr_recalibrated"]
    return pd.DataFrame({colname: e for colname, *e in zip(colnames, *p)})
Exemplo n.º 6
0
def get_prot_group_infos(p: pathlib.Path):
	import lxml.etree
	root = lxml.etree.parse(os_fspath(p)).getroot()
	def number_id_peps__pep_prob_sum(prot):
		peps = prot.findall("{*}peptide")
		return (len(peps), sum(float(pep.get("nsp_adjusted_probability")) for pep in peps))

	return [(prot.get('protein_name'), [e.get('protein_name') for e in prot.findall("{*}indistinguishable_protein")], number_id_peps__pep_prob_sum(prot))
			for prot in root.iterfind(".//{*}protein_group/{*}protein")]
Exemplo n.º 7
0
def get_pep_ion_minprob(opt: Filter_option, philosopher_filter_log: str = None):
	if philosopher_filter_log is not None:
		return get_pep_ion_minprob_from_log(opt, philosopher_filter_log)
	outl=[]
	f=sys.stdout.buffer
	### get 2D FDR
	if sys.platform=='linux':
		with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stderr=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \
			phi_log.open('wb') as f2:
			for line in proc1.stderr:
				f.write(line)
				f.flush()
				f2.write(line)
				f2.flush()
				outl.append(line)
	if sys.platform=='win32':
		with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stdout=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \
			phi_log.open('wb') as f2:
			for line in proc1.stdout:
				f.write(line)
				f.flush()
				f2.write(line)
				f2.flush()
				outl.append(line)
	if use_philosopher_fo and proc1.returncode == 1:
		a = (output_directory/'.meta'/'pep_pro_mappings.tsv').read_text()
		l = [e.split('\t') for e in re.compile('(?=sp\\|)').split(a)]
		d = {ee2: e for e, *e2 in l for ee2 in e2}
		##create dummy fasta
		with (output_directory / 'proteins.fas').open('x') as f:
			for prot in sorted(set(d.values())):
				f.write(f'>{prot}\nDUMMY\n')
		# create dummy psm.tsv
		(output_directory / 'psm.tsv').write_text(
			'Peptide\tProtein\n' +
			'\n'.join(f'{ee2}\t{e}' for e, *e2 in l for ee2 in e2)
		)
	else:
		assert proc1.returncode == 0, [proc1.args, proc1.returncode]
		## filter original fasta file
		subprocess.run(phi_cmd_part2, shell=True, stderr=subprocess.STDOUT, cwd=os_fspath(output_directory), check=True)
	out = b"".join(filter(lambda line: not line.startswith(b"+"), outl))
	outtxt = out.decode("ascii")
	res2 = [float(e) for e in re.compile(' Ions.+threshold.*?=([0-9.]+)').findall(outtxt)]
	return res2[opt.value]

	log_kvs = [list(g) for _, g in itertools.groupby(shlex.split(outtxt), key=lambda x: x.startswith("time="))]
	logrecords = [dict(e.split("=", 1) for e in a + b)
				  for a, b in zip(log_kvs[::2], log_kvs[1::2])]
	res2 = [float(e["threshold"]) for e in logrecords if e["msg"].endswith("Ions")]
	assert len(res2) == 2, res2
	return res2[opt.value]
Exemplo n.º 8
0
def get_pep_ion_minprob(opt: Filter_option):
    outl = []
    f = sys.stdout.buffer
    ### get 2D FDR
    if sys.platform == 'linux':
        with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stderr=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \
         phi_log.open('wb') as f2:
            for line in proc1.stderr:
                f.write(line)
                f.flush()
                f2.write(line)
                f2.flush()
                outl.append(line)
    if sys.platform == 'win32':
        with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stdout=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \
         phi_log.open('wb') as f2:
            for line in proc1.stdout:
                f.write(line)
                f.flush()
                f2.write(line)
                f2.flush()
                outl.append(line)
    assert proc1.returncode == 0, [proc1.args, proc1.returncode]
    ## filter original fasta file
    subprocess.run(phi_cmd_part2,
                   shell=True,
                   stderr=subprocess.STDOUT,
                   cwd=os_fspath(output_directory),
                   check=True)
    out = b"".join(filter(lambda line: not line.startswith(b"+"), outl))
    outtxt = out.decode("ascii")
    res2 = [
        float(e)
        for e in re.compile(' Ions.+threshold.*?=([0-9.]+)').findall(outtxt)
    ]
    return res2[opt.value]

    log_kvs = [
        list(g)
        for _, g in itertools.groupby(shlex.split(outtxt),
                                      key=lambda x: x.startswith("time="))
    ]
    logrecords = [
        dict(e.split("=", 1) for e in a + b)
        for a, b in zip(log_kvs[::2], log_kvs[1::2])
    ]
    res2 = [
        float(e["threshold"]) for e in logrecords if e["msg"].endswith("Ions")
    ]
    assert len(res2) == 2, res2
    return res2[opt.value]
Exemplo n.º 9
0
def get_scannum_to_rt(msms_file: pathlib.Path):
    assert msms_file.suffix.casefold() == '.mzXML'.casefold()
    from xml.dom import pulldom
    doc = pulldom.parse(os_fspath(msms_file))
    scannums, rts = [], []
    for event, node in doc:
        if event is pulldom.START_ELEMENT and node.tagName == "scan":
            scannum = int(node.getAttribute("num"))
            rt_str = node.getAttribute("retentionTime")
            assert rt_str.startswith("PT") and rt_str.endswith("S")
            # scannum_to_rt.append((scannum, float(rt_str[2:-1]))
            scannums.append(scannum)
            rts.append(float(rt_str[2:-1]))
    doc.stream.close()

    scannum_to_rt = np.empty((max(scannums) + 1, ), dtype=np.float32)
    scannum_to_rt.fill(np.nan)
    for scannum, rt in zip(scannums, rts):
        scannum_to_rt[scannum] = rt

    return scannum_to_rt
Exemplo n.º 10
0
        return m[1]

    import pandas as pd, pathlib
    t = pd.read_table("output_irt_con.tsv")
    philosopher_psm_tsv = pd.read_table('psm.tsv')
    pep_to_razor_prot = {
        pep: razor_prot
        for _, pep, razor_prot in philosopher_psm_tsv[["Peptide", "Protein"
                                                       ]].itertuples()
    }
    t["razor_Protein"] = t["PeptideSequence"].map(pep_to_razor_prot.get)
    pep_init_prob = get_pep_init_prob(p)
    pathlib.Path('con_lib_not_in_psm_tsv.tsv').write_text(
        t[t["razor_Protein"].isnull()].assign(
            init_prob=t["PeptideSequence"].map(pep_init_prob.get).map(
                lambda x: "" if x is None else ','.join(x))).to_csv(
                    sep='\t', index=False).replace('(UniMod:5)', '(UniMod:1)'))
    fout = pathlib.Path('con_lib.tsv')
    print(f'writing {fout.resolve()}')
    fout.write_text(t[t["razor_Protein"].notnull()].to_csv(
        sep='\t', index=False).replace('(UniMod:5)', '(UniMod:1)'))


main0()

os.chdir(os_fspath(output_directory))
edit_raw_con_lib()
os.chdir(CWD)

# if __name__=='__main__':
# 	main()
Exemplo n.º 11
0
def write_RT_aligned_pepxml(iproph_pep_xml, rt_aligned_pepxml, reg_obj):
    # reg_obj_max_idx = len(reg_obj) - 1
    # def repl(x):
    # 	idx = round(float(x.group()) * 10)
    # 	return str(reg_obj[min(idx,reg_obj_max_idx)])
    if reg_obj == "ref run":
        with rt_aligned_pepxml.open("wt") as newf, \
          iproph_pep_xml.open("rt") as origf:
            import shutil
            shutil.copyfileobj(origf, newf)

    def repl(x):
        return str(predict(float(x.group()), reg_obj))
        ## spectrast will fail reading mzXML file if the replacement is not of the same length
        return format(predict(float(x.group()), reg_obj),
                      ".6f")[:len(x.group())]
        return format(predict(float(x.group()), reg_obj), ".2f")

    import pathlib, re, filecmp

    def get_msms_from_pep_xml(p):
        t = p.read_text()
        paths = [
            pathlib.Path(p) for p in re.compile(
                '<msms_run_summary base_name="(.+?)"').findall(t)
        ]
        paths1 = [(p.parent / p.stem).with_suffix('.mzXML') for p in paths]
        # (msms_file,) = set(filter(pathlib.Path.exists, paths1))
        msms_files = set(filter(pathlib.Path.exists, paths1))
        msms_file = next(iter(msms_files))
        assert all([filecmp.cmp(f, msms_file) for f in msms_files]), msms_files
        return msms_file

    msms_file = get_msms_from_pep_xml(iproph_pep_xml)
    recomp = re.compile('(?<=retention_time_sec=")(.+?)(?=")')
    recomp_base_name = re.compile('base_name="(.+?)"')
    with rt_aligned_pepxml.open("wt") as newf, \
      iproph_pep_xml.open("rt") as origf:

        recomp2 = re.compile('(?<=retentionTime="PT)(.+?)(?=S")')
        new_msms_file = rt_aligned_pepxml.parent / msms_file.name
        with msms_file.open("rt") as msms_file_obj, \
          new_msms_file.open("wt") as new_msms_file_obj:
            total_repl = 0
            for line_1 in msms_file_obj:
                line_new, count = recomp2.subn(repl, line_1)
                assert count in (0, 1)
                total_repl += count
                new_msms_file_obj.write(line_new)
            assert total_repl > 0

        proc = subprocess.Popen(
            [os_fspath(TPP_BIN / 'indexmzXML'),
             os_fspath(new_msms_file)],
            stdout=subprocess.DEVNULL)
        for line in origf:
            if '<msms_run_summary' in line:
                newline, count = recomp_base_name.subn(
                    f'''base_name="{msms_file.with_suffix('')}"''', line, 1)
                assert count == 1
                newf.write(newline)
            else:
                newf.write(recomp.sub(repl, line))
    proc.wait()
    assert proc.returncode == 0, [proc.args, proc.returncode]

    new_msms_file.unlink()
    (new_msms_file.parent /
     (new_msms_file.name + ".new")).rename(new_msms_file)
Exemplo n.º 12
0
rtalign_data_directory.mkdir(parents=True, exist_ok=False)

rt_dicts_file = rtalign_data_directory / "RT_dicts.pickle"
PEPTIDE_PROB = 0.9
abs_paths = [
    None if e is None else e.resolve() for e in [
        rtalign_data_directory, dia_pepxml_directory, dda_pepxml_directory,
        TPP_BIN / 'indexmzXML'
    ]
]
print((TPP_BIN / 'indexmzXML').resolve(strict=True))
print("\n".join(str(e) for e in abs_paths))

CWD = os.getcwd()
os.chdir(os_fspath(rtalign_data_directory))

# dia_pepxml_directory = rtalign_data_directory / "iproph"
assert dia_pepxml_directory.exists()
pep_xml_rt_aligned_dir = rtalign_data_directory / "RT_aligned"

if has_DDA is True:
    assert dda_pepxml_directory.exists()
    dda_iproph_pep_xmls = sorted(
        fn.resolve() for fn in dda_pepxml_directory.glob("*.pep.xml"))
    assert len(dda_iproph_pep_xmls) > 0
    dda_pep_xml_rt_aligned_dir = rtalign_data_directory / "RT_aligned_DDA"

# rt_align_dir = data_directory / "RTalign"
iproph_pep_xmls = sorted(fn.resolve()
                         for fn in dia_pepxml_directory.glob("*.pep.xml"))
Exemplo n.º 13
0
import pandas as pd, numpy as np, pathlib
import pickle

if not True:
	sys.argv = ["%(prog)s", "./workdir/libgen/combined_prots", "./workdir/iproph", "/data/dattam/PROJECTS/CoreFacility/PDLC/dda-lib-atcc-mm-R1/workdir/iproph", "/data/teog/tpp5/bin/"]
	sys.argv = ["%(prog)s", "./DIA/workdir/libgen/combined_prots", "./DIA/workdir/iproph", "./DDA/workdir/iproph", "/data/teog/tpp5/bin/"]

has_DDA = sys.argv[3] != "none"

combined_prot_data_directory = str_to_path(sys.argv[1])
dia_pepxml_directory = str_to_path(sys.argv[2])
dda_pepxml_directory = str_to_path(sys.argv[3]) if has_DDA else None
TPP_BIN = str_to_path(sys.argv[4])

dia_pep_xmls = list(map(os_fspath, dia_pepxml_directory.glob("*.iproph.pep.xml")))
dda_pep_xmls = list(map(os_fspath, dda_pepxml_directory.glob("*.iproph.pep.xml")))

raise_if_not(dia_pepxml_directory.exists(), "nonexistant DIA pep xml directory")
raise_if_not(dda_pepxml_directory.exists(), "nonexistant DDA pep xml directory")
raise_if_not(len(dia_pep_xmls) > 0, "no DIA pep xml found")
raise_if_not(len(dda_pep_xmls) > 0, "no DDA pep xml found")

combined_prot_data_directory.mkdir(exist_ok=True)

subprocess.run([os_fspath(TPP_BIN / "ProteinProphet")] +
			   dia_pep_xmls +
			   dda_pep_xmls +
			   [os_fspath(combined_prot_data_directory / "interact.prot.xml")] +
			   ["IPROPHET", "MINPROB0.9"],
			   cwd=combined_prot_data_directory)