def test_identify_identified(self):
        parser = clustering_parser.ClusteringParser(self.testfile)
        analyser = id_transferer.IdTransferer(add_to_identified=True,
                                              add_to_unidentified=False)

        for cluster in parser:
            analyser.process_cluster(cluster)

        self.assertEqual(3151, len(analyser.identification_references))
def extract_clusters(cluster_ids, clustering_file):
    parser = clustering_parser.ClusteringParser(clustering_file)
    clusters = list()

    for cluster in parser:
        if cluster.id in cluster_ids:
            clusters.append(cluster)
        if len(clusters) == len(cluster_ids):
            break

    return clusters
    def test_wrong_spec_counts(self):
        parser = clustering_parser.ClusteringParser(self.testfile3)

        all_clusters = list()

        for c in parser:
            all_clusters.append(c)

        self.assertEqual(27, len(all_clusters))
        self.assertEqual(13, all_clusters[26].n_spectra)
        self.assertEqual(5, all_clusters[26].identified_spectra)
    def test_parse_clustering_file(self):
        parser = clustering_parser.ClusteringParser(self.testfile)

        n_clusters = 0
        seen_clusters = set()

        for cluster in parser:
            n_clusters += 1

            if cluster.id in seen_clusters:
                self.fail(cluster.id + " was encountered twice.")

            seen_clusters.add(cluster.id)

            if n_clusters == 1:
                self.assertEqual("1cc813a1-4e75-4c1d-99aa-752312fbe554",
                                 cluster.id)
                self.assertEqual(359.155, cluster.precursor_mz)
                self.assertEqual(2, len(cluster.get_spectra()))
                self.assertEqual(1, len(cluster.get_spectra()[0].psms))
                self.assertEqual(
                    "RPHFFFPK",
                    cluster.get_spectra()
                    [0].psms.__iter__().__next__().sequence)
                self.assertEqual(1, len(cluster.max_sequences))
                self.assertEqual("RPHFFFPK", cluster.max_sequences[0])
                self.assertEqual(1, cluster.max_ratio)
                self.assertEqual(1, cluster.max_il_ratio)

                for spectrum in cluster.get_spectra():
                    for psm in spectrum.psms:
                        if len(psm.ptms) > 0:
                            self.assertEqual("R[MOD:1234]PHFFFPK", str(psm))

            if n_clusters == 2:
                self.assertEqual("9a582e74-e8b1-451d-a007-cadc362aa2ce",
                                 cluster.id)
                self.assertEqual(2 / 3, cluster.max_ratio)
                self.assertEqual(2 / 3, cluster.max_il_ratio)
                self.assertEqual(1, len(cluster.max_sequences))
                self.assertEqual("MEGIGLK", cluster.max_sequences[0])

            # make sure the last cluster is read correctly
            if n_clusters == 838:
                self.assertEqual("25ed3015-f2d8-4df1-ac96-c23076c96bfe",
                                 cluster.id)
                self.assertEqual(1, cluster.max_ratio)
                self.assertEqual(1, cluster.max_il_ratio)
                self.assertEqual(1, len(cluster.max_sequences))
                self.assertEqual("MQEAMTQEVSDVFSDTTTPIK",
                                 cluster.max_sequences[0])

        self.assertEqual(838, n_clusters)
예제 #5
0
def main():
    """
    Primary entry function for the CLI.
    :return:
    """
    arguments = docopt(__doc__, version='cluster_filter.py 1.0 BETA')

    # make sure the input file exists
    if not os.path.isfile(arguments['--input']):
        print("Error: Cannot find input file '" + arguments["--input"] + "'")
        sys.exit(1)

    # make sure the output file does not exist
    if os.path.isfile(arguments["--output"]):
        print("Error: Output file exists '" + arguments["--output"] + "'")
        sys.exit(1)

    project_ids = None
    cluster_ids = None

    if "--project_ids" in arguments and arguments["--project_ids"] is not None:
        project_ids = list()

        with open(arguments["--project_ids"], "r") as IN:
            for line in IN:
                project_ids.append(line.strip())
    if "--cluster_ids" in arguments and arguments["--cluster_ids"] is not None:
        cluster_ids = list()

        with open(arguments["--cluster_ids"], "r") as IN:
            for line in IN:
                cluster_ids.append(line.strip())

    with open(arguments["--output"] + ".part", "w") as OUT:
        # create the id transferer based on the settings
        analyser = create_analyser(arguments, OUT)

        # process all clusters
        parser = clustering_parser.ClusteringParser(arguments["--input"])

        print("Parsing input .clustering file...")
        for cluster in parser:
            # filter based on cluster ids if set
            if cluster_ids is not None:
                if cluster.id not in cluster_ids:
                    continue
            if project_ids is not None:
                analyser.remove_spectra_by_project(cluster, project_ids)
            analyser.process_cluster(cluster)
    os.rename(arguments["--output"] + ".part", arguments["--output"])
    print("Results written to " + arguments["--output"])
def main():
    """
    Primary entry function for the CLI.
    :return:
    """
    arguments = docopt(__doc__, version='cluster_parameter_extractor 1.0 BETA')

    input_file = arguments['--input']
    output_file = arguments["--output"]
    process_synthetic = arguments["--synthetic_peptides"]

    # make sure the input file exists
    if not os.path.isfile(input_file):
        print("Error: Cannot find input file '" + input_file + "'")
        sys.exit(1)

    # make sure the output file does not exist
    if os.path.isfile(output_file):
        print("Error: Output file exists '" + output_file + "'")
        sys.exit(1)

    with open(output_file, "w") as OUT:
        # write the header
        OUT.write(
            "id\tprecursor_mz\tav_charge\tsize\tidentified_spec_count\tunidentified_spec_count\t"
            "max_ratio\tmax_il_ratio\tprecursor_mz_range\tsequences\t"
            "max_sequence\tmax_sequence_count\tmax_sequence_mods\t"
            "second_max_sequence\tsecond_max_sequence_count\tsecond_max_sequence_mods\tn_input_files\t"
            "max_consensus_peak_rel_tic\tmax_consensus_peak_mz")

        if process_synthetic:
            OUT.write("\tsynth_count\tsynth_ratio\tsynth_max_sequence")

        OUT.write("\n")

        # process the file
        parser = clustering_parser.ClusteringParser(input_file)

        for cluster in parser:
            cluster_line = process_cluster(cluster)
            OUT.write(cluster_line)

            # process synthetic peptides
            if process_synthetic:
                synth_line = process_synthetic_peptides(cluster)
                OUT.write("\t" + synth_line)

            OUT.write("\n")

    print("Results written to " + output_file)
    def testClusterAsFeatures(self):
        parser = clustering_parser.ClusteringParser(self.testfile)

        analyser = cluster_features.ClusterAsFeatures(
            sample_name_extractor=ClusterAsFeaturesTest.pride_project_extractor)

        for cluster in parser:
            analyser.process_cluster(cluster)

        self.assertEqual(838, len(analyser.features))
        self.assertEqual(1, len(analyser.features[0]))
        self.assertEqual(2, analyser.features[0]["PRD000001"])

        self.assertEqual(1, len(analyser.samples))
        self.assertTrue("PRD000001" in analyser.samples)
예제 #8
0
    def test_retention_time(self):
        """
        Test parsing .clustering files that contain the retention time as
        additional parameter
        """
        test_file = os.path.join(os.path.dirname(__file__), "testfiles", "retention_time_test.clustering")

        parser = clustering_parser.ClusteringParser(test_file)

        for cluster in parser:
            self.assertIsNotNone(cluster)

            # make sure every spectrum has the retention time property
            for spec in cluster.get_spectra():
                self.assertIsNotNone(spec.get_property("RT"))
    def test_identifiy_all(self):
        parser = clustering_parser.ClusteringParser(self.testfile)
        analyser = id_transferer.IdTransferer(True, True)

        for cluster in parser:
            analyser.process_cluster(cluster)

        self.assertEqual(3149, len(analyser.identification_references))
        self.assertEqual(1, len(analyser.identification_references[11].psms))
        self.assertEqual("HQGVMVGMGQK", analyser.identification_references[11].psms[0].sequence)
        self.assertEqual("/home/jg/Projects/ebi-pride/pride-cluster-2/chimeric-spectra-generator/src/test/resources/PRD000001.st.id.mgf",
                         analyser.identification_references[11].filename)

        ref4 = analyser.identification_references[4]
        self.assertEqual(1, len(ref4.psms))
        self.assertEqual("MEGIGLK", ref4.psms[0].sequence)
def load_spectra_to_cluster(result_file: str, before_cluster_id: str = None):
    """
    Creates a dict holding the spectra ids as keys and the cluster ids the spectrum belongs to as
    value.
    :param result_file: Path to the .clustering file to process
    :param before_cluster_id: If set this string is prepended to every cluster id.
    :return: A dict with the spectrum id as key and the cluster id as value
    """
    parser = clustering_parser.ClusteringParser(result_file)

    spec_to_cluster = dict()

    for cluster in parser:
        for spectrum in cluster.get_spectra():
            if before_cluster_id is not None:
                spec_to_cluster[
                    spectrum.get_id()] = before_cluster_id + cluster.id
            else:
                spec_to_cluster[spectrum.get_id()] = cluster.id

    return spec_to_cluster
def main():
    """
    Primary entry function for the CLI.
    :return:
    """
    arguments = docopt(__doc__, version='id_transferer_cli 1.0 BETA')

    # make sure the input file exists
    if not os.path.isfile(arguments['--input']):
        print("Error: Cannot find input file '" + arguments["--input"] + "'")
        sys.exit(1)

    # make sure the output file does not exist
    if os.path.isfile(arguments["--output"]):
        print("Error: Output file exists '" + arguments["--output"] + "'")
        sys.exit(1)

    with open(arguments["--output"], "w") as OUT:
        # create the id transferer based on the settings
        analyser = create_analyser(arguments, OUT)

        # process all clusters
        parser = clustering_parser.ClusteringParser(arguments["--input"])

        print("Parsing input .clustering file...", end="", flush=True)
        processed_clusters = 0
        for cluster in parser:
            analyser.process_cluster(cluster)

            processed_clusters += 1
            if processed_clusters == 1000:
                print(".", end="", flush=True)
                processed_clusters = 0

    # add the header to the output file
    print("Adding header line...")
    analyser.add_resultfile_header(arguments["--output"])

    print("Results written to " + arguments["--output"])
def main():
    """
    Primary entry function for the CLI.

    """
    arguments = docopt(__doc__, version='id_transferer_cli 1.0 BETA')

    # make sure the input file exists
    input_file = arguments['--input']
    if not os.path.isfile(input_file):
        print("Error: Cannot find input file '" + input_file + "'")
        sys.exit(1)

    # make sure the output file does not exist
    output_file = arguments["--output"]
    if os.path.isfile(output_file):
        print("Error: Output file exists '" + output_file + "'")
        sys.exit(1)

    fasta_file = arguments["--fasta"]

    if fasta_file is not None:
        if not os.path.isfile(fasta_file):
            print("Error: Cannot find FASTA file '" + fasta_file + "'")
            sys.exit(1)

    # create the id transferer based on the settings
    analyser = create_analyser(arguments)

    # process all clusters
    parser = clustering_parser.ClusteringParser(input_file)

    print("Parsing input .clustering file...")
    for cluster in parser:
        analyser.process_cluster(cluster)

    # perform protein inference
    if fasta_file is not None:
        print("Doing protein inference...")
        all_peptides = set()
        for id_ref in analyser.identification_references:
            for psm in id_ref.psms:
                all_peptides.add(psm.sequence)

        peptide_mappings = peptide_mapping.the_magic_mapping_function(
            all_peptides, fasta_file)
    else:
        peptide_mappings = None

    # create the output file
    if arguments["--moff_compatible"]:
        write_moff_results(
            identification_references=analyser.identification_references,
            peptide_mappings=peptide_mappings,
            output_filename=output_file)
    else:
        write_results(
            identification_references=analyser.identification_references,
            peptide_mappings=peptide_mappings,
            output_filename=output_file)

    print("Results written to " + output_file)
예제 #13
0
    def test_psi_clustering(self):
        parser = clustering_parser.ClusteringParser(self.testfile2)

        for cluster in parser:
            self.assertEqual(2, len(cluster.get_spectra()))