예제 #1
0
 def test_filelist_standard(self):
     # filename	replicate	batch	injectionOrder	classLabel
     fm_dict = validate_metadata(
         to_test_data("filelist_csl_MTBLS79_mzml_triplicates.txt"))
     self.assertEqual(fm_dict['filename'], [
         'batch04_B02_rep01_301.mzML', 'batch04_B02_rep02_302.mzML',
         'batch04_B02_rep03_303.mzML', 'batch04_QC17_rep01_262.mzML',
         'batch04_QC17_rep02_263.mzML', 'batch04_QC17_rep03_264.mzML',
         'batch04_S01_rep01_247.mzML', 'batch04_S01_rep02_248.mzML',
         'batch04_S01_rep03_249.mzML'
     ])
     self.assertEqual(fm_dict['replicate'], [1, 2, 3, 1, 2, 3, 1, 2, 3])
     self.assertEqual(fm_dict['batch'], [1] * 9)
     self.assertEqual(fm_dict['injectionOrder'],
                      [1, 2, 3, 4, 5, 6, 7, 8, 9])
     self.assertEqual(fm_dict['classLabel'], [
         'blank', 'blank', 'blank', 'QC', 'QC', 'QC', 'sample', 'sample',
         'sample'
     ])
예제 #2
0
 def test_filelist_replicate_error_zero_value(self):
     with self.assertRaises(Exception) as context:
         validate_metadata(to_test_data("filelist_replicate_error_2.txt"))
     self.assertTrue(
         "Incorrect numbering for replicates" in str(context.exception))
예제 #3
0
 def test_filelist_class_label_error(self):
     with self.assertRaises(Exception) as context:
         validate_metadata(to_test_data("filelist_class_label_error.txt"))
     self.assertTrue("class names do not match with number of replicates" in
                     str(context.exception))
예제 #4
0
 def test_filelist_injection_order_error(self):
     with self.assertRaises(Exception) as context:
         validate_metadata(
             to_test_data("filelist_injection_order_error.txt"))
     self.assertTrue("samples not in order" in str(context.exception))
예제 #5
0
 def test_filelist_multilist_error(self):
     with self.assertRaises(Exception) as context:
         validate_metadata(to_test_data("filelist_multi_error.txt"))
     self.assertTrue("Column 'multilist' values should be integers" in str(
         context.exception))
예제 #6
0
 def test_filename_error(self):
     with self.assertRaises(Exception) as context:
         validate_metadata(to_test_data("filelist_filename_error.txt"))
     self.assertTrue("Duplicate filename in list" in str(context.exception))
예제 #7
0
 def test_filelist_multi(self):
     fm_dict = validate_metadata(to_test_data("filelist_multi.txt"))
     self.assertEqual(fm_dict['multilist'], [1, 1, 2])
def main():

    # Create ArgumentParser object
    parser = argparse.ArgumentParser(
        description=
        'Python package for processing acoustic mist ionisation-mass spectrometry -based metabolomics and lipidomics data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # subparsers
    subparsers = parser.add_subparsers(dest='step')

    parser_scans = subparsers.add_parser(
        'process-scans', help='Process and align scans within samples.')
    parser_samples = subparsers.add_parser('process-samples',
                                           help='Process and align samples.')
    parser_hpmt = subparsers.add_parser(
        'hdf5-pm-to-txt',
        help='Write HDF5 output (peak matrix) to text format.')
    parser_hplt = subparsers.add_parser(
        'hdf5-pls-to-txt',
        help='Write HDF5 output (peak lists) to text format.')

    #####################
    # Process Scans
    #####################

    parser_scans.add_argument(
        "-i",
        "--input",
        type=str,
        nargs='+',
        required=True,
        metavar='source',
        help=
        "Absolute or relative path to the *.mzml file(s). Must be in same order as 'metascans *txt files'"
    )

    parser_scans.add_argument(
        '-ms',
        '--metascans',
        type=str,
        nargs='+',
        required=True,
        metavar='source',
        help=
        "Absolute or relative path to the comma-delimited *.txt metadata file. Must be in same order and 'input' *mzml files. Header names must contain and be in the following order names =['barcode', 'date/time', 'row', 'col', 'scan', 'ejection time', 'NA'] as output by MS-Parser tool"
    )

    parser_scans.add_argument(
        "-o",
        "--output",
        help="Absolute or relative path to the output file",
        action="store",
        type=str,
        required=True)

    parser_scans.add_argument(
        "-f",
        "--failed-wells",
        help=
        "Absolute or relative path to the *.txt output of which well failed",
        action="store",
        type=str,
        required=True)

    parser_scans.add_argument(
        "-pr",
        "--processed_scans",
        help=
        "Absolute or relative path to the *.txt output of which well failed",
        action="store",
        type=str,
        required=True)

    parser_scans.add_argument(
        "-m",
        "--method",
        help=
        "Method to define which scans to extract data from. DEFAULT = on_scans_no_edge",
        action="store",
        type=str,
        choices=["all_scans", "on_scans", "off_scans", "on_scan_no_edge"],
        default="on_scans_no_edge")

    parser_scans.add_argument(
        "-d",
        "--id-snr",
        help=
        "For identifying on/off scans: Hard SNR threshold for differentiating between on/off scans. DEFAULT = 15",
        action="store",
        type=int,
        default=15)

    parser_scans.add_argument(
        "-t",
        "--id-tol",
        help=
        "For identifying on/off scans: Number of features with SNR > threshold to tolerate in off scans. DEFAULT = 3",
        action="store",
        type=int,
        default=3)

    parser_scans.add_argument(
        "-s",
        "--snr-threshold",
        help="SNR threshold to remove noise features. DEFAULT = 2",
        action="store",
        type=int,
        default=3)

    parser_scans.add_argument(
        "-n",
        "--min-scans",
        help=
        "Minimum number of scans required to be labelled on within a well for sample to be taken forward. DEFAULT = 0",
        action="store",
        type=int,
        default=0)

    parser_scans.add_argument(
        "-r",
        "--rsd-threshold",
        help=
        "RSD filter (scan level): Threshold of RSD of features across scans in sample for it to be retained. DEFAULT = None",
        action="store",
        type=int,
        default=None)

    parser_scans.add_argument(
        "-fr",
        "--min-fraction",
        help=
        "Minimum fraction a peak has to be present. Use 0.0 to not apply this filter.",
        action="store",
        type=float,
        default=None)

    parser_scans.add_argument(
        "-p",
        "--ppm",
        help=
        "Aligning scans: m/z precision (ppm) to align scans in sample - REQUIRED PARAMETER!",
        action="store",
        type=int,
        required=True)

    parser_scans.add_argument(
        '-l',
        '--metalist',
        type=str,
        required=False,
        help=
        "Absolute or relative path to the tab-delimited *.txt file that include the name of the data files (*.mzml) and meta data. "
        "Column names: filename, replicate, batch, injectionOrder, classLabel."
    )

    #################################
    # Process Samples
    #################################

    parser_samples.add_argument(
        "-i",
        "--input",
        help=
        "Absolute or relative path to the *.hdf5 file containing all peaklists from process scans",
        action="store",
        type=str,
        required=True)

    parser_samples.add_argument(
        "-o",
        "--output",
        help="Absolute or relative path to the output file",
        action="store",
        type=str,
        required=True)

    parser_samples.add_argument(
        "-p",
        "--ppm",
        help=
        "Aligning samples: m/z precision (ppm) to align samples in study - REQUIRED PARAMETER!",
        action="store",
        type=int,
        required=True)

    parser_samples.add_argument(
        "-b",
        "--block-size",
        help=
        "Aligning samples: Number peaks in each centre clustering block for alignment of samples. DEFAULT = 5000 (should increase for large studies)",
        action="store",
        type=int,
        default=5000)

    parser_samples.add_argument(
        "-fr",
        "--min-fraction",
        help="Minimum percentage of samples a peak has to be present.",
        action="store",
        type=float,
        required=False,
        default=None)

    parser_samples.add_argument(
        '-r',
        '--rsd-threshold',
        default=None,
        type=float,
        required=False,
        help=
        "Peaks where the associated QC peaks are above this threshold will be removed."
    )

    parser_samples.add_argument(
        '-w',
        '--within',
        type=bool,
        nargs='?',
        const=True,
        default=False,
        help="Apply sample filter within each sample class.")

    parser_samples.add_argument('-q',
                                '--qc-label',
                                default=None,
                                type=str,
                                required=False,
                                help="Class label for QCs")

    #################################
    # HDF5 peaklists to text
    #################################

    parser_hplt.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help=
        "Absolute or relative path to the HDF5 file that contains a list of peaklist objects from one of the processing steps."
    )

    parser_hplt.add_argument("-o",
                             "--output",
                             help="Directory to write to.",
                             action="store",
                             type=str,
                             default=os.getcwd())

    parser_hplt.add_argument(
        '-d',
        '--delimiter',
        default="tab",
        choices=["tab", "comma"],
        help="Values on each line of the file are separated by this character."
    )

    #################################
    # HDF5 peak matrix to text
    #################################

    parser_hpmt.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help=
        "Absolute or relative path to the HDF5 file that contains a peak matrix object from one of the processing steps."
    )

    parser_hpmt.add_argument('-o',
                             '--output',
                             type=str,
                             required=True,
                             help="Directory to write to.")

    parser_hpmt.add_argument('-a',
                             '--attribute_name',
                             default="intensity",
                             choices=["intensity", "mz", "snr"],
                             required=False,
                             help="Type of matrix to print.")

    parser_hpmt.add_argument(
        '-l',
        '--class-label-rsd',
        action='append',
        required=False,
        default=(),
        help="Class label to select samples for RSD calculatons (e.g. QC).")

    parser_hpmt.add_argument(
        '-d',
        '--delimiter',
        default="tab",
        choices=["tab", "comma"],
        help="Values on each line of the file are separated by this character."
    )

    parser_hpmt.add_argument(
        '-s',
        '--representation-samples',
        default="rows",
        choices=["rows", "columns"],
        help="Should the rows or columns respresent the samples?")

    parser_hpmt.add_argument(
        '-c',
        '--comprehensive',
        action='store_true',
        required=False,
        help=
        "Whether to output simple or comprehensive version of the peak matrix. Do not use argument if want simple output, use -c or --comprehensive for comprehensive output"
    )

    args = parser.parse_args()

    print(args)

    if args.step == "process-scans":

        peaklists = []
        failed_wells = []
        scans_processed = {}

        for i in range(len(args.input)):

            print("Acquisition; {}".format(args.input[i]))
            # Store spectral data
            run = Mzml(args.input[i])

            # Define which wells scans are associated with
            df = pd.read_csv(args.metascans[i],
                             header=None,
                             names=[
                                 "barcode", "date/time", "row", "col", "scan",
                                 "ejection time", "NA"
                             ])
            df = df[["barcode", "row", "col", "scan"]]
            alphabet = list(string.ascii_uppercase)
            df['well_label'] = df.apply(
                lambda row: "%s_%s%02d" %
                (row.barcode, alphabet[row.row - 1], row.col),
                axis=1)

            if args.metalist is not None:
                metadata = validate_metadata(args.metalist)

            for index, well in df[["well_label"]].drop_duplicates().iterrows():

                well_scans = list(
                    df[(df["well_label"] == well["well_label"])]["scan"])

                wellInfo = Scans(run, well, well_scans, args.id_snr,
                                 args.id_tol)

                scan_ids = wellInfo.extract(args.method)

                if isinstance(scan_ids, str):
                    scans_processed[well[0]] = scan_ids

                else:
                    scans_processed[well[0]] = scan_ids

                if len(scan_ids) < args.min_scans:
                    line = "Well: {}, failed due to: < {} scans in well taken forward. Scan_ids for well: {}".format(
                        well, args.min_scans, scan_ids)
                    failed_wells.append(line)

                else:
                    # Regenerates peak lists for each well (pl is individual
                    # scan) with user defined snr, rsd and min fraction
                    # thresholds
                    # pls is the spectral data (mz, intensity, snr, flags) for
                    # all scans
                    pls = run.peaklists(scan_ids, function_noise="median")

                    pls = [
                        filter_attr(
                            pl, "snr", min_threshold=args.snr_threshold)
                        if len(pl.mz) > 0 else pl for pl in pls
                    ]  # Filters out noise using SNR
                    # dataframe with only extracted scans/peaklists
                    pls = [pl for pl in pls if int(pl.ID) in scan_ids]

                    try:
                        # Forms aligned peak matrix from peakLists
                        pm = align_peaks(pls,
                                         ppm=args.ppm,
                                         block_size=5000,
                                         edge_extend=(2 * args.ppm))

                    except ValueError as e:
                        line = "Well: {}, failed due to: {}.".format(well, e)
                        failed_wells.append(line)
                        continue

                    # Generates peakLists from aligned peak matrix
                    pl_aligned = pm.to_peaklist(
                        ID="{}".format(well["well_label"]))

                    if "snr" in pm.attributes:
                        pl_aligned.add_attribute("snr",
                                                 pm.attr_mean_vector("snr"),
                                                 on_index=2)

                    pl_aligned.add_attribute("rsd",
                                             pm.rsd(flagged_only=False),
                                             on_index=4)

                    pl_aligned.add_attribute('snr_flag',
                                             np.ones(pl_aligned.full_size),
                                             flagged_only=False,
                                             is_flag=True)

                    if args.rsd_threshold is not None:
                        rsd_flag = map(
                            lambda x: not np.isnan(x) and x < args.
                            rsd_threshold,
                            pl_aligned.get_attribute("rsd",
                                                     flagged_only=False))
                        pl_aligned.add_attribute("rsd_flag",
                                                 rsd_flag,
                                                 flagged_only=False,
                                                 is_flag=True)

                    if args.min_fraction is not None:
                        pl_aligned.add_attribute(
                            "internal_fraction_flag",
                            (pm.present / float(pm.shape[0])) >=
                            args.min_fraction,
                            flagged_only=False,
                            is_flag=True)

                    if args.metalist is not None:
                        pl_aligned = update_metadata_and_labels([pl_aligned],
                                                                metadata)
                        peaklists.append(pl_aligned[0])
                    else:
                        peaklists.append(pl_aligned)

        with open(args.failed_wells, "w") as out:
            for well in failed_wells:
                out.write("{}\n".format(well))

        out_df = pd.DataFrame.from_dict(scans_processed, orient='index')
        out_df.to_csv(args.processed_scans, sep='\t')

        hdf5_portal.save_peaklists_as_hdf5(peaklists,
                                           "{}.hdf5".format(args.output))

    if args.step == "process-samples":

        peaklists = hdf5_portal.load_peaklists_from_hdf5(args.input)

        peakmatrix = align_peaks(
            peaklists,
            ppm=args.ppm,
            block_size=args.block_size,
            edge_extend=(
                2 *
                args.ppm))  # align peaks into mz bins... ppm = ppm_precision

        peakmatrix = sample_filter(peakmatrix,
                                   min_fraction=args.min_fraction,
                                   within=args.within,
                                   qc_label=args.qc_label,
                                   rsd_thres=args.rsd_threshold)

        hdf5_portal.save_peak_matrix_as_hdf5(peakmatrix, args.output)

    if args.step == 'hdf5-pls-to-txt':
        hdf5_peaklists_to_txt(args.input,
                              path_out=args.output,
                              delimiter=map_delimiter(args.delimiter))

    if args.step == 'hdf5-pm-to-txt':
        if args.representation_samples == "rows":
            samples_in_rows = True
        else:
            samples_in_rows = False

        hdf5_peak_matrix_to_txt(args.input,
                                path_out=args.output,
                                attr_name=args.attribute_name,
                                delimiter=map_delimiter(args.delimiter),
                                rsd_tags=args.class_label_rsd,
                                samples_in_rows=samples_in_rows,
                                comprehensive=args.comprehensive)