def launchAddClf(args, models_path, reports_path): """ Launch second classification with a list of classifiers and update reports file. :param args: Arguments of the script. :type args: argparse.NameSpace :param models_path: Path to the models file (format: MSIReport). :type models_path: str :param reports_path: Path to the report file obtained with first classification (format: MSIReport). :type reports_path: str """ for clf_name in args.add_classifiers: method_name = clf_name clf_params = None if clf_name.startswith("RandomForest:"): n_estimators = clf_name.split(":")[1] clf_name = "RandomForest" clf_params = '{"n_estimators": ' + n_estimators + '}' # Copy combination produced by MIAmS in data of the new method reports = MSIReport.parse(out_reports_path) lociInitData(reports, args.default_classifier, method_name) MSIReport.write(reports, out_reports_path) # Submit classification submitAddClf(models_path, out_reports_path, out_reports_path, args, method_name, clf_name, clf_params)
def process(args): """ Filter loci usable for instability status prediction. :param args: The namespace extracted from the script arguments. :type args: Namespace """ reports = MSIReport.parse(args.input_reports) for spl in reports: # Filter loci status for locus_id, locus in spl.loci.items(): res_locus = locus.results[args.method_name] if len(res_locus.data) != 0 and res_locus.getCount( ) < args.min_distrib_support: res_locus.status = Status.undetermined res_locus.score = None # Re-repocess sample status if args.consensus_method == "majority": spl.setStatusByMajority(args.method_name, args.min_voting_loci) elif args.consensus_method == "ratio": spl.setStatusByInstabilityRatio(args.method_name, args.min_voting_loci, args.instability_ratio) elif args.consensus_method == "count": spl.setStatusByInstabilityCount(args.method_name, args.min_voting_loci, args.instability_count) spl.setScore(args.method_name, args.undetermined_weight, args.locus_weight_is_score) # Write report MSIReport.write(reports, args.output_reports)
def getHigherPeakByLocus(models, min_support_reads): """ Return length of the higher peak of each model by locus. :param models: The list of MSIReport representing the models (status known and stored in Expected result). :type models: list :param min_support_reads: The minimum number of reads on locus to use the stability status of the current model. :type min_support_reads: int :return: By locus the list of higher peak length. :rtype: dict """ higher_by_locus = {} models_samples = MSIReport.parse(models) for curr_spl in models_samples: for locus_id, curr_locus in curr_spl.loci.items(): if locus_id not in higher_by_locus: higher_by_locus[locus_id] = [] if "model" in curr_locus.results: if curr_locus.results["model"].status == Status.stable and curr_locus.results["model"].getNbFrag() > (min_support_reads / 2): max_peak = None max_count = -1 for length, count in curr_locus.results["model"].data["nb_by_length"].items(): if count >= max_count: # "=" for select the tallest max_count = count max_peak = int(length) higher_by_locus[locus_id].append(max_peak) return higher_by_locus
def process(args): """ ********************************************************. :param args: The namespace extracted from the script arguments. :type args: Namespace """ data_by_spl = getLocusAnnotDict(args.input_loci_annotations) msi_samples = MSIReport.parse(args.input_report) for curr_spl in msi_samples: addLociResToSpl(curr_spl, data_by_spl[curr_spl.name]) MSIReport.write(msi_samples, args.output_report)
def process(args): """ Predict classification (status and score) for all samples loci. :param args: The namespace extracted from the script arguments. :type args: Namespace """ train_dataset = MSIReport.parse(args.input_references) test_dataset = MSIReport.parse(args.input_evaluated) # Classification by locus loci_ids = sorted(train_dataset[0].loci.keys()) for locus_id in loci_ids: # Select the samples with a sufficient number of fragment for classify the distribution evaluated_test_dataset = [] for spl in test_dataset: if spl.loci[locus_id].results[args.method_name].getNbFrag() < args.min_support_fragments: spl.loci[locus_id].results[args.method_name].status = Status.undetermined spl.loci[locus_id].results[args.method_name].score = None else: evaluated_test_dataset.append(spl) # Classify if len(evaluated_test_dataset) != 0: clf = MIAmSClassifier(locus_id, args.method_name, "model", args.classifier, args.classifier_params) clf.fit(train_dataset) clf.set_status(evaluated_test_dataset) # Classification by sample for spl in test_dataset: if args.consensus_method == "majority": spl.setStatusByMajority(args.method_name, args.min_voting_loci) elif args.consensus_method == "ratio": spl.setStatusByInstabilityRatio(args.method_name, args.min_voting_loci, args.instability_ratio) elif args.consensus_method == "count": spl.setStatusByInstabilityCount(args.method_name, args.min_voting_loci, args.instability_count) spl.setScore(args.method_name, args.undetermined_weight, args.locus_weight_is_score) MSIReport.write(test_dataset, args.output_report)
def getAggregatedSpl(in_reports): """ Return one list of MSISample from several MSReport. :param in_reports: Pathes to the MSIReport files. :type in_reports: list of MSIReport :return: List of MSISample. :rtype: list """ aggregated_spl = [] for curr_report in in_reports: msi_samples = MSIReport.parse(curr_report) for curr_spl in msi_samples: aggregated_spl.append(curr_spl) return aggregated_spl
def process(args): """ Tag stability for loci and sample from length distribution on loci. :param args: The namespace extracted from the script arguments. :type args: Namespace """ spl_name = args.sample_name if args.sample_name is None: spl_name = os.path.basename(args.output_report).split(".")[0] if spl_name.endswith("_report"): spl_name = spl_name[:-7] msi_spl = MSISample(spl_name) # Parse lengths metrics by loci with HashedSVIO(args.input_combined_list) as FH_loci_list: for record in FH_loci_list: with open(record["Filepath"]) as FH_locus: locus_metrics = json.load(FH_locus) msi_locus = MSILocus.fromDict({ "name": record["Locus_name"], "position": record["Locus_position"], "results": { "PairsCombi": { "_class": "LocusResPairsCombi", "status": Status.none, "data": { "nb_by_length": locus_metrics["nb_by_length"], "nb_pairs_aligned": locus_metrics["nb_uncombined_pairs"] + locus_metrics["nb_combined_pairs"] } } } }) msi_spl.addLocus(msi_locus) # Process status msi_models = MSIReport.parse(args.input_models) for locus_id in msi_spl.loci: processor = PairsCombiProcessor(locus_id, msi_models, [msi_spl], args.min_support) processor.setLocusStatus() msi_spl.setStatus("PairsCombi") # Write report MSIReport.write([msi_spl], args.output_report)
spl_name for idx, spl_name in enumerate(ordered_spl_names) if idx in test_idx } train_samples = [ lib for lib in librairies if lib["spl_name"] in train_names ] # Select all libraries corresponding to the train samples test_samples = [ lib for lib in librairies if lib["spl_name"] in test_names ] # Select all libraries corresponding to the test samples # Process learn and tag train(train_samples, annotation_path, design_folder, baseline_path, models_path, learn_log_path, args) predict(test_samples, design_folder, baseline_path, models_path, out_folder, args) models = MSIReport.parse(models_path) reports = getMSISamples(os.path.join(out_folder, "data")) if len(args.add_classifiers) > 0: log.info( "Process {} additionnal classifiers on dataset {}/{} ({})." .format(len(args.add_classifiers), dataset_id, args.nb_tests - 1, dataset_md5)) MSIReport.write(reports, out_reports_path) launchAddClf(args, models_path, out_reports_path) reports = MSIReport.parse(out_reports_path) # Write results and dataset use_header = False out_mode = "a" if dataset_id == 0: use_header = True out_mode = "w"
# Get status by locus status_by_spl = {} with HashedSVIO(args.input_status, title_starter="") as FH_in: for record in FH_in: status_by_spl[record["sample"]] = { locus: status for locus, status in record.items() if locus not in ["sample", "sample_status"] } # Get min and max amplicon size by locus range_by_locus = {} for filename in os.listdir(args.input_data): filepath = os.path.join(args.input_data, filename) report = MSIReport.parse(filepath) for spl in report: for locus_id, locus in spl.loci.items(): if locus_id not in range_by_locus: range_by_locus[locus_id] = {"min": 300, "max": 0} range_by_locus[locus_id]["min"] = min( locus.results[args.reference_method].getMinLength(), range_by_locus[locus_id]["min"]) range_by_locus[locus_id]["max"] = max( locus.results[args.reference_method].getMaxLength(), range_by_locus[locus_id]["max"]) # Write lengths distributions for filename in os.listdir(args.input_data): filepath = os.path.join(args.input_data, filename) report = MSIReport.parse(filepath)