示例#1
0
    def encode_dataset(dataset,
                       hp_setting: HPSetting,
                       path: Path,
                       learn_model: bool,
                       context: dict,
                       number_of_processes: int,
                       label_configuration: LabelConfiguration,
                       encode_labels: bool = True,
                       store_encoded_data: bool = False):
        PathBuilder.build(path)

        encoded_dataset = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=hp_setting.encoder,
                              encoder_params=EncoderParams(
                                  model=hp_setting.encoder_params,
                                  result_path=path,
                                  pool_size=number_of_processes,
                                  label_config=label_configuration,
                                  learn_model=learn_model,
                                  filename="train_dataset.pkl"
                                  if learn_model else "test_dataset.pkl",
                                  encode_labels=encode_labels),
                              store_encoded_data=store_encoded_data))
        return encoded_dataset
示例#2
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder

    Arguments:
        path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format
        result_path (str): where to store the results
        metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
            otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column
    Returns:
         encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    path_to_dataset_directory = Path(path_to_dataset_directory)
    result_path = Path(result_path)

    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)
    else:
        metadata_path = Path(metadata_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_alleles",
            "allJHitsWithScore": "j_alleles"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.labels.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_type": "amino_acid",
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])]))))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=result_path / "csv_exported", file_format='csv')
    dataset_exporter.generate_report()

    return encoded_dataset
 def encode(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> Dataset:
     if unit.encoder is not None:
         encoded_dataset = DataEncoder.run(DataEncoderParams(dataset=unit.dataset, encoder=unit.encoder,
                                                             encoder_params=EncoderParams(result_path=result_path,
                                                                                          label_config=unit.label_config,
                                                                                          filename="encoded_dataset.pkl",
                                                                                          pool_size=unit.number_of_processes,
                                                                                          learn_model=True,
                                                                                          encode_labels=unit.label_config is not None),
                                                             store_encoded_data=True))
     else:
         encoded_dataset = unit.dataset
     return encoded_dataset
示例#4
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)