class VariantSparkAPITestCase(VariantSparkPySparkTestCase): def setUp(self): self.spark = SparkSession(self.sc) self.vc = VariantsContext(self.spark) def test_variants_context_parameter_type(self): with self.assertRaises(TypeError) as cm: self.vc.load_label(label_file_path=123, col_name=456) self.assertEqual('keyword argument label_file_path = 123 doesn\'t match signature str', str(cm.exception)) def test_importance_analysis_from_vcf(self): label_data_path = os.path.join(PROJECT_DIR, 'data/chr22-labels.csv') label = self.vc.load_label(label_file_path=label_data_path, col_name='22_16050678') feature_data_path = os.path.join(PROJECT_DIR, 'data/chr22_1000.vcf') features = self.vc.import_vcf(vcf_file_path=feature_data_path) imp_analysis = features.importance_analysis(label, 200, None, True, 17, 50, 3) imp_vars = imp_analysis.important_variables(20) most_imp_var = imp_vars[0][0] self.assertEqual('22_16050678_C_T', most_imp_var) df = imp_analysis.variable_importance() self.assertEqual('22_16050678_C_T', str(df.orderBy('importance', ascending=False).collect()[0][0])) oob_error = imp_analysis.oob_error() self.assertAlmostEqual(0.004578754578754579, oob_error, 4)
def main(): spark = SparkSession.builder\ .config(conf=VariantsContext.spark_conf()) \ .appName("HipsterIndex") \ .getOrCreate() vs = VariantsContext(spark) features = vs.import_vcf(os.path.join(PROJECT_DIR, 'data/chr22_1000.vcf')) labels = vs.load_label(os.path.join(PROJECT_DIR, 'data/chr22-labels.csv'), '22_16050408') model = features.importance_analysis(labels, mtry_fraction=0.1, seed=13, n_trees=200) print("Oob = %s" % model.oob_error()) for entry in model.important_variables(10): print entry
def setUp(self): self.spark = SparkSession(self.sc) self.vc = VariantsContext(self.spark)