Пример #1
0
class VariantSparkAPITestCase(VariantSparkPySparkTestCase):

    def setUp(self):
        self.spark = SparkSession(self.sc)
        self.vc = VariantsContext(self.spark)

    def test_variants_context_parameter_type(self):
        with self.assertRaises(TypeError) as cm:
            self.vc.load_label(label_file_path=123, col_name=456)
        self.assertEqual('keyword argument label_file_path = 123 doesn\'t match signature str',
                         str(cm.exception))

    def test_importance_analysis_from_vcf(self):
        label_data_path = os.path.join(PROJECT_DIR, 'data/chr22-labels.csv')
        label = self.vc.load_label(label_file_path=label_data_path, col_name='22_16050678')
        feature_data_path = os.path.join(PROJECT_DIR, 'data/chr22_1000.vcf')
        features = self.vc.import_vcf(vcf_file_path=feature_data_path)

        imp_analysis = features.importance_analysis(label, 200, None, True, 17, 50, 3)
        imp_vars = imp_analysis.important_variables(20)
        most_imp_var = imp_vars[0][0]
        self.assertEqual('22_16050678_C_T', most_imp_var)
        df = imp_analysis.variable_importance()
        self.assertEqual('22_16050678_C_T',
                         str(df.orderBy('importance', ascending=False).collect()[0][0]))
        oob_error = imp_analysis.oob_error()
        self.assertAlmostEqual(0.004578754578754579, oob_error, 4)
Пример #2
0
def main():
    spark = SparkSession.builder\
        .config(conf=VariantsContext.spark_conf()) \
        .appName("HipsterIndex") \
        .getOrCreate()
    vs = VariantsContext(spark)
    features = vs.import_vcf(os.path.join(PROJECT_DIR, 'data/chr22_1000.vcf'))
    labels = vs.load_label(os.path.join(PROJECT_DIR, 'data/chr22-labels.csv'),
                           '22_16050408')
    model = features.importance_analysis(labels,
                                         mtry_fraction=0.1,
                                         seed=13,
                                         n_trees=200)
    print("Oob = %s" % model.oob_error())
    for entry in model.important_variables(10):
        print entry
Пример #3
0
 def setUp(self):
     self.spark = SparkSession(self.sc)
     self.vc = VariantsContext(self.spark)