Exemplo n.º 1
0
def test_mdr_init():
    """Ensure that the MDR instantiator stores the MDR variables properly"""

    mdr_obj = MDR() 

    assert mdr_obj.tie_break == 1
    assert mdr_obj.default_label == 0
    assert mdr_obj.class_count_matrix is None
    assert mdr_obj.feature_map is None

    mdr_obj2 = MDR(tie_break=1, default_label=2)

    assert mdr_obj2.tie_break == 1 
    assert mdr_obj2.default_label == 2
    assert mdr_obj.class_count_matrix is None
    assert mdr_obj.feature_map is None
Exemplo n.º 2
0
def test_mdr_fit():
    """Ensure that the MDR 'fit' function constructs the right matrix to count each class, as well as the right map from feature instances to labels"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    mdr.fit(features, classes)

    assert len(mdr.class_count_matrix) == 4
    assert len(mdr.feature_map) == 4

    assert mdr.class_count_matrix[(2, 0)][1] == 1
    assert mdr.class_count_matrix[(0, 0)][0] == 3
    assert mdr.class_count_matrix[(0, 0)][1] == 6
    assert mdr.class_count_matrix[(1, 1)][0] == 2
    assert mdr.class_count_matrix[(0, 1)][1] == 3

    assert mdr.feature_map[(2, 0)] == 1
    assert mdr.feature_map[(0, 0)] == 1
    assert mdr.feature_map[(1, 1)] == 0
    assert mdr.feature_map[(0, 1)] == 1
Exemplo n.º 3
0
def test_mdr_fit_raise_ValueError():
    """Ensure that the MDR 'fit' function raises ValueError when it is not a binary classification (temporary)"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    try:
        mdr.fit(features, classes)
    except ValueError:
        assert True
    else:
        assert False

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    try:
        mdr.fit(features, classes)
    except ValueError:
        assert True
    else:
        assert False
Exemplo n.º 4
0
    def test_extract_with_seed2(self):

        mdr = MDR()
        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record = Record(candidates[0][1], candidates[0][2])

        fragment1 = fragment_fromstring(get_page('fragment1'))
        seed_record_copy, mappings = mdr.extract(fragment1, seed_record)

        self.assertEquals(2, len(seed_record_copy))
        self.assertEquals('hreview', seed_record_copy[1].attrib.get('class'))
        # 27 items (records)
        self.assertEquals(27, len(mappings))

        extracted_dates = []
        extracted_texts = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('class') == 'dtreviewed':
                    extracted_dates.append(v.text)
                elif k.attrib.get('class') == 'description':
                    extracted_texts.append(v.text)

        # extract items are sorted in original order
        self.assertEquals(extracted_dates[0], '27-05-2014')
        self.assertEquals(extracted_dates[-1], '07-07-2013')
        self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig')
        self.assertEquals(
            extracted_texts[-1],
            'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.'
        )
Exemplo n.º 5
0
    def test_extract_with_seed(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        # we known first element can be used as seed
        seed_record = Record(candidates[0][0])

        fragment = fragment_fromstring(get_page('fragment0'))
        seed_record_copy, mappings = mdr.extract(fragment, seed_record)

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record_copy))
        # 40 items (records)
        self.assertEquals(40, len(mappings))

        extracted_dates = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('itemprop') == 'datePublished':
                    extracted_dates.append(v.attrib.get('content'))

        self.assertEquals(extracted_dates[0], '2014-07-02')
        self.assertEquals(extracted_dates[-1], '2014-05-18')
Exemplo n.º 6
0
    def test_extract(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record))

        # div is the top element of <li>, and there are 40 items in total
        self.assertEquals(40, len(mappings))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('divider-horizontal',
                          seed_record[0].attrib.get('class'))
        self.assertEquals('hreview', seed_record[1].attrib.get('class'))

        self.assertEquals(30, len(mappings))

        fragment2 = fragment_fromstring(get_page('fragment2'))
        seed_record, mappings = mdr.extract(fragment2)

        # record have 2 elememts: <div class='row'> and <div class='row'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('row', seed_record[0].attrib.get('class'))
        self.assertEquals(7, len(mappings))
Exemplo n.º 7
0
    def test_detect(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        assert_element('ul', "ylist ylist-bordered reviews", '', candidates[0])

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        assert_element('div', "tab-pane fade in active", 'reviews', candidates[0])
Exemplo n.º 8
0
    def test_cluster(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        self.assertEquals(1, len(set(mdr.hcluster(m))))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        # first element is different from the rests
        self.assertEquals(3, len(set(mdr.hcluster(m))))
Exemplo n.º 9
0
def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf,
                                features,
                                classes,
                                cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.
Exemplo n.º 10
0
def test_mdr_fit_transform():
    """Ensure that the MDR 'fit_transform' function combines both fit and transform, and produces the right predicted labels"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    new_features = mdr.fit_transform(features, classes)
    assert np.array_equal(new_features,
                          [[1], [1], [1], [1], [1], [1], [1], [1], [1], [1],
                           [1], [1], [1], [0], [0]])
Exemplo n.º 11
0
    def __init__(self,name,loc=None,value=None,otype=None):
        """defines properties of a node given its name"""
        self.name = name
        self.arity = {None:0}
        self.arity['f'] = defaultdict(lambda: 0, {
                           'sin':1,'cos':1,'exp':1,'log':1,'^2':1,'^3':1,
                           'sqrt':1,'if':1,
                           'ife':2,'+':2,'-':2,'*':2,'/':2,'>_f':2,'<_f':2,
                           '>=_f':2,'<=_f':2,'xor_f':2,'mdr2':2})[name]

        self.arity['b'] = defaultdict(lambda: 0, {
                            '!':1,'if':1,'ife':1,
                            '&':2,'|':2,'==':2,'>_b':2,'<_b':2,'>=_b':2,
                            '<=_b':2,'xor_b':2})[name]
        self.in_type = {
        # float operations
            '+':'f', '-':'f', '*':'f', '/':'f', 'sin':'f', 'cos':'f', 'exp': 'f',
            'log':'f', 'x':None, 'k':None, '^2':'f', '^3':'f', 'sqrt': 'f',
            # 'rbf': ,
        # bool operations
            '!':'b', '&':'b', '|':'b', '==':'b', '>_f':'f', '<_f':'f', '>=_f':'f',
            '<=_f':'f', '>_b':'b', '<_b':'b', '>=_b':'b', '<=_b':'b','xor_b':'b',
            'xor_f':'f',
        # mixed
            'mdr2':'f','if':('f','b'),'ife':('f','b')
        }[name]
        if otype is None:
            self.out_type = {
            # float operations
                '+': 'f','-': 'f','*': 'f','/': 'f','sin': 'f','cos': 'f','exp': 'f',
                'log': 'f','x':'f','k': 'f','^2': 'f','^3': 'f','sqrt': 'f',
                # 'rbf': ,
            # bool operations
                '!': 'b', '&': 'b','|': 'b','==': 'b','>_f': 'b','<_f': 'b','>=_f': 'b',
                '<=_f': 'b','>_b': 'b','<_b': 'b','>=_b': 'b','<=_b': 'b','xor_f':'b',
                'xor_b':'b',
            # mixed
                'mdr2':'b','if':'f','ife':'f'
            }[name]
        else:
            self.out_type = otype 

        if 'mdr' in self.name:
            self.model = MDR()
            self.evaluate = run_MDR

        self.loc = loc
        self.value = value
Exemplo n.º 12
0
def test_mdr_transform():
    """Ensure that the MDR 'transform' function maps a new set of feature instances to the desired labels"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    mdr.fit(features, classes)
    test_features = np.array([[2, 2], [1, 1], [0, 0], [0, 0], [0, 0], [0, 0],
                              [1, 1], [0, 0], [0, 0], [0, 0], [0, 1], [1, 0],
                              [0, 0], [1, 0], [0, 0]])

    new_features = mdr.transform(test_features)
    assert np.array_equal(new_features,
                          [[0], [0], [1], [1], [1], [1], [0], [1], [1], [1],
                           [1], [0], [1], [0], [1]])
Exemplo n.º 13
0
        for features in itertools.combinations(range(X.shape[1]), cur_n):
            mdr_model = copy.deepcopy(mdr_instance)
            mdr_model.fit(X[:, features], y)
            mdr_model_score = mdr_model.score(X[:, features], y)
            model_features = [feature_names[feature] for feature in features]
            yield mdr_model, mdr_model_score, model_features



xtr = _ekf(training_features, ekf_index=0)
xte = _ekf(testing_features, ekf_index=0)

#my_mdr_tr = MDR(tie_break_choice, default_label_choice)
#my_mdr_te = MDR(tie_break_choice, default_label_choice)

mymdr = MDR()
clf = GaussianNB()

n_way_results = []
n_way_features = []
for nw in range(2,4):
#            subset_features = np.random.choice(training_features.columns, nw, replace=False)
#            training_features = training_features[subset_features]
    m1 = n_way_models(mymdr, xtr.values, training_classes, n=[nw], 
                      feature_names=list(xtr.columns))
    m2 = list(m1)
    
    for i in range(0, len(m2)):
        n_way_results.append( (m2[i])[1] )
#        n_way_results = tuple(n_way_results)
        n_way_features.append( (m2[i])[2] )
Exemplo n.º 14
0
    a5000_01h, a5000_02h, a5000_04h
]

dataset_names = [
    'a10_005h', 'a10_01h', 'a10_02h', 'a10_04h', 'a100_005h', 'a100_01h',
    'a100_02h', 'a100_04h', 'a1000_005h', 'a1000_01h', 'a1000_02h',
    'a1000_04h', 'a5000_005h', 'a5000_01h', 'a5000_02h', 'a5000_04h'
]

output_txt = '/home/ansohn/Python/venvs/mdr/gametes_logs/target_scores.txt'
with open(output_txt, 'w') as t1:
    for i in range(16):
        #        print(dataset)
        dataset = gametes_all[i]
        dataset_name = dataset_names[i]
        load_dataset = pd.read_csv(dataset, sep='\t')
        phenotype = load_dataset['Class'].values
        individuals = load_dataset.drop('Class', axis=1)
        individuals = individuals[['M0P0', 'M0P1']].values

        for i in range(30):

            X_train, X_test, y_train, y_test = train_test_split(
                individuals, phenotype, train_size=0.75, test_size=0.25)

            target_pipeline = MDR()
            target_pipeline.fit(X_train, y_train)

            t1.write('{}\t{}\tmdr-perfect\n'.format(
                dataset_name, target_pipeline.score(X_test, y_test)))