Пример #1
0
    def test_extract_with_seed(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        # we known first element can be used as seed
        seed_record = Record(candidates[0][0])

        fragment = fragment_fromstring(get_page('fragment0'))
        seed_record_copy, mappings = mdr.extract(fragment, seed_record)

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record_copy))
        # 40 items (records)
        self.assertEquals(40, len(mappings))

        extracted_dates = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('itemprop') == 'datePublished':
                    extracted_dates.append(v.attrib.get('content'))

        self.assertEquals(extracted_dates[0], '2014-07-02')
        self.assertEquals(extracted_dates[-1], '2014-05-18')
Пример #2
0
    def test_extract_with_seed(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        # we known first element can be used as seed
        seed_record = Record(candidates[0][0])

        fragment = fragment_fromstring(get_page('fragment0'))
        seed_record_copy, mappings = mdr.extract(fragment, seed_record)

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record_copy))
        # 40 items (records)
        self.assertEquals(40, len(mappings))

        extracted_dates = []

        for record, mapping in mappings.iteritems():
            for k, v in mapping.iteritems():
                if k.attrib.get('itemprop') == 'datePublished':
                    extracted_dates.append(v.attrib.get('content'))

        self.assertEquals(extracted_dates[0], '2014-07-02')
        self.assertEquals(extracted_dates[-1], '2014-05-18')
Пример #3
0
    def test_extract_with_seed2(self):

        mdr = MDR()
        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record = Record(candidates[0][1], candidates[0][2])

        fragment1 = fragment_fromstring(get_page('fragment1'))
        seed_record_copy, mappings = mdr.extract(fragment1, seed_record)

        self.assertEquals(2, len(seed_record_copy))
        self.assertEquals('hreview', seed_record_copy[1].attrib.get('class'))
        # 27 items (records)
        self.assertEquals(27, len(mappings))

        extracted_dates = []
        extracted_texts = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('class') == 'dtreviewed':
                    extracted_dates.append(v.text)
                elif k.attrib.get('class') == 'description':
                    extracted_texts.append(v.text)

        # extract items are sorted in original order
        self.assertEquals(extracted_dates[0], '27-05-2014')
        self.assertEquals(extracted_dates[-1], '07-07-2013')
        self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig')
        self.assertEquals(
            extracted_texts[-1],
            'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.'
        )
Пример #4
0
    def test_extract_with_seed2(self):

        mdr = MDR()
        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record = Record(candidates[0][1], candidates[0][2])

        fragment1 = fragment_fromstring(get_page('fragment1'))
        seed_record_copy, mappings = mdr.extract(fragment1, seed_record)

        self.assertEquals(2, len(seed_record_copy))
        self.assertEquals('hreview', seed_record_copy[1].attrib.get('class'))
        # 27 items (records)
        self.assertEquals(27, len(mappings))

        extracted_dates = []
        extracted_texts = []

        for record, mapping in mappings.iteritems():
            for k, v in mapping.iteritems():
                if k.attrib.get('class') == 'dtreviewed':
                    extracted_dates.append(v.text)
                elif k.attrib.get('class') == 'description':
                    extracted_texts.append(v.text)

        # extract items are sorted in original order
        self.assertEquals(extracted_dates[0], '27-05-2014')
        self.assertEquals(extracted_dates[-1], '07-07-2013')
        self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig')
        self.assertEquals(extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.')
Пример #5
0
def test_mdr_fit():
    """Ensure that the MDR 'fit' function constructs the right matrix to count each class, as well as the right map from feature instances to labels"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    mdr.fit(features, classes)

    assert len(mdr.class_count_matrix) == 4
    assert len(mdr.feature_map) == 4

    assert mdr.class_count_matrix[(2, 0)][1] == 1
    assert mdr.class_count_matrix[(0, 0)][0] == 3
    assert mdr.class_count_matrix[(0, 0)][1] == 6
    assert mdr.class_count_matrix[(1, 1)][0] == 2
    assert mdr.class_count_matrix[(0, 1)][1] == 3

    assert mdr.feature_map[(2, 0)] == 1
    assert mdr.feature_map[(0, 0)] == 1
    assert mdr.feature_map[(1, 1)] == 0
    assert mdr.feature_map[(0, 1)] == 1
Пример #6
0
    def test_detect(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        assert_element('ul', "ylist ylist-bordered reviews", '', candidates[0])

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        assert_element('div', "tab-pane fade in active", 'reviews', candidates[0])
Пример #7
0
def test_mdr_fit_transform():
    """Ensure that the MDR 'fit_transform' function combines both fit and transform, and produces the right predicted labels"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    new_features = mdr.fit_transform(features, classes)
    assert np.array_equal(new_features,
                          [[1], [1], [1], [1], [1], [1], [1], [1], [1], [1],
                           [1], [1], [1], [0], [0]])
Пример #8
0
def test_mdr_init():
    """Ensure that the MDR instantiator stores the MDR variables properly"""

    mdr_obj = MDR() 

    assert mdr_obj.tie_break == 1
    assert mdr_obj.default_label == 0
    assert mdr_obj.class_count_matrix is None
    assert mdr_obj.feature_map is None

    mdr_obj2 = MDR(tie_break=1, default_label=2)

    assert mdr_obj2.tie_break == 1 
    assert mdr_obj2.default_label == 2
    assert mdr_obj.class_count_matrix is None
    assert mdr_obj.feature_map is None
Пример #9
0
    def test_extract(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record))

        # div is the top element of <li>, and there are 40 items in total
        self.assertEquals(40, len(mappings))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('divider-horizontal',
                          seed_record[0].attrib.get('class'))
        self.assertEquals('hreview', seed_record[1].attrib.get('class'))

        self.assertEquals(30, len(mappings))

        fragment2 = fragment_fromstring(get_page('fragment2'))
        seed_record, mappings = mdr.extract(fragment2)

        # record have 2 elememts: <div class='row'> and <div class='row'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('row', seed_record[0].attrib.get('class'))
        self.assertEquals(7, len(mappings))
Пример #10
0
def test_fit():
	"""Ensure that the MDR 'fit' method constructs the right matrix to count each class, as well as the right map from feature instances to labels"""
	features = np.array([   [2,	0],
							[0,	0],
							[0,	1],
							[0,	0],
							[0,	0],
							[0,	0],
							[0,	1],
							[0,	0],
							[0,	0],
							[0,	1],
							[0,	0],
							[0,	0],
							[0,	0],
							[1,	1],
							[1,	1]])

	classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

	mdr = MDR() 
	mdr.fit(features, classes)

	assert len(mdr.unique_labels) == 2
	assert mdr.class_fraction == 1. / 3.
	assert len(mdr.class_count_matrix) == 4
	assert len(mdr.feature_map) == 4

	assert mdr.class_count_matrix[(2,0)][0] == 0 
	assert mdr.class_count_matrix[(2,0)][1] == 1
	assert mdr.class_count_matrix[(0,0)][0] == 3 
	assert mdr.class_count_matrix[(0,0)][1] == 6
	assert mdr.class_count_matrix[(1,1)][0] == 2 
	assert mdr.class_count_matrix[(1,1)][1] == 0 
	assert mdr.class_count_matrix[(0,1)][0] == 0 
	assert mdr.class_count_matrix[(0,1)][1] == 3 
	assert mdr.class_count_matrix[(2,2)][0] == 0
	assert mdr.class_count_matrix[(2,2)][1] == 0

	assert mdr.feature_map[(2,0)] == 1
	assert mdr.feature_map[(0,0)] == 0
	assert mdr.feature_map[(1,1)] == 0
	assert mdr.feature_map[(0,1)] == 1
Пример #11
0
def test_mdr_transform():
    """Ensure that the MDR 'transform' function maps a new set of feature instances to the desired labels"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    mdr.fit(features, classes)
    test_features = np.array([[2, 2], [1, 1], [0, 0], [0, 0], [0, 0], [0, 0],
                              [1, 1], [0, 0], [0, 0], [0, 0], [0, 1], [1, 0],
                              [0, 0], [1, 0], [0, 0]])

    new_features = mdr.transform(test_features)
    assert np.array_equal(new_features,
                          [[0], [0], [1], [1], [1], [1], [0], [1], [1], [1],
                           [1], [0], [1], [0], [1]])
Пример #12
0
    def test_cluster(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        self.assertEquals(1, len(set(mdr.hcluster(m))))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        # first element is different from the rests
        self.assertEquals(3, len(set(mdr.hcluster(m))))
Пример #13
0
def test_custom_score(): 
	"""Ensure that the MDR 'score' method outputs the right custom score passed in from the user"""
	features = np.array([[2,0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	0],
						[1,	1],
						[1,	1]])

	classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

	mdr = MDR() 
	mdr.fit(features, classes)
	assert mdr.score(features = features, classes = classes, scoring_function = accuracy_score) == 9./15
	assert mdr.score(features = features, classes = classes, scoring_function = zero_one_loss) == 1 - 9./15
	assert mdr.score(features = features, classes = classes, scoring_function = zero_one_loss, normalize=False) == 15 - 9
Пример #14
0
    def test_extract(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record))

        # div is the top element of <li>, and there are 40 items in total
        self.assertEquals(40, len(mappings))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('divider-horizontal', seed_record[0].attrib.get('class'))
        self.assertEquals('hreview', seed_record[1].attrib.get('class'))

        self.assertEquals(30, len(mappings))

        fragment2 = fragment_fromstring(get_page('fragment2'))
        seed_record, mappings = mdr.extract(fragment2)

        # record have 2 elememts: <div class='row'> and <div class='row'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('row', seed_record[0].attrib.get('class'))
        self.assertEquals(7, len(mappings))
Пример #15
0
def test_mdr_fit_raise_ValueError():
    """Ensure that the MDR 'fit' function raises ValueError when it is not a binary classification (temporary)"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    try:
        mdr.fit(features, classes)
    except ValueError:
        assert True
    else:
        assert False

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    try:
        mdr.fit(features, classes)
    except ValueError:
        assert True
    else:
        assert False
Пример #16
0
def test_transform():
	"""Ensure that the MDR 'transform' method maps a new set of feature instances to the desired labels"""
	features = np.array([   [2,	0],
							[0,	0],
							[0,	1],
							[0,	0],
							[0,	0],
							[0,	0],
							[0,	1],
							[0,	0],
							[0,	0],
							[0,	1],
							[0,	0],
							[0,	0],
							[0,	0],
							[1,	1],
							[1,	1]])

	classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

	mdr = MDR() 
	mdr.fit(features, classes)
	test_features = np.array([	[2, 2],
								[1,	1],	
								[0,	0],	
								[0,	0],	
								[0,	0],	
								[0,	0],	
								[1,	1],	
								[0,	0],	
								[0,	0],	
								[0,	0],	
								[0,	1],	
								[1,	0],	
								[0,	0],	
								[1,	0],	
								[0,	0]])

	new_features = mdr.transform(test_features)
	assert np.array_equal(new_features, [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0])
Пример #17
0
def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0,
                                                                  0], [0, 1],
                         [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0],
                         [1, 1], [1, 1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf,
                                features,
                                classes,
                                cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.
Пример #18
0
def test_fit_transform():
	"""Ensure that the MDR 'fit_transform' method combines both fit and transform, and produces the right predicted labels"""
	features = np.array([[2,0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	0],
						[1,	1],
						[1,	1]])

	classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

	mdr = MDR() 
	new_features = mdr.fit_transform(features, classes)
	assert np.array_equal(new_features, [1,0,1,0,0,0,1,0,0,1,0,0,0,0,0])
Пример #19
0
def test_score():
	"""Ensure that the MDR 'score' method outputs the right default score, as well as the right custom metric if specified"""
	features = np.array([[2,0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	1],
						[0,	0],
						[0,	0],
						[0,	0],
						[1,	1],
						[1,	1]])

	classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

	mdr = MDR() 
	mdr.fit(features, classes)
	assert mdr.score(features, classes)	== 9./15
Пример #20
0
    def test_cluster(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        self.assertEquals(1, len(set(mdr.hcluster(m))))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        # first element is different from the rests
        self.assertEquals(3, len(set(mdr.hcluster(m))))
Пример #21
0
def test_mdr_fit_raise_ValueError():
    """Ensure that the MDR 'fit' function raises ValueError when it is not a binary classification (temporary)"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    mdr = MDR()
    try:
        mdr.fit(features, classes)
    except ValueError:
        assert True
    else:
        assert False

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    try:
        mdr.fit(features, classes)
    except ValueError:
        assert True
    else:
        assert False
Пример #22
0
    def __init__(self,name,loc=None,value=None,otype=None):
        """defines properties of a node given its name"""
        self.name = name
        self.arity = {None:0}
        self.arity['f'] = defaultdict(lambda: 0, {
                           'sin':1,'cos':1,'exp':1,'log':1,'^2':1,'^3':1,
                           'sqrt':1,'if':1,
                           'ife':2,'+':2,'-':2,'*':2,'/':2,'>_f':2,'<_f':2,
                           '>=_f':2,'<=_f':2,'xor_f':2,'mdr2':2})[name]

        self.arity['b'] = defaultdict(lambda: 0, {
                            '!':1,'if':1,'ife':1,
                            '&':2,'|':2,'==':2,'>_b':2,'<_b':2,'>=_b':2,
                            '<=_b':2,'xor_b':2})[name]
        self.in_type = {
        # float operations
            '+':'f', '-':'f', '*':'f', '/':'f', 'sin':'f', 'cos':'f', 'exp': 'f',
            'log':'f', 'x':None, 'k':None, '^2':'f', '^3':'f', 'sqrt': 'f',
            # 'rbf': ,
        # bool operations
            '!':'b', '&':'b', '|':'b', '==':'b', '>_f':'f', '<_f':'f', '>=_f':'f',
            '<=_f':'f', '>_b':'b', '<_b':'b', '>=_b':'b', '<=_b':'b','xor_b':'b',
            'xor_f':'f',
        # mixed
            'mdr2':'f','if':('f','b'),'ife':('f','b')
        }[name]
        if otype is None:
            self.out_type = {
            # float operations
                '+': 'f','-': 'f','*': 'f','/': 'f','sin': 'f','cos': 'f','exp': 'f',
                'log': 'f','x':'f','k': 'f','^2': 'f','^3': 'f','sqrt': 'f',
                # 'rbf': ,
            # bool operations
                '!': 'b', '&': 'b','|': 'b','==': 'b','>_f': 'b','<_f': 'b','>=_f': 'b',
                '<=_f': 'b','>_b': 'b','<_b': 'b','>=_b': 'b','<=_b': 'b','xor_f':'b',
                'xor_b':'b',
            # mixed
                'mdr2':'b','if':'f','ife':'f'
            }[name]
        else:
            self.out_type = otype 

        if 'mdr' in self.name:
            self.model = MDR()
            self.evaluate = run_MDR

        self.loc = loc
        self.value = value
Пример #23
0
    a5000_01h, a5000_02h, a5000_04h
]

dataset_names = [
    'a10_005h', 'a10_01h', 'a10_02h', 'a10_04h', 'a100_005h', 'a100_01h',
    'a100_02h', 'a100_04h', 'a1000_005h', 'a1000_01h', 'a1000_02h',
    'a1000_04h', 'a5000_005h', 'a5000_01h', 'a5000_02h', 'a5000_04h'
]

output_txt = '/home/ansohn/Python/venvs/mdr/gametes_logs/target_scores.txt'
with open(output_txt, 'w') as t1:
    for i in range(16):
        #        print(dataset)
        dataset = gametes_all[i]
        dataset_name = dataset_names[i]
        load_dataset = pd.read_csv(dataset, sep='\t')
        phenotype = load_dataset['Class'].values
        individuals = load_dataset.drop('Class', axis=1)
        individuals = individuals[['M0P0', 'M0P1']].values

        for i in range(30):

            X_train, X_test, y_train, y_test = train_test_split(
                individuals, phenotype, train_size=0.75, test_size=0.25)

            target_pipeline = MDR()
            target_pipeline.fit(X_train, y_train)

            t1.write('{}\t{}\tmdr-perfect\n'.format(
                dataset_name, target_pipeline.score(X_test, y_test)))
Пример #24
0
        for features in itertools.combinations(range(X.shape[1]), cur_n):
            mdr_model = copy.deepcopy(mdr_instance)
            mdr_model.fit(X[:, features], y)
            mdr_model_score = mdr_model.score(X[:, features], y)
            model_features = [feature_names[feature] for feature in features]
            yield mdr_model, mdr_model_score, model_features



xtr = _ekf(training_features, ekf_index=0)
xte = _ekf(testing_features, ekf_index=0)

#my_mdr_tr = MDR(tie_break_choice, default_label_choice)
#my_mdr_te = MDR(tie_break_choice, default_label_choice)

mymdr = MDR()
clf = GaussianNB()

n_way_results = []
n_way_features = []
for nw in range(2,4):
#            subset_features = np.random.choice(training_features.columns, nw, replace=False)
#            training_features = training_features[subset_features]
    m1 = n_way_models(mymdr, xtr.values, training_classes, n=[nw], 
                      feature_names=list(xtr.columns))
    m2 = list(m1)
    
    for i in range(0, len(m2)):
        n_way_results.append( (m2[i])[1] )
#        n_way_results = tuple(n_way_results)
        n_way_features.append( (m2[i])[2] )
Пример #25
0
def extract(request):
    if request.GET.get('url'):
        url = request.GET['url']

        mdr = MDR()
        try:
            r = requests.get(url)
            parsed_uri = urlparse(url)
        except:
            return redirect(index)

        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

        candidates, doc = mdr.list_candidates(r.text)

        seed, mappings = mdr.extract(candidates[0])

        script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in
        rel_path = "templates/autoscrapper/output.html"
        abs_file_path = os.path.join(script_dir, rel_path)
        f = open(rel_path,'w')
        x = seed.trees[0]
        # print "seed : ",x

        values = list(mappings.viewvalues())

        f.write("{%  load static %}")
        f.write("<html><h1>Extracted Data<h1>")
        f.write('<link href="bootstrap.min.css" rel="stylesheet" >')
        f.write("""<link href="{%  static 'bootstrap.min.css' %} " rel="stylesheet" >""")
        f.write("""<link href="{%  static 'cover.css' %} " rel="stylesheet">""")
        f.write('<table class="table table-bordered ">')

        key = x.iterdescendants()
        while(True):
            try:
                k = key.next()
                f.write("<th>")
                try:
                    classname = k.attrib['class']
                    f.write(classname)
                except:
                    f.write("_"+k.tag+"</th>")
                f.write("</th>")
            except:
                break

        for i, value in enumerate(values):
            f.write("<tr>")
            print "data item", i
            print "=============="
            key = x.iterdescendants()
            while(True):
                try:
                    k = key.next()
                    try:
                        val = value[k]
                    except:
                        f.write("<td></td>")
                        continue
                    f.write("<td>")
                    print k.tag, " --------> ", val.tag
                    if k.tag == 'a':
                        valattrib = val.attrib
                        href = valattrib['href']
                        # print href
                        try:
                            atext = a.text
                            print "atext = ", atext
                        except:
                            atext = href
                        # print href[:4]
                        if href[:4] != 'http':
                            # print "rel"
                            f.write('<a href="'+domain+href+'" >'+atext+'</a>')
                        else:
                            # print "abs"
                            f.write('<a href="'+valattrib['href']+'" >'+atext+'</a>')
                        # print "href = ", valattrib['href']
                        
                    elif k.tag == 'img':
                        
                        valattrib = val.attrib

                        href = valattrib['src']
                        if href[:4] != 'http':
                            f.write('<img height="100" src="'+domain+href+'" >')
                        else:
                            f.write('<img height="100" src="'+valattrib['src']+'" >')

                        # print "img = ", valattrib['src'] 
                        
                    else:  
                        try:
                            f.write(val.text)     
                            ktext = k.text
                            # valtext = val.text
                            valtext = etree.tostring(val, pretty_print=True)
                            # print ktext, " --------> ", valtext
                        except:
                            pass
                    f.write("</td>")    
                    
                except:
                    break
            f.write("</tr>")
        f.write("</table>")
        f.write("</html>")
        f.close()



        # return HttpResponse(url)
        return redirect('/output/')
    else:
        return redirect(index)