def test_extract(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') seed_record, mappings = mdr.extract(candidates[0]) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record)) # div is the top element of <li>, and there are 40 items in total self.assertEquals(40, len(mappings)) page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record, mappings = mdr.extract(candidates[0]) # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'> self.assertEquals(2, len(seed_record)) self.assertEquals('divider-horizontal', seed_record[0].attrib.get('class')) self.assertEquals('hreview', seed_record[1].attrib.get('class')) self.assertEquals(30, len(mappings)) fragment2 = fragment_fromstring(get_page('fragment2')) seed_record, mappings = mdr.extract(fragment2) # record have 2 elememts: <div class='row'> and <div class='row'> self.assertEquals(2, len(seed_record)) self.assertEquals('row', seed_record[0].attrib.get('class')) self.assertEquals(7, len(mappings))
def test_detect(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') assert_element('ul', "ylist ylist-bordered reviews", '', candidates[0]) page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') assert_element('div', "tab-pane fade in active", 'reviews', candidates[0])
def test_cluster(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') m = mdr.calculate_similarity_matrix(candidates[0]) self.assertEquals(1, len(set(mdr.hcluster(m)))) page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') m = mdr.calculate_similarity_matrix(candidates[0]) # first element is different from the rests self.assertEquals(3, len(set(mdr.hcluster(m))))
def test_extract_with_seed2(self): mdr = MDR() page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record = Record(candidates[0][1], candidates[0][2]) fragment1 = fragment_fromstring(get_page('fragment1')) seed_record_copy, mappings = mdr.extract(fragment1, seed_record) self.assertEquals(2, len(seed_record_copy)) self.assertEquals('hreview', seed_record_copy[1].attrib.get('class')) # 27 items (records) self.assertEquals(27, len(mappings)) extracted_dates = [] extracted_texts = [] for record, mapping in mappings.items(): for k, v in mapping.items(): if k.attrib.get('class') == 'dtreviewed': extracted_dates.append(v.text) elif k.attrib.get('class') == 'description': extracted_texts.append(v.text) # extract items are sorted in original order self.assertEquals(extracted_dates[0], '27-05-2014') self.assertEquals(extracted_dates[-1], '07-07-2013') self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig') self.assertEquals( extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.' )
def test_extract_with_seed(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') # we known first element can be used as seed seed_record = Record(candidates[0][0]) fragment = fragment_fromstring(get_page('fragment0')) seed_record_copy, mappings = mdr.extract(fragment, seed_record) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record_copy)) # 40 items (records) self.assertEquals(40, len(mappings)) extracted_dates = [] for record, mapping in mappings.items(): for k, v in mapping.items(): if k.attrib.get('itemprop') == 'datePublished': extracted_dates.append(v.attrib.get('content')) self.assertEquals(extracted_dates[0], '2014-07-02') self.assertEquals(extracted_dates[-1], '2014-05-18')
def test_extract_with_seed2(self): mdr = MDR() page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record = Record(candidates[0][1], candidates[0][2]) fragment1 = fragment_fromstring(get_page('fragment1')) seed_record_copy, mappings = mdr.extract(fragment1, seed_record) self.assertEquals(2, len(seed_record_copy)) self.assertEquals('hreview', seed_record_copy[1].attrib.get('class')) # 27 items (records) self.assertEquals(27, len(mappings)) extracted_dates = [] extracted_texts = [] for record, mapping in mappings.iteritems(): for k, v in mapping.iteritems(): if k.attrib.get('class') == 'dtreviewed': extracted_dates.append(v.text) elif k.attrib.get('class') == 'description': extracted_texts.append(v.text) # extract items are sorted in original order self.assertEquals(extracted_dates[0], '27-05-2014') self.assertEquals(extracted_dates[-1], '07-07-2013') self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig') self.assertEquals(extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.')
def test_extract_with_seed(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') # we known first element can be used as seed seed_record = Record(candidates[0][0]) fragment = fragment_fromstring(get_page('fragment0')) seed_record_copy, mappings = mdr.extract(fragment, seed_record) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record_copy)) # 40 items (records) self.assertEquals(40, len(mappings)) extracted_dates = [] for record, mapping in mappings.iteritems(): for k, v in mapping.iteritems(): if k.attrib.get('itemprop') == 'datePublished': extracted_dates.append(v.attrib.get('content')) self.assertEquals(extracted_dates[0], '2014-07-02') self.assertEquals(extracted_dates[-1], '2014-05-18')
def extract(request): if request.GET.get('url'): url = request.GET['url'] mdr = MDR() try: r = requests.get(url) parsed_uri = urlparse(url) except: return redirect(index) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) candidates, doc = mdr.list_candidates(r.text) seed, mappings = mdr.extract(candidates[0]) script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in rel_path = "templates/autoscrapper/output.html" abs_file_path = os.path.join(script_dir, rel_path) f = open(rel_path,'w') x = seed.trees[0] # print "seed : ",x values = list(mappings.viewvalues()) f.write("{% load static %}") f.write("<html><h1>Extracted Data<h1>") f.write('<link href="bootstrap.min.css" rel="stylesheet" >') f.write("""<link href="{% static 'bootstrap.min.css' %} " rel="stylesheet" >""") f.write("""<link href="{% static 'cover.css' %} " rel="stylesheet">""") f.write('<table class="table table-bordered ">') key = x.iterdescendants() while(True): try: k = key.next() f.write("<th>") try: classname = k.attrib['class'] f.write(classname) except: f.write("_"+k.tag+"</th>") f.write("</th>") except: break for i, value in enumerate(values): f.write("<tr>") print "data item", i print "==============" key = x.iterdescendants() while(True): try: k = key.next() try: val = value[k] except: f.write("<td></td>") continue f.write("<td>") print k.tag, " --------> ", val.tag if k.tag == 'a': valattrib = val.attrib href = valattrib['href'] # print href try: atext = a.text print "atext = ", atext except: atext = href # print href[:4] if href[:4] != 'http': # print "rel" f.write('<a href="'+domain+href+'" >'+atext+'</a>') else: # print "abs" f.write('<a href="'+valattrib['href']+'" >'+atext+'</a>') # print "href = ", valattrib['href'] elif k.tag == 'img': valattrib = val.attrib href = valattrib['src'] if href[:4] != 'http': f.write('<img height="100" src="'+domain+href+'" >') else: f.write('<img height="100" src="'+valattrib['src']+'" >') # print "img = ", valattrib['src'] else: try: f.write(val.text) ktext = k.text # valtext = val.text valtext = etree.tostring(val, pretty_print=True) # print ktext, " --------> ", valtext except: pass f.write("</td>") except: break f.write("</tr>") f.write("</table>") f.write("</html>") f.close() # return HttpResponse(url) return redirect('/output/') else: return redirect(index)