Пример #1
0
    def test_extract(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record))

        # div is the top element of <li>, and there are 40 items in total
        self.assertEquals(40, len(mappings))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('divider-horizontal', seed_record[0].attrib.get('class'))
        self.assertEquals('hreview', seed_record[1].attrib.get('class'))

        self.assertEquals(30, len(mappings))

        fragment2 = fragment_fromstring(get_page('fragment2'))
        seed_record, mappings = mdr.extract(fragment2)

        # record have 2 elememts: <div class='row'> and <div class='row'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('row', seed_record[0].attrib.get('class'))
        self.assertEquals(7, len(mappings))
Пример #2
0
    def test_extract(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record))

        # div is the top element of <li>, and there are 40 items in total
        self.assertEquals(40, len(mappings))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record, mappings = mdr.extract(candidates[0])

        # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('divider-horizontal',
                          seed_record[0].attrib.get('class'))
        self.assertEquals('hreview', seed_record[1].attrib.get('class'))

        self.assertEquals(30, len(mappings))

        fragment2 = fragment_fromstring(get_page('fragment2'))
        seed_record, mappings = mdr.extract(fragment2)

        # record have 2 elememts: <div class='row'> and <div class='row'>
        self.assertEquals(2, len(seed_record))
        self.assertEquals('row', seed_record[0].attrib.get('class'))
        self.assertEquals(7, len(mappings))
Пример #3
0
    def test_detect(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        assert_element('ul', "ylist ylist-bordered reviews", '', candidates[0])

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        assert_element('div', "tab-pane fade in active", 'reviews', candidates[0])
Пример #4
0
    def test_cluster(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        self.assertEquals(1, len(set(mdr.hcluster(m))))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        # first element is different from the rests
        self.assertEquals(3, len(set(mdr.hcluster(m))))
Пример #5
0
    def test_cluster(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        self.assertEquals(1, len(set(mdr.hcluster(m))))

        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        m = mdr.calculate_similarity_matrix(candidates[0])
        # first element is different from the rests
        self.assertEquals(3, len(set(mdr.hcluster(m))))
Пример #6
0
    def test_extract_with_seed2(self):

        mdr = MDR()
        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record = Record(candidates[0][1], candidates[0][2])

        fragment1 = fragment_fromstring(get_page('fragment1'))
        seed_record_copy, mappings = mdr.extract(fragment1, seed_record)

        self.assertEquals(2, len(seed_record_copy))
        self.assertEquals('hreview', seed_record_copy[1].attrib.get('class'))
        # 27 items (records)
        self.assertEquals(27, len(mappings))

        extracted_dates = []
        extracted_texts = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('class') == 'dtreviewed':
                    extracted_dates.append(v.text)
                elif k.attrib.get('class') == 'description':
                    extracted_texts.append(v.text)

        # extract items are sorted in original order
        self.assertEquals(extracted_dates[0], '27-05-2014')
        self.assertEquals(extracted_dates[-1], '07-07-2013')
        self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig')
        self.assertEquals(
            extracted_texts[-1],
            'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.'
        )
Пример #7
0
    def test_extract_with_seed(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        # we known first element can be used as seed
        seed_record = Record(candidates[0][0])

        fragment = fragment_fromstring(get_page('fragment0'))
        seed_record_copy, mappings = mdr.extract(fragment, seed_record)

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record_copy))
        # 40 items (records)
        self.assertEquals(40, len(mappings))

        extracted_dates = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('itemprop') == 'datePublished':
                    extracted_dates.append(v.attrib.get('content'))

        self.assertEquals(extracted_dates[0], '2014-07-02')
        self.assertEquals(extracted_dates[-1], '2014-05-18')
Пример #8
0
    def test_extract_with_seed2(self):

        mdr = MDR()
        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record = Record(candidates[0][1], candidates[0][2])

        fragment1 = fragment_fromstring(get_page('fragment1'))
        seed_record_copy, mappings = mdr.extract(fragment1, seed_record)

        self.assertEquals(2, len(seed_record_copy))
        self.assertEquals('hreview', seed_record_copy[1].attrib.get('class'))
        # 27 items (records)
        self.assertEquals(27, len(mappings))

        extracted_dates = []
        extracted_texts = []

        for record, mapping in mappings.iteritems():
            for k, v in mapping.iteritems():
                if k.attrib.get('class') == 'dtreviewed':
                    extracted_dates.append(v.text)
                elif k.attrib.get('class') == 'description':
                    extracted_texts.append(v.text)

        # extract items are sorted in original order
        self.assertEquals(extracted_dates[0], '27-05-2014')
        self.assertEquals(extracted_dates[-1], '07-07-2013')
        self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig')
        self.assertEquals(extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.')
Пример #9
0
    def test_extract_with_seed(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        # we known first element can be used as seed
        seed_record = Record(candidates[0][0])

        fragment = fragment_fromstring(get_page('fragment0'))
        seed_record_copy, mappings = mdr.extract(fragment, seed_record)

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record_copy))
        # 40 items (records)
        self.assertEquals(40, len(mappings))

        extracted_dates = []

        for record, mapping in mappings.iteritems():
            for k, v in mapping.iteritems():
                if k.attrib.get('itemprop') == 'datePublished':
                    extracted_dates.append(v.attrib.get('content'))

        self.assertEquals(extracted_dates[0], '2014-07-02')
        self.assertEquals(extracted_dates[-1], '2014-05-18')
Пример #10
0
def extract(request):
    if request.GET.get('url'):
        url = request.GET['url']

        mdr = MDR()
        try:
            r = requests.get(url)
            parsed_uri = urlparse(url)
        except:
            return redirect(index)

        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

        candidates, doc = mdr.list_candidates(r.text)

        seed, mappings = mdr.extract(candidates[0])

        script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in
        rel_path = "templates/autoscrapper/output.html"
        abs_file_path = os.path.join(script_dir, rel_path)
        f = open(rel_path,'w')
        x = seed.trees[0]
        # print "seed : ",x

        values = list(mappings.viewvalues())

        f.write("{%  load static %}")
        f.write("<html><h1>Extracted Data<h1>")
        f.write('<link href="bootstrap.min.css" rel="stylesheet" >')
        f.write("""<link href="{%  static 'bootstrap.min.css' %} " rel="stylesheet" >""")
        f.write("""<link href="{%  static 'cover.css' %} " rel="stylesheet">""")
        f.write('<table class="table table-bordered ">')

        key = x.iterdescendants()
        while(True):
            try:
                k = key.next()
                f.write("<th>")
                try:
                    classname = k.attrib['class']
                    f.write(classname)
                except:
                    f.write("_"+k.tag+"</th>")
                f.write("</th>")
            except:
                break

        for i, value in enumerate(values):
            f.write("<tr>")
            print "data item", i
            print "=============="
            key = x.iterdescendants()
            while(True):
                try:
                    k = key.next()
                    try:
                        val = value[k]
                    except:
                        f.write("<td></td>")
                        continue
                    f.write("<td>")
                    print k.tag, " --------> ", val.tag
                    if k.tag == 'a':
                        valattrib = val.attrib
                        href = valattrib['href']
                        # print href
                        try:
                            atext = a.text
                            print "atext = ", atext
                        except:
                            atext = href
                        # print href[:4]
                        if href[:4] != 'http':
                            # print "rel"
                            f.write('<a href="'+domain+href+'" >'+atext+'</a>')
                        else:
                            # print "abs"
                            f.write('<a href="'+valattrib['href']+'" >'+atext+'</a>')
                        # print "href = ", valattrib['href']
                        
                    elif k.tag == 'img':
                        
                        valattrib = val.attrib

                        href = valattrib['src']
                        if href[:4] != 'http':
                            f.write('<img height="100" src="'+domain+href+'" >')
                        else:
                            f.write('<img height="100" src="'+valattrib['src']+'" >')

                        # print "img = ", valattrib['src'] 
                        
                    else:  
                        try:
                            f.write(val.text)     
                            ktext = k.text
                            # valtext = val.text
                            valtext = etree.tostring(val, pretty_print=True)
                            # print ktext, " --------> ", valtext
                        except:
                            pass
                    f.write("</td>")    
                    
                except:
                    break
            f.write("</tr>")
        f.write("</table>")
        f.write("</html>")
        f.close()



        # return HttpResponse(url)
        return redirect('/output/')
    else:
        return redirect(index)