示例#1
0
    def test_scrapt_body(self):

        data = u"""<html><header></header><body><div class="content"><div class="index,en"><div class="title">Crazy <i>Title</i></div><p>Crazy Body</p><p>Really Crazy Body</p></div></div></body></html>"""

        result = load_body.scrap_body(data, 'en')

        self.assertEqual(result, '<div class="title">Crazy <i>Title</i></div><p>Crazy Body</p><p>Really Crazy Body</p>')
示例#2
0
    def test_scrapt_body_not_found_for_a_given_language(self):

        data = u"""<html><header></header><body><div class="content"><div class="index,en"><div class="title">Crazy <i>Title</i></div><p>Crazy Body</p><p>Really Crazy Body</p></div></div></body></html>"""

        result = load_body.scrap_body(data.encode('utf-8'), 'pt')

        self.assertEqual(result, None)
示例#3
0
    def test_scrapt_body_not_found(self):

        data = u"""<html><header></header><body><div class="content"></div></body></html>"""

        result = load_body.scrap_body(data, 'pt')

        self.assertEqual(result, None)
示例#4
0
    def test_scrapt_body_not_found_for_a_given_language(self):

        data = u"""<html><header></header><body><div class="content"><div class="index,en"><div class="title">Crazy <i>Title</i></div><p>Crazy Body</p><p>Really Crazy Body</p></div></div></body></html>"""

        result = load_body.scrap_body(data, 'pt')

        self.assertEqual(result, None)
示例#5
0
    def test_scrapt_body(self):

        data = u"""<html><header></header><body><div class="content"><div class="index,en"><div class="title">Crazy <i>Title</i></div><p>Crazy Body</p><p>Really Crazy Body</p></div></div></body></html>"""

        result = load_body.scrap_body(data, 'en')

        self.assertEqual(result, '<div class="title">Crazy <i>Title</i></div><p>Crazy Body</p><p>Really Crazy Body</p>')
示例#6
0
    def test_scrapt_body_not_found(self):

        data = u"""<html><header></header><body><div class="content"></div></body></html>"""

        result = load_body.scrap_body(data.encode('utf-8'), 'pt')

        self.assertEqual(result, None)
示例#7
0
    def test_body_sample_7(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_7.html', 'r', encoding='utf-8').readlines()])

        result = load_body.scrap_body(data, 'en')

        # Text on the begining of the document
        self.assertTrue(u'caso da bacia    do Amazonas' in result)
        # Text on the end of the document
        self.assertTrue(u'com o Embasamento. Universidade Federal' in result)
示例#8
0
    def test_body_sample_6(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_6.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'Editorial' in result)
        # Text on the end of the document
        self.assertTrue(u'Boa leitura!' in result)
示例#9
0
    def test_body_sample_5(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_5.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'Molestia de Carlos Chagas' in result)
        # Text on the end of the document
        self.assertTrue(u'Full text available only in PDF format' in result)
示例#10
0
    def test_body_sample_4(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_4.html', 'r', encoding='utf-8').readlines()])
          
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'Aquarelas de um Brasil' in result)
        # Text on the end of the document
        self.assertTrue(u'São Paulo, Companhia das Letras.' in result)
示例#11
0
    def test_body_sample_3(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_3.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'A TRIBUTAÇÃO NA PRODUÇÃO DE CARVÃO VEGETAL' in result)
        # Text on the end of the document
        self.assertTrue(u'Recebido: 03 de Fevereiro de 2012; Aceito: 14 de Abril de 2014' in result)
示例#12
0
    def test_body_sample_2(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_2.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'meio para isolamento de' in result)
        # Text on the end of the document
        self.assertTrue(u'Recebido    para publicação em 31-7-1967' in result)
示例#13
0
    def test_body_sample_4(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_4.html', 'r', encoding='utf-8').readlines()])
          
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'Aquarelas de um Brasil' in result)
        # Text on the end of the document
        self.assertTrue(u'São Paulo, Companhia das Letras.' in result)
示例#14
0
    def test_body_sample_2(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_2.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'meio para isolamento de' in result)
        # Text on the end of the document
        self.assertTrue(u'Recebido    para publicação em 31-7-1967' in result)
示例#15
0
    def test_body_sample_7(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_7.html', 'r', encoding='utf-8').readlines()])

        result = load_body.scrap_body(data, 'en')

        # Text on the begining of the document
        self.assertTrue(u'caso da bacia    do Amazonas' in result)
        # Text on the end of the document
        self.assertTrue(u'com o Embasamento. Universidade Federal' in result)
示例#16
0
    def test_body_sample_6(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_6.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'Editorial' in result)
        # Text on the end of the document
        self.assertTrue(u'Boa leitura!' in result)
示例#17
0
    def test_body_sample_5(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_5.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'Molestia de Carlos Chagas' in result)
        # Text on the end of the document
        self.assertTrue(u'Full text available only in PDF format' in result)
示例#18
0
    def test_body_sample_1(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_1.html', 'r', encoding='utf-8').readlines()])

        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'On the one pot syntheses' in result)
        # Text on the end of the document
        self.assertTrue(u'Web Release Date: November 26, 2009' in result)
示例#19
0
    def test_body_sample_1(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_1.html', 'r', encoding='utf-8').readlines()])

        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'On the one pot syntheses' in result)
        # Text on the end of the document
        self.assertTrue(u'Web Release Date: November 26, 2009' in result)
示例#20
0
    def test_body_sample_3(self):
        data = ' '.join([i.strip() for i in codecs.open(os.path.dirname(__file__)+'/fixtures/body_sample_3.html', 'r', encoding='utf-8').readlines()])
      
        result = load_body.scrap_body(data, 'pt')

        # Text on the begining of the document
        self.assertTrue(u'A TRIBUTAÇÃO NA PRODUÇÃO DE CARVÃO VEGETAL' in result)
        # Text on the end of the document
        self.assertTrue(u'Recebido: 03 de Fevereiro de 2012; Aceito: 14 de Abril de 2014' in result)