Exemplo n.º 1
0
 async def read_book(self, book_file):
     p = re.compile(r'\.\s+')
     p2 = re.compile(r"\\'")
     self._book = epub.read_epub(book_file)
     for item in self._book.get_items():
         if item.get_type() == ebooklib.ITEM_DOCUMENT:
             name = str(item.get_name())
             self._parsed_book[name] = list()
             logger.debug('==================================')
             logger.debug('NAME : ' + name)
             logger.debug('----------------------------------')
             content = str(item.get_content())
             logger.debug(content)
             parser = MyHTMLParser()
             parser.feed(content)
             result = parser.get_result()
             for string in result:
                 string = p.sub('.\n', string)
                 string = p2.sub("'", string)
                 lines = string.split("\n")
                 for line in lines:
                     new_line = str(line.lstrip("\\n")).rstrip()
                     self._parsed_book[name].append(new_line)
                     logger.debug(new_line)
                     #translated_string = await self.translate_text(new_line, 'en')
                     #content = content.replace(new_line, translated_string)
                     #logger.debug(":" + str(translated_string) + ":")
             logger.debug('==================================')
     logger.debug("Book:")
     logger.debug(str(self._parsed_book))
     logger.debug('==================================')
Exemplo n.º 2
0
def fetch(furl):
    print 'fetch ' + furl
    wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})')
    #html = get(url)
    html = pq(url=furl)('.maxPicList').html()
    if html:
        hp = MyHTMLParser()
        hp.feed(html)
        hp.close()
        for link in hp.links:
            m = wwwp.match(link)
            if m:
                m.group(4)
                myurls.append(m.group(1)+m.group(2)+'/detail/apply/'+m.group(4)+'/?callback=?')
Exemplo n.º 3
0
def fetch(furl):
    print 'fetch ' + furl
    wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})')
    #html = get(url)
    html = pq(url=furl)('.maxPicList').html()
    if html:
        hp = MyHTMLParser()
        hp.feed(html)
        hp.close()
        for link in hp.links:
            m = wwwp.match(link)
            if m:
                m.group(4)
                myurls.append(
                    m.group(1) + m.group(2) + '/detail/apply/' + m.group(4) +
                    '/?callback=?')
Exemplo n.º 4
0
def fetch(furl):
    print 'fetch ' + furl + str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})')
    #html = get(url)
    html = pq(url=furl)('.maxPicList').html()
    #md5.update(html.encode('utf8'))
    #keytxt = md5.hexdigest()
    #global lastdigest
    #print lastdigest
    #if lastdigest == keytxt:
    #    return
    #lastdigest = keytxt
    if html:
        hp = MyHTMLParser()
        hp.feed(html)
        hp.close()
        for link in hp.links:
            m = wwwp.match(link)
            if m:
                m.group(4)
                myurls.append(m.group(1)+m.group(2)+'/detail/apply/'+m.group(4)+'/?callback=?')
Exemplo n.º 5
0
def fetch(furl):
    print 'fetch ' + furl + str(
        time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})')
    #html = get(url)
    html = pq(url=furl)('.maxPicList').html()
    #md5.update(html.encode('utf8'))
    #keytxt = md5.hexdigest()
    #global lastdigest
    #print lastdigest
    #if lastdigest == keytxt:
    #    return
    #lastdigest = keytxt
    if html:
        hp = MyHTMLParser()
        hp.feed(html)
        hp.close()
        for link in hp.links:
            m = wwwp.match(link)
            if m:
                m.group(4)
                myurls.append(
                    m.group(1) + m.group(2) + '/detail/apply/' + m.group(4) +
                    '/?callback=?')
Exemplo n.º 6
0
    'https://www.seattletimes.com/seattle-news/politics/how-amazon-gets-whatever-it-wants/',
    'https://www.seattletimes.com/seattle-news/homeless/homeless-man-dies-from-exposure-service-providers-prepare-for-more-cold-weather/',
    'https://www.seattletimes.com/seattle-news/transportation/more-snow-is-headed-toward-seattle-and-road-clearing-crews-are-getting-ready/',
    'https://www.seattletimes.com/seattle-news/politics/should-seattle-make-trims-to-neighborhood-upzones-plan-city-council-wades-into-debate/',
    'https://www.seattletimes.com/business/tensions-over-political-resistance-to-amazon-boil-over-in-new-york/',
    'https://www.seattletimes.com/seattle-news/health/washington-lawmakers-weigh-stricter-vaccine-bill-amid-outbreak/'
]

count = 0
for url2 in url_list:
    res = requests.get(url2)
    raw_file = open('file' + str(count) + '.txt', 'w')
    raw_file.write(url2)
    raw_file.write('\n')
    raw_file.write(str(res.status_code))
    raw_file.write('\n')
    raw_file.write(res.text)
    raw_file.write('\n')
    raw_file.close()
    count = count + 1

res = requests.get(url_list[0])
parser = MyHTMLParser()
parser.feed(res.text)

raw_file = open('sampledata.txt', 'w')
for stuff in parser.p_data:
    raw_file.write(stuff)

raw_file.close()