Python get_html примеры, util.get_html Python примеры использования

Пример #1

0

Показать файл

Файл: mangareader.py Проект: Pierre70/Manga

def mangareader(url, download_chapters, args):
    html = get_html(url)
    global last
    if hasattr(args, 'last'):
        last = args.last

    series = title(
        re.search('<td.*?>\\s*Name:.*?<h2.*?>\\s*(.*?)\\s*</h2>\\s*</td>',
                  html.replace('\n', '')).group(1))
    status = re.search('<td.*?>\\s*Status:.*?<td>\\s*(.*?)\\s*</td>',
                       html.replace('\n', '')).group(1)
    author = re.search('<td.*?>\\s*Author:.*?<td>\\s*(.*?)\\s*</td>',
                       html.replace('\n',
                                    '')).group(1).partition('(')[0].strip()
    tags = re.findall('<a.*?><span class="genretags">(.*?)</span></a>', html)
    for j in range(len(tags)):
        for k in tag_dict:
            tags[j] = re.sub(k, tag_dict[k], tags[j])
    chapters = []

    for j in re.findall('<tr>\\s*<td>\\s*<div.*?</div>(.*?)</tr>', html,
                        re.DOTALL | re.MULTILINE):
        match = re.search('<a.*?([\\d.,-]+)</a>(\\s*:\\s*)(.*?)\\s*</td>', j)
        num = float(match.group(1))
        name = match.group(3)
        link = 'http://www.mangareader.net' + re.search(
            '<a\\s*href=\"(/.*?)\">', j).group(1)
        date = re.search('<td>(\\d{2})/(\\d{2})/(\\d{4})</td>', j)
        date = '{:04}-{:02}-{:02}'.format(int(date.group(3)),
                                          int(date.group(1)),
                                          int(date.group(2)))

        if name:
            name = '{} - {} : {}'.format(series,
                                         '{:3.1f}'.format(num).zfill(5), name)
        else:
            name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

        if (download_chapters
                and num in download_chapters) or (not download_chapters
                                                  and num > last):
            if args.debug or args.verbose:
                print('  Gathering info: \"{}\"'.format(name))
            chap_html = get_html(link)
            links = [
                'http://www.mangareader.net' + i for i in re.findall(
                    '<option value=\"(.*?)\".*?>\\d+</option>', chap_html)
            ]
            chapters.append({
                'name': name,
                'links': links,
                'backup_links': links,
                'date': date,
                'pages': len(links),
                'num': num
            })

    if chapters:
        function_name(chapters, series, tags, author, status, args)

Пример #2

0

Показать файл

def scan_fr(url, download_chapters,args):
  print("getting url "+url)
  html  = get_html(url)
  global last
  if hasattr(args, 'last'):
    last=args.last
  series    = title(re.search('(<h2 class="widget-title" style="display: inline-block;">)([^<]*)(</h2>)', html.replace('\n', '')).group(2))
  print("series: series"+series)
 
#FIND ALL
 # info_gen = re.findall('(<div class="cell">\\s*(.*?)\\s*</div>)', html.replace('\n', '')) ## ['*****@*****.**', '*****@*****.**']

  status="" # not set in this source
  author="" # not set in this source
  tags="" # not set in this source


#  for j in range(len(tags)):
    #for k in tag_dict:
       #tags[j] = re.sub(k, tag_dict[k], tags[j])
  chapters  = []

  # catch chapters list
  chapitres=re.search('(<ul class="chapters">(.*)</ul>)', html.replace('\n','').replace('\r',''))
  # char ? will be used to allow overlapping regex !
  for j in re.findall('<h5 class="chapter-title-rtl">(.*?)</h5>', chapitres.group(1), re.DOTALL|re.MULTILINE)[::-1]:
    print("ligne trouvée:"+j)
    #match = re.search('<a.*[-/]([0-9]+).*',j,re.DOTALL|re.MULTILINE)
    match = re.search('<a.*[-/]([0-9.]+).*>(.*) ([0-9.]+)</a>',j,re.DOTALL|re.MULTILINE)
# re.search('<a.*?>(.*?)([\\d,.]+)\\s*</a>', j, re.DOTALL|re.MULTILINE)
    #name  = match.group(2)
    num   = float(match.group(1))
    link  = "http://"+re.search('href=\".*(www.*?)\"', j).group(1)
    # no name, we use title instead
    name = ''
    date = "01/01/2000"
    serie_short=match.group(2)
    if name:
      name = '{} - {} : {}'.format(serie_short, '{:3.1f}'.format(num).zfill(5), name)
    else:
      name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

    if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
      if args.debug or args.verbose:
        print('  Gathering info: \"{}\"'.format(series))
        print('  downloading chapter '+link)
      chap_html = get_html(link)
      links=['']
      image_regex="data-src='(.*?) '"
      links     = [i for i in re.findall(image_regex, chap_html)]

      chapters.append({'name':name, 'links':links, 'backup_links':links, 'date':date, 'pages':len(links), 'num':num})
      args.url=url
  if chapters:
    function_name(chapters, series, tags, author, status,args)

Пример #3

0

Показать файл

def goodmanga(url, download_chapters,args):
  html  = get_html(url)
  global last
  if hasattr(args, 'last'):
    last=args.last
 
  series    = title(re.search('<h1>([^<>]*?)</h1>', html.replace('\n', '')).group(1))
  status    = re.search('<span>Status:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1)
  author    = re.search('<span>Authors?:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1)
  tags      = re.findall('<a.*?>(.*?)</a>', re.search('<span>Genres:</span>(.*?)\\s*</div>', html, re.DOTALL|re.MULTILINE).group(1))
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])
  chapters  = []

  while True:
    for j in re.findall('<li>\\s*(.{1,300}?\\d{4}</span>)\\s*</li>', html, re.DOTALL|re.MULTILINE):
      match = re.search('<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL|re.MULTILINE)
      name  = match.group(2)
      num   = float(match.group(1))
      link  = re.search('href=\"(.*?)\"', j).group(1)
      try:
        date  = datetime.strptime(re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d')
      except:
        date  = datetime.datetime.today().strftime('%Y-%m-%d')

      if name:
        name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name)
      else:
        name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

      if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
        if args.debug or args.verbose:
          print('  Gathering info: \"{}\"'.format(name))
        chap_html  = get_html(link)
        img_url    = re.sub('1.([jpgnig]{3})', '{}.\\1', re.search('</div>\\s*<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>', chap_html, re.DOTALL|re.MULTILINE).group(1))
        pages      = max([int(i) for i in re.findall('<option value=\".*?\".*?>\\s*(\\d+)\\s*</option>', chap_html)])
        b_links    = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>\\s*(\\d+)\\s*</option>', chap_html)}
        b_links    = [b_links[i+1] for i in range(pages)]
        links      = [img_url.format(i+1) for i in range(pages)]

        chapters.insert(0, {'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num})
    match   = re.search('<a href=\"(.*?)\">Next</a>', html)
    if match:
      html  = get_html(match.group(1))
    else:
      break
  if chapters:
    function_name(chapters, series, tags, author, status,args)

Пример #4

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def run_revo(self, args):
		if not args:
			return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("revo")
		if args.lower() == "sal":
			return "%s/revo/sal.html" % config.zz9pza
		word = urllib.quote(util.x_to_unicode(args))
		html = util.get_html(config.revo_search % word)
		if "Neniu trafo" in html:
			return 'Nenio trovita por "%s".' % args 
		ret = []
		esperante = False
		for line in html.splitlines():
			if line.startswith("<h1>"):
				lang = re.search(r"<h1>(.+?)</h1>", line).group(1).split()[0]
				esperante = lang=="esperante"
				ret.append("-%s-" % lang)
			if line.startswith("<a"):
				if esperante:
					m = re.search(r'href="(.+?)">(.+?)</a>', line)
					eo_word, link = m.group(2).split()[0], config.revo_site+m.group(1)
					ret.append("%s %s" % (eo_word, link))
				else:
					m = re.search(r'>(.+?)</a>', line)
					word = m.group(1)
					m = re.search(r'\(.+?href="(.+?)">(.+?)</a>', line)
					eo_word, link = m.group(2).split()[0], config.revo_site+m.group(1)
					ret.append("%s (%s) %s" % (word, eo_word, link))
		return "\n".join(ret)

Пример #5

0

Показать файл

Файл: fulgencio.py Проект: jisazaTappsi/combot

def scrape_company_url(results, browser, leads_to_filter):
    """
    The Angarita automation
    :return:
    """
    for profile, row in results.iterrows():
        if row[COMPANY_URL]:
            try:
                print(f'browser.get({row[COMPANY_URL]}), ...')
                browser.get('http://www.' + row[COMPANY_URL])
                html = util.get_html(browser)

                emails = util.get_list_from_print(
                    results.loc[profile, EMAILS]) + util.get_patterns(
                        util.EMAIL_REGEX, html)
                emails = util.filter_emails(emails)

                phones = util.get_list_from_print(
                    results.loc[profile, PHONES]) + util.get_patterns(
                        util.PHONE_REGEX, html)
                phones = util.filter_phones(phones)

                results.loc[profile, EMAILS] = util.print_list(emails)
                results.loc[profile, PHONES] = util.print_list(phones)
            except WebDriverException:
                print(f'failed to load {row[COMPANY_URL]}, continuing...')

    save_leads_to_excel(results, leads_to_filter)

Пример #6

0

Показать файл

Файл: fulgencio.py Проект: jisazaTappsi/combot

def scrape_all(browser):
    results = pd.DataFrame(columns=COLUMNS)
    leads_to_filter = get_leads_to_filter()

    for idx, (group_name, group_url,
              scroll_steps) in enumerate(values.get_groups()):

        print(f'browser.get({group_name}), ...')
        browser.get(group_url)

        scroll_down(scroll_steps, browser)
        html = util.get_html(browser)

        try:
            for word in values.get_keywords():
                results = scrap_word(word=word.lower().replace('\n', ''),
                                     df=results,
                                     html=html,
                                     group_url=group_url,
                                     group_name=group_name)
                print(f'scraped word: {word}, done')

            save_leads_to_excel(results, leads_to_filter)

            print(f'saved results for: {group_name}')
        except MemoryError:
            pass

    scrape_company_url(results, browser, leads_to_filter)

    return results

Пример #7

0

Показать файл

Файл: batoto.py Проект: Pierre70/Manga

def login(username=batoto_username, password=batoto_password):
  global session
  if not username:
    print('It seems like you want to use bato.to, but did not provide a' + \
          'username or password')
    global batoto_username
    batoto_username = username = input('please enter your bato.to username: '******'please enter your bato.to password: '******'auth_key.*?value=[\'"]([^\'"]+)', html).group(1)
  referer = re.search('referer.*?value=[\'"]([^\'"]+)', html).group(1)
  url = 'https://bato.to/forums/index.php?app=core&module=global&section=login&do=process'
  fields = {
    'anonymous'    : 1,
    'rememberMe'   : 1,
    'auth_key'     : auth_key,
    'referer'      : referer,
    'ips_username' : username,
    'ips_password' : password,
  }
  r = session.post(url, data=fields)
  if 'set-cookie' in r.headers:
    session.headers.update({'cookie':r.headers['set-cookie']})
    return True
  else:
    return False #Login failed

Пример #8

0

Показать файл

    def train_net(self, lr=0.001):
        running_loss = 0.0
        criterion = nn.BCEWithLogitsLoss()
        losses = []
        optimizer = optim.Adam(self.parameters(), lr)
        for i, item in enumerate(util.get_html()):
            start = time.time()
            input_tensor = util.example_to_tensor((item[0] + item[1])[:2])
            if is_cuda:
                input_tensor = input_tensor.cuda()
            input_var = Variable(input_tensor)
            output = self(input_var)
            optimizer.zero_grad()
            #exit()
            loss = criterion(output, input_var)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%5d] loss: %.3f' % (i + 1, running_loss / 2000))
                losses.append(running_loss / 2000)
                running_loss = 0.0
                self.save_model()
            print("Epoch took " + str(time.time() - start) + " to complete")
        return losses

Пример #9

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def run_seed(self, args):
		if not args:
			return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("seed")
		word = urllib.quote(util.x_to_unicode(args))
		url = config.seed_search % word
		html = util.get_html(url)
		if "Word not found" in html:
			return 'Nenio trovita por "%s".' % args
		return url

Пример #10

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def run_komputeko(self, args):
		if not args:
			return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("komputeko")
		word = urllib.quote(util.x_to_unicode(args))
		url = config.komputeko_search % word
		html = util.get_html(url)
		if "ne ekzistas en Komputeko" in html or "Bonvolu tajpi almenaŭ" in html:
			return 'Nenio trovita por "%s".' % args
		else:
			return url

Пример #11

0

Показать файл

Файл: agent_stats.py Проект: aerosox/agent-stats-utils

def get_custom_date_ranges(group):
    html = get_html(scoreboard=group, time_span='custom')
    soup = BeautifulSoup(html, "html.parser")
    #soup.find('input', {'name':'startDate'}).attrs['value']
    for span in soup('span'):
        if span.text.startswith('Last refresh:'):
            return (datetime.datetime.strptime(span.text[42:61],
                                               '%Y-%m-%d %H:%M:%S'),
                    datetime.datetime.strptime(span.text[65:],
                                               '%Y-%m-%d %H:%M:%S'))

Пример #12

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def run_guglu(self, args):
		if not args:
			return "Necesas specifi serĉterminon. Por helpo vidu %s" % self.help_url("guglu")
		term = urllib.quote(args)
		url = config.google_search % term
		html = util.get_html(url, masquerade=True)
		m = re.search(r'<div id="?resultStats"?>((About )?(.+?) results)<(nobr|/div)>', html, re.I)
		if not m:
			return "Ne povis trovi la nombron da rezultoj."
		else:
			return m.group(1)

Пример #13

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def run_trancxu(self, args):
		if not args:
			return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("tranĉu")
		word = urllib.quote(util.x_to_unicode(args))
		url = config.sivo_search % ("ser%c4%89o", word)
		html = util.get_html(url)
		html = re.search(r"<h2>Vortfarada Serĉo</h2>(.+?)<h2>", html, re.S).group(1)
		if "Neniu trovita" in html:
			return 'Nenio trovita por "%s".' % args
		else:
			ret = [util.strip_tags(line) for line in html.splitlines() if "<li>" in line]
			return "\n".join(ret)

Пример #14

0

Показать файл

Файл: agent_stats.py Проект: Yossi/agent-stats-utils

def check_for_applicants(group):
    html = get_html(scoreboard=group)
    soup = BeautifulSoup(html, "html.parser")
    applicants = None
    for elem in soup(text='Agents waiting for validation:'):
        applicants = elem.parent.parent.text.replace('\n', '').split('@')[1:]
        break
    message = []
    if applicants:
        message.append('Agent(s) awaiting validation to the {} group:'.format(group))
        for agent in applicants:
            message.append('    @{}'.format(agent))
        message.append('\nGo to {} and click on the [View admin panel] button to take care of it.'.format(html.partition('give them this url: <a href="')[2].partition('">https://www.agent-stats.com/groups.php')[0]).partition('&')[0])
    return '\n'.join(message)

Пример #15

0

Показать файл

Файл: tangorin.py Проект: HELL-TO-HEAVEN/anki-kanji

 def _get_words_for_kanji(kanji, log):
     try:
         doc = util.get_html(TG_BASE_URL + TG_KANJI_PATH + '/' + kanji, log)
         if doc is not None:
             kanji_words = {}
             for tr in doc.xpath('//table[@class="k-compounds-table"]//tr'):
                 reading, reading_words = Tangorin._process_reading_row(tr)
                 if not reading:
                     log.debug('invalid kanji: %s', kanji)
                     return None
                 kanji_words[reading] = reading_words
             return kanji_words
     except:
         log.exception('failed to load words for kanji %s', kanji)
     return None

Пример #16

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def url(self, nick, msg, to):
		if re.match( "^https?:", msg, re.I ):
			try:
				html = util.get_html(msg, masquerade=True)
				if html:
					m = re.search(r"<title>(.*?)</title>", html, re.I | re.S)
					if m:
						title = m.group(1).strip()
						parser = HTMLParser.HTMLParser()
						title = parser.unescape(title)
						if type(title) == unicode:
							title = title.encode('utf-8')
						self.send(to, "Titolo: %s" % title)
			except Exception as e:
				error = "ERROR: %s" % str(e)
				if config.do_syslog:
					syslog.syslog(syslog.LOG_ERR, error)

Пример #17

0

Показать файл

Файл: free_proxy.py Проект: hhy5277/crawler-7

def fetch_66ip():
    """
    http://www.66ip.cn/
    每次打开此链接都能得到一批代理, 速度不保证
    """
    proxies = []
    try:
        # 修改getnum大小可以一次获取不同数量的代理
        url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
        content = get_html(url)
        urls = content.split("</script>")[-1].split("<br />")
        for u in urls:
            if u.strip():
                proxies.append(u.strip())
    except Exception as e:
        logger.warning("fail to fetch from httpdaili: %s" % e)
    return proxies

Пример #18

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def run_av(self, args):
		if not args:
			return "Necesas specifi radikon. Por helpo vidu %s" % self.help_url("av")
		root = urllib.quote(util.x_to_unicode(args))
		html = util.get_html(config.av_search % root)
		mlnk = re.search(r'<td class="formo"><a href="(.+?)" title="Ligilo al la Fundamento">(.+?)</a></td>', html)
		mele = re.search(r'<td class="formo">(.+?)</td>', html)
		mbaz = re.search(r'<td class="bazformo">(.+?)</td>', html)
		msta = re.search(r'<td class="statuso">(.+?)</td>', html)
		mbro = re.search(r'<td class="bro">(.+?)</td>', html)
		if not mele:
			return 'Nenio trovita por "%s".' % args
		else:
			link = config.av_site + mlnk.group(1).strip() if mlnk else ""
			elemento = mlnk.group(2).strip() if mlnk else mele.group(1).strip()
			bazformo = mbaz.group(1).strip() if mbaz else ""
			statuso = re.sub(r".*? \+?", "", msta.group(1).strip()).upper()[0] if msta else ""
			bro = mbro.group(1).strip()[3] if mbro else ""
		ret = []
		ret.append("Elemento: %s %s" % (elemento, link))
		ret.append("Bazformo: %s" % bazformo)
		last = []
		if statuso:
			last.append("La radiko %s troviĝas en la" % elemento)
			if statuso == "F":
				last.append("Fundamento de Esperanto")
			else:
				last.append({"1": "Unua",
							 "2": "Dua",
							 "3": "Tria",
							 "4": "Kvara",
							 "5": "Kvina",
							 "6": "Sesa",
							 "7": "Sepa",
							 "8": "Oka",
							 "9": "Naŭa"}[statuso])
				last.append("Aldono al la Universala Vortaro")
		if bro:
			if statuso:
				last.append("kaj")
			else:
				last.append("La radiko %s troviĝas en" % elemento)
			last.append("Grupo %s de la Baza Radikaro Oficiala" % bro)
		ret.append(" ".join(last))
		return "\n".join(ret) + "."

Пример #19

0

Показать файл

Файл: proxy.py Проект: Linao1996/aqi-analyzer

def get_proxies_66(site=util.PROXY_SITES[0], filepath='proxy.txt'):
    '''
    crawl and test proxy from 66ip and save it to file
    :param site: proxy site(66ip)
    :return: None
    '''
    print('start crawl proxies:')
    soup = BeautifulSoup(util.get_html(site, header=util.HEADER), 'lxml')
    tbs = soup.find_all('table')[2]
    trs = tbs.find_all('tr')[1:]
    for tr in trs:
        ip = tr.find_all('td')[0].text
        port = tr.find_all('td')[1].text
        proxy_value = 'http:' + str(ip) + ':' + str(port)
        proxy = {'http': proxy_value, 'https': proxy_value}
        if test_proxy(proxy):
            with open(filepath, 'a+') as f:
                f.write(str(ip) + ':' + str(port) + '\n')

Пример #20

0

Показать файл

Файл: main.py Проект: WUZHEN1991/WebCrawler

def download_and_unzip_file(url_, path):
    try:
        zfobj = ZipFile(StringIO(util.get_html(url_)))
        for name in zfobj.namelist():
            if name.endswith('/'):
                sub_path = os.path.join(path, name)
                if not os.path.exists(sub_path):
                    os.mkdir(sub_path)
                continue
            uncompressed = zfobj.read(name)
            out_ = path + '/' + name
            output = open(out_, 'wb')
            output.write(uncompressed)
            output.close()
        return True
    except Exception, e:
        print e
        return False

Пример #21

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def trans_google(self, fr, to, translate):
		translate = urllib.quote(translate)
		url = config.google_translate_search % (translate, fr, to)
		jsn = util.get_html(url, masquerade=True)
		dic = json.loads(jsn)	
		src = dic["src"].encode('utf-8')
		trans = dic["sentences"][0]["trans"].encode('utf-8')
		translit = dic["sentences"][0]["translit"].encode('utf-8')
		ret = []
		if fr == "auto":
			ret.append('Tradukis de lingvo "%s"' % src)
		if not translit:
			ret.append(trans)
		else:
			ret.append("Traduko: %s" % trans)
			if trans != translit:
				ret.append("Transliterumo: %s" % translit)
		return "\n".join(ret)

Пример #22

0

Показать файл

Файл: bot.py Проект: benjamicho/eobotulo

	def trans_majstro(self, fr, to, word):
		qword = urllib.quote(word)
		url = config.majstro_search % (fr, to, qword)
		html = util.get_html(url)
		if "could not be translated" in html:
			return 'Nenio trovita por "%s".' % word
		results = re.findall(r"<li>.+?</li>", html)
		ret = "\n".join(results)
		ret = util.strip_tags(ret)
	
		parser = HTMLParser.HTMLParser()
		ret = ret.decode('utf-8')
		ret = parser.unescape(ret)
		if type(ret) == unicode:
			ret = ret.encode('utf-8')

		ret = re.sub(": ", " → ", ret)
		ret = re.sub("; ", ", ", ret)
		return ret

Пример #23

0

Показать файл

Файл: batoto.py Проект: Pierre70/Manga

def batoto(url, download_chapters,args):
  batoto_username = args.username
  batoto_password = args.password
  login()
  for i in range(3):
    try:
      html  = get_html(url+'/')
      break
    except:
      if i == 2:
        raise
      else:
        pass

  global last
  global session

  if hasattr(args, 'last'):
    last=args.last

  series    = title(re.search('<h1.*?>[\\s\n]*(.*?)[\\s\n]*</h1>', html, re.DOTALL|re.MULTILINE).group(1))
  status    = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1)
  author    = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)))
  tags      = re.findall('<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])

  chapters  = []

  for j in re.findall('<tr class=\"row lang_([A-Za-z]*?) chapter_row\".*?>(.*?)</tr>', html, re.DOTALL|re.MULTILINE)[::-1]:
    if j[0]  == batoto_lang:
      match  = re.search('<a href=\"([^\"]*?)\".*?>\\s*<img.*?>\\s*([^\"<>]*)(\\s*:\\s*)?(.*?)\\s*</a>', j[1], re.DOTALL|re.MULTILINE)
      name   = match.group(4)
      m2     = re.search('[Cc]h(ap)?(ter)?\\.?\\s*([Ee]xtra:?)?\\s*([\\d\\.]+)\\s*(-\\s*[\\d\\.]+)?', match.group(2))
      try:
        num    = float(m2.group(4))
      except:
        if args.debug:
          print(j[1])
        raise

      '''
      #TODO
      if m2.group(3):
        if chapters:
          num = chapters[-1]['num'] + .4
        else:
          num = last + .4
      '''
      try:
        vol  = int(re.search('[Vv]ol(ume)?\\.\\s*(\\d+)', match.group(2)).group(2))
      except:
        vol  = 0
      link   = match.group(1)
      uuid   = link.rpartition('#')[2]
      ref    = link.rpartition('/')[0]+'/' + "reader#" + uuid + "_1"
      head   = {'Referer':ref, 'supress_webtoon':'t'}
      link   = link.rpartition('/')[0]+'/'+ 'areader?id='+uuid+'&p=1'
      session.headers.update(head)

      try:
        date = datetime.strptime(re.search('<td.*?>(\\d{2} [A-Za-z]* \\d{4}.*?([Aa][Mm]|[Pp][Mm])).*?</td>', j[1]).group(1), '%d %B %Y - %I:%M %p').strftime('%Y-%m-%dT%H:%M:00')
      except:
        try:
          t  = re.search('(\\d+) [Mm]inutes ago', j[1]).group(1)
        except:
          t  = '1' if re.search('A minute ago', j[1]) else ''
        if t:
          unit = '%M'
        else:
          try:
            t  = re.search('(\\d+) [Hh]ours ago', j[1]).group(1)
          except:
            t  = '1' if re.search('An hour ago', j[1]) else ''
          if t:
            unit = '%H'
          else:
            try:
              t  = re.search('(\\d+) [Dd]ays ago', j[1]).group(1)
            except:
              t  = '1' if re.search('A day ago', j[1]) else ''
            if t:
              unit = '%d'
            else:
              try:
                t  = re.search('(\\d+) [Ww]eeks ago', j[1]).group(1)
              except:
                t  = '1' if re.search('A week ago', j[1]) else ''
              if t:
                unit = '%W'
              else:
                t = '0'
                unit = '%M'
        date = datetime.fromtimestamp((datetime.today()-datetime.strptime(t, unit)).total_seconds()).strftime('%Y-%m-%dT%H:%M:00')

      if name:
        name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name)
      else:
        name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

      if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
        if args.debug or args.verbose:
          print('  Gathering info: \"{}\"'.format(name))
        chap_html = get_html(link)
        img_url   = re.sub('001\\.([A-Za-z]{3})', '{:03}.\\1', re.search('<div.*?>\\s*<a.*?>\\s*<img[^<]*?src=\"([^\"]*?)\"[^>]*?/>\\s*</div>', chap_html, re.DOTALL|re.MULTILINE).group(1))
        zero = False
        if '{:03}' not in img_url:
          img_url  = re.sub('000\\.([A-Za-z]{3})', '{:03}.\\1', img_url)
          zero = True
        if '{:03}' not in img_url:
          img_url  = re.sub('000\\.([A-Za-z]{3})', '{:03}.\\1', img_url)
          zero = True
          if '{:03}' not in img_url:
            img_url  = re.sub('01\\.([A-Za-z]{3})', '{:02}.\\1', img_url)
            zero = False
            if '{:02}' not in img_url:
              img_url  = re.sub('00\\.([A-Za-z]{3})', '{:02}.\\1', img_url)
              zero = True
        if re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html):
          pages      = max([int(i) for i in re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html)])
        else:
          continue
        b_links    = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>page (\\d+)</option>', chap_html)}
        b_links    = [b_links[i+1] for i in range(pages)]
        if zero:
          links      = [img_url.format(i) for i in range(pages)]
        else:
          links      = [img_url.format(i+1) for i in range(pages)]

        chapters.append({'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num})

  if chapters:
    function_name(chapters, series, tags, author, status,args)

Пример #24

0

Показать файл

Файл: main.py Проект: xenron/sandbox-dev-python

    pre_url_list = [
        "http://tieba.baidu.com/p/2738151262",
        'http://[email protected]/portal/home',
    ]

    post_url_list = []

    count = 0
    util.init_path([html_folder, img_folder])
    
    while pre_url_list and count<100:
        # print 'pre_url_list: %s' % len(pre_url_list)
        # print 'post_url_list: %s' % len(post_url_list)
        url = pre_url_list.pop()
        # print 'current url: %s' % url
        html = util.get_html(url)
        # print 'html => ', html
        if not html:
            continue
        hp = parser.HP(url)
        hp.feed(html)
        hp.close()
        # print hp.links
        for link in hp.links:
            if not link.startswith('http'):
                continue
            if link not in post_url_list and link not in pre_url_list and link!=url and \
                link not in util.exclude_url_list and link not in util.error_url_list:
                pre_url_list.append(link)
        write_html(html_folder, html)
        post_url_list.append(url)

Пример #25

0

Показать файл

Файл: main.py Проект: WUZHEN1991/WebCrawler

        if p.returncode is not None:
            return False
        else:
            return True


if __name__ == '__main__':
    dir_path = 'download_space'
    last_cmd = None
    client_name = sys.argv[1]
    HTTP_SERVER = 'http://localhost:8080'
    p = None

    while True:
        post_state = 'beeping'
        cmd = util.get_html(HTTP_SERVER + '/' + 'client?client_name=%s' % client_name)
        if cmd.startswith('dr:'):
            terminate_process()
            p = None
            para = cmd.replace('dr:', '')
            prepare_dir(dir_path)
            if download_and_unzip_file(HTTP_SERVER + '/static/' + para, dir_path):
                p = start_run_process()
                post_state = cmd
            else:
                remove_dir(dir_path)
        elif cmd.startswith('t:'):
            terminate_process()
            p = None
            post_state = cmd
        elif cmd.startswith('c:'):

Пример #26

0

Показать файл

Файл: japscan.py Проект: Pierre70/Manga

def japscan(url, download_chapters, args):
    print("getting url " + url)
    html = get_html(url)
    global last
    if hasattr(args, 'last'):
        last = args.last
    series = title(
        re.search('(<h1 class="bg-header">).*>(.*)</a>(</h1>)',
                  html.replace('\n', '')).group(2))

    #FIND ALL
    info_gen = re.findall('(<div class="cell">\\s*(.*?)\\s*</div>)',
                          html.replace(
                              '\n',
                              ''))  ## ['*****@*****.**', '*****@*****.**']

    status = info_gen[7][1]
    author = info_gen[5][1]
    tags = info_gen[7][1]

    #  for j in range(len(tags)):
    #for k in tag_dict:
    #tags[j] = re.sub(k, tag_dict[k], tags[j])
    chapters = []

    # catch chapters list
    chapitres = re.search(
        '(<div id="liste_chapitres">(.*)</div>.*<div class="col-1-3")',
        html.replace('\n', ''))
    #print(html)
    #print(chapitres.group(1))
    for j in re.findall('<li>(.*?)</li>', chapitres.group(1),
                        re.DOTALL | re.MULTILINE)[::-1]:
        #match = re.search('<a.*[-/]([0-9]+).*',j,re.DOTALL|re.MULTILINE)
        match = re.search(
            '<a.*[-/]([0-9.]+).*>Scan (.*) ([0-9.]+) VF( : )?(.*)?<.*', j,
            re.DOTALL | re.MULTILINE)
        # re.search('<a.*?>(.*?)([\\d,.]+)\\s*</a>', j, re.DOTALL|re.MULTILINE)
        #name  = match.group(2)
        num = float(match.group(1))
        link = "http://" + re.search('href=\".*(www.*?)\"', j).group(1)
        name = match.group(5)
        date = "01/01/2000"
        serie_short = match.group(2)
        if name:
            name = '{} - {} : {}'.format(serie_short,
                                         '{:3.1f}'.format(num).zfill(5), name)
        else:
            name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

        if (download_chapters
                and num in download_chapters) or (not download_chapters
                                                  and num > last):
            if args.debug or args.verbose:
                print('  Gathering info: \"{}\"'.format(series))
            chap_html = get_html(link)
            links = ['']
            #HACK : HAVE TO PARSE EACH PAGE TO RETRIEVE IMAGE
            for content in re.findall(
                    '<option .* value=\"(.*?)\".*?>.*</option>',
                    chap_html)[::-1]:
                #  print("http://www.japscan.com"+content)
                content_html = get_html("http://www.japscan.com" + content)
                search = '<div itemscope itemtype="http://schema.org/Article">.*src="(.*[.][a-z]{0,4})" />'
                #link_page=re.search(search,content_html.replace('\n',''),re.MULTILINE)
                link_page = re.search(search, content_html.replace('\n', ''),
                                      re.MULTILINE)
                #print(content_html.replace('\n',''))
                try:
                    #print(link_page.group(1))
                    links.append(link_page.group(1))
                except:
                    print('An error occurs, unable to search pages')
                    print(content_html.replace('\n', ''))

            links.remove('')
            links = list(reversed(links))
            chapters.append({
                'name': name,
                'links': links,
                'backup_links': links,
                'date': date,
                'pages': len(links),
                'num': num
            })
            args.url = url
    if chapters:
        function_name(chapters, series, tags, author, status, args)

Пример #27

0

Показать файл

def mymanga(url, download_chapters, args):
    html = get_html(url)
    global last
    if hasattr(args, 'last'):
        last = args.last
    series = title(
        re.search('(<h2 class="text-border">)(.*)(</h2>)',
                  html.replace('\n', '')).group(2))

    #FIND ALL
    info_gen = re.findall('(<div class="cell">\\s*(.*?)\\s*</div>)',
                          html.replace(
                              '\n',
                              ''))  ## ['*****@*****.**', '*****@*****.**']

    status = 'default'  #info_gen[7][1]
    author = 'default'  #info_gen[5][1]
    tags = 'default'  #info_gen[7][1]

    #  for j in range(len(tags)):
    #for k in tag_dict:
    #tags[j] = re.sub(k, tag_dict[k], tags[j])
    chapters = []

    # catch chapters list
    chapitres = re.search(
        '(<section class="listchapseries fiche block sep">(.*)</section>)',
        html.replace('\n', ''))
    #print(html.replace('\n',''))
    #print(chapitres.group(1))
    for j in re.findall('<li>(.*?)</li>', chapitres.group(1),
                        re.DOTALL | re.MULTILINE)[::-1]:
        #match = re.search('<a.*[-/]([0-9]+).*',j,re.DOTALL|re.MULTILINE)

        match = re.search('<a.* href=".*/chapitre-([0-9.]+)/', j,
                          re.DOTALL | re.MULTILINE)
        # re.search('<a.*?>(.*?)([\\d,.]+)\\s*</a>', j, re.DOTALL|re.MULTILINE)
        #name  = match.group(2)
        num = float(match.group(1))
        link = "http://" + re.search('href=\".*(www.*?)\" title', j).group(1)
        name = match.group(1)
        date = "01/01/2000"
        serie_short = series
        if name:
            name = '{} - {} : {}'.format(serie_short,
                                         '{:3.1f}'.format(num).zfill(5), name)
        else:
            name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

        if (download_chapters
                and num in download_chapters) or (not download_chapters
                                                  and num > last):
            if args.debug or args.verbose:
                print('  Gathering info: \"{}\"'.format(series))
            chap_html = get_html(link)
            links = ['']
            #HACK : HAVE TO PARSE EACH PAGE TO RETRIEVE IMAGE
            #print(chap_html)
            for content in re.findall('<option value="(.*[0-9]?)" ',
                                      chap_html)[::-1]:
                #print("http://www.hitmanga.eu/"+content)
                content_html = get_html("http://www.hitmanga.eu/" + content)
                #print(content_html.replace('\n','').replace('\r',''))
                search = '<table id="picture">.*src="(.*[.][a-z]{0,4}?)" alt=.*</table>'
                #link_page=re.search(search,content_html.replace('\n',''),re.MULTILINE)
                link_page = re.search(search, content_html.replace('\n', ''),
                                      re.MULTILINE)
                #print(content_html.replace('\n',''))
                try:
                    #print(link_page.group(1))
                    links.append(link_page.group(1))
                except:
                    print('An error occurs, unable to search pages')
                    print(content_html.replace('\n', ''))

            links.remove('')
            links = list(reversed(links))
            chapters.append({
                'name': name,
                'links': links,
                'backup_links': links,
                'date': date,
                'pages': len(links),
                'num': num
            })
            args.url = url
    if chapters:
        function_name(chapters, series, tags, author, status, args)

Пример #28

0

Показать файл

		b=sp.sub(" ",t.string)
		t.replace_with(b)

for f in soup.findAll("a",text=re.compile(u".*(Eventos de Linux Foundation).*")):
	texts=f.find_parent("fieldset").div.ul.find_all("strong")
	for t in texts:
		t.unwrap()

e=soup.find("span", attrs={'class': "enlace"})
if e and e.parent.name=="li":
	e.parent.extract()

util.set_menu(soup)


h=util.get_html(soup)

h=h.replace("Objectivos de aprendizaje","Objetivos de aprendizaje") #7 11
h=h.replace(">31</a></h1>",">31. zypper</a></h1>") #31
h=h.replace(">31</option>",">31. zypper</option>") #31
h=h.replace(" del sisco "," del disco ")
h=h.replace("miltihebra","multihebra")
h=h.replace("el ajusta de E/S","el ajuste de E/S")
h=h.replace(". Se este",". Si este")
h=h.replace(" tital "," total ")
h=h.replace(" para para "," para ")
h=h.replace("revision_umber","revision_number")
h=h.replace("cuentasde","cuentas de")
h=h.replace("/opt/dolphy_app /man","/opt/dolphy_app/man")
h=h.replace("archivosy propietarios","archivos y propietarios")
h=h.replace("$tar","$ tar")

Пример #29

0

Показать файл

Файл: tecmint.py Проект: ymedlop/LFS201

		util.h_to_a(out,h,ct)
		ct=ct+1
for ol in out.findAll("ol"):
	if ol.parent and ol.parent.name=="ol":
		ol.unwrap()
for p in out.findAll("pre"):
	for s in p.findAll("strong"):
		if "style" in s.attrs and "class" not in s.attrs:
			del s.attrs["style"]
			s.attrs["class"]="resaltar"
for i in out.findAll("iframe"):
	if "src" in i.attrs and i.attrs["src"].startswith("//"):
		i.attrs["src"]="http:"+i.attrs["src"]

util.set_menu(out)

html = util.get_html(out,True)
html=html.replace(u"–","-")
html=html.replace(u"—","-")
r=re.compile("([rwx])=2([210])")
html=r.sub("\\1=2<sup>\\2</sup>",html)
r=re.compile("</p>\s*<li>")
html=r.sub("</p><ul><li>",html)
r=re.compile("</li>\s*<p>")
html=r.sub("</li></ul><p>",html)
util.escribir(html,oht)

#out.prettify("utf-8",formatter="html")
#with open(oht, "wb") as file:
#	file.write(html)#.encode('utf8'))

Пример #30

0

Показать файл

Файл: kanjidamage.py Проект: HELL-TO-HEAVEN/anki-kanji

    def update(self):
        model = self.get_model()
        deck = self.get_deck()
        note_map = self.get_notes()

        model['did'] = deck['id']
        self._update_templates(model)

        # processes data from kd website
        self.log.info('loading data from kanji damage website (this will take quite a while)...')
        self.col.models.setCurrent(model)
        self.col.decks.select(deck['id'])
        url = KD_DAMAGE_BASE_URL + KD_KANJI_PATH + '/1'
        tries = 0
        while url:
            try:
                doc = util.get_html(url, self.log)
                if doc is None:
                    return
                util.add_base_url(doc, KD_DAMAGE_BASE_URL)

                # retrieves the data
                kanji = self._get_kanji(doc)
                meaning = self._get_meaning(doc)

                # get map key
                key = None
                if type(kanji) is lxml.html.HtmlElement:
                    # kanji is an image
                    self._download_images(kanji, KD_DAMAGE_BASE_URL)
                    kanji = util.html_to_string(kanji)
                    key = meaning
                elif KD_VALID_KANJI.match(kanji):
                    key = kanji

                # update/create note
                if key:
                    note = note_map[key] if key in note_map else self.col.newNote()
                    note['Kanji'] = kanji
                    note['Meaning'] = meaning
                    note['Number'] = self._get_number(doc)
                    note['Description'] = self._get_description(doc, KD_DAMAGE_BASE_URL)
                    note['Usefulness'] = self._get_usefulness(doc)
                    note['Full used In'] = self._get_used_in(doc, KD_DAMAGE_BASE_URL)
                    onyomi_full, onyomi = self._get_onyomi(doc, KD_DAMAGE_BASE_URL)
                    note['Full onyomi'] = onyomi_full
                    note['Onyomi'] = onyomi
                    kun_full, kun, kun_meaning, kun_use = self._get_kunyomi(doc, KD_DAMAGE_BASE_URL)
                    note['Full kunyomi'] = kun_full
                    note['First kunyomi'] = kun
                    note['First kunyomi meaning'] = kun_meaning
                    note['First kunyomi usefulness'] = kun_use
                    mnemonic_full, mnemonic = self._get_mnemonic(doc, KD_DAMAGE_BASE_URL)
                    note['Full mnemonic'] = mnemonic_full
                    note['Mnemonic'] = mnemonic
                    note['Components'] = self._get_components(doc, KD_DAMAGE_BASE_URL)
                    jk_full, jk, jk_meaning, jk_use = self._get_jukugo(doc, KD_DAMAGE_BASE_URL)
                    note['Full jukugo'] = jk_full
                    note['First jukugo'] = jk
                    note['First jukugo meaning'] = jk_meaning
                    note['First jukugo usefulness'] = jk_use
                    note['Full header'] = self._get_header(doc, KD_DAMAGE_BASE_URL)
                    note['Full lookalikes'] = self._get_lookalikes(doc, KD_DAMAGE_BASE_URL)

                    if key not in note_map:
                        self.col.addNote(note)
                        note_map[key] = note
                    else:
                        note.flush()
                    self.log.debug(util.note_to_json(note))
                else:
                    self.log.info('ignored kanji: %s', kanji)

                # finds the link to the next kanji
                url = next(iter(doc.xpath('//div[@class="span2 text-righted"]/a[1]/@href')), None)
                tries = 0
            except OSError as e:
                if (e.errno == 101) and (tries < 3):
                    tries += 1
                else:
                    self.log.exception('failed to retrieve from %s', url)
                    url = None
        self.col.save()

Пример #31

0

Показать файл

Файл: grabber.py Проект: hiyouth/SpiderMan

 def get(self):
     spide_url = getattr(self.resource, 'spide_url')
     self.content = get_html(self.resource.spide_url)

Пример #32

0

Показать файл

Файл: mangahere.py Проект: Pierre70/Manga

def mangahere(url, download_chapters, args):
    html = get_html(url)
    global last
    if hasattr(args, 'last'):
        last = args.last

    series = title(
        re.search(
            '<h1 class="title"><span class="title_icon"></span>(.*?)</h1>',
            html.replace('\n', '')).group(1))
    status = re.search('<li><label>Status:</label>(.*?)<',
                       html.replace('\n', '')).group(1)
    author = ', '.join(
        re.findall(
            '<a.*?>(.*?)</a>',
            re.search('<li><label>Author\\(?s?\\)?:</label>(.*?)</li>',
                      html.replace('\n', '')).group(1)))
    tags = re.search('<li><label>Genre\\(s\\):</label>(.*?)</li>',
                     html).group(1).split(', ')
    for j in range(len(tags)):
        for k in tag_dict:
            tags[j] = re.sub(k, tag_dict[k], tags[j])
    chapters = []

    for j in re.findall(
            '<li>\\s*<span class=\"left\">\\s*(.*?\\d{4}</span>)\\s*</li>',
            html, re.DOTALL | re.MULTILINE)[::-1]:
        match = re.search(
            '<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j,
            re.DOTALL | re.MULTILINE)
        name = match.group(2)
        num = float(match.group(1))
        link = re.search('href=\"(.*?)\"', j).group(1)
        try:
            date = datetime.strptime(
                re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1),
                '%b %d, %Y').strftime('%Y-%m-%d')
        except:
            date = datetime.datetime.today().strftime('%Y-%m-%d')
        if name:
            name = '{} - {} : {}'.format(series,
                                         '{:3.1f}'.format(num).zfill(5), name)
        else:
            name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

        if (download_chapters
                and num in download_chapters) or (not download_chapters
                                                  and num > last):
            if args.debug or args.verbose:
                print('  Gathering info: \"{}\"'.format(name))
            chap_html = get_html(link)
            img_url = re.sub(
                '001.([A-Za-z]{3})', '{:03}.\\1',
                re.search('<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>',
                          chap_html, re.DOTALL | re.MULTILINE).group(1))
            if '{:03}' not in img_url:
                img_url = re.sub('01.([A-Za-z]{3})', '{:02}.\\1', img_url)
            pages = max([
                int(i) for i in re.findall(
                    '<option value=\".*?\".*?>(\\d+)</option>', chap_html)
            ])
            b_links = {
                float(i[1]): i[0]
                for i in re.findall(
                    '<option value=\"(.*?)\".*?>(\\d+)</option>', chap_html)
            }
            b_links = [b_links[i + 1] for i in range(pages)]
            links = [img_url.format(i + 1) for i in range(pages)]

            chapters.append({
                'name': name,
                'links': links,
                'backup_links': b_links,
                'date': date,
                'pages': pages,
                'num': num
            })

    if chapters:
        function_name(chapters, series, tags, author, status, args)

Python get_html примеры использования