def mangareader(url, download_chapters, args): html = get_html(url) global last if hasattr(args, 'last'): last = args.last series = title( re.search('<td.*?>\\s*Name:.*?<h2.*?>\\s*(.*?)\\s*</h2>\\s*</td>', html.replace('\n', '')).group(1)) status = re.search('<td.*?>\\s*Status:.*?<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1) author = re.search('<td.*?>\\s*Author:.*?<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1).partition('(')[0].strip() tags = re.findall('<a.*?><span class="genretags">(.*?)</span></a>', html) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall('<tr>\\s*<td>\\s*<div.*?</div>(.*?)</tr>', html, re.DOTALL | re.MULTILINE): match = re.search('<a.*?([\\d.,-]+)</a>(\\s*:\\s*)(.*?)\\s*</td>', j) num = float(match.group(1)) name = match.group(3) link = 'http://www.mangareader.net' + re.search( '<a\\s*href=\"(/.*?)\">', j).group(1) date = re.search('<td>(\\d{2})/(\\d{2})/(\\d{4})</td>', j) date = '{:04}-{:02}-{:02}'.format(int(date.group(3)), int(date.group(1)), int(date.group(2))) if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) links = [ 'http://www.mangareader.net' + i for i in re.findall( '<option value=\"(.*?)\".*?>\\d+</option>', chap_html) ] chapters.append({ 'name': name, 'links': links, 'backup_links': links, 'date': date, 'pages': len(links), 'num': num }) if chapters: function_name(chapters, series, tags, author, status, args)
def scan_fr(url, download_chapters,args): print("getting url "+url) html = get_html(url) global last if hasattr(args, 'last'): last=args.last series = title(re.search('(<h2 class="widget-title" style="display: inline-block;">)([^<]*)(</h2>)', html.replace('\n', '')).group(2)) print("series: series"+series) #FIND ALL # info_gen = re.findall('(<div class="cell">\\s*(.*?)\\s*</div>)', html.replace('\n', '')) ## ['*****@*****.**', '*****@*****.**'] status="" # not set in this source author="" # not set in this source tags="" # not set in this source # for j in range(len(tags)): #for k in tag_dict: #tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] # catch chapters list chapitres=re.search('(<ul class="chapters">(.*)</ul>)', html.replace('\n','').replace('\r','')) # char ? will be used to allow overlapping regex ! for j in re.findall('<h5 class="chapter-title-rtl">(.*?)</h5>', chapitres.group(1), re.DOTALL|re.MULTILINE)[::-1]: print("ligne trouvée:"+j) #match = re.search('<a.*[-/]([0-9]+).*',j,re.DOTALL|re.MULTILINE) match = re.search('<a.*[-/]([0-9.]+).*>(.*) ([0-9.]+)</a>',j,re.DOTALL|re.MULTILINE) # re.search('<a.*?>(.*?)([\\d,.]+)\\s*</a>', j, re.DOTALL|re.MULTILINE) #name = match.group(2) num = float(match.group(1)) link = "http://"+re.search('href=\".*(www.*?)\"', j).group(1) # no name, we use title instead name = '' date = "01/01/2000" serie_short=match.group(2) if name: name = '{} - {} : {}'.format(serie_short, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(series)) print(' downloading chapter '+link) chap_html = get_html(link) links=[''] image_regex="data-src='(.*?) '" links = [i for i in re.findall(image_regex, chap_html)] chapters.append({'name':name, 'links':links, 'backup_links':links, 'date':date, 'pages':len(links), 'num':num}) args.url=url if chapters: function_name(chapters, series, tags, author, status,args)
def goodmanga(url, download_chapters,args): html = get_html(url) global last if hasattr(args, 'last'): last=args.last series = title(re.search('<h1>([^<>]*?)</h1>', html.replace('\n', '')).group(1)) status = re.search('<span>Status:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1) author = re.search('<span>Authors?:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1) tags = re.findall('<a.*?>(.*?)</a>', re.search('<span>Genres:</span>(.*?)\\s*</div>', html, re.DOTALL|re.MULTILINE).group(1)) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] while True: for j in re.findall('<li>\\s*(.{1,300}?\\d{4}</span>)\\s*</li>', html, re.DOTALL|re.MULTILINE): match = re.search('<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL|re.MULTILINE) name = match.group(2) num = float(match.group(1)) link = re.search('href=\"(.*?)\"', j).group(1) try: date = datetime.strptime(re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d') except: date = datetime.datetime.today().strftime('%Y-%m-%d') if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) img_url = re.sub('1.([jpgnig]{3})', '{}.\\1', re.search('</div>\\s*<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>', chap_html, re.DOTALL|re.MULTILINE).group(1)) pages = max([int(i) for i in re.findall('<option value=\".*?\".*?>\\s*(\\d+)\\s*</option>', chap_html)]) b_links = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>\\s*(\\d+)\\s*</option>', chap_html)} b_links = [b_links[i+1] for i in range(pages)] links = [img_url.format(i+1) for i in range(pages)] chapters.insert(0, {'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num}) match = re.search('<a href=\"(.*?)\">Next</a>', html) if match: html = get_html(match.group(1)) else: break if chapters: function_name(chapters, series, tags, author, status,args)
def run_revo(self, args): if not args: return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("revo") if args.lower() == "sal": return "%s/revo/sal.html" % config.zz9pza word = urllib.quote(util.x_to_unicode(args)) html = util.get_html(config.revo_search % word) if "Neniu trafo" in html: return 'Nenio trovita por "%s".' % args ret = [] esperante = False for line in html.splitlines(): if line.startswith("<h1>"): lang = re.search(r"<h1>(.+?)</h1>", line).group(1).split()[0] esperante = lang=="esperante" ret.append("-%s-" % lang) if line.startswith("<a"): if esperante: m = re.search(r'href="(.+?)">(.+?)</a>', line) eo_word, link = m.group(2).split()[0], config.revo_site+m.group(1) ret.append("%s %s" % (eo_word, link)) else: m = re.search(r'>(.+?)</a>', line) word = m.group(1) m = re.search(r'\(.+?href="(.+?)">(.+?)</a>', line) eo_word, link = m.group(2).split()[0], config.revo_site+m.group(1) ret.append("%s (%s) %s" % (word, eo_word, link)) return "\n".join(ret)
def scrape_company_url(results, browser, leads_to_filter): """ The Angarita automation :return: """ for profile, row in results.iterrows(): if row[COMPANY_URL]: try: print(f'browser.get({row[COMPANY_URL]}), ...') browser.get('http://www.' + row[COMPANY_URL]) html = util.get_html(browser) emails = util.get_list_from_print( results.loc[profile, EMAILS]) + util.get_patterns( util.EMAIL_REGEX, html) emails = util.filter_emails(emails) phones = util.get_list_from_print( results.loc[profile, PHONES]) + util.get_patterns( util.PHONE_REGEX, html) phones = util.filter_phones(phones) results.loc[profile, EMAILS] = util.print_list(emails) results.loc[profile, PHONES] = util.print_list(phones) except WebDriverException: print(f'failed to load {row[COMPANY_URL]}, continuing...') save_leads_to_excel(results, leads_to_filter)
def scrape_all(browser): results = pd.DataFrame(columns=COLUMNS) leads_to_filter = get_leads_to_filter() for idx, (group_name, group_url, scroll_steps) in enumerate(values.get_groups()): print(f'browser.get({group_name}), ...') browser.get(group_url) scroll_down(scroll_steps, browser) html = util.get_html(browser) try: for word in values.get_keywords(): results = scrap_word(word=word.lower().replace('\n', ''), df=results, html=html, group_url=group_url, group_name=group_name) print(f'scraped word: {word}, done') save_leads_to_excel(results, leads_to_filter) print(f'saved results for: {group_name}') except MemoryError: pass scrape_company_url(results, browser, leads_to_filter) return results
def login(username=batoto_username, password=batoto_password): global session if not username: print('It seems like you want to use bato.to, but did not provide a' + \ 'username or password') global batoto_username batoto_username = username = input('please enter your bato.to username: '******'please enter your bato.to password: '******'auth_key.*?value=[\'"]([^\'"]+)', html).group(1) referer = re.search('referer.*?value=[\'"]([^\'"]+)', html).group(1) url = 'https://bato.to/forums/index.php?app=core&module=global§ion=login&do=process' fields = { 'anonymous' : 1, 'rememberMe' : 1, 'auth_key' : auth_key, 'referer' : referer, 'ips_username' : username, 'ips_password' : password, } r = session.post(url, data=fields) if 'set-cookie' in r.headers: session.headers.update({'cookie':r.headers['set-cookie']}) return True else: return False #Login failed
def train_net(self, lr=0.001): running_loss = 0.0 criterion = nn.BCEWithLogitsLoss() losses = [] optimizer = optim.Adam(self.parameters(), lr) for i, item in enumerate(util.get_html()): start = time.time() input_tensor = util.example_to_tensor((item[0] + item[1])[:2]) if is_cuda: input_tensor = input_tensor.cuda() input_var = Variable(input_tensor) output = self(input_var) optimizer.zero_grad() #exit() loss = criterion(output, input_var) loss.backward() optimizer.step() # print statistics running_loss += loss.data[0] if i % 2000 == 1999: # print every 2000 mini-batches print('[%5d] loss: %.3f' % (i + 1, running_loss / 2000)) losses.append(running_loss / 2000) running_loss = 0.0 self.save_model() print("Epoch took " + str(time.time() - start) + " to complete") return losses
def run_seed(self, args): if not args: return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("seed") word = urllib.quote(util.x_to_unicode(args)) url = config.seed_search % word html = util.get_html(url) if "Word not found" in html: return 'Nenio trovita por "%s".' % args return url
def run_komputeko(self, args): if not args: return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("komputeko") word = urllib.quote(util.x_to_unicode(args)) url = config.komputeko_search % word html = util.get_html(url) if "ne ekzistas en Komputeko" in html or "Bonvolu tajpi almenaŭ" in html: return 'Nenio trovita por "%s".' % args else: return url
def get_custom_date_ranges(group): html = get_html(scoreboard=group, time_span='custom') soup = BeautifulSoup(html, "html.parser") #soup.find('input', {'name':'startDate'}).attrs['value'] for span in soup('span'): if span.text.startswith('Last refresh:'): return (datetime.datetime.strptime(span.text[42:61], '%Y-%m-%d %H:%M:%S'), datetime.datetime.strptime(span.text[65:], '%Y-%m-%d %H:%M:%S'))
def run_guglu(self, args): if not args: return "Necesas specifi serĉterminon. Por helpo vidu %s" % self.help_url("guglu") term = urllib.quote(args) url = config.google_search % term html = util.get_html(url, masquerade=True) m = re.search(r'<div id="?resultStats"?>((About )?(.+?) results)<(nobr|/div)>', html, re.I) if not m: return "Ne povis trovi la nombron da rezultoj." else: return m.group(1)
def run_trancxu(self, args): if not args: return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("tranĉu") word = urllib.quote(util.x_to_unicode(args)) url = config.sivo_search % ("ser%c4%89o", word) html = util.get_html(url) html = re.search(r"<h2>Vortfarada Serĉo</h2>(.+?)<h2>", html, re.S).group(1) if "Neniu trovita" in html: return 'Nenio trovita por "%s".' % args else: ret = [util.strip_tags(line) for line in html.splitlines() if "<li>" in line] return "\n".join(ret)
def check_for_applicants(group): html = get_html(scoreboard=group) soup = BeautifulSoup(html, "html.parser") applicants = None for elem in soup(text='Agents waiting for validation:'): applicants = elem.parent.parent.text.replace('\n', '').split('@')[1:] break message = [] if applicants: message.append('Agent(s) awaiting validation to the {} group:'.format(group)) for agent in applicants: message.append(' @{}'.format(agent)) message.append('\nGo to {} and click on the [View admin panel] button to take care of it.'.format(html.partition('give them this url: <a href="')[2].partition('">https://www.agent-stats.com/groups.php')[0]).partition('&')[0]) return '\n'.join(message)
def _get_words_for_kanji(kanji, log): try: doc = util.get_html(TG_BASE_URL + TG_KANJI_PATH + '/' + kanji, log) if doc is not None: kanji_words = {} for tr in doc.xpath('//table[@class="k-compounds-table"]//tr'): reading, reading_words = Tangorin._process_reading_row(tr) if not reading: log.debug('invalid kanji: %s', kanji) return None kanji_words[reading] = reading_words return kanji_words except: log.exception('failed to load words for kanji %s', kanji) return None
def url(self, nick, msg, to): if re.match( "^https?:", msg, re.I ): try: html = util.get_html(msg, masquerade=True) if html: m = re.search(r"<title>(.*?)</title>", html, re.I | re.S) if m: title = m.group(1).strip() parser = HTMLParser.HTMLParser() title = parser.unescape(title) if type(title) == unicode: title = title.encode('utf-8') self.send(to, "Titolo: %s" % title) except Exception as e: error = "ERROR: %s" % str(e) if config.do_syslog: syslog.syslog(syslog.LOG_ERR, error)
def fetch_66ip(): """ http://www.66ip.cn/ 每次打开此链接都能得到一批代理, 速度不保证 """ proxies = [] try: # 修改getnum大小可以一次获取不同数量的代理 url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip" content = get_html(url) urls = content.split("</script>")[-1].split("<br />") for u in urls: if u.strip(): proxies.append(u.strip()) except Exception as e: logger.warning("fail to fetch from httpdaili: %s" % e) return proxies
def run_av(self, args): if not args: return "Necesas specifi radikon. Por helpo vidu %s" % self.help_url("av") root = urllib.quote(util.x_to_unicode(args)) html = util.get_html(config.av_search % root) mlnk = re.search(r'<td class="formo"><a href="(.+?)" title="Ligilo al la Fundamento">(.+?)</a></td>', html) mele = re.search(r'<td class="formo">(.+?)</td>', html) mbaz = re.search(r'<td class="bazformo">(.+?)</td>', html) msta = re.search(r'<td class="statuso">(.+?)</td>', html) mbro = re.search(r'<td class="bro">(.+?)</td>', html) if not mele: return 'Nenio trovita por "%s".' % args else: link = config.av_site + mlnk.group(1).strip() if mlnk else "" elemento = mlnk.group(2).strip() if mlnk else mele.group(1).strip() bazformo = mbaz.group(1).strip() if mbaz else "" statuso = re.sub(r".*? \+?", "", msta.group(1).strip()).upper()[0] if msta else "" bro = mbro.group(1).strip()[3] if mbro else "" ret = [] ret.append("Elemento: %s %s" % (elemento, link)) ret.append("Bazformo: %s" % bazformo) last = [] if statuso: last.append("La radiko %s troviĝas en la" % elemento) if statuso == "F": last.append("Fundamento de Esperanto") else: last.append({"1": "Unua", "2": "Dua", "3": "Tria", "4": "Kvara", "5": "Kvina", "6": "Sesa", "7": "Sepa", "8": "Oka", "9": "Naŭa"}[statuso]) last.append("Aldono al la Universala Vortaro") if bro: if statuso: last.append("kaj") else: last.append("La radiko %s troviĝas en" % elemento) last.append("Grupo %s de la Baza Radikaro Oficiala" % bro) ret.append(" ".join(last)) return "\n".join(ret) + "."
def get_proxies_66(site=util.PROXY_SITES[0], filepath='proxy.txt'): ''' crawl and test proxy from 66ip and save it to file :param site: proxy site(66ip) :return: None ''' print('start crawl proxies:') soup = BeautifulSoup(util.get_html(site, header=util.HEADER), 'lxml') tbs = soup.find_all('table')[2] trs = tbs.find_all('tr')[1:] for tr in trs: ip = tr.find_all('td')[0].text port = tr.find_all('td')[1].text proxy_value = 'http:' + str(ip) + ':' + str(port) proxy = {'http': proxy_value, 'https': proxy_value} if test_proxy(proxy): with open(filepath, 'a+') as f: f.write(str(ip) + ':' + str(port) + '\n')
def download_and_unzip_file(url_, path): try: zfobj = ZipFile(StringIO(util.get_html(url_))) for name in zfobj.namelist(): if name.endswith('/'): sub_path = os.path.join(path, name) if not os.path.exists(sub_path): os.mkdir(sub_path) continue uncompressed = zfobj.read(name) out_ = path + '/' + name output = open(out_, 'wb') output.write(uncompressed) output.close() return True except Exception, e: print e return False
def trans_google(self, fr, to, translate): translate = urllib.quote(translate) url = config.google_translate_search % (translate, fr, to) jsn = util.get_html(url, masquerade=True) dic = json.loads(jsn) src = dic["src"].encode('utf-8') trans = dic["sentences"][0]["trans"].encode('utf-8') translit = dic["sentences"][0]["translit"].encode('utf-8') ret = [] if fr == "auto": ret.append('Tradukis de lingvo "%s"' % src) if not translit: ret.append(trans) else: ret.append("Traduko: %s" % trans) if trans != translit: ret.append("Transliterumo: %s" % translit) return "\n".join(ret)
def trans_majstro(self, fr, to, word): qword = urllib.quote(word) url = config.majstro_search % (fr, to, qword) html = util.get_html(url) if "could not be translated" in html: return 'Nenio trovita por "%s".' % word results = re.findall(r"<li>.+?</li>", html) ret = "\n".join(results) ret = util.strip_tags(ret) parser = HTMLParser.HTMLParser() ret = ret.decode('utf-8') ret = parser.unescape(ret) if type(ret) == unicode: ret = ret.encode('utf-8') ret = re.sub(": ", " → ", ret) ret = re.sub("; ", ", ", ret) return ret
def batoto(url, download_chapters,args): batoto_username = args.username batoto_password = args.password login() for i in range(3): try: html = get_html(url+'/') break except: if i == 2: raise else: pass global last global session if hasattr(args, 'last'): last=args.last series = title(re.search('<h1.*?>[\\s\n]*(.*?)[\\s\n]*</h1>', html, re.DOTALL|re.MULTILINE).group(1)) status = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1) author = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))) tags = re.findall('<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall('<tr class=\"row lang_([A-Za-z]*?) chapter_row\".*?>(.*?)</tr>', html, re.DOTALL|re.MULTILINE)[::-1]: if j[0] == batoto_lang: match = re.search('<a href=\"([^\"]*?)\".*?>\\s*<img.*?>\\s*([^\"<>]*)(\\s*:\\s*)?(.*?)\\s*</a>', j[1], re.DOTALL|re.MULTILINE) name = match.group(4) m2 = re.search('[Cc]h(ap)?(ter)?\\.?\\s*([Ee]xtra:?)?\\s*([\\d\\.]+)\\s*(-\\s*[\\d\\.]+)?', match.group(2)) try: num = float(m2.group(4)) except: if args.debug: print(j[1]) raise ''' #TODO if m2.group(3): if chapters: num = chapters[-1]['num'] + .4 else: num = last + .4 ''' try: vol = int(re.search('[Vv]ol(ume)?\\.\\s*(\\d+)', match.group(2)).group(2)) except: vol = 0 link = match.group(1) uuid = link.rpartition('#')[2] ref = link.rpartition('/')[0]+'/' + "reader#" + uuid + "_1" head = {'Referer':ref, 'supress_webtoon':'t'} link = link.rpartition('/')[0]+'/'+ 'areader?id='+uuid+'&p=1' session.headers.update(head) try: date = datetime.strptime(re.search('<td.*?>(\\d{2} [A-Za-z]* \\d{4}.*?([Aa][Mm]|[Pp][Mm])).*?</td>', j[1]).group(1), '%d %B %Y - %I:%M %p').strftime('%Y-%m-%dT%H:%M:00') except: try: t = re.search('(\\d+) [Mm]inutes ago', j[1]).group(1) except: t = '1' if re.search('A minute ago', j[1]) else '' if t: unit = '%M' else: try: t = re.search('(\\d+) [Hh]ours ago', j[1]).group(1) except: t = '1' if re.search('An hour ago', j[1]) else '' if t: unit = '%H' else: try: t = re.search('(\\d+) [Dd]ays ago', j[1]).group(1) except: t = '1' if re.search('A day ago', j[1]) else '' if t: unit = '%d' else: try: t = re.search('(\\d+) [Ww]eeks ago', j[1]).group(1) except: t = '1' if re.search('A week ago', j[1]) else '' if t: unit = '%W' else: t = '0' unit = '%M' date = datetime.fromtimestamp((datetime.today()-datetime.strptime(t, unit)).total_seconds()).strftime('%Y-%m-%dT%H:%M:00') if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) img_url = re.sub('001\\.([A-Za-z]{3})', '{:03}.\\1', re.search('<div.*?>\\s*<a.*?>\\s*<img[^<]*?src=\"([^\"]*?)\"[^>]*?/>\\s*</div>', chap_html, re.DOTALL|re.MULTILINE).group(1)) zero = False if '{:03}' not in img_url: img_url = re.sub('000\\.([A-Za-z]{3})', '{:03}.\\1', img_url) zero = True if '{:03}' not in img_url: img_url = re.sub('000\\.([A-Za-z]{3})', '{:03}.\\1', img_url) zero = True if '{:03}' not in img_url: img_url = re.sub('01\\.([A-Za-z]{3})', '{:02}.\\1', img_url) zero = False if '{:02}' not in img_url: img_url = re.sub('00\\.([A-Za-z]{3})', '{:02}.\\1', img_url) zero = True if re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html): pages = max([int(i) for i in re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html)]) else: continue b_links = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>page (\\d+)</option>', chap_html)} b_links = [b_links[i+1] for i in range(pages)] if zero: links = [img_url.format(i) for i in range(pages)] else: links = [img_url.format(i+1) for i in range(pages)] chapters.append({'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num}) if chapters: function_name(chapters, series, tags, author, status,args)
pre_url_list = [ "http://tieba.baidu.com/p/2738151262", 'http://[email protected]/portal/home', ] post_url_list = [] count = 0 util.init_path([html_folder, img_folder]) while pre_url_list and count<100: # print 'pre_url_list: %s' % len(pre_url_list) # print 'post_url_list: %s' % len(post_url_list) url = pre_url_list.pop() # print 'current url: %s' % url html = util.get_html(url) # print 'html => ', html if not html: continue hp = parser.HP(url) hp.feed(html) hp.close() # print hp.links for link in hp.links: if not link.startswith('http'): continue if link not in post_url_list and link not in pre_url_list and link!=url and \ link not in util.exclude_url_list and link not in util.error_url_list: pre_url_list.append(link) write_html(html_folder, html) post_url_list.append(url)
if p.returncode is not None: return False else: return True if __name__ == '__main__': dir_path = 'download_space' last_cmd = None client_name = sys.argv[1] HTTP_SERVER = 'http://localhost:8080' p = None while True: post_state = 'beeping' cmd = util.get_html(HTTP_SERVER + '/' + 'client?client_name=%s' % client_name) if cmd.startswith('dr:'): terminate_process() p = None para = cmd.replace('dr:', '') prepare_dir(dir_path) if download_and_unzip_file(HTTP_SERVER + '/static/' + para, dir_path): p = start_run_process() post_state = cmd else: remove_dir(dir_path) elif cmd.startswith('t:'): terminate_process() p = None post_state = cmd elif cmd.startswith('c:'):
def japscan(url, download_chapters, args): print("getting url " + url) html = get_html(url) global last if hasattr(args, 'last'): last = args.last series = title( re.search('(<h1 class="bg-header">).*>(.*)</a>(</h1>)', html.replace('\n', '')).group(2)) #FIND ALL info_gen = re.findall('(<div class="cell">\\s*(.*?)\\s*</div>)', html.replace( '\n', '')) ## ['*****@*****.**', '*****@*****.**'] status = info_gen[7][1] author = info_gen[5][1] tags = info_gen[7][1] # for j in range(len(tags)): #for k in tag_dict: #tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] # catch chapters list chapitres = re.search( '(<div id="liste_chapitres">(.*)</div>.*<div class="col-1-3")', html.replace('\n', '')) #print(html) #print(chapitres.group(1)) for j in re.findall('<li>(.*?)</li>', chapitres.group(1), re.DOTALL | re.MULTILINE)[::-1]: #match = re.search('<a.*[-/]([0-9]+).*',j,re.DOTALL|re.MULTILINE) match = re.search( '<a.*[-/]([0-9.]+).*>Scan (.*) ([0-9.]+) VF( : )?(.*)?<.*', j, re.DOTALL | re.MULTILINE) # re.search('<a.*?>(.*?)([\\d,.]+)\\s*</a>', j, re.DOTALL|re.MULTILINE) #name = match.group(2) num = float(match.group(1)) link = "http://" + re.search('href=\".*(www.*?)\"', j).group(1) name = match.group(5) date = "01/01/2000" serie_short = match.group(2) if name: name = '{} - {} : {}'.format(serie_short, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(series)) chap_html = get_html(link) links = [''] #HACK : HAVE TO PARSE EACH PAGE TO RETRIEVE IMAGE for content in re.findall( '<option .* value=\"(.*?)\".*?>.*</option>', chap_html)[::-1]: # print("http://www.japscan.com"+content) content_html = get_html("http://www.japscan.com" + content) search = '<div itemscope itemtype="http://schema.org/Article">.*src="(.*[.][a-z]{0,4})" />' #link_page=re.search(search,content_html.replace('\n',''),re.MULTILINE) link_page = re.search(search, content_html.replace('\n', ''), re.MULTILINE) #print(content_html.replace('\n','')) try: #print(link_page.group(1)) links.append(link_page.group(1)) except: print('An error occurs, unable to search pages') print(content_html.replace('\n', '')) links.remove('') links = list(reversed(links)) chapters.append({ 'name': name, 'links': links, 'backup_links': links, 'date': date, 'pages': len(links), 'num': num }) args.url = url if chapters: function_name(chapters, series, tags, author, status, args)
def mymanga(url, download_chapters, args): html = get_html(url) global last if hasattr(args, 'last'): last = args.last series = title( re.search('(<h2 class="text-border">)(.*)(</h2>)', html.replace('\n', '')).group(2)) #FIND ALL info_gen = re.findall('(<div class="cell">\\s*(.*?)\\s*</div>)', html.replace( '\n', '')) ## ['*****@*****.**', '*****@*****.**'] status = 'default' #info_gen[7][1] author = 'default' #info_gen[5][1] tags = 'default' #info_gen[7][1] # for j in range(len(tags)): #for k in tag_dict: #tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] # catch chapters list chapitres = re.search( '(<section class="listchapseries fiche block sep">(.*)</section>)', html.replace('\n', '')) #print(html.replace('\n','')) #print(chapitres.group(1)) for j in re.findall('<li>(.*?)</li>', chapitres.group(1), re.DOTALL | re.MULTILINE)[::-1]: #match = re.search('<a.*[-/]([0-9]+).*',j,re.DOTALL|re.MULTILINE) match = re.search('<a.* href=".*/chapitre-([0-9.]+)/', j, re.DOTALL | re.MULTILINE) # re.search('<a.*?>(.*?)([\\d,.]+)\\s*</a>', j, re.DOTALL|re.MULTILINE) #name = match.group(2) num = float(match.group(1)) link = "http://" + re.search('href=\".*(www.*?)\" title', j).group(1) name = match.group(1) date = "01/01/2000" serie_short = series if name: name = '{} - {} : {}'.format(serie_short, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(series)) chap_html = get_html(link) links = [''] #HACK : HAVE TO PARSE EACH PAGE TO RETRIEVE IMAGE #print(chap_html) for content in re.findall('<option value="(.*[0-9]?)" ', chap_html)[::-1]: #print("http://www.hitmanga.eu/"+content) content_html = get_html("http://www.hitmanga.eu/" + content) #print(content_html.replace('\n','').replace('\r','')) search = '<table id="picture">.*src="(.*[.][a-z]{0,4}?)" alt=.*</table>' #link_page=re.search(search,content_html.replace('\n',''),re.MULTILINE) link_page = re.search(search, content_html.replace('\n', ''), re.MULTILINE) #print(content_html.replace('\n','')) try: #print(link_page.group(1)) links.append(link_page.group(1)) except: print('An error occurs, unable to search pages') print(content_html.replace('\n', '')) links.remove('') links = list(reversed(links)) chapters.append({ 'name': name, 'links': links, 'backup_links': links, 'date': date, 'pages': len(links), 'num': num }) args.url = url if chapters: function_name(chapters, series, tags, author, status, args)
b=sp.sub(" ",t.string) t.replace_with(b) for f in soup.findAll("a",text=re.compile(u".*(Eventos de Linux Foundation).*")): texts=f.find_parent("fieldset").div.ul.find_all("strong") for t in texts: t.unwrap() e=soup.find("span", attrs={'class': "enlace"}) if e and e.parent.name=="li": e.parent.extract() util.set_menu(soup) h=util.get_html(soup) h=h.replace("Objectivos de aprendizaje","Objetivos de aprendizaje") #7 11 h=h.replace(">31</a></h1>",">31. zypper</a></h1>") #31 h=h.replace(">31</option>",">31. zypper</option>") #31 h=h.replace(" del sisco "," del disco ") h=h.replace("miltihebra","multihebra") h=h.replace("el ajusta de E/S","el ajuste de E/S") h=h.replace(". Se este",". Si este") h=h.replace(" tital "," total ") h=h.replace(" para para "," para ") h=h.replace("revision_umber","revision_number") h=h.replace("cuentasde","cuentas de") h=h.replace("/opt/dolphy_app /man","/opt/dolphy_app/man") h=h.replace("archivosy propietarios","archivos y propietarios") h=h.replace("$tar","$ tar")
util.h_to_a(out,h,ct) ct=ct+1 for ol in out.findAll("ol"): if ol.parent and ol.parent.name=="ol": ol.unwrap() for p in out.findAll("pre"): for s in p.findAll("strong"): if "style" in s.attrs and "class" not in s.attrs: del s.attrs["style"] s.attrs["class"]="resaltar" for i in out.findAll("iframe"): if "src" in i.attrs and i.attrs["src"].startswith("//"): i.attrs["src"]="http:"+i.attrs["src"] util.set_menu(out) html = util.get_html(out,True) html=html.replace(u"–","-") html=html.replace(u"—","-") r=re.compile("([rwx])=2([210])") html=r.sub("\\1=2<sup>\\2</sup>",html) r=re.compile("</p>\s*<li>") html=r.sub("</p><ul><li>",html) r=re.compile("</li>\s*<p>") html=r.sub("</li></ul><p>",html) util.escribir(html,oht) #out.prettify("utf-8",formatter="html") #with open(oht, "wb") as file: # file.write(html)#.encode('utf8'))
def update(self): model = self.get_model() deck = self.get_deck() note_map = self.get_notes() model['did'] = deck['id'] self._update_templates(model) # processes data from kd website self.log.info('loading data from kanji damage website (this will take quite a while)...') self.col.models.setCurrent(model) self.col.decks.select(deck['id']) url = KD_DAMAGE_BASE_URL + KD_KANJI_PATH + '/1' tries = 0 while url: try: doc = util.get_html(url, self.log) if doc is None: return util.add_base_url(doc, KD_DAMAGE_BASE_URL) # retrieves the data kanji = self._get_kanji(doc) meaning = self._get_meaning(doc) # get map key key = None if type(kanji) is lxml.html.HtmlElement: # kanji is an image self._download_images(kanji, KD_DAMAGE_BASE_URL) kanji = util.html_to_string(kanji) key = meaning elif KD_VALID_KANJI.match(kanji): key = kanji # update/create note if key: note = note_map[key] if key in note_map else self.col.newNote() note['Kanji'] = kanji note['Meaning'] = meaning note['Number'] = self._get_number(doc) note['Description'] = self._get_description(doc, KD_DAMAGE_BASE_URL) note['Usefulness'] = self._get_usefulness(doc) note['Full used In'] = self._get_used_in(doc, KD_DAMAGE_BASE_URL) onyomi_full, onyomi = self._get_onyomi(doc, KD_DAMAGE_BASE_URL) note['Full onyomi'] = onyomi_full note['Onyomi'] = onyomi kun_full, kun, kun_meaning, kun_use = self._get_kunyomi(doc, KD_DAMAGE_BASE_URL) note['Full kunyomi'] = kun_full note['First kunyomi'] = kun note['First kunyomi meaning'] = kun_meaning note['First kunyomi usefulness'] = kun_use mnemonic_full, mnemonic = self._get_mnemonic(doc, KD_DAMAGE_BASE_URL) note['Full mnemonic'] = mnemonic_full note['Mnemonic'] = mnemonic note['Components'] = self._get_components(doc, KD_DAMAGE_BASE_URL) jk_full, jk, jk_meaning, jk_use = self._get_jukugo(doc, KD_DAMAGE_BASE_URL) note['Full jukugo'] = jk_full note['First jukugo'] = jk note['First jukugo meaning'] = jk_meaning note['First jukugo usefulness'] = jk_use note['Full header'] = self._get_header(doc, KD_DAMAGE_BASE_URL) note['Full lookalikes'] = self._get_lookalikes(doc, KD_DAMAGE_BASE_URL) if key not in note_map: self.col.addNote(note) note_map[key] = note else: note.flush() self.log.debug(util.note_to_json(note)) else: self.log.info('ignored kanji: %s', kanji) # finds the link to the next kanji url = next(iter(doc.xpath('//div[@class="span2 text-righted"]/a[1]/@href')), None) tries = 0 except OSError as e: if (e.errno == 101) and (tries < 3): tries += 1 else: self.log.exception('failed to retrieve from %s', url) url = None self.col.save()
def get(self): spide_url = getattr(self.resource, 'spide_url') self.content = get_html(self.resource.spide_url)
def mangahere(url, download_chapters, args): html = get_html(url) global last if hasattr(args, 'last'): last = args.last series = title( re.search( '<h1 class="title"><span class="title_icon"></span>(.*?)</h1>', html.replace('\n', '')).group(1)) status = re.search('<li><label>Status:</label>(.*?)<', html.replace('\n', '')).group(1) author = ', '.join( re.findall( '<a.*?>(.*?)</a>', re.search('<li><label>Author\\(?s?\\)?:</label>(.*?)</li>', html.replace('\n', '')).group(1))) tags = re.search('<li><label>Genre\\(s\\):</label>(.*?)</li>', html).group(1).split(', ') for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall( '<li>\\s*<span class=\"left\">\\s*(.*?\\d{4}</span>)\\s*</li>', html, re.DOTALL | re.MULTILINE)[::-1]: match = re.search( '<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL | re.MULTILINE) name = match.group(2) num = float(match.group(1)) link = re.search('href=\"(.*?)\"', j).group(1) try: date = datetime.strptime( re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d') except: date = datetime.datetime.today().strftime('%Y-%m-%d') if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): if args.debug or args.verbose: print(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) img_url = re.sub( '001.([A-Za-z]{3})', '{:03}.\\1', re.search('<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>', chap_html, re.DOTALL | re.MULTILINE).group(1)) if '{:03}' not in img_url: img_url = re.sub('01.([A-Za-z]{3})', '{:02}.\\1', img_url) pages = max([ int(i) for i in re.findall( '<option value=\".*?\".*?>(\\d+)</option>', chap_html) ]) b_links = { float(i[1]): i[0] for i in re.findall( '<option value=\"(.*?)\".*?>(\\d+)</option>', chap_html) } b_links = [b_links[i + 1] for i in range(pages)] links = [img_url.format(i + 1) for i in range(pages)] chapters.append({ 'name': name, 'links': links, 'backup_links': b_links, 'date': date, 'pages': pages, 'num': num }) if chapters: function_name(chapters, series, tags, author, status, args)