def crawl(keyword): urls = [] g = Grab() g.go(f'http://www.yinghuacd.com/search/{keyword}/') candidate_selectors = g.doc(f'//div[@class="lpic"]/ul/li/a/@href') # tmpfix: avoid too many inaccurate results if len(candidate_selectors) >= 3: candidate_selectors = candidate_selectors[0:3] print(f'[WARN] too many candidates for {keyword}, only keep the first three.') # for candidates for candidate in candidate_selectors: g.go(f'http://www.yhdm.so{candidate.text()}') episode_selectors = g.doc('//div[@class="movurl"]/ul/li/a/@href') # tmpfix: avoid too many episodes if len(episode_selectors) >= 30: episode_selectors = episode_selectors[:30] print(f'[WARN] too many episodes, only keep the first 30.') # for episodes for episode in episode_selectors: g.go(f'http://www.yhdm.so{episode.text()}') title_selectors = g.doc('//div[@class="gohome l"]/h1') data_selectors = g.doc('//div[@id="playbox"]/@data-vid') title = title_selectors[0].text() url = data_selectors[0].text() # tmpfix if url[-4:] == "$mp4": url = url[:-4] # append to results print(f'[INFO] crawled {title} {url}') urls.append({ 'formattedUrl': url, 'title': title, 'snipped': '', }) return urls
def download(): url=input('url:') print('ddd') pool = ThreadPool(4) main_page=Grab() main_page.go(url) title=main_page.doc('//*[@id="novel_color"]/p').text() if os.path.isdir('temp'): shutil.rmtree('temp') os.mkdir('temp') path='temp/' main_page.doc.save(path+'main.html') urls_xpath = main_page.doc('//*[@id="novel_color"]/div/dl/dd/a') i=1 urls=[] dict={} for url in urls_xpath: urls.append('http://novel18.syosetu.com'+ url.attr('href')) key= url.attr('href').split('/')[2] dict[key]=url.text().replace('/',' ') print(dict) def pages(page_url): num=page_url.split('/')[4] f = urllib.request.urlopen(page_url) data = f.read() with open(path+'%04u.%s.html'%(int(num),dict[num]), "wb") as code: code.write(data) print(num,dict[num]) pool.map(pages,urls)
def get_new_address(self): base_url = "http://www.fakemailgenerator.com/" g = Grab() g.setup(connect_timeout=20, timeout=20) g.go(base_url) if g.response.code == 200: email_name = g.doc('//*[@id="home-email"]/@value').text() email_site = g.doc('//*[@id="domain"]').text() self.full_email = email_name, email_site
def feed_http(request): """HTTP Cloud Function. Args: request (flask.Request): The request object. <http://flask.pocoo.org/docs/1.0/api/#flask.Request> Returns: The response text, or any set of values that can be turned into a Response object using `make_response` <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>. """ request_args = request.args url = request_args['url'] g = Grab() fg = FeedGenerator() g.go(url) fg.id(url) fg.title('Rabota.UA | rss feed') url_parsed = urlparse(g.response.url) fg.link(href=url_parsed.scheme + '://' + url_parsed.hostname, rel='alternate') fg.description(g.doc('/html/head/title').text()) count = int( g.doc('//span[@id="ctl00_content_vacancyList_ltCount"]/span').one(). text()) if count == 0: itm_list = [] else: articles = g.doc.select( '//table[contains(@class, "f-vacancylist-tablewrap")]').one() itm_list = articles.select( 'tr[@id]/td/article/div[contains(@class, "card-body")]') for item in itm_list: vac_title = item.select( 'div[1]//h2[contains(@class, "card-title")]/a/@title').text( ).strip() vac_url = g.make_url_absolute( item.select( 'div[1]//h2[contains(@class, "card-title")]/a/@href').text()) try: vac_description = item.select( 'div[contains(@class, "card-description")]').text().strip() except weblib.error.DataNotFound: vac_description = 'N/A' fe = fg.add_entry() print(vac_title) fe.id(vac_url) fe.link({'href': vac_url}) fe.source(vac_url) fe.title(vac_title) fe.description(vac_description) response = make_response(fg.atom_str(pretty=True, extensions=False)) response.headers['Content-Type'] = 'application/rss+xml; charset=UTF-8' return response
class MultsInfo: def __init__(self, url): self.url = url self.g = Grab() self.go() def go(self): self.g.go(self.url) self.name = self.g.doc('//html/body/center/div/div[2]/h1').text().split('"')[1] self.year = self.g.doc('//html/body/center/div/div[2]/div[3]/p[1]/a[1]').text().split()[1] self.turl = "http://mults.info/" + self.g.doc('//a[b="torrent"]/@href').text()
def get_by_name(name): url = KINOPOISK_SEARCH % quote(name) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0' } g = Grab(log_dir = 'tmp', referer = 'https://www.kinopoisk.ru/', headers = headers) g.go(url) return g.doc('//meta[@name="twitter:app:url:iphone"]/@content').text().split('/')[-1]
class KPMovie: def __init__(self, kid, cached = None): # Movie id. self.kid = kid self.g = Grab() self.cached = cached self.go() def go(self): if self.cached: print 'Loading from cache', self.kid c = ('%s' % self.cached).encode('cp1251', 'ignore') #print type(c) self.g.doc.body = c self.g.doc.parse() else: print 'Downloading', self.kid self.g.go(self._url()) self.cached = self.g.doc.unicode_body() self.name_ru = self.g.doc('//h1[contains(@itemprop, "name")]').text() self.name_en = self.g.doc('//span[contains(@itemprop, "alternativeHeadline")]').text() self.year = self.g.doc('//table[contains(@class, "info")]/tr/*/div/a').text() self.genres = [el.text() for el in self.g.doc('//span[contains(@itemprop, "genre")]/a')] def _url(self): return KINOPOISK_URL % self.kid @staticmethod def get_by_name(name): url = KINOPOISK_SEARCH % quote(name) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0' } g = Grab(log_dir = 'tmp', referer = 'https://www.kinopoisk.ru/', headers = headers) g.go(url) return g.doc('//meta[@name="twitter:app:url:iphone"]/@content').text().split('/')[-1]
def ycombinator(): spider = Grab() main_domain = 'https://news.ycombinator.com/' spider.go(main_domain) items = spider.doc('//td[@class="title"]/a[@class="storylink"]') links = items.attr_list('href') links = list( map(lambda url: url if 'https://' in url or 'http://' in url else "{}{}".format(main_domain, url), links) ) title = items.text_list() result = [{'url': data[0], 'title': data[1]} for data in list(zip(links, title))] return result
def fetch_contacts(username, password): basicConfig(level=DEBUG) g = Grab() home_url = 'https://www.linkedin.com' g.go(home_url + '/uas/login') g.doc.set_input('session_key', username) g.doc.set_input('session_password', password) g.doc.submit() def get_nb_contacts(): elem = g.doc('//li[@class="nav-item account-settings-tab"]/a') own_page = elem.attr('href') g.go(own_page) #g.doc.save('z.html') html = g.doc.select('//*[@id="top_card-content"]').html() start = html.find('{') com = html[start:-10] content = loads(com)["content"] res = content["ContactInfo"]["distance"]["numberOfConnections"] return res nb_contacts = 0 while nb_contacts == 0: try: nb_contacts = get_nb_contacts() except: pass contacts_url = g.doc('//*[@id="advanced-search"]/@href').text() g.go(contacts_url) def process_comments(commented_line): start = commented_line.find('{') tmp = commented_line[start:-10] res = loads(tmp)["content"]["page"]["voltron_unified_search_json"]["search"] next_page_url = res["baseData"]["resultPagination"]["nextPage"]["pageURL"] results = res["results"] contacts = [ X.itervalues().next() for X in results ] return contacts, home_url + next_page_url def process1contact(contact): s_dict = defaultdict(lambda: '') l_dict = defaultdict(lambda: []) s_dict["lastname"] = contact["lastName"] s_dict["firstname"] = contact["firstName"] try: s_dict["id"] = contact["id"] except: s_dict["id"] = '' try: s_dict["job_title"] = contact["fmt_headline"] except: s_dict["job_title"] = '' print(s_dict) links_profile = [] for key in contact.keys(): if 'link_nprofile_view' in key: links_profile.append(key) if len(links_profile) == 0: print("ERROR: No 'link_profile' provided in contact.keys()") link_profile = links_profile[0] url_contact = contact[link_profile] g.go(url_contact) # email try: s_dict["email"] = g.doc.select('//a[contains(@href,"mailto")]').text() except: s_dict["email"] = '' # phone number try: s_dict["phone"] = g.doc.select('//div[@id="phone-view"]/ul/li').text() except: s_dict["phone"] = '' # skills for elem in g.doc.select('//ul[@class="skills-section"]//span[contains(@class, "endorse-item-name-text")]'): l_dict["main_skills"].append(elem.text()) for elem in g.doc.select('//ul[@class="skills-section compact-view"]//span[contains(@class, "endorse-item-name-text")]'): l_dict["other_skills"].append(elem.text()) # companies tmp = g.doc.select('//div[@class="editable-item section-item current-position"]//a[contains(@href, "company-name")]') for elem in tmp: s_dict["current_company"] += elem.text() tmp = g.doc.select('//div[@class="editable-item section-item past-position"]//a[contains(@href, "company-name")]') for elem in tmp: if len(elem.text()) > 0: l_dict["former_companies"].append(elem.text()) # summary try: s_dict["summary"] = g.doc.select('//div[@class="summary"]/p').text() except: s_dict["summary"] = '' # languages tmp = g.doc.select('//div[@id="languages"]//li[@class="section-item"]/h4/span') for elem in tmp: l_dict["languages"].append(elem.text()) # projects project_names = g.doc('//div[@id="background-projects"]//span[@dir="auto"]') project_dates = g.doc('//div[@id="background-projects"]//span[@class="projects-date"]/time') for name, date in zip(project_names, project_dates): l_dict["projects"].append((name.text(), date.text())) # certifications certification_titles = g.doc('//div[@id="background-certifications"]//a[contains(@href,"certification_company_title")]') certification_orgs = g.doc('//div[@id="background-certifications"]//a[contains(@href,"certification-org_name")]') certification_dates = g.doc('//div[@id="background-certifications"]//span[@class="certification-date"]/time') for title, org, date in zip(certification_titles, certification_orgs, certification_dates): title_text = title.text() end = title_text.find('(') l_dict["certifications"].append((title_text[:end], org.text(), date.text())) # coursework schools = g.doc('//div[@id="background-education-container"]//a[contains(@href,"edu-school-name")]') dates = g.doc('//div[@id="background-education-container"]//span[@class="education-date"]') for school, date in zip(schools, dates): l_dict["coursework"].append((school.text(), date.text())) # graduation year try: end_schools = [ int(date.text().split(' ')[-1]) for date in dates ] s_dict["graduation_year"] = max(end_schools) except: s_dict["graduation_year"] = '' # experiences xp_titles = g.doc('//div[@id="background-experience"]//a[contains(@href,"profile_title")]') xp_comps = g.doc('//div[@id="background-experience"]//a[contains(@href,"company-name")]') xp_comps = filter(lambda x: len(x.text()) > 0, xp_comps) xp_dates = g.doc('//div[@id="background-experience"]//span[@class="experience-date-locale"]/time') xp_places = g.doc('//div[@id="background-experience"]//span[@class="locality"]') xp_summaries = g.doc('//div[@id="background-experience"]//p[@class="description summary-field-show-more"]') for title, company, date, place, summary in zip(xp_titles, xp_comps, xp_dates, xp_places, xp_summaries): l_dict["experiences"].append((title.text(), company.text(), date.text(), place.text(), summary.text())) def pretty_tuple(T): if type(T) == tuple: return ', '.join(t.encode('utf-8') for t in T) else: return T pretty_list = lambda L: '\n'.join(pretty_tuple(l) for l in L) s_l_dict = {k:pretty_list(v) for k, v in l_dict.items()} s_dict.update(s_l_dict) return s_dict import os path_to_desktop = os.path.expanduser('~/Desktop/') filename = path_to_desktop + 'contacts.xlsx' if os.path.isfile(filename): saved_df = read_excel(filename) saved_IDs = saved_df["id"].values else: saved_IDs = [] contacts = [] nb_pages = int((nb_contacts-1)/10 + 1) #nb_pages = 2 for i in range(nb_pages): comments = g.doc.select('//*[@id="voltron_srp_main-content"]').html() new_contacts, next_page_url = process_comments(comments) if i == nb_pages - 1: new_contacts = filter(lambda contact: contact["distance"] == 1, new_contacts) new_contacts = filter(lambda contact: contact["id"] not in saved_IDs, new_contacts) contacts.extend(new_contacts) g.go(next_page_url) processed_contacts = [ process1contact(contact) for contact in contacts ] df = DataFrame(processed_contacts) cols_sorted = ["lastname", "firstname", "email", "phone", "job_title", "main_skills", "other_skills", "current_company", "former_companies", "certifications", "projects", "graduation_year", "coursework", "summary", "experiences", "languages", "id"] df_cols = df.columns.tolist() cols = [col for col in cols_sorted if col in df_cols] df = df[cols] if os.path.isfile(filename): df = saved_df.append(df) df.to_excel(filename, sheet_name='sheet1', index=False) return processed_contacts
from grab import Grab spider = Grab() main_domain = 'https://news.ycombinator.com/' spider.go(main_domain) items = spider.doc('//td[@class="title"]/a[@class="storylink"]') links = items.attr_list('href') links = list( map( lambda url: url if 'https://' in url or 'http://' in url else "{}{}".format( main_domain, url), links)) title = items.text_list() result = list(zip(links, title))
import logging from grab import Grab logging.basicConfig(level=logging.DEBUG) g = Grab() g.setup( headers={ 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' }) g.go('https://www.youtube.com/channel/UCy3eOZh9n5P3aWWKXmrq6yg') # g.doc.save('x.html') result = g.doc('//a[@id="video-title"]').exists() print(result)
# crawl github project list from grab import Grab import logging logging.basicConfig(level=logging.DEBUG) g = Grab() g.go('https://github.com/login') print g.doc.form g.doc.set_input('login', '*****@*****.**') g.doc.set_input('password', '') g.doc.submit() g.doc.save('/tmp/x.html') home_url = g.doc('//a[contains(@class, "header-nav-link name")]/@href').text() repo_url = home_url + '?tab=repositories' g.go(repo_url) for elem in g.doc.select('//h3[@class="repo-list-name"]/a'): print('%s: %s' % (elem.text(), g.make_url_absolute(elem.attr('href')))) # from grab.spider import Spider, Task # import logging # # class ExampleSpider(Spider): # def task_generator(self):