def scrape(self, username): self._connect('{}/{}/media'.format(self.base_url, username)) if self._mode != 'silent': print('Crawling...') done = self.scrollToBottom() source = self.source() soup = bs(source, 'html.parser') # title = soup.find('title') # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '') # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large') # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src') tasks = [] for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }): url = div.get('data-image-url') tasks.append((url+':large', username, get_filename(url))) if self._mode != 'silent': print('{} media are found.'.format(len(tasks))) return tasks
def scrape(self, username): self._connect('{}/{}/media'.format(self.base_url, username)) if self._mode != 'silent': print('Crawling...') done = self.scrollToBottom() source = self.source() soup = bs(source, 'html.parser') # title = soup.find('title') # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '') # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large') # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src') tasks = [] for li in soup.find_all('li', {'class': 'js-stream-item stream-item stream-item '}): photos = li.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }) if photos == []: try: img_url, vid_url = get_twitter_video_url(li['data-item-id']) tasks.append((img_url+':large', username, get_basename(get_filename(img_url)))) tasks.append((vid_url, username, get_basename(get_filename(vid_url)))) except Exception as e: with open('error.txt', 'w', encoding='utf-8') as f: f.write(str(e) + '\n') f.write(str(li)) else: for photo in photos: url = photo['data-image-url'] tasks.append((url+':large', username, get_basename(get_filename(url)))) for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }): url = div['data-image-url'] tasks.append((url+':large', username, get_basename(get_filename(url)))) if self._mode != 'silent': print('{} media are found.'.format(len(tasks))) return tasks
def scrape(self, id, content_type='all'): self._connect(self.post_url + id) self.id = id self.type = content_type if self._mode != 'silent': print('Crawling...') # TODO # get page num pager_container = self._driver.get_element_by_class_name('page-list') last_pager = pager_container.get_element_by_tag_name('li')[-1] num_page = int(last_pager.get_element_by_tag_name('a').text) print('# of page: {}'.format(num_page)) # crawl each page for p in range(1, num_page+1): url = 'https://www.pixiv.net/member_illust.php?id={}&type={}&p={}'.format(self.id, self.type, p) self._driver._connect(url) time.sleep(self._next_page_pause_time) print(url) # scrape each post return done = self.scrollToBottom() source = self.source() soup = bs(source, 'html.parser') # title = soup.find('title') # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '') # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large') # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src') tasks = [] for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }): url = div.get('data-image-url') tasks.append((url+':large', get_filename(url))) if self._mode != 'silent': print('{} media are found.'.format(len(tasks))) return tasks