def download(self, queue, folder=''): if not isinstance(folder, text): folder = str(folder) if self.path: folder = os.path.join(self.path, folder) if not os.path.exists(folder): logger.warn( 'Path \'{0}\' does not exist, creating.'.format(folder)) try: os.makedirs(folder) except EnvironmentError as e: logger.critical('{0}'.format(str(e))) else: logger.warn('Path \'{0}\' already exist.'.format(folder)) queue = [(self, url, folder) for url in queue] pool = multiprocessing.Pool(self.size, init_worker) [pool.apply_async(download_wrapper, args=item) for item in queue] pool.close() pool.join()
def __api_suspended_tag_parser(tag_id, sorting, max_page=1): logger.info('Searching for doujinshi with tag id {0}'.format(tag_id)) result = [] response = request('get', url=constant.TAG_API_URL, params={ 'sort': sorting, 'tag_id': tag_id }).json() page = max_page if max_page <= response['num_pages'] else int( response['num_pages']) for i in range(1, page + 1): logger.info('Getting page {} ...'.format(i)) if page != 1: response = request('get', url=constant.TAG_API_URL, params={ 'sort': sorting, 'tag_id': tag_id }).json() for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for tag id {}'.format(tag_id)) return result
def __api_suspended_search_parser(keyword, page): logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) result = [] i = 0 while i < 5: try: response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page}).json() except Exception as e: i += 1 if not i < 5: logger.critical(str(e)) logger.warn('If you are in China, please configure the proxy to fu*k GFW.') exit(1) continue break if 'result' not in response: raise Exception('No result in response') for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for keywords {}'.format(keyword)) return result
def search_parser(keyword, page): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) result = [] try: response = request('get', url=constant.SEARCH_URL, params={ 'q': keyword, 'page': page }).content except requests.ConnectionError as e: logger.critical(e) logger.warn( 'If you are in China, please configure the proxy to fu*k GFW.') raise SystemExit html = BeautifulSoup(response, 'html.parser') doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'}) for doujinshi in doujinshi_search_result: doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'}) title = doujinshi_container.text.strip() title = title if len(title) < 85 else title[:82] + '...' id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1) result.append({'id': id_, 'title': title}) if not result: logger.warn('Not found anything of keyword {}'.format(keyword)) return result
def tag_parser(tag_name, max_page=1): result = [] tag_name = tag_name.lower() tag_name = tag_name.replace(' ', '-') for p in range(1, max_page + 1): logger.debug('Fetching page {0} for doujinshi with tag \'{1}\''.format( p, tag_name)) response = request('get', url='%s/%s?page=%d' % (constant.TAG_URL, tag_name, p)).content html = BeautifulSoup(response, 'html.parser') doujinshi_items = html.find_all('div', attrs={'class': 'gallery'}) if not doujinshi_items: logger.error( 'Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) return for i in doujinshi_items: doujinshi_id = i.a.attrs['href'].strip('/g') doujinshi_title = i.a.text.strip() doujinshi_title = doujinshi_title if len( doujinshi_title) < 85 else doujinshi_title[:82] + '...' result.append({'title': doujinshi_title, 'id': doujinshi_id}) if not result: logger.warn('No results for tag \'{}\''.format(tag_name)) return result
def tag_parser(tag_name, max_page=1): result = [] tag_name = tag_name.lower() tag_name = tag_name.replace(' ', '-') for p in range(1, max_page + 1): logger.debug('Fetching page {0} for doujinshi with tag \'{1}\''.format(p, tag_name)) response = request('get', url='%s/%s?page=%d' % (constant.TAG_URL, tag_name, p)).content html = BeautifulSoup(response, 'html.parser') doujinshi_items = html.find_all('div', attrs={'class': 'gallery'}) if not doujinshi_items: logger.error('Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) return for i in doujinshi_items: doujinshi_id = i.a.attrs['href'].strip('/g') doujinshi_title = i.a.text.strip() doujinshi_title = doujinshi_title if len(doujinshi_title) < 85 else doujinshi_title[:82] + '...' result.append({'title': doujinshi_title, 'id': doujinshi_id}) if not result: logger.warn('No results for tag \'{}\''.format(tag_name)) return result
def __api_suspended_search_parser(keyword, page): logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) result = [] i = 0 while i < 5: try: response = request('get', url=constant.SEARCH_URL, params={ 'query': keyword, 'page': page }).json() except Exception as e: i += 1 if not i < 5: logger.critical(str(e)) logger.warn( 'If you are in China, please configure the proxy to fu*k GFW.' ) exit(1) continue break if 'result' not in response: raise Exception('No result in response') for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for keywords {}'.format(keyword)) return result
def download(self, queue, folder=''): if not isinstance(folder, text): folder = str(folder) if self.path: folder = os.path.join(self.path, folder) if not os.path.exists(folder): logger.warn( 'Path \'{0}\' does not exist, creating.'.format(folder)) try: os.makedirs(folder) except EnvironmentError as e: logger.critical('{0}'.format(str(e))) exit(1) else: logger.warn('Path \'{0}\' already exist.'.format(folder)) queue = [([url], {'folder': folder}) for url in queue] self.thread_pool = threadpool.ThreadPool(self.thread_count) requests_ = threadpool.makeRequests(self._download, queue, self._download_callback) [self.thread_pool.putRequest(req) for req in requests_] self.thread_pool.wait()
def tag_parser(tag_id, max_page=1): logger.info('Searching for doujinshi with tag id {0}'.format(tag_id)) result = [] i = 0 while i < 5: try: response = request('get', url=constant.TAG_API_URL, params={ 'sort': 'popular', 'tag_id': tag_id }).json() except Exception as e: i += 1 if not i < 5: logger.critical(str(e)) exit(1) continue break page = max_page if max_page <= response['num_pages'] else int( response['num_pages']) for i in range(1, page + 1): logger.info('Getting page {} ...'.format(i)) if page != 1: i = 0 while i < 5: try: response = request('get', url=constant.TAG_API_URL, params={ 'sort': 'popular', 'tag_id': tag_id }).json() except Exception as e: i += 1 if not i < 5: logger.critical(str(e)) exit(1) continue break for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for tag id {}'.format(tag_id)) return result
def tag_parser(tag_id): logger.info('Get doujinshi of tag id: {0}'.format(tag_id)) result = [] response = request('get', url=constant.TAG_API_URL, params={'sort': 'popular', 'tag_id': tag_id}).json() for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('Not found anything of tag id {}'.format(tag_id)) return result
def search_parser(keyword, sorting, page, is_page_all=False): # keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')]) result = [] if not page: page = [1] if is_page_all: url = request('get', url=constant.SEARCH_URL, params={ 'query': keyword }).url init_response = request('get', url.replace('%2B', '+')).json() page = range(1, init_response['num_pages'] + 1) total = '/{0}'.format(page[-1]) if is_page_all else '' for p in page: i = 0 logger.info( 'Searching doujinshis using keywords "{0}" on page {1}{2}'.format( keyword, p, total)) while i < 3: try: url = request('get', url=constant.SEARCH_URL, params={ 'query': keyword, 'page': p, 'sort': sorting }).url response = request('get', url.replace('%2B', '+')).json() except Exception as e: logger.critical(str(e)) break if 'result' not in response: logger.warn('No result in response in page {}'.format(p)) break for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for keywords {}'.format(keyword)) return result
def search_parser(keyword, sorting='date', page=1): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) response = request('get', url=constant.SEARCH_URL, params={ 'q': keyword, 'page': page, 'sort': sorting }).content result = _get_title_and_id(response) if not result: logger.warn('Not found anything of keyword {}'.format(keyword)) return result
def tag_parser(tag_name, sorting='date', max_page=1, index=0): result = [] tag_name = tag_name.lower() if ',' in tag_name: tag_name = [i.strip().replace(' ', '-') for i in tag_name.split(',')] else: tag_name = tag_name.strip().replace(' ', '-') if sorting == 'date': sorting = '' for p in range(1, max_page + 1): if sys.version_info >= (3, 0, 0): unicode_ = str else: unicode_ = unicode if isinstance(tag_name, (str, unicode_)): logger.debug( 'Fetching page {0} for doujinshi with tag \'{1}\''.format( p, tag_name)) response = request( 'get', url='%s/%s/%s?page=%d' % (constant.TAG_URL[index], tag_name, sorting, p)).content result += _get_title_and_id(response) else: for i in tag_name: logger.debug( 'Fetching page {0} for doujinshi with tag \'{1}\''.format( p, i)) response = request( 'get', url='%s/%s/%s?page=%d' % (constant.TAG_URL[index], i, sorting, p)).content result += _get_title_and_id(response) if not result: logger.error( 'Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) return if not result: logger.warn('No results for tag \'{}\''.format(tag_name)) return result
def search_parser(keyword, page): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) try: response = request('get', url=constant.SEARCH_URL, params={ 'q': keyword, 'page': page }).content except requests.ConnectionError as e: logger.critical(e) logger.warn( 'If you are in China, please configure the proxy to fu*k GFW.') raise SystemExit result = _get_title_and_id(response) if not result: logger.warn('Not found anything of keyword {}'.format(keyword)) return result
def __api_suspended_tag_parser(tag_id, max_page=1): logger.info('Searching for doujinshi with tag id {0}'.format(tag_id)) result = [] response = request('get', url=constant.TAG_API_URL, params={'sort': 'popular', 'tag_id': tag_id}).json() page = max_page if max_page <= response['num_pages'] else int(response['num_pages']) for i in range(1, page + 1): logger.info('Getting page {} ...'.format(i)) if page != 1: response = request('get', url=constant.TAG_API_URL, params={'sort': 'popular', 'tag_id': tag_id}).json() for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for tag id {}'.format(tag_id)) return result
def search_parser(keyword, page): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) result = [] try: response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page}).json() if 'result' not in response: raise Exception('No result in response') except requests.ConnectionError as e: logger.critical(e) logger.warn('If you are in China, please configure the proxy to fu*k GFW.') exit(1) for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('Not found anything of keyword {}'.format(keyword)) return result
def tag_parser(tag_name, max_page=1): result = [] tag_name = tag_name.lower() tag_name = tag_name.replace(' ', '-') for p in range(1, max_page + 1): logger.debug('Fetching page {0} for doujinshi with tag \'{1}\''.format( p, tag_name)) response = request('get', url='%s/%s?page=%d' % (constant.TAG_URL, tag_name, p)).content result = _get_title_and_id(response) if not result: logger.error( 'Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) return if not result: logger.warn('No results for tag \'{}\''.format(tag_name)) return result
def search_parser(keyword, page): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) result = [] try: response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page}).content except requests.ConnectionError as e: logger.critical(e) logger.warn('If you are in China, please configure the proxy to fu*k GFW.') exit(1) html = BeautifulSoup(response, 'html.parser') doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'}) for doujinshi in doujinshi_search_result: doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'}) title = doujinshi_container.text.strip() title = (title[:85] + '..') if len(title) > 85 else title id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1) result.append({'id': id_, 'title': title}) if not result: logger.warn('Not found anything of keyword {}'.format(keyword)) return result
def search_parser(keyword, sorting, page): logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) keyword = '+'.join( [i.strip().replace(' ', '-').lower() for i in keyword.split(',')]) result = [] i = 0 while i < 5: try: url = request('get', url=constant.SEARCH_URL, params={ 'query': keyword, 'page': page, 'sort': sorting }).url response = request('get', url.replace('%2B', '+')).json() except Exception as e: i += 1 if not i < 5: logger.critical(str(e)) logger.warn( 'If you are in China, please configure the proxy to fu*k GFW.' ) exit(1) continue break if 'result' not in response: raise Exception('No result in response') for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title result.append({'id': row['id'], 'title': title}) if not result: logger.warn('No results for keywords {}'.format(keyword)) return result
def download(self, queue, folder=''): if not isinstance(folder, (text)): folder = str(folder) if self.path: folder = os.path.join(self.path, folder) if not os.path.exists(folder): logger.warn('Path \'{0}\' not exist.'.format(folder)) try: os.makedirs(folder) except EnvironmentError as e: logger.critical('{0}'.format(str(e))) exit(1) else: logger.warn('Path \'{0}\' already exist.'.format(folder)) queue = [([url], {'folder': folder}) for url in queue] self.thread_pool = threadpool.ThreadPool(self.thread_count) requests_ = threadpool.makeRequests(self._download, queue, self._download_callback) [self.thread_pool.putRequest(req) for req in requests_] self.thread_pool.wait()
def doujinshi_parser(id_): if not isinstance(id_, (int, )) and (isinstance(id_, (str, )) and not id_.isdigit()): raise Exception('Doujinshi id({0}) is not valid'.format(id_)) id_ = int(id_) logger.log(15, 'Fetching doujinshi information of id {0}'.format(id_)) doujinshi = dict() doujinshi['id'] = id_ url = '{0}/{1}/'.format(constant.DETAIL_URL, id_) try: response = request('get', url) if response.status_code in (200, ): response = response.content else: logger.debug('Slow down and retry ({}) ...'.format(id_)) time.sleep(1) return doujinshi_parser(str(id_)) except Exception as e: logger.warn('Error: {}, ignored'.format(str(e))) return None html = BeautifulSoup(response, 'html.parser') doujinshi_info = html.find('div', attrs={'id': 'info'}) title = doujinshi_info.find('h1').text subtitle = doujinshi_info.find('h2') doujinshi['name'] = title doujinshi['subtitle'] = subtitle.text if subtitle else '' doujinshi_cover = html.find('div', attrs={'id': 'cover'}) img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png)$', doujinshi_cover.a.img.attrs['data-src']) ext = [] for i in html.find_all('div', attrs={'class': 'thumb-container'}): _, ext_name = os.path.basename(i.img.attrs['data-src']).rsplit('.', 1) ext.append(ext_name) if not img_id: logger.critical('Tried yo get image id failed') exit(1) doujinshi['img_id'] = img_id.group(1) doujinshi['ext'] = ext pages = 0 for _ in doujinshi_info.find_all('div', class_=''): pages = re.search('([\d]+) pages', _.text) if pages: pages = pages.group(1) break doujinshi['pages'] = int(pages) # gain information of the doujinshi information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'}) needed_fields = ['Characters', 'Artists', 'Languages', 'Tags'] for field in information_fields: field_name = field.contents[0].strip().strip(':') if field_name in needed_fields: data = [ sub_field.contents[0].strip() for sub_field in field.find_all('a', attrs={'class': 'tag'}) ] doujinshi[field_name.lower()] = ', '.join(data) return doujinshi