def getCover_small(number): # 从avsox获取封面图 htmlcode = get_html('https://avsox.host/cn/search/' + number) html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result == '' or result == 'null' or result == 'None': htmlcode = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) html = etree.fromstring( htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = str( html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result == '' or result == 'null' or result == 'None': htmlcode = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) html = etree.fromstring(htmlcode, etree.HTMLParser()) counts = len(html.xpath("//div[@id='waterfall']/div/a/div")) if counts == 0: return '' for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 number_get = html.xpath( "//div[@id='waterfall']/div[" + str(count) + "]/a/div[@class='photo-info']/span/date[1]/text()") if len(number_get) > 0 and number_get[0] == number: cover_small = html.xpath( "//div[@id='waterfall']/div[" + str(count) + "]/a/div[@class='photo-frame']/img/@src")[0] return cover_small return ''
def find_number(number): # =======================================================================有码搜索 if not (re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper()): htmlcode = get_html('https://www.javbus.com/search/' + number + '&type=1') html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len(html.xpath("//div[@id='waterfall']/div[@id='waterfall']/div")) if counts != 0: for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 number_get = html.xpath("//div[@id='waterfall']/div[@id='waterfall']/div[" + str(count) + "]/a[@class='movie-box']/div[@class='photo-info']/span/date[1]/text()")[0] number_get = number_get.upper() number = number.upper() if number_get == number or number_get == number.replace('-', '') or number_get == number.replace('_', ''): result_url = html.xpath( "//div[@id='waterfall']/div[@id='waterfall']/div[" + str(count) + "]/a[@class='movie-box']/@href")[0] return result_url # =======================================================================无码搜索 htmlcode = get_html('https://www.javbus.com/uncensored/search/' + number + '&type=1') html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len(html.xpath("//div[@id='waterfall']/div[@id='waterfall']/div")) if counts == 0: return 'not found' for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 number_get = html.xpath("//div[@id='waterfall']/div[@id='waterfall']/div[" + str(count) + "]/a[@class='movie-box']/div[@class='photo-info']/span/date[1]/text()")[0] number_get = number_get.upper() number = number.upper() if number_get == number or number_get == number.replace('-', '') or number_get == number.replace('_', ''): result_url = html.xpath( "//div[@id='waterfall']/div[@id='waterfall']/div[" + str(count) + "]/a[@class='movie-box']/@href")[0] return result_url elif number_get == number.replace('-', '_') or number_get == number.replace('_', '-'): result_url = html.xpath( "//div[@id='waterfall']/div[@id='waterfall']/div[" + str(count) + "]/a[@class='movie-box']/@href")[0] return result_url return 'not found'
def getOutlineScore(number): # 获取简介 outline = '' score = '' try: response = post_html("https://www.jav321.com/search", query={"sn": number}) detail_page = etree.fromstring(response, etree.HTMLParser()) outline = str(detail_page.xpath('/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()')).strip(" ['']") if re.search(r'<b>评分</b>: <img data-original="/img/(\d+).gif" />', response): score = re.findall(r'<b>评分</b>: <img data-original="/img/(\d+).gif" />', response)[0] score = str(float(score) / 10.0) else: score = str(re.findall(r'<b>评分</b>: ([^<]+)<br>', response)).strip(" [',']").replace('\'', '') if outline == '': dmm_htmlcode = get_html( "https://www.dmm.co.jp/search/=/searchstr=" + number.replace('-', '') + "/sort=ranking/") if 'に一致する商品は見つかりませんでした' not in dmm_htmlcode: dmm_page = etree.fromstring(dmm_htmlcode, etree.HTMLParser()) url_detail = str(dmm_page.xpath('//*[@id="list"]/li[1]/div/p[2]/a/@href')).split(',', 1)[0].strip( " ['']") if url_detail != '': dmm_detail = get_html(url_detail) html = etree.fromstring(dmm_detail, etree.HTMLParser()) outline = str(html.xpath('//*[@class="mg-t0 mg-b20"]/text()')).strip(" ['']").replace('\\n', '').replace('\n', '') except Exception as error_info: print('Error in javbus.getOutlineScore : ' + str(error_info)) return outline, score
def main(number): htmlcode = get_html( 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=' + number) url = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=' + number if '404 Not Found' in htmlcode: htmlcode = get_html('https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=' + number) url = 'https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=' + number if '404 Not Found' in htmlcode: dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8') return js try: actor = getActor(htmlcode) dic = { 'title': getTitle(htmlcode).strip(getActor(htmlcode)), 'studio': getStudio(htmlcode), 'publisher': getPublisher(htmlcode), 'outline': getOutline(htmlcode), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': actor, 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'cover': getCover(htmlcode, number), 'imagecut': 1, 'tag': getTag(htmlcode), 'series': getSeries(htmlcode), 'year': getYear(getRelease( htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), 'actor_photo': getActorPhoto(actor), 'website': url, 'source': 'dmm.py', } except: if htmlcode == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8') return js
def getOutline(number): # 获取简介 try: dww_htmlcode = get_html('https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=' + number.replace("-", '00')) if '404 Not Found' in dww_htmlcode: dww_htmlcode = get_html('https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=' + number.replace("-", '00')) except: dww_htmlcode = '' html = etree.fromstring(dww_htmlcode, etree.HTMLParser()) result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")).strip(" ['']") return result.replace('\n', '').replace('\\n', '').replace('\'', '').replace(',', '').replace(' ', '')
def getOutlineScore(number): # 获取简介 dmm_htmlcode = get_html("https://www.dmm.co.jp/search/=/searchstr=" + number + "/sort=ranking/") dmm_page = etree.fromstring(dmm_htmlcode, etree.HTMLParser()) dmm_detail = get_html(str(dmm_page.xpath('//*[@id="list"]/li[1]/div/p[2]/a/@href')).split(',',1)[0].strip(" ['']")) html = etree.fromstring(dmm_detail, etree.HTMLParser()) outline = str(html.xpath('//*[@class="mg-t0 mg-b20"]/text()')).strip(" ['']") if outline.strip() == "": response = post_html("https://www.jav321.com/search", query={"sn": number}) detail_page = etree.fromstring(response, etree.HTMLParser()) outline = str(detail_page.xpath('/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()')).strip(" ['']") score = str(html.xpath('//*[@class="d-review__average"]/strong/text()')).strip(" ['']点") return outline, score
def main(number): try: htmlcode = get_html( 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=' + number) url = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=' + number if '404 Not Found' in htmlcode: htmlcode = get_html( 'https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=' + number) url = 'https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=' + number if '404 Not Found' in htmlcode: raise Exception('Movie Data not found in dmm!') if str(htmlcode) == 'ProxyError': raise TimeoutError actor = getActor(htmlcode) dic = { 'title': getTitle(htmlcode).strip(getActor(htmlcode)), 'studio': getStudio(htmlcode), 'publisher': getPublisher(htmlcode), 'outline': getOutline(htmlcode), 'score': getScore(htmlcode), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': actor, 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'tag': getTag(htmlcode), 'series': getSeries(htmlcode).replace('-', ''), 'year': getYear(getRelease(htmlcode)), 'actor_photo': getActorPhoto(actor), 'cover': getCover(htmlcode, number), 'extrafanart': getExtraFanart(htmlcode), 'imagecut': 1, 'website': url, 'source': 'dmm.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in dmm.main : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8') return js
def main(number): a = get_html('https://avsox.host/cn/search/' + number) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") web = get_html(result1) soup = BeautifulSoup(web, 'lxml') info = str(soup.find(attrs={'class': 'row movie'})) try: dic = { 'actor': getActor(web), 'title': getTitle(web).strip(getNum(web)).strip().replace(' ', '-'), 'studio': getStudio(info), 'publisher': '', 'outline': '', # 'runtime': getRuntime(info), 'director': '', # 'release': getRelease(info), 'number': getNum(info), 'cover': getCover(web), 'cover_small': getCover_small(a, number), 'imagecut': 3, 'tag': getTag(web), 'series': getSeries(info), 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), 'actor_photo': getActorPhoto(web), 'website': result1, 'source': 'avsox.py', } except: if a == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number, appoint_url): try: count, response, url = getUrl(number) if str(response) == 'ProxyError': raise TimeoutError if appoint_url != '': url = appoint_url elif url == '': raise Exception('Movie Data not found in avsox!') web = get_html(url) soup = BeautifulSoup(web, 'lxml') info = str(soup.find(attrs={'class': 'row movie'})) number = getNum(web) print(1) dic = { 'actor': getActor(web), 'title': getTitle(web).strip(number).strip().replace(' ', '-'), 'studio': getStudio(info), 'runtime': getRuntime(info), 'release': getRelease(info), 'number': getNum(info), 'tag': getTag(web), 'series': getSeries(info), 'year': getYear(getRelease(info)), 'actor_photo': getActorPhoto(web), 'cover': getCover(web), 'cover_small': getCover_small(response, count), 'extrafanart': '', 'imagecut': 3, 'director': '', 'publisher': '', 'outline': '', 'score': '', 'website': url, 'source': 'avsox.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in avsox.main : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js # print(main('051119-917')) # print(main('032620_001')) # print(main('032620_001', 'https://avsox.host/cn/movie/cb8d28437cff4e90'))
def main(number): htmlcode2 = get_html( 'http://adult.contents.fc2.com/article_search.php?id=' + number + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') htmlcode = get_html('https://fc2club.com//html/FC2-' + number + '.html') actor = getActor(htmlcode) if len(actor) == 0: actor = 'FC2系列' try: dic = { 'title': getTitle(htmlcode).replace(' ', '-'), 'studio': getStudio(htmlcode), 'publisher': '', 'year': '', # str(re.search('\d{4}',getRelease(number)).group()), 'outline': getOutline(htmlcode2).replace('\n', ''), 'runtime': getYear(getRelease(htmlcode)), 'director': '', 'actor': actor.replace('/', ','), 'release': getRelease(number), 'number': 'FC2-' + number, 'cover': getCover(htmlcode, number, htmlcode2), 'imagecut': 0, 'series': '', 'tag': getTag(htmlcode), 'actor_photo': getActorPhoto(actor), 'website': 'https://fc2club.com//html/FC2-' + number + '.html', 'source': 'fc2fans_club.py', } except: if htmlcode2 == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def getActorPhoto(htmlcode): soup = BeautifulSoup(htmlcode, 'lxml') a = soup.find_all(attrs={'class': 'star-name'}) d = {} for i in a: l = i.a['href'] t = i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) p = str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") p2 = {t: p} d.update(p2) return d
def find_number(number): htmlcode = get_html('https://xcity.jp/result_published/?q=' + number.replace('-', '')) if '該当する作品はみつかりませんでした' in htmlcode: return 'not found', '' html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len( html.xpath("//div[@id='searchResult']/table[@class='resultList']/tr")) if counts >= 2: for count in range(2, counts + 1): # 遍历搜索结果,找到需要的番号 result_url = 'https://xcity.jp' + html.xpath( "//div[@id='searchResult']/table[@class='resultList']/tr[" + str(count) + "]/td[1]/a/@href")[0] detail_page = get_html(result_url) detail_page_html = etree.fromstring(detail_page, etree.HTMLParser()) number_get = str( detail_page_html.xpath("//span[@id='hinban']/text()")[0]) if number_get.upper() == number.replace('-', '').upper(): return result_url, detail_page return 'not found', ''
def main(number, appoint_url): try: url = 'https://fc2club.com//html/FC2-' + number + '.html' if appoint_url: url = appoint_url htmlcode = get_html(url) if str(htmlcode) == 'ProxyError': raise TimeoutError actor = getActor(htmlcode) if len(actor) == 0: actor = 'FC2系列' dic = { 'title': getTitle(htmlcode).strip(' '), 'studio': getStudio(htmlcode), 'score': getScore(htmlcode), 'runtime': getYear(getRelease(htmlcode)), 'actor': actor.replace('/', ','), 'release': getRelease(number), 'number': 'FC2-' + number, 'tag': getTag(htmlcode), 'actor_photo': getActorPhoto(actor), 'cover': getCover(htmlcode), 'extrafanart': getExtraFanart(htmlcode), 'imagecut': 0, 'director': '', 'series': '', 'publisher': '', 'year': '', 'outline': '', 'website': 'https://fc2club.com//html/FC2-' + number + '.html', 'source': 'fc2fans_club.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in fc2fans_club.main : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) return js
def main_uncensored(number): try: result_url = find_number(number) if result_url == 'not found': raise Exception('Movie Data not found in javbus.main_uncensored!') htmlcode = get_html(result_url) if str(htmlcode) == 'ProxyError': raise TimeoutError number = getNum(htmlcode) outline = '' score = '' if 'HEYZO' in number.upper(): outline, score = getOutlineScore(number) dic = { 'title': getTitle(htmlcode).replace(number, '').strip().replace(' ', '-'), 'studio': getStudio(htmlcode), 'publisher': '', 'year': getYear(getRelease(htmlcode)), 'outline': outline, 'score': score, 'runtime': getRuntime(htmlcode).replace('分鐘', '').strip(), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'extrafanart': getExtraFanart(htmlcode), 'tag': getTag(htmlcode), 'series': getSeries(htmlcode), 'imagecut': 3, 'cover_small': getCover_small(number), 'actor_photo': getActorPhoto(htmlcode), 'website': result_url, 'source': 'javbus.py', } if dic['cover_small'] == '': dic['imagecut'] = 0 except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in javbus.main_uncensored : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def getCover_small(number): # 从avsox获取封面图 try: htmlcode = get_html('https://avsox.host/cn/search/' + number) html = etree.fromstring(htmlcode, etree.HTMLParser()) counts = len(html.xpath("//div[@id='waterfall']/div/a/div")) if counts == 0: return '' for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 number_get = html.xpath("//div[@id='waterfall']/div[" + str(count) + "]/a/div[@class='photo-info']/span/date[1]/text()") if len(number_get) > 0 and number_get[0].upper() == number.upper(): cover_small = html.xpath("//div[@id='waterfall']/div[" + str(count) + "]/a/div[@class='photo-frame']/img/@src")[0] return cover_small except Exception as error_info: print('Error in javbus.getCover_small : ' + str(error_info)) return ''
def getUrl(number): response = get_html('https://avsox.website/cn/search/' + number) html = etree.fromstring(response, etree.HTMLParser()) # //table/tr[1]/td[1]/text() url_list = html.xpath('//*[@id="waterfall"]/div/a/@href') if len(url_list) > 0: for i in range(1, len(url_list) + 1): number_get = str( html.xpath('//*[@id="waterfall"]/div[' + str(i) + ']/a/div[@class="photo-info"]/span/date[1]/text()') ).strip(" ['']") if number.upper() == number_get.upper(): page_url = 'https:' + url_list[i - 1] return i, response, page_url return 0, response, ''
def main(number): try: number = number.upper() htmlcode = str( get_html('https://www.mgstage.com/product/product_detail/' + str(number) + '/', cookies={'adc': '1'})) htmlcode = htmlcode.replace('ahref', 'a href') # 针对a标签、属性中间未分开 if str(htmlcode) == 'ProxyError': raise TimeoutError actor = getActor(htmlcode).replace(' ', '') dic = { 'title': getTitle(htmlcode).replace("\\n", '').replace(' ', '').strip(','), 'studio': getStudio(htmlcode).strip(','), 'publisher': getPublisher(htmlcode).strip(','), 'outline': getOutline(htmlcode).replace('\n', '').strip(','), 'score': getScore(htmlcode).strip(','), 'runtime': getRuntime(htmlcode).strip(','), 'actor': actor.strip(','), 'release': getRelease(htmlcode).strip(','), 'number': getNum(htmlcode).strip(','), 'cover': getCover(htmlcode).strip(','), 'extrafanart': getExtraFanart(htmlcode).strip(','), 'imagecut': 0, 'tag': getTag(htmlcode).strip(','), 'series': getSeries(htmlcode).strip(','), 'year': getYear(getRelease(htmlcode)).strip(','), 'actor_photo': getActorPhoto(actor.split(',')), 'director': '', 'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/', 'source': 'mgstage.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in mgstage.main : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number): number = number.upper() htmlcode = str( get_html('https://www.mgstage.com/product/product_detail/' + str(number) + '/', cookies={'adc': '1'})) soup = BeautifulSoup(htmlcode, 'lxml') a = str(soup.find(attrs={'class': 'detail_data'})).replace( '\n ', '').replace(' ', '').replace('\n ', '').replace('\n ', '') try: actor = getActor(a).replace(' ', '') dic = { 'title': getTitle(htmlcode).replace("\\n", '').replace(' ', ''), 'studio': getStudio(a), 'publisher': getPublisher(a), 'outline': getOutline(htmlcode).replace('\n', ''), 'runtime': getRuntime(a), 'director': '', 'actor': actor, 'release': getRelease(a), 'number': getNum(a), 'cover': getCover(htmlcode), 'imagecut': 0, 'tag': getTag(a).strip(','), 'series': getSeries(a).strip(','), 'year': getYear(getRelease( a)), # str(re.search('\d{4}',getRelease(a)).group()), 'actor_photo': getActorPhoto(actor.split(',')), 'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/', 'source': 'mgstage.py', } except: if htmlcode == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main_us(number): try: htmlcode = get_html('https://www.javbus.zone/search/' + number) if str(htmlcode) == 'ProxyError': raise TimeoutError html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len(html.xpath("//div[@class='row']/div[@id='waterfall']/div")) if counts == 0: raise Exception('Movie Data not found in javbus.main_us!') result_url = '' cover_small = '' for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 number_get = html.xpath("//div[@id='waterfall']/div[" + str( count) + "]/a[@class='movie-box']/div[@class='photo-info']/span/date[1]/text()")[0] if number_get.upper() == number.upper() or number_get.replace('-', '').upper() == number.upper(): result_url = html.xpath( "//div[@id='waterfall']/div[" + str(count) + "]/a[@class='movie-box']/@href")[0] cover_small = html.xpath( "//div[@id='waterfall']/div[" + str( count) + "]/a[@class='movie-box']/div[@class='photo-frame']/img[@class='img']/@src")[0] break if result_url == '': raise Exception('Movie Data not found in javbus.main_us!') htmlcode = get_html(result_url) if str(htmlcode) == 'ProxyError': raise TimeoutError number = getNum(htmlcode) dic = { 'title': getTitle(htmlcode).replace(number, '').strip(), 'studio': getStudio(htmlcode), 'year': getYear(getRelease(htmlcode)), 'runtime': getRuntime(htmlcode).replace('分鐘', '').strip(), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'tag': getTag(htmlcode), 'series': getSeries(htmlcode), 'cover': getCover(htmlcode), 'cover_small': cover_small, 'imagecut': 3, 'actor_photo': getActorPhoto(htmlcode), 'publisher': '', 'outline': '', 'score': '', 'website': result_url, 'source': 'javbus.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in javbus.main_us : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number, appoint_url=''): # fanza allow letter + number + underscore, normalize the input here # @note: I only find the usage of underscore as h_test123456789 fanza_search_number = number # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix if fanza_search_number.startswith("h-"): fanza_search_number = fanza_search_number.replace("h-", "h_") fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() fanza_urls = [ "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", "https://www.dmm.co.jp/rental/-/detail/=/cid=", ] chosen_url = "" htmlcode = '' if appoint_url: chosen_url = appoint_url htmlcode = get_html( "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format( urlencode({"rurl": appoint_url}) )) else: for url in fanza_urls: chosen_url = url + fanza_search_number final_url = "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format( urlencode({"rurl": chosen_url}) ) htmlcode = get_html(final_url) if "404 Not Found" not in htmlcode: break if "404 Not Found" in htmlcode: return json.dumps({"title": "", 'website': ''}) try: # for some old page, the input number does not match the page # for example, the url will be cid=test012 # but the hinban on the page is test00012 # so get the hinban first, and then pass it to following functions fanza_hinban = getNum(htmlcode) release = getRelease(htmlcode) dic = { "title": getTitle(htmlcode).strip(), 'publisher': getPublisher(htmlcode), 'score': getScore(htmlcode), "studio": getStudio(htmlcode), "outline": getOutline(htmlcode), "runtime": getRuntime(htmlcode), "director": getDirector(htmlcode) if "anime" not in chosen_url else "", "actor": getActor(htmlcode) if "anime" not in chosen_url else "", "release": release, "number": fanza_hinban, "cover": getCover(htmlcode, fanza_hinban), "imagecut": 1, "tag": getTag(htmlcode), "extrafanart": getExtrafanart(htmlcode), "label": getLabel(htmlcode), "year": getYear(release), # str(re.search('\d{4}',getRelease(a)).group()), "actor_photo": "", "website": chosen_url, "source": "fanza.py", "series": getSeries(htmlcode), } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in dmm.main : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8') return js
def main_us(number): try: # ========================================================================搜索番号 htmlcode = get_html('https://javdb.com/search?q=' + number + '&f=all').replace(u'\xa0', u' ') if str(htmlcode) == 'ProxyError': raise TimeoutError html = etree.fromstring( htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len( html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\']' )) if counts == 0: raise Exception('Movie Data not found in javdb.main_us!') # ========================================================================遍历搜索结果,找到需要的番号所在URL number_series = number.split('.')[0] number_date = '20' + number.replace(number_series, '').strip('.') number_date = number_date.replace('.', '-') count = 1 movie_found = 0 for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 series_get = html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\'][' + str(count) + ']/a[@class=\'box\']/div[@class=\'uid2\']/text()')[0] date_get = html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\'][' + str(count) + ']/a[@class=\'box\']/div[@class=\'meta\']/text()')[0] if re.search('\d{4}-\d{1,2}-\d{1,2}', date_get): date_get = re.findall('\d{4}-\d{1,2}-\d{1,2}', date_get)[0] series_get = series_get.replace(' ', '') if (series_get.upper() == number_series.upper() or series_get.replace('-', '').upper() == number_series.upper()) and number_date == date_get: movie_found = 1 break if movie_found == 0: raise Exception('Movie Data not found in javdb.main_us!') result_url = 'https://javdb.com' + html.xpath( '//*[@id="videos"]/div/div/a/@href')[count - 1] # ========================================================================请求、判断结果 html_info = get_html(result_url).replace(u'\xa0', u' ') if str(html_info) == 'ProxyError': raise TimeoutError # ========================================================================收集信息 actor = getActor(html_info) number = getNumber(html_info) dic = { 'actor': str(actor).strip(" [',']").replace('\'', ''), 'title': getTitle(html_info).replace('中文字幕', '').replace("\\n", '').replace( '_', '-').replace(number, '').strip(), 'studio': getStudio(html_info), 'publisher': getPublisher(html_info), 'outline': '', 'score': getScore(html_info), 'runtime': getRuntime(html_info).replace(' 分鍾', ''), 'director': getDirector(html_info), 'release': getRelease(html_info), 'number': number, 'cover': getCover_us(html_info), 'cover_small': getCover_small(htmlcode, count - 1), 'extrafanart': getExtraFanart(html_info), 'imagecut': 3, 'tag': getTag(html_info), 'series': getSeries(html_info), 'year': getYear(getRelease(html_info) ), # str(re.search('\d{4}',getRelease(htmlcode)).group()), 'actor_photo': getActorPhoto(actor), 'website': result_url, 'source': 'javdb.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in javdb.main_us : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number, isuncensored=False): try: # ========================================================================搜索番号 htmlcode = get_html('https://javdb.com/search?q=' + number + '&f=all').replace(u'\xa0', u' ') if str(htmlcode) == 'ProxyError': raise TimeoutError html = etree.fromstring( htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len( html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\']' )) if counts == 0: raise Exception('Movie Data not found in javdb.main!') # ========================================================================遍历搜索结果,找到需要的番号所在URL count = 1 number_get = '' movie_found = 0 for count in range(1, counts + 1): number_get = html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\'][' + str(count) + ']/a[@class=\'box\']/div[@class=\'uid\']/text()')[0] if number_get.upper() == number.upper(): movie_found = 1 break if movie_found == 0: raise Exception('Movie Data not found in javdb.main!') result_url = 'https://javdb.com' + html.xpath( '//*[@id="videos"]/div/div/a/@href')[count - 1] # ========================================================================请求、判断结果 html_info = get_html(result_url).replace(u'\xa0', u' ') if str(html_info) == 'ProxyError': raise TimeoutError # ========================================================================获取评分、简介 imagecut = 1 cover_small = '' outline = '' if isuncensored or re.match('^\d{4,}', number) or re.match( 'n\d{4}', number): # 无码,收集封面、评分 imagecut = 3 cover_small = getCover_small(htmlcode, count - 1) score = getScore(html_info) elif 'HEYZO' in number.upper(): # HEYZO,收集封面、评分、简介 imagecut = 3 cover_small = getCover_small(htmlcode, count - 1) outline, score = getOutlineScore(number) else: # 其他,收集评分、简介 outline, score = getOutlineScore(number) # ========================================================================收集信息 actor = getActor(html_info) if len(actor) == 0 and 'FC2-' in number_get: actor.append('FC2-NoActor') dic = { 'actor': str(actor).strip(" [',']").replace('\'', ''), 'title': getTitle(html_info).replace('中文字幕', '').replace('無碼', '').replace( "\\n", '').replace('_', '-').replace(number_get, '').strip().replace( ' ', '-').replace('--', '-'), 'studio': getStudio(html_info), 'publisher': getPublisher(html_info), 'outline': outline, 'score': score, 'runtime': getRuntime(html_info).replace(' 分鍾', ''), 'director': getDirector(html_info), 'release': getRelease(html_info), 'number': number_get, 'cover': getCover(html_info), 'cover_small': cover_small, 'extrafanart': getExtraFanart(html_info), 'imagecut': imagecut, 'tag': getTag(html_info), 'series': getSeries(html_info), 'year': getYear(getRelease(html_info) ), # str(re.search('\d{4}',getRelease(htmlcode)).group()), 'actor_photo': getActorPhoto(actor), 'website': result_url, 'source': 'javdb.py', } except TimeoutError: dic = { 'title': '', 'website': 'timeout', } except Exception as error_info: print('Error in javdb.main : ' + str(error_info)) dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number): try: htmlcode = get_html('https://javdb.com/search?q=' + number + '&f=all').replace(u'\xa0', u' ') html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() counts = len(html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\']')) if counts == 0: dic = { 'title': '', 'actor': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js count = 1 number_get = '' movie_found = 0 result_url = '' for count in range(1, counts + 1): # 遍历搜索结果,找到需要的番号 number_get = html.xpath( '//div[@id=\'videos\']/div[@class=\'grid columns\']/div[@class=\'grid-item column\'][' + str( count) + ']/a[@class=\'box\']/div[@class=\'uid\']/text()')[0] # number_get = number_get.replace('_', '-') if number_get == number.upper() or number_get == number.lower(): movie_found = 1 break result_url = 'https://javdb.com' + html.xpath('//*[@id="videos"]/div/div/a/@href')[count - 1] b = get_html(result_url).replace(u'\xa0', u' ') actor = getActor(b) if len(actor) == 0 and 'FC2-' in number_get: actor.append('FC2-NoActor') if movie_found == 1: dic = { 'actor': str(actor).strip(" [',']").replace('\'', ''), 'title': getTitle(b).replace('中文字幕', '').replace("\\n", '').replace('_', '-').replace(number_get, '').strip().replace( ' ', '-').replace('--', '-'), 'studio': getStudio(b), 'publisher': getPublisher(b), 'outline': getOutline(b).replace('\n', ''), 'runtime': getRuntime(b).replace(' 分鍾', ''), 'director': getDirector(b), 'release': getRelease(b), 'number': number_get, 'cover': getCover(b), 'cover_small': getCover_small(htmlcode, count - 1), 'imagecut': 3, 'tag': getTag(b), 'series': getSeries(b), 'year': getYear(getRelease(b)), # str(re.search('\d{4}',getRelease(htmlcode)).group()), 'actor_photo': getActorPhoto(actor), 'website': result_url, 'source': 'javdb.py', } else: # 未找到番号 dic = { 'title': '', 'actor': str(actor).strip(" [',']").replace('\'', ''), 'website': '', } except: # actor 用于判断ip是否被封 if htmlcode == 'ProxyError': dic = { 'title': '', 'actor': '', 'website': 'timeout', } else: dic = { 'title': '', 'actor': '', 'website': '', } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main_uncensored(number): result_url = find_number(number) if result_url == 'not found': dic = { 'title': '', 'actor': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js htmlcode = get_html(result_url) try: number = getNum(htmlcode) dic = { 'title': getTitle(htmlcode).replace(number, '').strip().replace(' ', '-'), 'studio': getStudio(htmlcode), 'publisher': '', 'year': getYear(getRelease(htmlcode)), 'outline': '', 'runtime': getRuntime(htmlcode).replace('分鐘', '').strip(), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'tag': getTag(htmlcode), 'series': getSeries(htmlcode), 'imagecut': 3, 'cover_small': getCover_small(number), 'actor_photo': getActorPhoto(htmlcode), 'website': result_url, 'source': 'javbus.py', } if dic['cover_small'] == '': dic['imagecut'] = 0 except Exception as error_info: print('Error in javbus.main_uncensored :' + str(error_info)) if htmlcode == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number, javlibrary_url): try: htmlcode = get_html('http://' + javlibrary_url + '/ja/vl_searchbyid.php?keyword=' + number).replace( u'\xa0', u' ') title = getTitle(htmlcode) movie_found = 1 if title == '': # 页面为搜索结果页,而不是视频信息页,遍历搜索结果 movie_found = 0 html = etree.fromstring( htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() count_all = len( html.xpath( "//div[@class='videothumblist']/div[@class='videos']/div[@class='video']" )) for count in range(1, count_all + 1): number_get = str( html.xpath( "//div[@class='videothumblist']/div[@class='videos']/div[" + str(count) + "]/a/div[1]/text()")).strip(" ['']") if number_get == number.upper(): url_get = str( html.xpath( "//div[@class='videothumblist']/div[@class='videos']/div[" + str(count) + "]/a/@href")).strip(" ['.']") htmlcode = get_html('http://' + javlibrary_url + '/ja' + url_get).replace(u'\xa0', u' ') movie_found = 1 break if movie_found == 1: try: # 从dmm获取简介 dww_htmlcode = get_html( "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=" + number.replace("-", '00')) except: dww_htmlcode = '' actor = getActor(htmlcode) number = getNum(htmlcode) release = getRelease(htmlcode) dic = { 'actor': str(actor).strip(" [',']").replace('\'', ''), 'title': getTitle(htmlcode).replace( '中文字幕', '').replace("\\n", '').replace('_', '-').replace( number, '').strip().replace(' ', '-').replace('--', '-'), 'studio': getStudio(htmlcode), 'publisher': getPublisher(htmlcode), 'outline': getOutline(dww_htmlcode).replace('\n', '').replace( '\\n', '').replace('\'', '').replace(',', '').replace(' ', ''), 'runtime': getRuntime(htmlcode), 'director': str(getDirector(htmlcode)).replace('----', ''), 'release': release, 'number': number, 'cover': getCover(htmlcode), 'imagecut': 1, 'tag': getTag(htmlcode), 'series': '', 'year': getYear(release), 'actor_photo': getActorPhoto(actor), 'website': getWebsite(htmlcode), 'source': 'javlibrary.py', } else: dic = { 'title': '', 'website': '', } except: if htmlcode == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number): result_url = find_number(number) if result_url == 'not found': dic = { 'title': '', 'actor': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js htmlcode = get_html(result_url) try: dww_htmlcode = get_html( "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=" + number.replace("-", '00')) except: dww_htmlcode = '' try: number = getNum(htmlcode) dic = { 'title': str(getTitle(htmlcode)).replace(number, '').strip().replace(' ', '-'), 'studio': getStudio(htmlcode), 'publisher': getPublisher(htmlcode), 'year': getYear(getRelease(htmlcode)), 'outline': getOutline(dww_htmlcode).replace('\n', '').replace( '\\n', '').replace('\'', '').replace(',', '').replace(' ', ''), 'runtime': getRuntime(htmlcode).replace('分鐘', '').strip(), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': number, 'cover': getCover(htmlcode), 'imagecut': 1, 'tag': getTag(htmlcode), 'series': getSeries(htmlcode), 'actor_photo': getActorPhoto(htmlcode), 'website': result_url, 'source': 'javbus.py', } except Exception as error_info: print('Error in javbus.main :' + str(error_info)) if htmlcode == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js