def get_scenes(self, response): scenelist = [] scenes = response.xpath('//div[@class="item "]') for scene in scenes: item = SceneItem() item['performers'] = [] item['tags'] = [] item['trailer'] = '' item['image'] = '' item['description'] = '' item['network'] = "Sins Life" item['parent'] = "Sins Life" item['site'] = "Sins Life" title = scene.xpath( './/div[contains(@class,"item-title")]/a/text()').get() if title: item['title'] = self.cleanup_title(title) externalid = re.sub('[^a-zA-Z0-9-]', '', item['title']) item['id'] = externalid.lower().strip().replace(" ", "-") item['url'] = response.url description = scene.xpath( './/div[@class="item-meta"]/div/text()').getall() if description: description = " ".join(description) description = description.replace(" ", " ") item['description'] = self.cleanup_description(description) item['date'] = self.parse_date('today').isoformat() image = scene.xpath('.//img/@src0_3x').get() if not image: image = scene.xpath('.//img/@src').get() if image: item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None if item['id'] and item['title'] and item['date']: scenelist.append(item.copy()) item.clear() return scenelist
def parse_scene(self, response): data = response.json() item = SceneItem() if len(data): data = data[0] item['title'] = data['issue_title'] item['date'] = data['issue_datetime'] if "Z" in item['date']: item['date'] = item['date'][:-1] item['description'] = data['issue_text'] item['tags'] = data['issue_tags'].split(",") item['tags'] = list(map(str.strip, item['tags'])) item['tags'] = list(map(str.capitalize, item['tags'])) item['tags'][:] = [x for x in item['tags'] if x] item['site'] = "Watch4Beauty" item['network'] = "Watch4Beauty" item['parent'] = "Watch4Beauty" item['url'] = "https://www.watch4beauty.com/updates/" + data[ 'issue_simple_title'] item['id'] = data['issue_simple_title'] item['trailer'] = '' item[ 'image'] = "https://s5q3w2t8.ssl.hwcdn.net/production/%s-issue-cover-wide-2560.jpg" % ( datetime.fromisoformat(item['date']).strftime('%Y%m%d')) item['image_blob'] = None item['performers'] = [] modelurl = response.url + "/models" yield scrapy.Request(modelurl, callback=self.parse_models, meta={'item': item})
def parse_scene(self, response): meta = response.meta item = SceneItem() item['title'] = self.cleanup_title(meta['title']) item['title'] = item['title'].replace("&", "&") item['description'] = self.cleanup_description(meta['title']) item['description'] = item['description'].replace("&", "&") item['performers'] = self.get_performers(response) item['id'] = meta['id'] item['site'] = meta['site'] item['parent'] = meta['parent'] item['network'] = meta['network'] item['date'] = self.parse_date('today').isoformat() item['trailer'] = '' item['url'] = re.search(r'(.*)\&nats', meta['url']).group(1) item['image'] = self.get_image(response) item['image_blob'] = None item['tags'] = self.get_tags(response) if self.debug: print(item) else: yield item
def parse_scene(self, response): movie = response.json() item = SceneItem() item['title'] = movie['name'] item['description'] = movie['description'] item['performers'] = [] for performer in movie['models']: item['performers'].append(performer['name']) if 'coverCleanImagePath' in movie: item['image'] = movie['coverCleanImagePath'] if 'splashImagePath' in movie: item['image'] = movie['splashImagePath'] else: item['image'] = None item['image_blob'] = None if 'hustler' in response.url: item[ 'image'] = 'https://cdn-hustlernetwork.metartnetwork.com/' + movie[ 'media']['siteUUID'] + item['image'] elif 'lovehairy' in response.url: item['image'] = 'https://cdn.metartnetwork.com/' + movie[ 'siteUUID'] + movie['splashImagePath'] else: item['image'] = self.format_link(response, item['image']) item['date'] = self.parse_date(movie['publishedAt']).isoformat() item['tags'] = movie['tags'] item['trailer'] = self.format_url( response.url, '/api/m3u8/' + movie['UUID'] + '.m3u8') item['site'] = self.get_site(response) item['url'] = self.format_link(response, movie['path']) item['network'] = self.network item['parent'] = self.get_parent(response) res = re.search('movie/(\\d+)/(.+)', movie['path']) item['id'] = res.group(1) + "_" + res.group(2) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_scene(self, response): meta = response.meta # The site uses a rate limit of 40 requests in a given minute. if 'X-Ratelimit-Remaining' in response.headers: ratelimit = int(response.headers['X-Ratelimit-Remaining']) try: jsondata = json.loads(response.text) except: print( f'Failed retrieving {response.url}. X-Ratelimit was: {ratelimit}' ) data = jsondata['payload']['scenes'] for row in data: item = SceneItem() item['id'] = data[row]['id'] item['title'] = html.unescape( re.sub('<[^<]+?>', '', data[row]['title'])) item['description'] = html.unescape( re.sub('<[^<]+?>', '', data[row]['story'])) item['url'] = "https://itspov.com/" + data[row]['url'] item['image'] = data[row]['video_cover']['1500'] item['image_blob'] = None item['date'] = self.parse_date( data[row]['translations'][0]['created_at']).isoformat() item['performers'] = [] for model in data[row]['models']: item['performers'].append( data[row]['models'][model]['stage_name']) item['tags'] = [] for tag in data[row]['main_scenetags']: item['tags'].append( data[row]['main_scenetags'][tag]['name'].strip().title()) item['trailer'] = '' item['site'] = meta['site'] item['parent'] = meta['site'] item['network'] = 'Its POV' if item['id'] and item['title']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): meta = response.meta jsondata = response.json()['data']['video']['list']['result']['edges'] for jsonrow in jsondata: item = SceneItem() sceneid = jsonrow['node']['videoId'] item['id'] = sceneid.replace(":", "-") item['title'] = self.cleanup_title(jsonrow['node']['title']) item['description'] = self.cleanup_description( jsonrow['node']['description']['long']) item['performers'] = [] for performer in jsonrow['node']['talent']: item['performers'].append(performer['talent']['name']) item['site'] = "Thicc 18" item['network'] = "Thicc 18" item['parent'] = "Thicc 18" item['url'] = "https://thicc18.com/videos/" + sceneid.replace( ':', '%3A') item['date'] = self.parse_date('today').isoformat() item['trailer'] = '' item['tags'] = ['Big Ass'] meta['item'] = item.copy() imagedata = jsonrow['node']['videoId'].split(":") imagequery = { "operationName": "BatchFindAssetQuery", "variables": { "paths": [ "/members/models/" + imagedata[0] + "/scenes/" + imagedata[1] + "/videothumb.jpg", ] }, "query": "query BatchFindAssetQuery($paths: [String!]!) {\n asset {\n batch(input: {paths: $paths}) {\n result {\nserve {\n uri\n}\n}\n}\n}\n}\n" } url = "https://thicc18.team18.app/graphql" imagequery = json.dumps(imagequery) yield Request(url, headers=self.headers, body=imagequery, method="POST", callback=self.get_images, meta=meta)
def parse_scene(self, json): item = SceneItem() item['id'] = json['_id'] json = json['_source'] # ~ print (" ") # ~ print(f'JSON: {json}') if 'preview' in json: item['trailer'] = 'https://i.bang.com/v/%s/%s/preview720.mp4' % ( json['dvd']['id'], json['identifier']) else: item['trailer'] = '' item['site'] = json['studio']['name'].title() if item['site'].lower().strip() == 'bang! originals' or item[ 'site'].lower().strip() == 'bang originals': item['site'] = json['series']['name'].title() item['title'] = json['name'] item['description'] = json['description'] item['date'] = json['releaseDate'] item['tags'] = list(map(lambda x: x['name'].title(), json['genres'])) item['performers'] = list(map(lambda x: x['name'], json['actors'])) try: item[ 'image'] = 'https://i.bang.com/screenshots/%s/movie/%s/%s.jpg' % ( json['dvd']['id'], json['order'], json['screenshots'][0]['screenId']) except BaseException: print(f"Index out of Range: {item['id']}") # ~ item['image_blob'] = None imagereq = requests.get(item['image']) item['image_blob'] = base64.b64encode(imagereq.content).decode('utf-8') item['url'] = 'https://bang.com/video/%s' % item['id'] item['network'] = 'Bang' item['parent'] = 'Bang' if item['title']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: return item else: return item return None
def parse_scene(self, response): jsondata = response.xpath('//script[contains(@type, "json")]').get() jsondata = re.search(r'(\{.*\})', jsondata).group(1) jsondata = json.loads(jsondata) item = SceneItem() item['performers'] = [] for model in jsondata['video']['actor']: item['performers'].append(model['name'].title()) item['title'] = self.cleanup_title(jsondata['video']['name']) item['description'] = self.cleanup_description( jsondata['video']['description']) if not item['description']: item['description'] = '' item['image'] = jsondata['video']['thumbnail'] if not item['image']: item['image'] = None item['image_blob'] = None item['trailer'] = '' item['url'] = jsondata['video']['url'] item['id'] = re.search(r'videos/(.*)', item['url']).group(1) item['date'] = self.parse_date( jsondata['video']['datePublished'].strip()).isoformat() item['site'] = "Virtual Taboo" item['parent'] = "POVR" item['network'] = "POVR" item['tags'] = jsondata['video']['keywords'] tags2 = item['tags'].copy() for tag in tags2: if re.match(r'\d+K', tag): item['tags'].remove(tag) item['tags'] = list(map(lambda x: x.strip().title(), set(item['tags']))) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_scene(self, response): jsondata = json.loads(response.body) htmlcode = jsondata['solution']['response'] imagedata = {} imagedata['url'] = response.url imagedata['html'] = htmlcode imagedata['cookies'] = jsondata['solution']['cookies'] response = HtmlResponse(url=response.url, body=htmlcode, encoding='utf-8') cookies = jsondata['solution']['cookies'] for cookie in cookies: if cookie['name'] == 'mydate': scenedate = cookie['value'] item = SceneItem() if scenedate: item['date'] = dateparser.parse(scenedate).isoformat() else: item['date'] = dateparser.parse('today').isoformat() item['title'] = self.get_title(response) item['description'] = self.get_description(response) item['image'] = self.get_image(response) item['image_blob'] = self.get_image_blob(response) item['performers'] = self.get_performers(response) item['tags'] = self.get_tags(response) item['id'] = re.search(r'(\d+) ', item['title']).group(1) item['trailer'] = self.get_trailer(response) item['url'] = jsondata['solution']['url'] item['network'] = "In The Crack" item['parent'] = "In The Crack" item['site'] = "In The Crack" days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): scenes = response.xpath('//div[contains(@class,"episode__preview")]') for scene in scenes: item = SceneItem() title = scene.xpath('.//h2/text()') if title: item['title'] = self.cleanup_title(title.get()) else: item['title'] = '' item['image'] = None image = scene.xpath( './/div[@class="thumbnail_wrapper"]/img/@src').get() if image: image = re.search(r'(.*)\?', image) if image: item['image'] = image.group(1).strip() item['image_blob'] = None item['performers'] = [] performers = scene.xpath( './/span[@class="episode__artist__name"]/text()').get() if performers: item['performers'] = [ html.unescape(string.capwords(performers.strip())) ] item['url'] = '' item['id'] = '' url = scene.xpath( './/div[contains(@class,"description")]/a/@href').get() if url: item['url'] = "https://" + tldextract.extract( response.url).domain + ".com" + url.strip() item['id'] = re.search(r'.*\/(.*?)\/', url).group(1) item['date'] = self.parse_date('today').isoformat() item['description'] = '' item['tags'] = [] item['trailer'] = '' item['network'] = 'Czech Casting' if "glaminogirls" in response.url: item['parent'] = "Glamino Girls" item['site'] = "Glamino Girls" if "lifepornstories" in response.url: item['parent'] = "Life P**n Stories" item['site'] = "Life P**n Stories" if item['id'] and item['title']: yield item
def get_scenes(self, response): movies = json.loads(response.text) for movie in movies: item = SceneItem() item['title'] = self.cleanup_title(movie['fullName']) item['description'] = '' item['performers'] = [] for performer in movie['models']: item['performers'].append( performer['fullName'].strip().title()) item['image'] = movie['previewImage960'] if not item['image']: item['image'] = None item['image_blob'] = None item['date'] = self.parse_date(movie['publishDate']).isoformat() item['tags'] = [] for tag in movie['tags']: item['tags'].append(tag['fullName'].strip().title()) item['trailer'] = '' item['site'] = "MrBigfatdick" item['parent'] = "MrBigfatdick" item['network'] = "MrBigfatdick" item['url'] = "https://www.mrbigfatdick.com/videos/" + movie[ 'permaLink'] item['id'] = movie['id'] days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): scenes = response.xpath( '//div[@class="wrapper" and ./div[@class="scenewrapper"]]') for scene in scenes: item = SceneItem() title = scene.xpath('.//h1/text()').get() if title: item['title'] = self.cleanup_title(title) else: item['title'] = '' item['description'] = '' item['performers'] = [] item['date'] = self.parse_date('today').isoformat() image = scene.xpath( './div/div[@class="snapshot"]/img/@src|./div/div//video/@poster' ).get() if image: item['image'] = self.format_link(response, image) item['id'] = re.search(r'\.com/(.*?)/.*', image).group(1).strip() else: item['image'] = '' item['id'] = '' tags = scene.xpath('.//div[@class="tags"]/a/text()') if tags: item['tags'] = list( map(lambda x: x.strip().title(), tags.getall())) else: item['tags'] = [] if "myfirstpublic" in response.url: item['site'] = "My First Public" item['parent'] = "My First Public" if "shootourself" in response.url: item['site'] = "Shoot Ourself" item['parent'] = "Shoot Ourself" if "teenyplayground" in response.url: item['site'] = "Teeny Playground" item['parent'] = "Teeny Playground" item['network'] = 'MMP Network' item['image_blob'] = '' item['trailer'] = '' item['url'] = response.url if item['id'] and item['title']: yield item
def get_json_scene(self, response): jsonrow = response.json() item = SceneItem() item['id'] = str(jsonrow['id']) item['title'] = self.cleanup_title(jsonrow['title']) item['description'] = self.cleanup_description(jsonrow['description']) item['performers'] = [] for performer in jsonrow['casts']: item['performers'].append(string.capwords( performer['screen_name'])) item['network'] = "AlterPic" if "alterpic" in response.url: item['site'] = "Fetish Clinic" item['parent'] = "Fetish Clinic" item['url'] = "https://alterpic.com/videos/" + item['id'] if "kinkyponygirl" in response.url: item['site'] = "Kinky Pony Girl" item['parent'] = "Kinky Pony Girl" item['url'] = "https://kinkyponygirl.com/videos/" + item['id'] item['date'] = self.parse_date(jsonrow['publish_date']).isoformat() item['trailer'] = '' item['tags'] = [] for tag in jsonrow['tags']: item['tags'].append(string.capwords(tag['name'])) item['image'] = jsonrow['poster_src'] item['image_blob'] = None if item['id'] and item['title']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): jsondata = response.json()['data']['scenes']['edges'] for jsonrow in jsondata: item = SceneItem() item['id'] = jsonrow['node']['id'] item['title'] = jsonrow['node']['title'] if len(jsonrow['node']['sites']): prefix = match_site(str(jsonrow['node']['sites'][0]['id'])) else: prefix = "https://transdayspa.com" item['url'] = prefix + jsonrow['node']['url'] item['image'] = jsonrow['node']['primaryPhotoUrl'] item['image_blob'] = None item['date'] = jsonrow['node']['createdAt'] item['trailer'] = jsonrow['node']['videoUrls']['trailer'] item['description'] = jsonrow['node']['summary'] item['tags'] = [] for tag in jsonrow['node']['genres']: item['tags'].append(tag['name']) item['performers'] = [] for performer in jsonrow['node']['actors']: item['performers'].append(performer['stageName']) if len(jsonrow['node']['sites']): item['site'] = jsonrow['node']['sites'][0]['name'] item['parent'] = jsonrow['node']['sites'][0]['name'] else: item['site'] = "Trans Day Spa" item['parent'] = "Trans Day Spa" item['network'] = "ARX Bucks" if item['id'] and item['title']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_all_performers(self, response): '''Override performers with correct value ifeelmyself displays videos with multiple performers as multiple videos, one under each name. We search for other copies of the video. ''' # Rebuild object scene = SceneItem() for key in response.meta["scene"]: scene[key] = response.meta["scene"][key] # Override performers with search results (if other performers found) performers = response.xpath("//a[contains(@href,'artist')]/text()").getall() performers = [p.replace("_", " ") for p in performers] if performers != []: scene["performers"] = performers yield scene
def parse_scene(self, response): item = SceneItem() jsondata = response.xpath('//script[@type="application/ld+json"]/text()').get() jsondata = jsondata.replace("\r\n", "") try: data = json.loads(jsondata.strip()) except: print(f'JSON Data: {jsondata}') data = data[0] item['title'] = self.cleanup_title(data['name']) item['description'] = self.cleanup_description(data['description'].strip()) item['date'] = self.parse_date(data['uploadDate'].strip()).isoformat() tags = data['keywords'].split(",") item['tags'] = list(map(lambda x: string.capwords(x.strip()), tags)) item['performers'] = list( map(lambda x: string.capwords(x['name'].strip()), data['actor'])) item['url'] = response.url item['image'] = data['thumbnailUrl'].replace(" ", "%20") item['image_blob'] = None item['trailer'] = '' item['site'] = 'Bi Guys F**k' item['parent'] = 'Bi Guys F**k' item['network'] = 'Bi Guys F**k' item['id'] = re.search(r'.*/(.*?)$', response.url).group(1) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): scenes = response.xpath(r'//div[@class="serie"]') if response.meta['page'] < self.max_pages: for scene in scenes: item = SceneItem() title = scene.xpath(r'./div/div[@class="serie_tekst"]' '/strong/text()').get() if title: item['title'] = self.cleanup_title(title) else: item['title'] = '' description = scene.xpath( r'./div/div[@class="serie_tekst"]' '/strong/following-sibling::text()').get() if description: item['description'] = self.cleanup_description(description) else: item['description'] = '' item['performers'] = ['Lara Latex'] item['tags'] = [] item['date'] = self.parse_date('today').isoformat() image = scene.xpath( r'./div/div[@class="serie_pic01"]/img/@src') if image: image = image.get() item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None item['trailer'] = '' item['site'] = "Laras Playground" item['parent'] = "Laras Playground" item['network'] = "Laras Playground" extern_id = re.search(r'.*\/(\d+)\/.*?\.jpg', item['image']) if extern_id: item['id'] = extern_id.group(1).strip() item['url'] = response.url yield item
def get_scenes(self, response): if "cruel-mistresses" in response.url: xpath = '//div[contains(@class,"movie")]' else: xpath = '//h2[contains(@id,"newestmovie") or contains(@id,"featuremovies") or contains(@id,"classics")]/following-sibling::div' scenes = response.xpath(xpath) for scene in scenes: item = SceneItem() title = scene.xpath('./h3/text()') if title: item['title'] = string.capwords(title.get()) item['title'] = re.sub(u'\u0096', u"\u0027", item['title']) item['title'] = re.sub(u'\u0092', u"\u0027", item['title']) item['title'] = item['title'].replace('`', '') else: item['title'] = '' image = scene.xpath('.//img/@src') if image: image = image.get() item['image'] = self.format_link(response, image) else: item['image'] = None item['image_blob'] = None item['url'] = self.format_link(response, scene.xpath('./a[1]/@href').get()) item['date'] = self.parse_date('today').isoformat() item['id'] = scene.xpath('./@id').get().strip() item['tags'] = site_tags(tldextract.extract(response.url).domain) item['performers'] = [] item['trailer'] = '' item['description'] = '' item['network'] = "Mood Universe" item['parent'] = match_site(tldextract.extract(response.url).domain) item['site'] = match_site(tldextract.extract(response.url).domain) if item['id'] and item['title'] and not re.match(r'best\d{1,3}', item['id']): yield item
def parse_scene(self, response): jslde = JsonLdExtractor() json = jslde.extract(response.text) data = {} for obj in json: if obj['@type'] == 'VideoObject': data = obj break item = SceneItem() item['title'] = self.cleanup_title(data['name']) item['description'] = self.cleanup_description(data['description']) item['image'] = data['thumbnail'] item['image_blob'] = None item['id'] = self.get_id(response) item['trailer'] = data['contentUrl'] item['url'] = response.url item['date'] = self.parse_date(data['datePublished']).isoformat() item['site'] = data['author']['name'] item['network'] = self.network item['parent'] = item['site'] item['performers'] = [] for model in data['actor']: item['performers'].append(model['name']) item['tags'] = self.get_tags(response) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): itemlist = [] jsondata = json.loads(response.text) data = jsondata['results'] for jsonentry in data: item = SceneItem() item['performers'] = [] for model in jsonentry['actors']: item['performers'].append(model['name'].title()) item['title'] = self.cleanup_title(jsonentry['title']) item['description'] = self.cleanup_description(jsonentry['long_description']) if not item['description']: item['description'] = '' item['image'] = jsonentry['thumb']['image'] if not item['image']: item['image'] = None item['image_blob'] = None item['id'] = jsonentry['id'] item['trailer'] = '' item['url'] = "https://femjoy.com" + jsonentry['url'] item['date'] = self.parse_date(jsonentry['release_date'].strip()).isoformat() item['site'] = "FemJoy" item['parent'] = "FemJoy" item['network'] = "FemJoy" item['tags'] = [] days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: itemlist.append(item.copy()) else: itemlist.append(item.copy()) item.clear() return itemlist
def parse_scenes(self, response): item = SceneItem() modelname = response.meta['name'] item['performers'] = [modelname] title = response.xpath('//h1[@id="post-title"]/text()').get() if title: item['title'] = self.cleanup_title(title) description = response.xpath( '//div[@class="postcontent"]//p/text()').get() if description: item['description'] = self.cleanup_description(description) item['site'] = "Chastity Babes" item['parent'] = "Chastity Babes" item['network'] = "Chastity Babes" postinfo = response.xpath('//div[@class="post_info"]/text()').get() if postinfo: postinfo = postinfo.replace("\r\n", " ") postinfo = postinfo.replace("\n", " ") scenedate = re.search(r'Posted\s+on\s?(.*)\s?in', postinfo) if scenedate: scenedate = scenedate.group(1) scenedate = self.parse_date(scenedate.strip()).isoformat() item['date'] = scenedate externalid = re.search(r'Update\s?(.*)\ ?\|', postinfo) if externalid: externalid = externalid.group(1) externalid = re.sub(r'\s+', '', externalid) item['id'] = externalid else: item['id'] = '' else: item['id'] = '' image = response.xpath( '//div[@class="postcontent"]/a[1]/img/@src').get() if image: item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None tags = response.xpath('//a[@rel="category tag"]/text()').getall() if tags: item['tags'] = list(map(lambda x: x.strip().title(), tags)) if "Featured" in item['tags']: item['tags'].remove('Featured') else: item['tags'] = [] item['url'] = response.url item['trailer'] = '' if item['id'] and item['title'] and item['date']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_scene(self, response): item = SceneItem() if response.xpath( '//div[contains(@class,"video-name-area")]/h3/text()'): if 'title' in response.meta and response.meta['title']: item['title'] = response.meta['title'] else: item['title'] = self.get_title(response) if 'description' in response.meta: item['description'] = response.meta['description'] else: item['description'] = self.get_description(response) if 'site' in response.meta: item['site'] = response.meta['site'] else: item['site'] = self.get_site(response) if 'date' in response.meta: item['date'] = response.meta['date'] else: item['date'] = self.get_date(response) if 'image' in response.meta: item['image'] = response.meta['image'] else: item['image'] = self.get_image(response) item['image_blob'] = None if 'performers' in response.meta: item['performers'] = response.meta['performers'] else: item['performers'] = self.get_performers(response) if 'tags' in response.meta: item['tags'] = response.meta['tags'] else: item['tags'] = self.get_tags(response) if 'id' in response.meta: item['id'] = response.meta['id'] else: item['id'] = self.get_id(response) if 'trailer' in response.meta: item['trailer'] = response.meta['trailer'] else: item['trailer'] = self.get_trailer(response) item['url'] = self.get_url(response) if hasattr(self, 'network'): item['network'] = self.network else: item['network'] = self.get_network(response) if hasattr(self, 'parent'): item['parent'] = self.parent else: item['parent'] = self.get_parent(response) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_scene(self, response): jsondata = response.json() htmlcode = jsondata['solution']['response'] response = HtmlResponse(url=response.url, body=htmlcode, encoding='utf-8') response_url = jsondata['solution']['url'] cookies = jsondata['solution']['cookies'] for cookie in cookies: if cookie['name'] == 'mydate': scenedate = cookie['value'] if cookie['name'] == 'performer': performer = cookie['value'] item = SceneItem() if scenedate: item['date'] = self.parse_date(scenedate).isoformat() else: item['date'] = self.parse_date('today').isoformat() if performer: item['performers'] = [performer] else: item['performers'] = [] item['title'] = self.get_title(response) item['description'] = self.get_description(response) item['image'] = self.get_image(response) item['image_blob'] = self.get_image_blob(response) item['tags'] = self.get_tags(response) if "" in item['tags']: item['tags'].remove("") item['id'] = re.search(r'/movie/(.*?)/', jsondata['solution']['url']).group(1) item['trailer'] = self.get_trailer(response) item['url'] = jsondata['solution']['url'] item['network'] = "ATK Girlfriends" if "atkarchives" in response_url: item['parent'] = "ATK Archives" item['site'] = "ATK Archives" if "atkexotics" in response_url: item['parent'] = "ATK Exotics" item['site'] = "ATK Exotics" if "atkpremium" in response_url: item['parent'] = "ATK Premium" item['site'] = "ATK Premium" if "atkpetites" in response_url: item['parent'] = "ATK Petites" item['site'] = "ATK Petites" if "atkhairy" in response_url: item['parent'] = "ATK Hairy" item['site'] = "ATK Hairy" if "amkingdom" in response_url: item['parent'] = "ATK Galleria" item['site'] = "ATK Galleria" days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): responseresult = response.xpath( '//script[contains(text(),"window.__DATA__")]/text()').get() responsedata = re.search(r'__DATA__\ =\ (.*)', responseresult).group(1) jsondata = json.loads(responsedata) data = jsondata['videos']['items'] for jsonentry in data: item = SceneItem() item['title'] = jsonentry['title'] item['description'] = jsonentry['description'] item['description'] = re.sub('<[^<]+?>', '', item['description']).strip() item['image'] = jsonentry['trailer']['poster'] if not isinstance(item['image'], str): item['image'] = None item['image_blob'] = None item['id'] = jsonentry['id'] item['trailer'] = jsonentry['trailer']['src'] if item['trailer'] == "https://c2d8j4g8.ssl.hwcdn.net/6/0/2/5/8/60258852ed44c/bjr0005_rachaelcavalli _trailer.mp4": # For some reason shows this scene trailer as invalid item['trailer'] = '' item['date'] = jsonentry['release_date'] urltext = re.sub(r'[^A-Za-z0-9 ]+', '', jsonentry['title']).lower() urltext = urltext.replace(" ", " ") urltext = urltext.replace(" ", "-") if 'bjraw' in response.url: urltext = "https://www.bjraw.com/tour/videos/" + str( jsonentry['id']) + "/" + urltext item['tags'] = ['B*****b'] item['site'] = "BJ Raw" item['parent'] = "BJ Raw" item['network'] = "BJ Raw" if 'gotfilled' in response.url: urltext = "https://www.gotfilled.com/tour/videos/" + str( jsonentry['id']) + "/" + urltext item['tags'] = ['Creampie'] item['site'] = "Got Filled" item['parent'] = "Got Filled" item['network'] = "Got Filled" item['url'] = urltext item['performers'] = [] for model in jsonentry['models']: item['performers'].append(model['name']) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item item.clear()
def parse_scene(self, response): data = response.json() item = SceneItem() is_v2 = "store2" in response.url if "store2" in response.url: data = data['_source'] item['title'] = data['title'] item['description'] = data['description'] item['image'] = data['img'] item['image_blob'] = None if 'tags' in data: item['tags'] = data['tags'] else: item['tags'] = [] item['id'] = data['id'] if 'videoTrailer' in data: item['trailer'] = data['videoTrailer'] elif 'video' in data: item['trailer'] = 'https://videodelivery.net/' + \ data['video'] + '/manifest/video.m3u8' else: item['trailer'] = '' item['network'] = self.network item['parent'] = response.meta['site'] if 'publishedDate' in data: item['date'] = self.parse_date(data['publishedDate']).isoformat() else: item['date'] = self.parse_date('today').isoformat() if 'site' in data: if 'name' in data['site']: item['site'] = data['site']['name'] else: item['site'] = response.meta['site'] else: item['site'] = response.meta['site'] if is_v2: item['url'] = "https://www.teamskeet.com/movies/" + data['id'] else: item['url'] = "https://www." + response.meta['site'].replace( " ", "").lower() + ".com/movies/" + data['id'] item['performers'] = [] if 'models' in data: for model in data['models']: item['performers'].append(model['modelName']) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): scenes = response.xpath('(//div[@class="tContent left"]|//div[@class="tContent right"])') for scene in scenes: item = SceneItem() title = scene.xpath('.//h3[1]/strong/text()').get() if title: item['title'] = self.cleanup_title(title) else: item['title'] = '' description = scene.xpath('.//h3[1]/strong/text()').get() if description: item['description'] = self.cleanup_description(description) else: item['description'] = '' performers = scene.xpath('.//h1/text()').getall() if performers: item['performers'] = list(map(lambda x: x.strip(), performers)) else: item['performers'] = [] tags = scene.xpath('.//h4[contains(text(),"tags")]/strong/a/text()').getall() if tags: item['tags'] = list(map(lambda x: x.strip(), tags)) else: item['tags'] = [] scenedate = scene.xpath('.//h3[contains(text(),"released")]/strong/text()').get() if scenedate: item['date'] = self.parse_date(scenedate, date_formats=['%m/%d/%Y']).isoformat() else: item['date'] = [] image = scene.xpath('./preceding-sibling::div[1]/@style').get() if image: image = re.search('(https:.*.jpg)', image).group(1) if image: item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None if item['image']: extern_id = re.search(r'samples/(.*?)/', item['image']).group(1) if extern_id: item['id'] = extern_id.strip() item['trailer'] = "https://cdn.legsjapan.com/samples/" + extern_id.strip() + "/sample.mp4" item['site'] = "Legs Japan" item['parent'] = "Legs Japan" item['network'] = "Digital J Media" item['url'] = "https://www.legsjapan.com/en/samples/" + item['id'] days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): scenes = response.xpath('//div[@class="videoBlock"]') for scene in scenes: scenedate = scene.xpath( './/comment()[contains(.,"Release")]').get() if scenedate: scenedate = re.search(r'(\d{2}/\d{2}/\d{4})', scenedate).group(1) scenedate = self.parse_date(scenedate, date_formats=['%m/%d/%Y' ]).isoformat() scenelink = scene.xpath('./div/a/@href').get() if re.search(self.get_selector_map('external_id'), scenelink) and "signup.php" not in scenelink: yield scrapy.Request(url=self.format_link(response, scenelink), callback=self.parse_scene, meta={'date': scenedate}) else: item = SceneItem() item['date'] = scenedate title = scene.xpath('.//span/text()').get() if title: item['title'] = string.capwords(title.strip()) else: item['title'] = '' extern_id = item['title'].replace(" ", "-").replace( "_", "-").strip().lower() extern_id = re.sub('[^a-zA-Z0-9-]', '', extern_id) if extern_id: item['id'] = extern_id.strip() image = scene.xpath('.//img/@src0_4x').get() if not image: image = scene.xpath('.//img/@src0_3x').get() if not image: image = scene.xpath('.//img/@src0_2x').get() if not image: image = scene.xpath('.//img/@src0_1x').get() if image: item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None performers = scene.xpath('./p/a/text()').getall() if performers: item['performers'] = list( map(lambda x: x.strip(), performers)) else: item['performers'] = [] item['tags'] = [] item['trailer'] = '' item['url'] = scenelink item['network'] = "Siren XXX Studios" item['parent'] = "Siren XXX Studios" item['site'] = self.get_site(response) item['description'] = '' days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): scenes = response.xpath('//div [@class="content-item-medium"]') for scene in scenes: item = SceneItem() title = scene.xpath('./div//h3/a/text()') if title: item['title'] = self.cleanup_title(title.get()) else: item['title'] = None scenedate = scene.xpath( './div//span[@class="date"]/span/following-sibling::text()') if scenedate: scenedate = scenedate.get() scenedate = re.sub(r"([0123]?[0-9])(st|th|nd|rd)?", r"\1", scenedate) item['date'] = self.parse_date(scenedate, date_formats=['%d %b %Y' ]).isoformat() else: item['date'] = self.parse_date('today').isoformat() performers = scene.xpath('.//h4[@class="models"]/a/text()') if performers: item['performers'] = list( map(lambda x: self.cleanup_title(x), performers.getall())) else: item['performers'] = [] image = scene.xpath('.//a/@data-images') if image: image = re.search(r'(http.*?\.jpg)', image.get()).group(1) item['image'] = image = self.format_link( response, image.replace("\\", "")) else: item['image'] = None item['image_blob'] = None item['description'] = '' item['tags'] = ['POV'] item['trailer'] = None item['url'] = response.url item['network'] = 'POV Perv' item['parent'] = 'POV Perv' item['site'] = 'POV Perv' if item['title']: externid = item['title'].replace(" ", "-").lower() item['id'] = re.sub('[^a-zA-Z0-9-]', '', externid) else: item['id'] = None if item['title'] and item['id'] and item['date'] > '2021-02-26': days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def get_scenes(self, response): meta = response.meta scenes = response.xpath('//li[contains(@class,"first")]') for scene in scenes: item = SceneItem() url = scene.xpath('./a/@href') if url: url = url.get() item['url'] = self.format_link(response, url) else: item['url'] = '' title = scene.xpath('.//h3/text()') if title: title = title.get() title = title.replace("*", "") item['title'] = string.capwords(title.strip()) else: title = re.search(r'(setid=\d+)', item['url']).group(1) item['title'] = title.replace('=', ' ').title() item['title'] = self.cleanup_title(item['title']) image = scene.xpath('.//img/@src') if image: image = image.get().replace(" ", "%20") item['image'] = self.format_link(response, image.strip()) item['id'] = re.search(r'.*/(.*)\.', item['image']).group(1) item['id'] = item['id'].replace("_tn", "").replace('%20', '') item['id'] = re.sub(r'[^a-zA-Z0-9-]', '', item['id']) else: item['image'] = None item['id'] = None item['image_blob'] = None if meta['site'] == "Brendas Bound": description = scene.xpath('.//span[contains(@style,"font-size: medium;")]/em/text()') else: description = scene.xpath('./a/div/p//text()') if description: description = description.getall() item['description'] = " ".join(description).replace('\xa0', '').strip() item['description'] = re.sub(r'\d{1,3} photos', '', item['description'], flags=re.IGNORECASE) item['description'] = re.sub(r'\d{1,3}:\d{1,3} video', '', item['description'], flags=re.IGNORECASE) item['description'] = re.sub(' ', ' ', item['description']) item['description'] = self.cleanup_description(item['description']) else: item['description'] = '' scenedate = re.search(r' (\w+ \d{1,2}, \d{4}) ', item['description']) if not scenedate: scenedate = re.search(r'(\d{2}\.\d{2}\.\d{2})', item['description']) if scenedate: item['date'] = self.parse_date(scenedate.group(1).strip()).isoformat() else: item['date'] = self.parse_date('today').isoformat() item['performers'] = self.site_performers(scene, meta) item['tags'] = self.site_tags(scene, meta) item['trailer'] = '' item['site'] = meta['site'] item['parent'] = meta['site'] item['network'] = 'XSiteAbility' if item['id']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_scene(self, response): item = SceneItem() if 'title' in response.meta and response.meta['title']: item['title'] = response.meta['title'] else: item['title'] = self.get_title(response) if 'description' in response.meta: item['description'] = response.meta['description'] else: item['description'] = self.get_description(response) if hasattr(self, 'site'): item['site'] = self.site elif 'site' in response.meta: item['site'] = response.meta['site'] else: item['site'] = self.get_site(response) if 'date' in response.meta: item['date'] = response.meta['date'] else: item['date'] = self.get_date(response) if 'image' in response.meta: item['image'] = response.meta['image'] else: item['image'] = self.get_image(response) if 'image' not in item or not item['image']: item['image'] = None if 'image_blob' in response.meta: item['image_blob'] = response.meta['image_blob'] else: item['image_blob'] = self.get_image_blob(response) if ('image_blob' not in item or not item['image_blob']) and item['image']: item['image_blob'] = self.get_image_blob_from_link(item['image']) if 'image_blob' not in item: item['image_blob'] = None if 'performers' in response.meta: item['performers'] = response.meta['performers'] else: item['performers'] = self.get_performers(response) if 'tags' in response.meta: item['tags'] = response.meta['tags'] else: item['tags'] = self.get_tags(response) if 'id' in response.meta: item['id'] = response.meta['id'] else: item['id'] = self.get_id(response) if 'trailer' in response.meta: item['trailer'] = response.meta['trailer'] else: item['trailer'] = self.get_trailer(response) item['url'] = self.get_url(response) if hasattr(self, 'network'): item['network'] = self.network elif 'network' in response.meta: item['network'] = response.meta['network'] else: item['network'] = self.get_network(response) if hasattr(self, 'parent'): item['parent'] = self.parent elif 'parent' in response.meta: item['parent'] = response.meta['parent'] else: item['parent'] = self.get_parent(response) yield self.check_item(item, self.days)