def get_scenes(self, response): itemlist = [] jsondata = json.loads(response.text) data = jsondata['results'] for jsonentry in data: item = SceneItem() item['performers'] = [] for model in jsonentry['actors']: item['performers'].append(model['name'].title()) item['title'] = self.cleanup_title(jsonentry['title']) item['description'] = self.cleanup_description(jsonentry['long_description']) if not item['description']: item['description'] = '' item['image'] = jsonentry['thumb']['image'] if not item['image']: item['image'] = None item['image_blob'] = None item['id'] = jsonentry['id'] item['trailer'] = '' item['url'] = "https://femjoy.com" + jsonentry['url'] item['date'] = self.parse_date(jsonentry['release_date'].strip()).isoformat() item['site'] = "FemJoy" item['parent'] = "FemJoy" item['network'] = "FemJoy" item['tags'] = [] days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: itemlist.append(item.copy()) else: itemlist.append(item.copy()) item.clear() return itemlist
def get_scenes(self, response): scenelist = [] scenes = response.xpath('//div[@class="item "]') for scene in scenes: item = SceneItem() item['performers'] = [] item['tags'] = [] item['trailer'] = '' item['image'] = '' item['description'] = '' item['network'] = "Sins Life" item['parent'] = "Sins Life" item['site'] = "Sins Life" title = scene.xpath( './/div[contains(@class,"item-title")]/a/text()').get() if title: item['title'] = self.cleanup_title(title) externalid = re.sub('[^a-zA-Z0-9-]', '', item['title']) item['id'] = externalid.lower().strip().replace(" ", "-") item['url'] = response.url description = scene.xpath( './/div[@class="item-meta"]/div/text()').getall() if description: description = " ".join(description) description = description.replace(" ", " ") item['description'] = self.cleanup_description(description) item['date'] = self.parse_date('today').isoformat() image = scene.xpath('.//img/@src0_3x').get() if not image: image = scene.xpath('.//img/@src').get() if image: item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None if item['id'] and item['title'] and item['date']: scenelist.append(item.copy()) item.clear() return scenelist
def get_scenes(self, response): responseresult = response.xpath( '//script[contains(text(),"window.__DATA__")]/text()').get() responsedata = re.search(r'__DATA__\ =\ (.*)', responseresult).group(1) jsondata = json.loads(responsedata) data = jsondata['videos']['items'] for jsonentry in data: item = SceneItem() item['title'] = jsonentry['title'] item['description'] = jsonentry['description'] item['description'] = re.sub('<[^<]+?>', '', item['description']).strip() item['image'] = jsonentry['trailer']['poster'] if not isinstance(item['image'], str): item['image'] = None item['image_blob'] = None item['id'] = jsonentry['id'] item['trailer'] = jsonentry['trailer']['src'] if item['trailer'] == "https://c2d8j4g8.ssl.hwcdn.net/6/0/2/5/8/60258852ed44c/bjr0005_rachaelcavalli _trailer.mp4": # For some reason shows this scene trailer as invalid item['trailer'] = '' item['date'] = jsonentry['release_date'] urltext = re.sub(r'[^A-Za-z0-9 ]+', '', jsonentry['title']).lower() urltext = urltext.replace(" ", " ") urltext = urltext.replace(" ", "-") if 'bjraw' in response.url: urltext = "https://www.bjraw.com/tour/videos/" + str( jsonentry['id']) + "/" + urltext item['tags'] = ['B*****b'] item['site'] = "BJ Raw" item['parent'] = "BJ Raw" item['network'] = "BJ Raw" if 'gotfilled' in response.url: urltext = "https://www.gotfilled.com/tour/videos/" + str( jsonentry['id']) + "/" + urltext item['tags'] = ['Creampie'] item['site'] = "Got Filled" item['parent'] = "Got Filled" item['network'] = "Got Filled" item['url'] = urltext item['performers'] = [] for model in jsonentry['models']: item['performers'].append(model['name']) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item item.clear()
def get_scenes(self, response): responseresult = response.xpath( '//script[contains(text(),"window.__DATA__")]/text()').get() responsedata = re.search(r'__DATA__\ =\ (.*)', responseresult).group(1) jsondata = json.loads(responsedata) data = jsondata['data']['videos']['items'] for jsonentry in data: item = SceneItem() item['title'] = jsonentry['title'] item['description'] = jsonentry['description'] item['description'] = re.sub('<[^<]+?>', '', item['description']).strip() item['image'] = jsonentry['thumb'] if not isinstance(item['image'], str): item['image'] = None else: item['image'] = item['image'].replace(" ", "%20") item['image_blob'] = None item['id'] = jsonentry['id'] item['trailer'] = '' urltext = re.sub(r'[^A-Za-z0-9 ]+', '', jsonentry['title']).lower() urltext = urltext.replace(" ", " ") urltext = urltext.replace(" ", "-") urltext = "https://tour.topwebmodels.com/scenes/" + str( jsonentry['id']) + "/" + urltext item['url'] = urltext item['date'] = jsonentry['release_date'] item['site'] = match_site(jsonentry['sites'][0]['name']) item['network'] = 'TopWebModels' item['parent'] = 'TopWebModels' item['performers'] = [] for model in jsonentry['models']: if " and " in model['name'].lower(): modellist = model['name'].split(" and ") if modellist: for model in modellist: item['performers'].append(model.title()) if " & " in model['name'].lower(): modellist = model['name'].split(" & ") if modellist: for model in modellist: item['performers'].append(model.title()) else: item['performers'].append(model['name']) item['tags'] = [] for tags in jsonentry['tags']: if "scott's picks" not in tags['name'].lower(): item['tags'].append(string.capwords(tags['name'])) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item item.clear()
def parse_scenepage(self, response): itemlist = [] meta = response.meta parsed_uri = urlparse(response.url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) jsondata = json.loads(response.text) data = jsondata['data'] for jsonentry in data: item = SceneItem() item['performers'] = [] for model in jsonentry['actors']: model['name'] = model['name'].replace("+", "&").strip() if "&" in model['name']: models = model['name'].split("&") for star in models: item['performers'].append(star.strip().title()) else: item['performers'].append(model['name'].title()) item['title'] = jsonentry['title_en'] if len(re.findall(r'\w+', item['title'])) == 1 and len( item['performers']): if len(item['performers']) > 1: item['title'] = ", ".join( item['performers']) + " (" + item['title'] + ")" else: item['title'] = item['performers'][0] + " (" + item[ 'title'] + ")" item['description'] = jsonentry['description_en'] if not item['description']: item['description'] = '' item['image'] = jsonentry['screenshots'][0] if isinstance(item['image'], str): item['image'] = "https:" + item['image'] else: item['image'] = None item['image_blob'] = None item['id'] = jsonentry['id'] item['trailer'] = '' item['url'] = domain + "video/" + str( jsonentry['id']) + "/" + jsonentry['slug'] item['date'] = self.parse_date( jsonentry['publication_start'].strip()).isoformat() if not item['date']: item['date'] = self.parse_date( jsonentry['created_at'].strip()).isoformat() item['site'] = meta['site'] item['parent'] = "Teen Core Club" item['network'] = "Teen Core Club" item['tags'] = [] days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: itemlist.append(item.copy()) else: itemlist.append(item.copy()) item.clear() return itemlist
def parse_scenepage(self, response): scenelist = [] scenes = response.xpath('//article[contains(@class,"post")]') for scene in scenes: item = SceneItem() item['performers'] = [] item['title'] = '' item['id'] = '' title = scene.xpath('./header/h2/a/text()').get() if title: item['title'] = self.cleanup_title(title) url = scene.xpath('./header/h2/a/@href').get() if url: item['url'] = url.strip() scene_id = re.search(r'.*/(.*?)/$', item['url']).group(1) if scene_id: item['id'] = scene_id.strip() scenedate = scene.xpath( './/time[contains(@class,"published")]/@datetime').get() if scenedate: scenedate = scenedate.strip() else: scenedate = self.parse_date(scenedate).isoformat() item['date'] = scenedate description = scene.xpath( './/div[@class="entry-content"]/p/text()').getall() if not description: description = scene.xpath( './/div[@class="entry-content"]//img/following-sibling::text()' ).getall() if description: description = list(map(lambda x: x.strip(), description)) description = " ".join(description) item['description'] = self.cleanup_description(description) else: item['description'] = '' image = scene.xpath( './/div[@class="entry-content"]/figure//img/@src').get() if not image: image = scene.xpath( './/img[contains(@src,"uploads")]/@src').get() if image: image = image.strip() else: image = None item['image'] = image item['image_blob'] = None performers = scene.xpath( './/span[@class="cat-links"]/a/text()').getall() if performers: performers = list(map(lambda x: x.strip(), performers)) item['performers'] = performers else: item['performers'] = [] tags = scene.xpath( './/span[@class="tags-links"]/a/text()').getall() if tags: tags = list(map(lambda x: x.strip().title(), tags)) item['tags'] = tags else: item['tags'] = [] item['trailer'] = '' item['parent'] = "XXX Horror" item['network'] = "XXX Horror" item['site'] = "XXX Horror" if item['id']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: scenelist.append(item.copy()) else: scenelist.append(item.copy()) item.clear() return scenelist
def get_scenes(self, response): scenes = response.xpath('//div[@class="item-episode"]') scenelist = [] for scene in scenes: item = SceneItem() item['site'] = "Arch Angel" item['parent'] = "Arch Angel" item['network'] = "Arch Angel" title = scene.xpath('.//h3/a/text()') if title: item['title'] = self.cleanup_title(title.get()) else: item['title'] = 'No Title Available' scenedate = scene.xpath( './/strong[contains(text(),"Date")]/following-sibling::text()') if scenedate: item['date'] = self.parse_date(scenedate.get()).isoformat() else: item['date'] = self.parse_date('today').isoformat() performers = scene.xpath( './div[@class="item-info"]//a[contains(@href,"/models/") or contains(@href,"sets.php")]/text()' ).getall() if len(performers): item['performers'] = list( map(lambda x: string.capwords(x.strip()), performers)) else: item['performers'] = [] image = scene.xpath('.//span[@class="left"]/a/img/@src0_1x').get() if image: image = image.replace('//', '/').strip() image = image.replace('#id#', '').strip() image = "https://www.archangelvideo.com" + image item['image'] = image.strip() else: item['image'] = None item['image_blob'] = None item['trailer'] = '' url = scene.xpath('.//span[@class="left"]/a/@href').get() if url: item['url'] = url.strip() external_id = re.search(r'.*/(.*).html', url).group(1) if external_id: item['id'] = external_id.strip().lower() else: item['id'] = '' else: item['url'] = '' item['description'] = '' item['tags'] = [] if item['title'] and item['id']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: scenelist.append(item.copy()) else: scenelist.append(item.copy()) item.clear() return scenelist
def parse_scenepage(self, response): scenelist = [] if "brutaldungeon" in response.url: scenes = response.xpath( '//div[contains(@class,"download-box-large")]/label/..') else: scenes = response.xpath( '//div[@class="row"]/div[contains(@class,"content-image-video")]' ) for scene in scenes: item = SceneItem() item['performers'] = [] item['title'] = '' item['id'] = '' if "brutaldungeon" in response.url: title = scene.xpath('.//h1/text()').get() else: title = scene.xpath( './/div[contains(@class,"vedio-text-area")]/h4/text()' ).get() if title: title = title.strip() if "Latina Patrol" in title: performers = title.replace("Latina Patrol", "") word_list = performers.split() if len(word_list) == 2 or len(word_list) == 3: item['performers'] = [performers.strip()] if "Teens In The Woods" in title: performers = title.replace("Teens In The Woods", "") performers = performers.replace("-", "") performers = performers.replace("&", ",") performers = performers.strip() performerlist = performers.split(",") for performeritem in performerlist: word_list = performeritem.split() if len(word_list) == 2 or len(word_list) == 3: item['performers'].append(performeritem.strip()) if "brutalpickups" in response.url or "bdsmprison" in response.url: item['performers'] = [title] if "brutaldungeon" in response.url: item['performers'] = [] if title: title = title.strip() item['title'] = title if "brutaldungeon" in response.url: date = scene.xpath( './/span[contains(@class,"date")]/text()').get() else: date = scene.xpath( './/div[contains(@class,"image-text-area")]/h4/text()' ).get() if date: date = date.strip() item['date'] = self.parse_date(date.strip()).isoformat() else: item['date'] = self.parse_date('today').isoformat() if "brutaldungeon" in response.url: description = scene.xpath('.//p/text()').get() else: description = scene.xpath('.//h5/text()').get() if description: description = description.replace("Description: ", "").strip() item['description'] = description else: item['description'] = '' image = scene.xpath('.//video/@poster').get() if not image: image = scene.xpath( './/div[contains(@class,"image-section-blk")]/a/img/@src' ).get() if not image: image = scene.xpath( './/div[contains(@class,"image-section-blk")]/a/img/@data-src' ).get() if not image: image = scene.xpath( './/label[@class="player"]//video/@poster').get() if not image: image = scene.xpath( './/label[@class="player"]/a/img/@src').get() if image: baseurl = re.search(r'(.*\/t2\/)', response.url).group(1) image = baseurl + image.strip() else: image = None item['image'] = image item['image_blob'] = None idcode = '' if re.search(r'p\d{3,4}_s\d{3,4}_\d{3,4}_', item['image']): idcode = re.search(r'p\d{3,4}_s\d{3,4}_(\d{3,4})_', item['image']).group(1) if idcode: item['id'] = idcode.strip() else: if item['title']: item['id'] = item['title'].replace(" ", "-") item['url'] = response.url item['tags'] = [] item['trailer'] = '' item['parent'] = "Fetish Network" item['network'] = "Fetish Network" sitename = tldextract.extract(response.url).domain item['site'] = match_site(sitename) if item['id']: scenelist.append(item.copy()) item.clear() return scenelist
def get_scenes(self, response): scenes = response.xpath('//div[@class="movie-set-list-item"]') scenelist = [] for scene in scenes: item = SceneItem() item['site'] = "Aletta Ocean Live" item['parent'] = "Aletta Ocean Live" item['network'] = "Aletta Ocean" title = scene.xpath('.//div[contains(@class,"title")]/text()') if title: item['title'] = self.cleanup_title(title.get()) else: item['title'] = 'No Title Available' scenedate = scene.xpath('.//div[contains(@class,"date")]/text()') if scenedate: item['date'] = self.parse_date( scenedate.get().strip()).isoformat() else: item['date'] = self.parse_date('today').isoformat() item['performers'] = ['Aletta Ocean'] image = scene.xpath('./@style').get() if image: image = re.search(r'url\((.*.jpg)', image).group(1) if image: item['image'] = image.strip() else: item['image'] = '' item['image_blob'] = '' item['trailer'] = '' url = scene.xpath('./a/@href').get() if url: item['url'] = url.strip() external_id = re.search(r'.*/(.*).html', url).group(1) if external_id: item['id'] = external_id.strip().lower() else: item['id'] = '' else: item['url'] = '' item['description'] = '' item['tags'] = [] if item['title'] and item['id']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: scenelist.append(item.copy()) else: scenelist.append(item.copy()) item.clear() return scenelist