def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collection = data.get("collection").get("film") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls()
class FilmReprocess: def __init__(self): self.db = MongoPipeline( MONGODB_HOST, MONGODB_PORT, MONGODB_DB ) def reprocess(self): collection = "reprocess_item_film" response = self.db.get(collection, where={"status": "checked"}) for item in response.get("data"): for child in item.get("data"): status = self.db.get("film", where={"film_id": child}) if status.get("count") == 0: self.db.reprocess_item("film", child) self.db.updateOne(collection, {"status": "save"}, {'key': "id", 'value': str(item.get("id"))})
class FilmDetailSpider(scrapy.Spider): name = "film-detail" allowed_domains = [] base_url = None start_urls = [] collection = None pipeline = set([MongoPipeline, RequiredFieldsPipeline]) required_fields = ["film_id"] mongo_requirement = { "primary": "film_id", "collection": "film", "source": "list" } def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collection = data.get("collection").get("film") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls() # self.start_urls = ["http://www.imdb.com/title/tt1398426"] @classmethod def from_crawler(cls, crawler): data = dict() data.update({"base_url": crawler.settings.get("BASE_URL")}) data.update({"collection": crawler.settings.get("COLLECTION")}) data.update( {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")}) data.update({ "database": { "host": crawler.settings.get("MONGODB_HOST"), "port": crawler.settings.get("MONGODB_PORT"), "db": crawler.settings.get("MONGODB_DB") } }) return cls(data) def populate_start_urls(self): BASE_URL = "http://www.imdb.com/title/" db = self.db.get("film", where={"film_stars": {"$exists": False}}) return [ '{}{}'.format(BASE_URL, a.get('film_id')) for a in db.get('data') ] def parse(self, response): name = response.css( ".titleBar > .title_wrapper > h1::text").extract_first() if name: name = name.strip() temp_year = response.css("#titleYear > a::text").extract_first() temp_year = re.sub("[()]", "", temp_year)\ .replace(u"\u2013", "5432112345").replace(u"-", "5432112345") if temp_year else "" temp_year = re.sub("[^0-9.]+", "", temp_year).split("5432112345") if len(temp_year) > 1: temp_year[1] = int( temp_year[1]) if temp_year[1] != '' else 'On Going' if len(temp_year) > 0 and temp_year[0] != "": year = {'start': int(temp_year[0]), 'end': temp_year[1]} if len(temp_year) > 1 \ else {'start': int(temp_year[0]), 'end': int(temp_year[0])} type = "Movie" else: component = response.css( ".titleBar > .title_wrapper > .subtext > a[href*='title']::text" ).extract_first() year = self.get_year_movie(component) type = "TV Series" length = response.css( ".titleBar > .title_wrapper > .subtext > time[itemprop='duration']::attr(datetime)" ).extract_first() length = int(re.sub("[^0-9.]+", "", length.strip())) if length else 'Data not found' short_description = response.css( ".plot_summary_wrapper > .plot_summary > .summary_text::text" ).extract_first() if short_description: short_description = unicodedata.normalize( 'NFKD', short_description.strip()).encode('ascii', 'ignore') person_title = response.css("div.plot_summary > .credit_summary_item > h4::text") \ .extract() person_title = [ re.sub("[^a-zA-Z]+", "", item).lower() if item else None for item in person_title ] person_value_html = response.css( "div.plot_summary > .credit_summary_item") person_value = list() for item in person_value_html: user = item.css("span > a > span::text").extract() user = [{ "name": str( unicodedata.normalize('NFKD', x.strip()).encode( 'ascii', 'ignore')) } if x else { "name": None } for x in user] link = item.css("span > a::attr(href)").extract() link = [{ "id": str(x.strip().split("?")[0].replace("/name/", "")) } if { "id": None } else None for x in link] [x.update(link[i]) for i, x in enumerate(user)] person_value.append(user) person = dict(zip(person_title, person_value)) rating = dict() imdbRating = response.css( "div.imdbRating > div.ratingValue > strong > span::text" ).extract_first() if imdbRating: imdbRating = float(imdbRating.strip().replace(",", ".")) else: imdbRating = "IMDB rating is not found" metascoreRating = response.css( "div.titleReviewBar > .titleReviewBarItem > a > div.metacriticScore > span::text" ).extract_first() if metascoreRating: metascoreRating = int(metascoreRating.strip().replace(",", ".")) else: metascoreRating = "Metascore rating is not found" rating.update({"imdb": imdbRating, "metascore": metascoreRating}) genreElement = response.css( ".titleBar > .title_wrapper > .subtext > a[href*='genre']") genre = list() for item in genreElement: genreItems = item.css("span::text").extract_first() if genreItems: genre.append(str(genreItems.strip())) time = dict() date_release = response.css( ".titleBar > .title_wrapper > .subtext > a[href*='title'] > meta::attr(content)" ).extract_first() if date_release: try: time.update({ "iso": datetime.strptime(date_release.strip(), '%Y-%M-%d') }) except Exception: pass time.update( {"iso": datetime.strptime(date_release.strip(), '%Y')}) finally: print 'done' time.update({"year": time.get("iso").strftime("%Y")}) time.update({"month": time.get("iso").strftime("%m")}) time.update({"date": time.get("iso").strftime("%d")}) else: time.update({"iso": None}) time.update({"year": None}) time.update({"month": None}) time.update({"date": None}) contentRating = response.css( ".titleBar > .title_wrapper > .subtext > meta[itemprop='contentRating']::attr(content)" ).extract_first() if contentRating: contentRating = contentRating.strip() storyline = response.css( "#titleStoryLine > div[itemprop='description'] > p::text").extract( ) storyline = "\n".join([ unicodedata.normalize('NFKD', x.strip()).encode('ascii', 'ignore') if x else "" for x in storyline ]) image = response.css("div.poster > a > img::attr(src)").extract_first() if image: image = convert_photo(image.strip(), "film-list") url = self.replaceText(response.url.replace(self.base_url, ''), '?') item = FilmDetail() item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")}) item.update({"film_title": name}) item.update({"film_description_short": short_description}) item.update({"film_director": person.get("director")}) item.update({ "film_writer": person.get("writer") if "writer" in person else person.get("writers") }) item.update({ "film_stars": person.get("stars") if "stars" in person else person.get("star") }) item.update({ "film_creator": person.get("creators") if "creators" in person else person.get("creator") }) item.update({"film_rating": rating}) item.update({"film_genre": genre}) item.update({"film_photo": image}) item.update({"film_year": year}) item.update({"film_length": length}) item.update({"film_type": type}) item.update({"film_date_release": time}) item.update({"film_content_rating": contentRating}) item.update({"film_storyline": storyline}) yield item def get_year_movie(self, component): year = dict() if component: component = re.sub("[()]", "", component).replace( u"\u2013", "5432112345") if component else "" component = re.sub("[^0-9.]+", "", component).split("5432112345") if len(component) > 1: component[1] = int( component[1]) if component[1] != '' else 'On Going' if len(component) > 0 and component[0] != "": year = {'start': int(component[0]), 'end': component[1]} if len(component) > 1 \ else {'start': int(component[0]), 'end': int(component[0])} else: year = {'start': 'Info Not Found', 'end': 'Info Not Found'} return year else: return {'start': 'Info Not Found', 'end': 'Info Not Found'} def replaceText(self, text, keyword): there = re.compile(re.escape('{}'.format(keyword)) + '.*') return there.sub('', text)
class ActressDetailSpider(scrapy.Spider): name = "actress-bio" allowed_domains = [] base_url = None start_urls = [] collection = None pipeline = set([MongoPipeline, RequiredFieldsPipeline]) required_fields = ["actress_id", "actress_bio"] mongo_requirement = { "primary": "actress_id", "collection": "actress", "source": "bio" } def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collection = data.get("collection").get("actress") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls() def populate_start_urls(self): BASE_URL = "http://www.imdb.com/name/" db = self.db.get("actress", where={ "actress_birth": { "$exists": False }, "actress_bio": { "$exists": False } }) return [ '{}{}/bio'.format(BASE_URL, a.get('actress_id')) for a in db.get('data') ] @classmethod def from_crawler(cls, crawler): data = dict() data.update({"base_url": crawler.settings.get("BASE_URL")}) data.update({"collection": crawler.settings.get("COLLECTION")}) data.update( {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")}) data.update({ "database": { "host": crawler.settings.get("MONGODB_HOST"), "port": crawler.settings.get("MONGODB_PORT"), "db": crawler.settings.get("MONGODB_DB") } }) return cls(data) def parse(self, response): birth = dict() height = None birth_date = response.css("#overviewTable > tr:nth-child(1) > td:nth-child(2) > time::attr(datetime)")\ .extract_first() if birth_date: birth_date = birth_date.strip() birth.update({"date": convert_date(birth_date, "%Y-%m-%d")}) else: birth.update({"date": convert_date('', "%Y-%m-%d")}) place = response.css( "#overviewTable > tr.even > td:nth-child(2) > a::text" ).extract_first() if place: place = place.strip() birth.update({ "place": str( unicodedata.normalize('NFKD', place).encode('ascii', 'ignore')) }) else: birth.update({"place": "Oops data not found"}) field_key = [ str(i).lower() for i in response.css( "#overviewTable > tr > td.label::text").extract() ] field_value = [ unicodedata.normalize('NFKD', self.strip_html(i).strip()).encode( 'ascii', 'ignore') if i else '-' for i in response.css("#overviewTable > tr > td:last-child").extract() ] key = dict(zip(field_key, field_value)) if 'born' in key: del key['born'] if 'height' in key: height = key.get('height').strip() del key['height'] bio_element = response.css("#bio_content > div.soda") bio = ''.join([ self.strip_html(a.strip()) for a in bio_element[0].css('p').extract() ]) url = self.replaceText(response.url.replace(self.base_url, ''), '?') item = ActressBio() item.update({"actress_id": re.sub(r"name.+?", "", url).strip("/")}) item.update({"actress_height": height}) item.update({"actress_birth": birth}) item.update({"actress_personal_detail": key}) item.update({"actress_bio": bio}) yield item def strip_html(self, data): data = "\line".join(data.split("<br>")) p = re.compile(r'<.*?>') data = p.sub('', data) data = re.sub("\n + ? +", "", data) return "\n".join(data.split("\line")) def replaceText(self, text, keyword): there = re.compile(re.escape('{}'.format(keyword)) + '.*') return there.sub('', text)[1:].replace('/bio', '')
class FilmSynopsisSpider(scrapy.Spider): name = "film-media" allowed_domains = [] base_url = None start_urls = [] collection = None pipeline = set([MongoPipeline, RequiredFieldsPipeline]) required_fields = ["film_id"] mongo_requirement = { "primary": "film_id", "collection": "film", "source": "media" } def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collect = data.get("collection").get("film") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls() @classmethod def from_crawler(cls, crawler): data = dict() data.update({"base_url": crawler.settings.get("BASE_URL")}) data.update({"collection": crawler.settings.get("COLLECTION")}) data.update( {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")}) data.update({ "database": { "host": crawler.settings.get("MONGODB_HOST"), "port": crawler.settings.get("MONGODB_PORT"), "db": crawler.settings.get("MONGODB_DB") } }) return cls(data) def populate_start_urls(self): BASE_URL = "http://www.imdb.com/title/" db = self.db.get("film", where={"film_media": { "$exists": False }} # , # limit=100 ) return [ '{}{}/mediaindex'.format(BASE_URL, a.get('film_id')) for a in db.get('data') ] def parse(self, response): tag = response.css( "#media_index_thumbnail_grid > a > img::attr(src)").extract() media = [convert_photo(index, "media") for index in tag] url = self.replaceText(response.url.replace(self.base_url, ''), '?') item = FilmPhoto() item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")}) item.update({"film_media": media if type(media) is list else []}) yield item def replaceText(self, text, keyword): there = re.compile(re.escape('{}'.format(keyword)) + '.*') return there.sub('', text)[1:].replace('/mediaindex', '')
class FilmSynopsisSpider(scrapy.Spider): name = "film-synopsis" allowed_domains = [] base_url = None start_urls = [] collection = None pipeline = set([MongoPipeline, RequiredFieldsPipeline]) required_fields = ["film_id"] mongo_requirement = { "primary": "film_id", "collection": "film", "source": "synopsis" } def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collect = data.get("collection").get("film") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls() @classmethod def from_crawler(cls, crawler): data = dict() data.update({"base_url": crawler.settings.get("BASE_URL")}) data.update({"collection": crawler.settings.get("COLLECTION")}) data.update( {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")}) data.update({ "database": { "host": crawler.settings.get("MONGODB_HOST"), "port": crawler.settings.get("MONGODB_PORT"), "db": crawler.settings.get("MONGODB_DB") } }) return cls(data) def populate_start_urls(self): BASE_URL = "http://www.imdb.com/title/" db = self.db.get("film", where={"film_synopsis": { "$exists": False }} # , # limit=78 ) return [ '{}{}/synopsis'.format(BASE_URL, a.get('film_id')) for a in db.get('data') ] def parse(self, response): synopsis = response.css( "#swiki_body > div:nth-child(2) > .display > div[id*='swiki'] > div::text" ).extract() synopsis = [ str( unicodedata.normalize('NFKD', x.strip()).encode( 'ascii', 'ignore')) if x else "Synopsis is not found" for x in synopsis ] synopsis = "Synopsis is not found" if len( synopsis) == 1 and synopsis[0] == "" else "\n".join(synopsis) url = self.replaceText(response.url.replace(self.base_url, ''), '?') item = FilmSynopsis() item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")}) item.update({"film_synopsis": synopsis}) yield item def replaceText(self, text, keyword): there = re.compile(re.escape('{}'.format(keyword)) + '.*') return there.sub('', text)[1:].replace('/synopsis', '').replace('/plotsummary', '')
def __init__(self): self.db = MongoPipeline( MONGODB_HOST, MONGODB_PORT, MONGODB_DB )
class FilmDetailSpider(scrapy.Spider): name = "film-crew" allowed_domains = [] base_url = None start_urls = [] collection = None pipeline = set([MongoPipeline, RequiredFieldsPipeline]) required_fields = ["film_id"] mongo_requirement = { "primary": "film_id", "collection": "film", "source": "crew" } def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collection = data.get("collection").get("film") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls() # self.start_urls = ["http://www.imdb.com/title/tt3315342/fullcredits"] @classmethod def from_crawler(cls, crawler): data = dict() data.update({"base_url": crawler.settings.get("BASE_URL")}) data.update({"collection": crawler.settings.get("COLLECTION")}) data.update( {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")}) data.update({ "database": { "host": crawler.settings.get("MONGODB_HOST"), "port": crawler.settings.get("MONGODB_PORT"), "db": crawler.settings.get("MONGODB_DB") } }) return cls(data) def populate_start_urls(self): BASE_URL = "http://www.imdb.com/title/" db = self.db.get("film", where={ "film_crew": { "$exists": False }, "film_cast": { "$exists": False } }) return [ '{}{}/fullcredits'.format(BASE_URL, a.get('film_id')) for a in db.get('data') ] def parse(self, response): reprocess = dict() reprocess.update({"data": []}) heading = response.css("#fullcredits_content > h4::text").extract() table = response.css("#fullcredits_content > table") tic() heading = [i.strip() if i.strip() != '' else None for i in heading] heading = filter(lambda x: x is not None and x != 'Cast', heading) table_data = list() for item in table: temporary = list() name = item.css("tr > td.name > a::text").extract() link = item.css("tr > td.name > a::attr(href)").extract() credit = item.css("tr > td.credit::text").extract() for item in link: reprocess["data"].append( re.sub(r"name.+?", "", item.strip()).split("?")[0].strip("/")) temporary = [{ "name": unicodedata.normalize('NFKD', x.strip()).encode('ascii', 'ignore'), "actress_id": re.sub(r"name.+?", "", link[k].strip()).split("?")[0].strip("/"), "credit": unicodedata.normalize('NFKD', credit[k].strip()).encode( 'ascii', 'ignore') if len(credit) > k is not None else None } for k, x in enumerate(name)] if len(temporary) > 0: table_data.append(temporary) data = dict(zip(heading, table_data)) # Cast list_cast = list() cast_table = response.css( "#fullcredits_content > table.cast_list > tr") for item in cast_table: temporary = dict() name = item.css("td.itemprop > a > span::text").extract_first() link = item.css("td.itemprop > a::attr(href)").extract_first() credit = "" credit_div = item.css("td.character > div::text").extract() credit_div = ''.join( [re.sub("\s\s+", " ", x).lstrip(" ") for x in credit_div]) credit_link = item.css( "td.character > div > a::text").extract_first() if credit_link: credit = credit_link.strip() credit = unicodedata.normalize('NFKD', credit).encode( 'ascii', 'ignore') if credit_div: credit_div = re.sub("\s\s+", " ", credit_div).lstrip(" ") credit_div = unicodedata.normalize('NFKD', credit_div).encode( 'ascii', 'ignore') credit = "{} {}".format(credit, credit_div) if name is not None and link is not None: temporary.update({ "name": unicodedata.normalize('NFKD', name.strip()).encode( 'ascii', 'ignore') }) temporary.update({"credit": credit.lstrip(" ")}) temporary.update({ "actress_id": re.sub(r"name.+?", "", link.strip()).split("?")[0].strip("/") }) reprocess["data"].append(temporary.get("actress_id")) list_cast.append(temporary) toc(save=True, fmt=True) url = self.replaceText(response.url.replace(self.base_url, ''), '?') reprocess.update({"id": re.sub(r"title.+?", "", url).strip("/")}) reprocess.update({"status": "checked"}) self.reprocessActress(reprocess, reprocess.get("id")) item = FilmCrew() item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")}) item.update({"film_crew": data}) item.update({"film_cast": list_cast}) yield item def replaceText(self, text, keyword): there = re.compile(re.escape('{}'.format(keyword)) + '.*') return there.sub('', text)[1:].replace('/fullcredits', '') def reprocessActress(self, data, id): collection = "reprocess_item_actress" result = self.db.get(collection, where={"id": id}) if result.get("count") == 0: self.db.insertOne(collection, data, id) else: self.db.updateOne(collection, data, { 'key': "id", 'value': str(id) })
class ActressDetailSpider(scrapy.Spider): name = "actress-detail" allowed_domains = [] base_url = None start_urls = [] collection = None pipeline = set([MongoPipeline, RequiredFieldsPipeline]) required_fields = ["actress_filmography", "actress_height"] mongo_requirement = { "primary": "actress_id", "collection": "actress", "source": "detail" } def __init__(self, data): scrapy.Spider.__init__(self) self.base_url = data.get("base_url") self.collection = data.get("collection").get("actress") self.allowed_domains = [x for x in data.get("allowed_domains")] self.db = MongoPipeline( data.get("database").get("host"), data.get("database").get("port"), data.get("database").get("db")) self.start_urls = self.populate_start_urls() # self.start_urls = ["http://www.imdb.com/name/nm3592338"] @classmethod def from_crawler(cls, crawler): data = dict() data.update({"base_url": crawler.settings.get("BASE_URL")}) data.update({"collection": crawler.settings.get("COLLECTION")}) data.update( {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")}) data.update({ "database": { "host": crawler.settings.get("MONGODB_HOST"), "port": crawler.settings.get("MONGODB_PORT"), "db": crawler.settings.get("MONGODB_DB") } }) return cls(data) def populate_start_urls(self): BASE_URL = "http://www.imdb.com/name/" db = self.db.get("actress", where={ "actress_category": { "$exists": False }, "actress_filmography": { "$exists": False } }) return [ '{}{}'.format(BASE_URL, a.get('actress_id')) for a in db.get('data') ] def parse(self, response): reprocess = dict() reprocess.update({"data": []}) name = response.css( "#overview-top > h1 > span.itemprop::text").extract_first() if name is not None: name = name.strip() else: name = response.css( "#overview-top > div.no-pic-wrapper > div > h1 > span.itemprop::text" ).extract_first() name = name.strip() if name is not None else None category = [ str(item.css('span::text').extract_first().strip().lower()) for item in response.css("#name-job-categories > a") ] film_component = response.css("#filmography > div > div.filmo-row") filmography = [self.parsingFilm(item) for item in film_component] for item in filmography: reprocess["data"].append(item.get("film_id")) height = response.css("#details-height::text").extract() height = unicodedata.normalize('NFKD', height[1].strip()).encode( 'ascii', 'ignore') if height else '-' image = response.css("#name-poster::attr(src)").extract_first() if image: image = convert_photo(image.strip()) url = self.replaceText(response.url.replace(self.base_url, ''), '?') reprocess.update({"id": re.sub(r"name.+?", "", url).strip("/")}) reprocess.update({"status": "checked"}) self.reprocessFilm(reprocess, reprocess.get("id")) item = ActressDetail() item.update({"actress_id": re.sub(r"name.+?", "", url).strip("/")}) item.update({"actress_name": name}) item.update({"actress_category": category}) item.update({"actress_photo": image}) item.update({"actress_filmography": filmography}) item.update({"actress_height": height}) yield item def replaceText(self, text, keyword): there = re.compile(re.escape('{}'.format(keyword)) + '.*') return there.sub('', text) def parsingFilm(self, component): tempYear = filter( None, component.css( "span.year_column::text").extract_first().strip().split('-')) if len(tempYear) > 0: year = {'start': int(re.sub('[^0-9.]+', '', tempYear[0])), 'end': int(re.sub('[^0-9.]+', '', tempYear[1]))} \ if len(tempYear) > 1 \ else {'start': int(re.sub('[^0-9.]+', '', tempYear[0])), 'end': int(re.sub('[^0-9.]+', '', tempYear[0]))} else: year = {'start': 'Info Not Found', 'end': 'Info Not Found'} film = component.css("b > a::text").extract_first().strip() id = component.css("b > a::attr(href)").extract_first().strip() id = re.sub(r"title.+?", "", id).split("/")[1] return {"year": year, "film": film, "film_id": id} def reprocessFilm(self, data, id): collection = "reprocess_item_film" result = self.db.get(collection, where={"id": id}) if result.get("count") == 0: self.db.insertOne(collection, data, id) else: self.db.updateOne(collection, data, { 'key': "id", 'value': str(id) })