Python MongoPipeline示例，imdbCrawler.library.mongo_pipeline.MongoPipeline Python示例

示例#1

0

显示文件

文件： film_detail.py 项目： koreakstd/imdbCrawler

 def __init__(self, data):
     scrapy.Spider.__init__(self)
     self.base_url = data.get("base_url")
     self.collection = data.get("collection").get("film")
     self.allowed_domains = [x for x in data.get("allowed_domains")]
     self.db = MongoPipeline(
         data.get("database").get("host"),
         data.get("database").get("port"),
         data.get("database").get("db"))
     self.start_urls = self.populate_start_urls()

示例#2

0

显示文件

文件： film.py 项目： koreakstd/imdbCrawler

class FilmReprocess:
    def __init__(self):
        self.db = MongoPipeline(
            MONGODB_HOST,
            MONGODB_PORT,
            MONGODB_DB
        )

    def reprocess(self):
        collection = "reprocess_item_film"

        response = self.db.get(collection, where={"status": "checked"})

        for item in response.get("data"):
            for child in item.get("data"):
                status = self.db.get("film", where={"film_id": child})
                if status.get("count") == 0:
                    self.db.reprocess_item("film", child)

            self.db.updateOne(collection, {"status": "save"}, {'key': "id", 'value': str(item.get("id"))})

示例#3

0

显示文件

文件： film_detail.py 项目： koreakstd/imdbCrawler

class FilmDetailSpider(scrapy.Spider):
    name = "film-detail"
    allowed_domains = []
    base_url = None
    start_urls = []
    collection = None
    pipeline = set([MongoPipeline, RequiredFieldsPipeline])
    required_fields = ["film_id"]
    mongo_requirement = {
        "primary": "film_id",
        "collection": "film",
        "source": "list"
    }

    def __init__(self, data):
        scrapy.Spider.__init__(self)
        self.base_url = data.get("base_url")
        self.collection = data.get("collection").get("film")
        self.allowed_domains = [x for x in data.get("allowed_domains")]
        self.db = MongoPipeline(
            data.get("database").get("host"),
            data.get("database").get("port"),
            data.get("database").get("db"))
        self.start_urls = self.populate_start_urls()
        # self.start_urls = ["http://www.imdb.com/title/tt1398426"]

    @classmethod
    def from_crawler(cls, crawler):
        data = dict()
        data.update({"base_url": crawler.settings.get("BASE_URL")})
        data.update({"collection": crawler.settings.get("COLLECTION")})
        data.update(
            {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")})
        data.update({
            "database": {
                "host": crawler.settings.get("MONGODB_HOST"),
                "port": crawler.settings.get("MONGODB_PORT"),
                "db": crawler.settings.get("MONGODB_DB")
            }
        })
        return cls(data)

    def populate_start_urls(self):
        BASE_URL = "http://www.imdb.com/title/"
        db = self.db.get("film", where={"film_stars": {"$exists": False}})

        return [
            '{}{}'.format(BASE_URL, a.get('film_id')) for a in db.get('data')
        ]

    def parse(self, response):
        name = response.css(
            ".titleBar > .title_wrapper > h1::text").extract_first()
        if name:
            name = name.strip()

        temp_year = response.css("#titleYear > a::text").extract_first()
        temp_year = re.sub("[()]", "", temp_year)\
            .replace(u"\u2013", "5432112345").replace(u"-", "5432112345") if temp_year else ""
        temp_year = re.sub("[^0-9.]+", "", temp_year).split("5432112345")

        if len(temp_year) > 1:
            temp_year[1] = int(
                temp_year[1]) if temp_year[1] != '' else 'On Going'

        if len(temp_year) > 0 and temp_year[0] != "":
            year = {'start': int(temp_year[0]), 'end': temp_year[1]} if len(temp_year) > 1 \
                else {'start': int(temp_year[0]), 'end': int(temp_year[0])}
            type = "Movie"
        else:
            component = response.css(
                ".titleBar > .title_wrapper > .subtext > a[href*='title']::text"
            ).extract_first()
            year = self.get_year_movie(component)
            type = "TV Series"

        length = response.css(
            ".titleBar > .title_wrapper > .subtext > time[itemprop='duration']::attr(datetime)"
        ).extract_first()
        length = int(re.sub("[^0-9.]+", "",
                            length.strip())) if length else 'Data not found'

        short_description = response.css(
            ".plot_summary_wrapper > .plot_summary > .summary_text::text"
        ).extract_first()
        if short_description:
            short_description = unicodedata.normalize(
                'NFKD', short_description.strip()).encode('ascii', 'ignore')

        person_title = response.css("div.plot_summary > .credit_summary_item > h4::text") \
            .extract()
        person_title = [
            re.sub("[^a-zA-Z]+", "", item).lower() if item else None
            for item in person_title
        ]

        person_value_html = response.css(
            "div.plot_summary > .credit_summary_item")
        person_value = list()
        for item in person_value_html:
            user = item.css("span > a > span::text").extract()
            user = [{
                "name":
                str(
                    unicodedata.normalize('NFKD', x.strip()).encode(
                        'ascii', 'ignore'))
            } if x else {
                "name": None
            } for x in user]

            link = item.css("span > a::attr(href)").extract()
            link = [{
                "id": str(x.strip().split("?")[0].replace("/name/", ""))
            } if {
                "id": None
            } else None for x in link]

            [x.update(link[i]) for i, x in enumerate(user)]

            person_value.append(user)

        person = dict(zip(person_title, person_value))

        rating = dict()
        imdbRating = response.css(
            "div.imdbRating > div.ratingValue > strong > span::text"
        ).extract_first()
        if imdbRating:
            imdbRating = float(imdbRating.strip().replace(",", "."))
        else:
            imdbRating = "IMDB rating is not found"

        metascoreRating = response.css(
            "div.titleReviewBar > .titleReviewBarItem > a > div.metacriticScore > span::text"
        ).extract_first()
        if metascoreRating:
            metascoreRating = int(metascoreRating.strip().replace(",", "."))
        else:
            metascoreRating = "Metascore rating is not found"

        rating.update({"imdb": imdbRating, "metascore": metascoreRating})

        genreElement = response.css(
            ".titleBar > .title_wrapper > .subtext > a[href*='genre']")
        genre = list()
        for item in genreElement:
            genreItems = item.css("span::text").extract_first()
            if genreItems:
                genre.append(str(genreItems.strip()))

        time = dict()
        date_release = response.css(
            ".titleBar > .title_wrapper > .subtext > a[href*='title'] > meta::attr(content)"
        ).extract_first()
        if date_release:
            try:
                time.update({
                    "iso":
                    datetime.strptime(date_release.strip(), '%Y-%M-%d')
                })
            except Exception:
                pass
                time.update(
                    {"iso": datetime.strptime(date_release.strip(), '%Y')})
            finally:
                print 'done'

            time.update({"year": time.get("iso").strftime("%Y")})
            time.update({"month": time.get("iso").strftime("%m")})
            time.update({"date": time.get("iso").strftime("%d")})
        else:
            time.update({"iso": None})
            time.update({"year": None})
            time.update({"month": None})
            time.update({"date": None})

        contentRating = response.css(
            ".titleBar > .title_wrapper > .subtext > meta[itemprop='contentRating']::attr(content)"
        ).extract_first()
        if contentRating:
            contentRating = contentRating.strip()

        storyline = response.css(
            "#titleStoryLine > div[itemprop='description'] > p::text").extract(
            )
        storyline = "\n".join([
            unicodedata.normalize('NFKD', x.strip()).encode('ascii', 'ignore')
            if x else "" for x in storyline
        ])

        image = response.css("div.poster > a > img::attr(src)").extract_first()
        if image:
            image = convert_photo(image.strip(), "film-list")

        url = self.replaceText(response.url.replace(self.base_url, ''), '?')

        item = FilmDetail()
        item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")})
        item.update({"film_title": name})
        item.update({"film_description_short": short_description})
        item.update({"film_director": person.get("director")})
        item.update({
            "film_writer":
            person.get("writer")
            if "writer" in person else person.get("writers")
        })
        item.update({
            "film_stars":
            person.get("stars") if "stars" in person else person.get("star")
        })
        item.update({
            "film_creator":
            person.get("creators")
            if "creators" in person else person.get("creator")
        })
        item.update({"film_rating": rating})
        item.update({"film_genre": genre})
        item.update({"film_photo": image})
        item.update({"film_year": year})
        item.update({"film_length": length})
        item.update({"film_type": type})
        item.update({"film_date_release": time})
        item.update({"film_content_rating": contentRating})
        item.update({"film_storyline": storyline})

        yield item

    def get_year_movie(self, component):
        year = dict()
        if component:
            component = re.sub("[()]", "", component).replace(
                u"\u2013", "5432112345") if component else ""
            component = re.sub("[^0-9.]+", "", component).split("5432112345")

            if len(component) > 1:
                component[1] = int(
                    component[1]) if component[1] != '' else 'On Going'

            if len(component) > 0 and component[0] != "":
                year = {'start': int(component[0]), 'end': component[1]} if len(component) > 1 \
                    else {'start': int(component[0]), 'end': int(component[0])}
            else:
                year = {'start': 'Info Not Found', 'end': 'Info Not Found'}

            return year
        else:
            return {'start': 'Info Not Found', 'end': 'Info Not Found'}

    def replaceText(self, text, keyword):
        there = re.compile(re.escape('{}'.format(keyword)) + '.*')
        return there.sub('', text)

示例#4

0

显示文件

文件： actress_bio.py 项目： koreakstd/imdbCrawler

class ActressDetailSpider(scrapy.Spider):
    name = "actress-bio"
    allowed_domains = []
    base_url = None
    start_urls = []
    collection = None
    pipeline = set([MongoPipeline, RequiredFieldsPipeline])
    required_fields = ["actress_id", "actress_bio"]
    mongo_requirement = {
        "primary": "actress_id",
        "collection": "actress",
        "source": "bio"
    }

    def __init__(self, data):
        scrapy.Spider.__init__(self)
        self.base_url = data.get("base_url")
        self.collection = data.get("collection").get("actress")
        self.allowed_domains = [x for x in data.get("allowed_domains")]
        self.db = MongoPipeline(
            data.get("database").get("host"),
            data.get("database").get("port"),
            data.get("database").get("db"))

        self.start_urls = self.populate_start_urls()

    def populate_start_urls(self):
        BASE_URL = "http://www.imdb.com/name/"
        db = self.db.get("actress",
                         where={
                             "actress_birth": {
                                 "$exists": False
                             },
                             "actress_bio": {
                                 "$exists": False
                             }
                         })

        return [
            '{}{}/bio'.format(BASE_URL, a.get('actress_id'))
            for a in db.get('data')
        ]

    @classmethod
    def from_crawler(cls, crawler):
        data = dict()
        data.update({"base_url": crawler.settings.get("BASE_URL")})
        data.update({"collection": crawler.settings.get("COLLECTION")})
        data.update(
            {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")})
        data.update({
            "database": {
                "host": crawler.settings.get("MONGODB_HOST"),
                "port": crawler.settings.get("MONGODB_PORT"),
                "db": crawler.settings.get("MONGODB_DB")
            }
        })
        return cls(data)

    def parse(self, response):
        birth = dict()
        height = None

        birth_date = response.css("#overviewTable > tr:nth-child(1) > td:nth-child(2) > time::attr(datetime)")\
            .extract_first()
        if birth_date:
            birth_date = birth_date.strip()
            birth.update({"date": convert_date(birth_date, "%Y-%m-%d")})
        else:
            birth.update({"date": convert_date('', "%Y-%m-%d")})

        place = response.css(
            "#overviewTable > tr.even > td:nth-child(2) > a::text"
        ).extract_first()
        if place:
            place = place.strip()
            birth.update({
                "place":
                str(
                    unicodedata.normalize('NFKD',
                                          place).encode('ascii', 'ignore'))
            })
        else:
            birth.update({"place": "Oops data not found"})

        field_key = [
            str(i).lower() for i in response.css(
                "#overviewTable > tr > td.label::text").extract()
        ]
        field_value = [
            unicodedata.normalize('NFKD',
                                  self.strip_html(i).strip()).encode(
                                      'ascii', 'ignore') if i else '-' for i in
            response.css("#overviewTable > tr > td:last-child").extract()
        ]

        key = dict(zip(field_key, field_value))
        if 'born' in key: del key['born']
        if 'height' in key:
            height = key.get('height').strip()
            del key['height']

        bio_element = response.css("#bio_content > div.soda")
        bio = ''.join([
            self.strip_html(a.strip())
            for a in bio_element[0].css('p').extract()
        ])

        url = self.replaceText(response.url.replace(self.base_url, ''), '?')

        item = ActressBio()
        item.update({"actress_id": re.sub(r"name.+?", "", url).strip("/")})
        item.update({"actress_height": height})
        item.update({"actress_birth": birth})
        item.update({"actress_personal_detail": key})
        item.update({"actress_bio": bio})

        yield item

    def strip_html(self, data):
        data = "\line".join(data.split("<br>"))
        p = re.compile(r'<.*?>')
        data = p.sub('', data)
        data = re.sub("\n + ? +", "", data)
        return "\n".join(data.split("\line"))

    def replaceText(self, text, keyword):
        there = re.compile(re.escape('{}'.format(keyword)) + '.*')
        return there.sub('', text)[1:].replace('/bio', '')

示例#5

0

显示文件

文件： film_media.py 项目： koreakstd/imdbCrawler

class FilmSynopsisSpider(scrapy.Spider):
    name = "film-media"
    allowed_domains = []
    base_url = None
    start_urls = []
    collection = None
    pipeline = set([MongoPipeline, RequiredFieldsPipeline])
    required_fields = ["film_id"]
    mongo_requirement = {
        "primary": "film_id",
        "collection": "film",
        "source": "media"
    }

    def __init__(self, data):
        scrapy.Spider.__init__(self)
        self.base_url = data.get("base_url")
        self.collect = data.get("collection").get("film")
        self.allowed_domains = [x for x in data.get("allowed_domains")]
        self.db = MongoPipeline(
            data.get("database").get("host"),
            data.get("database").get("port"),
            data.get("database").get("db"))

        self.start_urls = self.populate_start_urls()

    @classmethod
    def from_crawler(cls, crawler):
        data = dict()
        data.update({"base_url": crawler.settings.get("BASE_URL")})
        data.update({"collection": crawler.settings.get("COLLECTION")})
        data.update(
            {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")})
        data.update({
            "database": {
                "host": crawler.settings.get("MONGODB_HOST"),
                "port": crawler.settings.get("MONGODB_PORT"),
                "db": crawler.settings.get("MONGODB_DB")
            }
        })
        return cls(data)

    def populate_start_urls(self):
        BASE_URL = "http://www.imdb.com/title/"
        db = self.db.get("film",
                         where={"film_media": {
                             "$exists": False
                         }}
                         # ,
                         # limit=100
                         )

        return [
            '{}{}/mediaindex'.format(BASE_URL, a.get('film_id'))
            for a in db.get('data')
        ]

    def parse(self, response):
        tag = response.css(
            "#media_index_thumbnail_grid > a > img::attr(src)").extract()
        media = [convert_photo(index, "media") for index in tag]
        url = self.replaceText(response.url.replace(self.base_url, ''), '?')

        item = FilmPhoto()
        item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")})
        item.update({"film_media": media if type(media) is list else []})
        yield item

    def replaceText(self, text, keyword):
        there = re.compile(re.escape('{}'.format(keyword)) + '.*')
        return there.sub('', text)[1:].replace('/mediaindex', '')

示例#6

0

显示文件

文件： film_synopsis.py 项目： koreakstd/imdbCrawler

class FilmSynopsisSpider(scrapy.Spider):
    name = "film-synopsis"
    allowed_domains = []
    base_url = None
    start_urls = []
    collection = None
    pipeline = set([MongoPipeline, RequiredFieldsPipeline])
    required_fields = ["film_id"]
    mongo_requirement = {
        "primary": "film_id",
        "collection": "film",
        "source": "synopsis"
    }

    def __init__(self, data):
        scrapy.Spider.__init__(self)
        self.base_url = data.get("base_url")
        self.collect = data.get("collection").get("film")
        self.allowed_domains = [x for x in data.get("allowed_domains")]
        self.db = MongoPipeline(
            data.get("database").get("host"),
            data.get("database").get("port"),
            data.get("database").get("db"))

        self.start_urls = self.populate_start_urls()

    @classmethod
    def from_crawler(cls, crawler):
        data = dict()
        data.update({"base_url": crawler.settings.get("BASE_URL")})
        data.update({"collection": crawler.settings.get("COLLECTION")})
        data.update(
            {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")})
        data.update({
            "database": {
                "host": crawler.settings.get("MONGODB_HOST"),
                "port": crawler.settings.get("MONGODB_PORT"),
                "db": crawler.settings.get("MONGODB_DB")
            }
        })
        return cls(data)

    def populate_start_urls(self):
        BASE_URL = "http://www.imdb.com/title/"
        db = self.db.get("film",
                         where={"film_synopsis": {
                             "$exists": False
                         }}
                         # ,
                         # limit=78
                         )

        return [
            '{}{}/synopsis'.format(BASE_URL, a.get('film_id'))
            for a in db.get('data')
        ]

    def parse(self, response):
        synopsis = response.css(
            "#swiki_body > div:nth-child(2) > .display > div[id*='swiki'] > div::text"
        ).extract()
        synopsis = [
            str(
                unicodedata.normalize('NFKD', x.strip()).encode(
                    'ascii', 'ignore')) if x else "Synopsis is not found"
            for x in synopsis
        ]

        synopsis = "Synopsis is not found" if len(
            synopsis) == 1 and synopsis[0] == "" else "\n".join(synopsis)
        url = self.replaceText(response.url.replace(self.base_url, ''), '?')

        item = FilmSynopsis()
        item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")})
        item.update({"film_synopsis": synopsis})

        yield item

    def replaceText(self, text, keyword):
        there = re.compile(re.escape('{}'.format(keyword)) + '.*')
        return there.sub('', text)[1:].replace('/synopsis',
                                               '').replace('/plotsummary', '')

示例#7

0

显示文件

文件： film.py 项目： koreakstd/imdbCrawler

 def __init__(self):
     self.db = MongoPipeline(
         MONGODB_HOST,
         MONGODB_PORT,
         MONGODB_DB
     )

示例#8

0

显示文件

文件： film_cast.py 项目： koreakstd/imdbCrawler

class FilmDetailSpider(scrapy.Spider):
    name = "film-crew"
    allowed_domains = []
    base_url = None
    start_urls = []
    collection = None
    pipeline = set([MongoPipeline, RequiredFieldsPipeline])
    required_fields = ["film_id"]
    mongo_requirement = {
        "primary": "film_id",
        "collection": "film",
        "source": "crew"
    }

    def __init__(self, data):
        scrapy.Spider.__init__(self)
        self.base_url = data.get("base_url")
        self.collection = data.get("collection").get("film")
        self.allowed_domains = [x for x in data.get("allowed_domains")]
        self.db = MongoPipeline(
            data.get("database").get("host"),
            data.get("database").get("port"),
            data.get("database").get("db"))
        self.start_urls = self.populate_start_urls()
        # self.start_urls = ["http://www.imdb.com/title/tt3315342/fullcredits"]

    @classmethod
    def from_crawler(cls, crawler):
        data = dict()
        data.update({"base_url": crawler.settings.get("BASE_URL")})
        data.update({"collection": crawler.settings.get("COLLECTION")})
        data.update(
            {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")})
        data.update({
            "database": {
                "host": crawler.settings.get("MONGODB_HOST"),
                "port": crawler.settings.get("MONGODB_PORT"),
                "db": crawler.settings.get("MONGODB_DB")
            }
        })
        return cls(data)

    def populate_start_urls(self):
        BASE_URL = "http://www.imdb.com/title/"
        db = self.db.get("film",
                         where={
                             "film_crew": {
                                 "$exists": False
                             },
                             "film_cast": {
                                 "$exists": False
                             }
                         })

        return [
            '{}{}/fullcredits'.format(BASE_URL, a.get('film_id'))
            for a in db.get('data')
        ]

    def parse(self, response):
        reprocess = dict()
        reprocess.update({"data": []})

        heading = response.css("#fullcredits_content > h4::text").extract()
        table = response.css("#fullcredits_content > table")
        tic()
        heading = [i.strip() if i.strip() != '' else None for i in heading]
        heading = filter(lambda x: x is not None and x != 'Cast', heading)

        table_data = list()
        for item in table:
            temporary = list()
            name = item.css("tr > td.name > a::text").extract()
            link = item.css("tr > td.name > a::attr(href)").extract()
            credit = item.css("tr > td.credit::text").extract()

            for item in link:
                reprocess["data"].append(
                    re.sub(r"name.+?", "",
                           item.strip()).split("?")[0].strip("/"))

            temporary = [{
                "name":
                unicodedata.normalize('NFKD',
                                      x.strip()).encode('ascii', 'ignore'),
                "actress_id":
                re.sub(r"name.+?", "",
                       link[k].strip()).split("?")[0].strip("/"),
                "credit":
                unicodedata.normalize('NFKD', credit[k].strip()).encode(
                    'ascii', 'ignore') if len(credit) > k is not None else None
            } for k, x in enumerate(name)]

            if len(temporary) > 0:
                table_data.append(temporary)

        data = dict(zip(heading, table_data))

        # Cast
        list_cast = list()
        cast_table = response.css(
            "#fullcredits_content > table.cast_list > tr")
        for item in cast_table:
            temporary = dict()
            name = item.css("td.itemprop > a > span::text").extract_first()
            link = item.css("td.itemprop > a::attr(href)").extract_first()
            credit = ""
            credit_div = item.css("td.character > div::text").extract()
            credit_div = ''.join(
                [re.sub("\s\s+", " ", x).lstrip(" ") for x in credit_div])
            credit_link = item.css(
                "td.character > div > a::text").extract_first()
            if credit_link:
                credit = credit_link.strip()
                credit = unicodedata.normalize('NFKD', credit).encode(
                    'ascii', 'ignore')

            if credit_div:
                credit_div = re.sub("\s\s+", " ", credit_div).lstrip(" ")
                credit_div = unicodedata.normalize('NFKD', credit_div).encode(
                    'ascii', 'ignore')
                credit = "{} {}".format(credit, credit_div)

            if name is not None and link is not None:
                temporary.update({
                    "name":
                    unicodedata.normalize('NFKD', name.strip()).encode(
                        'ascii', 'ignore')
                })
                temporary.update({"credit": credit.lstrip(" ")})
                temporary.update({
                    "actress_id":
                    re.sub(r"name.+?", "",
                           link.strip()).split("?")[0].strip("/")
                })

                reprocess["data"].append(temporary.get("actress_id"))

                list_cast.append(temporary)

        toc(save=True, fmt=True)
        url = self.replaceText(response.url.replace(self.base_url, ''), '?')

        reprocess.update({"id": re.sub(r"title.+?", "", url).strip("/")})
        reprocess.update({"status": "checked"})
        self.reprocessActress(reprocess, reprocess.get("id"))

        item = FilmCrew()
        item.update({"film_id": re.sub(r"title.+?", "", url).strip("/")})
        item.update({"film_crew": data})
        item.update({"film_cast": list_cast})

        yield item

    def replaceText(self, text, keyword):
        there = re.compile(re.escape('{}'.format(keyword)) + '.*')
        return there.sub('', text)[1:].replace('/fullcredits', '')

    def reprocessActress(self, data, id):
        collection = "reprocess_item_actress"

        result = self.db.get(collection, where={"id": id})

        if result.get("count") == 0:
            self.db.insertOne(collection, data, id)
        else:
            self.db.updateOne(collection, data, {
                'key': "id",
                'value': str(id)
            })

示例#9

0

显示文件

文件： actress_detail.py 项目： koreakstd/imdbCrawler

class ActressDetailSpider(scrapy.Spider):
    name = "actress-detail"
    allowed_domains = []
    base_url = None
    start_urls = []
    collection = None
    pipeline = set([MongoPipeline, RequiredFieldsPipeline])
    required_fields = ["actress_filmography", "actress_height"]
    mongo_requirement = {
        "primary": "actress_id",
        "collection": "actress",
        "source": "detail"
    }

    def __init__(self, data):
        scrapy.Spider.__init__(self)
        self.base_url = data.get("base_url")
        self.collection = data.get("collection").get("actress")
        self.allowed_domains = [x for x in data.get("allowed_domains")]
        self.db = MongoPipeline(
            data.get("database").get("host"),
            data.get("database").get("port"),
            data.get("database").get("db"))

        self.start_urls = self.populate_start_urls()
        # self.start_urls = ["http://www.imdb.com/name/nm3592338"]

    @classmethod
    def from_crawler(cls, crawler):
        data = dict()
        data.update({"base_url": crawler.settings.get("BASE_URL")})
        data.update({"collection": crawler.settings.get("COLLECTION")})
        data.update(
            {"allowed_domains": crawler.settings.get("ALLOWED_DOMAIN")})
        data.update({
            "database": {
                "host": crawler.settings.get("MONGODB_HOST"),
                "port": crawler.settings.get("MONGODB_PORT"),
                "db": crawler.settings.get("MONGODB_DB")
            }
        })
        return cls(data)

    def populate_start_urls(self):
        BASE_URL = "http://www.imdb.com/name/"
        db = self.db.get("actress",
                         where={
                             "actress_category": {
                                 "$exists": False
                             },
                             "actress_filmography": {
                                 "$exists": False
                             }
                         })

        return [
            '{}{}'.format(BASE_URL, a.get('actress_id'))
            for a in db.get('data')
        ]

    def parse(self, response):
        reprocess = dict()
        reprocess.update({"data": []})
        name = response.css(
            "#overview-top > h1 > span.itemprop::text").extract_first()
        if name is not None:
            name = name.strip()
        else:
            name = response.css(
                "#overview-top > div.no-pic-wrapper > div > h1 > span.itemprop::text"
            ).extract_first()
            name = name.strip() if name is not None else None

        category = [
            str(item.css('span::text').extract_first().strip().lower())
            for item in response.css("#name-job-categories > a")
        ]

        film_component = response.css("#filmography > div > div.filmo-row")
        filmography = [self.parsingFilm(item) for item in film_component]
        for item in filmography:
            reprocess["data"].append(item.get("film_id"))

        height = response.css("#details-height::text").extract()
        height = unicodedata.normalize('NFKD', height[1].strip()).encode(
            'ascii', 'ignore') if height else '-'

        image = response.css("#name-poster::attr(src)").extract_first()
        if image:
            image = convert_photo(image.strip())

        url = self.replaceText(response.url.replace(self.base_url, ''), '?')

        reprocess.update({"id": re.sub(r"name.+?", "", url).strip("/")})
        reprocess.update({"status": "checked"})
        self.reprocessFilm(reprocess, reprocess.get("id"))

        item = ActressDetail()
        item.update({"actress_id": re.sub(r"name.+?", "", url).strip("/")})
        item.update({"actress_name": name})
        item.update({"actress_category": category})
        item.update({"actress_photo": image})
        item.update({"actress_filmography": filmography})
        item.update({"actress_height": height})

        yield item

    def replaceText(self, text, keyword):
        there = re.compile(re.escape('{}'.format(keyword)) + '.*')
        return there.sub('', text)

    def parsingFilm(self, component):
        tempYear = filter(
            None,
            component.css(
                "span.year_column::text").extract_first().strip().split('-'))

        if len(tempYear) > 0:
            year = {'start': int(re.sub('[^0-9.]+', '', tempYear[0])), 'end': int(re.sub('[^0-9.]+', '', tempYear[1]))} \
                if len(tempYear) > 1 \
                else {'start': int(re.sub('[^0-9.]+', '', tempYear[0])), 'end': int(re.sub('[^0-9.]+', '', tempYear[0]))}
        else:
            year = {'start': 'Info Not Found', 'end': 'Info Not Found'}

        film = component.css("b > a::text").extract_first().strip()
        id = component.css("b > a::attr(href)").extract_first().strip()
        id = re.sub(r"title.+?", "", id).split("/")[1]

        return {"year": year, "film": film, "film_id": id}

    def reprocessFilm(self, data, id):
        collection = "reprocess_item_film"

        result = self.db.get(collection, where={"id": id})

        if result.get("count") == 0:
            self.db.insertOne(collection, data, id)
        else:
            self.db.updateOne(collection, data, {
                'key': "id",
                'value': str(id)
            })