예제 #1
0
    def __init__(self, name=None, **kwargs):
        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'],
                                           kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(
            CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self.opensooq_parse = OpensooqParse()

        super(OpensooqDebugWatchSpider, self).__init__(name, **kwargs)
예제 #2
0
class OpensooqDebugWatchSpider(scrapy.Spider):
    name = "opensooqwatch_debug"
    allowed_domains = ["https://sa.opensooq.com/", 'http://www.mstaml.com']

    start_urls = [
        # paginate
        "https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=",
        # detail
    ]

    def __init__(self, name=None, **kwargs):
        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'],
                                           kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(
            CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self.opensooq_parse = OpensooqParse()

        super(OpensooqDebugWatchSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(OpensooqDebugWatchSpider, cls).from_crawler(
            crawler,
            args,
            host=crawler.settings.get('SQL_HOST'),
            port=crawler.settings.get('SQL_PORT'),
            user=crawler.settings.get('SQL_USER'),
            passwd=crawler.settings.get('SQL_PASSWD'),
            db=crawler.settings.get('SQL_DB'),
            collection_name=crawler.settings.get('SQL_COLLECTION_NAME'))

    def parse(self, response):
        self.opensooq_parse.parse_paginate(response.url, response,
                                           self._cache_db, self._history_db)
        item = self.opensooq_parse.parse(response.url, response, self._item_db)
    def __init__(self, name=None, **kwargs):
        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'], kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self.opensooq_parse = OpensooqParse()

        super(OpensooqDebugCommentDateSpider, self).__init__(name, **kwargs)
예제 #4
0
    def __init__(self, name=None, **kwargs):
        self.allowed_domains = [websites_allowed_domains.get(self.url_from)]

        if is_pagination:
            self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)]
        else:
            self.start_urls = self.details_urls

        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'],
                                           kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(
            CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self._parser = OpensooqParse()

        super(OpensooqDebugSpider, self).__init__(name, **kwargs)
    def __init__(self, name=None, **kwargs):
        self.allowed_domains = [websites_allowed_domains.get(self.url_from)]

        if is_pagination:
            self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)]
        else:
            self.start_urls = self.details_urls

        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'], kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self._parser = OpensooqParse()

        super(OpensooqDebugSpider, self).__init__(name, **kwargs)
class OpensooqDebugCommentDateSpider(scrapy.Spider):
    name = "opensooq_commentdate_debug"
    allowed_domains = [
        "https://sa.opensooq.com/",
    ]

    opensooq_pagination = 'https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=&page={}'
    opensooq_pagination_start_page = 20
    opensooq_pagination_total_page = 5
    start_urls = [
        # paginate
        opensooq_pagination.format(opensooq_pagination_start_page)
        # detail
        # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع'  # 8 comments
    ]

    def __init__(self, name=None, **kwargs):
        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'], kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self.opensooq_parse = OpensooqParse()

        super(OpensooqDebugCommentDateSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(OpensooqDebugCommentDateSpider, cls).from_crawler(crawler,
                                                                       args,
                                                                       host=crawler.settings.get('SQL_HOST'),
                                                                       port=crawler.settings.get('SQL_PORT'),
                                                                       user=crawler.settings.get('SQL_USER'),
                                                                       passwd=crawler.settings.get('SQL_PASSWD'),
                                                                       db=crawler.settings.get('SQL_DB'),
                                                                       collection_name=crawler.settings.get(
                                                                           'SQL_COLLECTION_NAME')
                                                                       )

    def parse(self, response):
        # self.opensooq_parse.parse_paginate(response.url, response, self._cache_db, self._history_db)
        # self.opensooq_pagination_start_page -= 1
        # _next_pagination = self.opensooq_pagination.format(self.opensooq_pagination_start_page)
        #
        # yield scrapy.Request(_next_pagination, callback=self.parse, dont_filter=True)

        _row = self._cache_db.get_oldest_row('', WebsiteTypes.opensooq.value)
        if _row:
            yield scrapy.Request(_row['url'], callback=self.parse_page_from_opensooq, dont_filter=True)

    def parse_page_from_opensooq(self, response):
        self._save_for_opensooq(response)

        _row = self._cache_db.get_oldest_row(response.url, WebsiteTypes.opensooq.value)
        if _row:
            yield scrapy.Request(_row['url'], callback=self.parse_page_from_opensooq, dont_filter=True)

    def _save_for_opensooq(self, hxs):
        _comments_selector = '//*[@class="commentItems clear"]/li'
        _comments_div = hxs.xpath(_comments_selector)

        _count = 1
        for _comment_div in _comments_div:
            _selector = _comments_selector + '[' + str(_count) + ']'

            _comment_date = self.opensooq_parse.get_value_response(hxs, _selector + '/div/span/text()')
            if _comment_date == '':
                continue

            opensooq_comment_date = OpensooqCommentDateItem.get_default(_comment_date)
            self._item_db.save_opensooq_comment_date(opensooq_comment_date)
            _count += 1
예제 #7
0
class OpensooqDebugSpider(scrapy.Spider):
    url_from = WebsiteTypes.opensooq
    name = "{}_debug".format(url_from.value)
    details_urls = [
        # ajax
        # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42946557&model_type=post'
        # detail
        # 'https://sa.opensooq.com/ar/search/42054599/شقة-للإيجار-حي-النعيم-5-غرف',
        # 'https://sa.opensooq.com/ar/search/43012611/فيلا-شمال-التخصصي-غرب-ابوبكر-حي-الياسمين'
        # detail without phone number
        # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42552861&model_type=post',
        # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=39509897&model_type=post'
        # 'https://sa.opensooq.com/ar/search/42552861/%D9%85%D9%86%D8%B8%D9%88%D9%85%D8%A9-%D9%85%D8%A8%D9%8A%D8%B9%D8%A7%D8%AA-%D9%84%D9%84%D8%A7%D8%B3%D9%88%D8%A7%D9%82-%D9%88%D8%A7%D9%84%D9%85%D8%AD%D9%84%D8%A7%D8%AA'
        # Fix phone number
        # 'https://sa.opensooq.com/ar/search/43152549/إفطار-صائم-بمكه-المكرمه'
        # 'https://sa.opensooq.com/ar/search/17978455/دهن-عود-ملكي'
        # mysql: insert the members row failure, (1406, "Data too long for column 'username' at row 1")
        # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع'  # 8 comments
        # 'https://sa.opensooq.com/ar/search/43796687/للبيع-لوحه-قمه-التميز-س-م-و-٨٨٨٨'  # 10 comments(no comment_date)
        # Parsing comment_date failure.
        # 'https://sa.opensooq.com/ar/search/38053621/لاصق-تثبيت-الجوال-والاغراض-على-طبلون-السيارة-والبيت'
        # No member register time.
        'https://sa.opensooq.com/ar/search/11956749/مستودع-للايجار-في-الصناعيه'
    ]

    def __init__(self, name=None, **kwargs):
        self.allowed_domains = [websites_allowed_domains.get(self.url_from)]

        if is_pagination:
            self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)]
        else:
            self.start_urls = self.details_urls

        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'],
                                           kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(
            CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self._parser = OpensooqParse()

        super(OpensooqDebugSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(OpensooqDebugSpider, cls).from_crawler(
            crawler,
            args,
            host=crawler.settings.get('SQL_HOST'),
            port=crawler.settings.get('SQL_PORT'),
            user=crawler.settings.get('SQL_USER'),
            passwd=crawler.settings.get('SQL_PASSWD'),
            db=crawler.settings.get('SQL_DB'),
            collection_name=crawler.settings.get('SQL_COLLECTION_NAME'))

    def parse(self, response):
        if is_pagination:
            self._parser.parse_paginate(response.url, response, self._cache_db,
                                        self._history_db)
        else:
            item = self._parser.parse(response.url, response, self._item_db)
            _ids_id = item["id_ads"]
class OpensooqDebugSpider(scrapy.Spider):
    url_from = WebsiteTypes.opensooq
    name = "{}_debug".format(url_from.value)
    details_urls = [
        # ajax
        # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42946557&model_type=post'
        # detail
        # 'https://sa.opensooq.com/ar/search/42054599/شقة-للإيجار-حي-النعيم-5-غرف',
        # 'https://sa.opensooq.com/ar/search/43012611/فيلا-شمال-التخصصي-غرب-ابوبكر-حي-الياسمين'
        # detail without phone number
        # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42552861&model_type=post',
        # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=39509897&model_type=post'
        # 'https://sa.opensooq.com/ar/search/42552861/%D9%85%D9%86%D8%B8%D9%88%D9%85%D8%A9-%D9%85%D8%A8%D9%8A%D8%B9%D8%A7%D8%AA-%D9%84%D9%84%D8%A7%D8%B3%D9%88%D8%A7%D9%82-%D9%88%D8%A7%D9%84%D9%85%D8%AD%D9%84%D8%A7%D8%AA'
        # Fix phone number
        # 'https://sa.opensooq.com/ar/search/43152549/إفطار-صائم-بمكه-المكرمه'
        # 'https://sa.opensooq.com/ar/search/17978455/دهن-عود-ملكي'
        # mysql: insert the members row failure, (1406, "Data too long for column 'username' at row 1")
        # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع'  # 8 comments
        # 'https://sa.opensooq.com/ar/search/43796687/للبيع-لوحه-قمه-التميز-س-م-و-٨٨٨٨'  # 10 comments(no comment_date)
        # Parsing comment_date failure.
        # 'https://sa.opensooq.com/ar/search/38053621/لاصق-تثبيت-الجوال-والاغراض-على-طبلون-السيارة-والبيت'
        # No member register time.
        'https://sa.opensooq.com/ar/search/11956749/مستودع-للايجار-في-الصناعيه'
    ]

    def __init__(self, name=None, **kwargs):
        self.allowed_domains = [websites_allowed_domains.get(self.url_from)]

        if is_pagination:
            self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)]
        else:
            self.start_urls = self.details_urls

        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'], kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self._parser = OpensooqParse()

        super(OpensooqDebugSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(OpensooqDebugSpider, cls).from_crawler(crawler,
                                                            args,
                                                            host=crawler.settings.get('SQL_HOST'),
                                                            port=crawler.settings.get('SQL_PORT'),
                                                            user=crawler.settings.get('SQL_USER'),
                                                            passwd=crawler.settings.get('SQL_PASSWD'),
                                                            db=crawler.settings.get('SQL_DB'),
                                                            collection_name=crawler.settings.get('SQL_COLLECTION_NAME')
                                                            )

    def parse(self, response):
        if is_pagination:
            self._parser.parse_paginate(response.url, response, self._cache_db, self._history_db)
        else:
            item = self._parser.parse(response.url, response, self._item_db)
            _ids_id = item["id_ads"]
예제 #9
0
class OpensooqDebugCommentDateSpider(scrapy.Spider):
    name = "opensooq_commentdate_debug"
    allowed_domains = [
        "https://sa.opensooq.com/",
    ]

    opensooq_pagination = 'https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=&page={}'
    opensooq_pagination_start_page = 20
    opensooq_pagination_total_page = 5
    start_urls = [
        # paginate
        opensooq_pagination.format(opensooq_pagination_start_page)
        # detail
        # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع'  # 8 comments
    ]

    def __init__(self, name=None, **kwargs):
        from cwharaj.database_factory import DatabaseFactory, CollectionTypes
        database_factory = DatabaseFactory(kwargs['host'], kwargs['port'],
                                           kwargs['user'], kwargs['passwd'],
                                           kwargs['db'],
                                           kwargs['collection_name'])

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(
            CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)

        from cwharaj.parser.opensooq_parser import OpensooqParse
        self.opensooq_parse = OpensooqParse()

        super(OpensooqDebugCommentDateSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(OpensooqDebugCommentDateSpider, cls).from_crawler(
            crawler,
            args,
            host=crawler.settings.get('SQL_HOST'),
            port=crawler.settings.get('SQL_PORT'),
            user=crawler.settings.get('SQL_USER'),
            passwd=crawler.settings.get('SQL_PASSWD'),
            db=crawler.settings.get('SQL_DB'),
            collection_name=crawler.settings.get('SQL_COLLECTION_NAME'))

    def parse(self, response):
        # self.opensooq_parse.parse_paginate(response.url, response, self._cache_db, self._history_db)
        # self.opensooq_pagination_start_page -= 1
        # _next_pagination = self.opensooq_pagination.format(self.opensooq_pagination_start_page)
        #
        # yield scrapy.Request(_next_pagination, callback=self.parse, dont_filter=True)

        _row = self._cache_db.get_oldest_row('', WebsiteTypes.opensooq.value)
        if _row:
            yield scrapy.Request(_row['url'],
                                 callback=self.parse_page_from_opensooq,
                                 dont_filter=True)

    def parse_page_from_opensooq(self, response):
        self._save_for_opensooq(response)

        _row = self._cache_db.get_oldest_row(response.url,
                                             WebsiteTypes.opensooq.value)
        if _row:
            yield scrapy.Request(_row['url'],
                                 callback=self.parse_page_from_opensooq,
                                 dont_filter=True)

    def _save_for_opensooq(self, hxs):
        _comments_selector = '//*[@class="commentItems clear"]/li'
        _comments_div = hxs.xpath(_comments_selector)

        _count = 1
        for _comment_div in _comments_div:
            _selector = _comments_selector + '[' + str(_count) + ']'

            _comment_date = self.opensooq_parse.get_value_response(
                hxs, _selector + '/div/span/text()')
            if _comment_date == '':
                continue

            opensooq_comment_date = OpensooqCommentDateItem.get_default(
                _comment_date)
            self._item_db.save_opensooq_comment_date(opensooq_comment_date)
            _count += 1
content_seperator = '\n' + '\n'

websites_allowed_domains = {
    WebsiteTypes.opensooq: "https://sa.opensooq.com/",
    WebsiteTypes.mstaml: 'http://www.mstaml.com',
    WebsiteTypes.harajsa: 'https://haraj.com.sa',
}

scraped_websites_pagination = {
    'https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=&page=1': WebsiteTypes.opensooq,
    'http://www.mstaml.com/market/?t=0&l=0&d=0&x=&u=&o=3': WebsiteTypes.mstaml,
    'https://haraj.com.sa': WebsiteTypes.harajsa,
}

websites_parses = {
    WebsiteTypes.opensooq: OpensooqParse(),
    WebsiteTypes.mstaml: MstamlParse(),
    WebsiteTypes.harajsa: HarajSaParse()
}


# ===
# for debug
# ===

def get_crawler_name():
    # Extensions

    # is_pagination = True
    is_pagination = False