def __init__(self, name=None, **kwargs): from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database( CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self.opensooq_parse = OpensooqParse() super(OpensooqDebugWatchSpider, self).__init__(name, **kwargs)
class OpensooqDebugWatchSpider(scrapy.Spider): name = "opensooqwatch_debug" allowed_domains = ["https://sa.opensooq.com/", 'http://www.mstaml.com'] start_urls = [ # paginate "https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=", # detail ] def __init__(self, name=None, **kwargs): from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database( CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self.opensooq_parse = OpensooqParse() super(OpensooqDebugWatchSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(OpensooqDebugWatchSpider, cls).from_crawler( crawler, args, host=crawler.settings.get('SQL_HOST'), port=crawler.settings.get('SQL_PORT'), user=crawler.settings.get('SQL_USER'), passwd=crawler.settings.get('SQL_PASSWD'), db=crawler.settings.get('SQL_DB'), collection_name=crawler.settings.get('SQL_COLLECTION_NAME')) def parse(self, response): self.opensooq_parse.parse_paginate(response.url, response, self._cache_db, self._history_db) item = self.opensooq_parse.parse(response.url, response, self._item_db)
def __init__(self, name=None, **kwargs): from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database(CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self.opensooq_parse = OpensooqParse() super(OpensooqDebugCommentDateSpider, self).__init__(name, **kwargs)
def __init__(self, name=None, **kwargs): self.allowed_domains = [websites_allowed_domains.get(self.url_from)] if is_pagination: self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)] else: self.start_urls = self.details_urls from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database( CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self._parser = OpensooqParse() super(OpensooqDebugSpider, self).__init__(name, **kwargs)
def __init__(self, name=None, **kwargs): self.allowed_domains = [websites_allowed_domains.get(self.url_from)] if is_pagination: self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)] else: self.start_urls = self.details_urls from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database(CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self._parser = OpensooqParse() super(OpensooqDebugSpider, self).__init__(name, **kwargs)
class OpensooqDebugCommentDateSpider(scrapy.Spider): name = "opensooq_commentdate_debug" allowed_domains = [ "https://sa.opensooq.com/", ] opensooq_pagination = 'https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=&page={}' opensooq_pagination_start_page = 20 opensooq_pagination_total_page = 5 start_urls = [ # paginate opensooq_pagination.format(opensooq_pagination_start_page) # detail # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع' # 8 comments ] def __init__(self, name=None, **kwargs): from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database(CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self.opensooq_parse = OpensooqParse() super(OpensooqDebugCommentDateSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(OpensooqDebugCommentDateSpider, cls).from_crawler(crawler, args, host=crawler.settings.get('SQL_HOST'), port=crawler.settings.get('SQL_PORT'), user=crawler.settings.get('SQL_USER'), passwd=crawler.settings.get('SQL_PASSWD'), db=crawler.settings.get('SQL_DB'), collection_name=crawler.settings.get( 'SQL_COLLECTION_NAME') ) def parse(self, response): # self.opensooq_parse.parse_paginate(response.url, response, self._cache_db, self._history_db) # self.opensooq_pagination_start_page -= 1 # _next_pagination = self.opensooq_pagination.format(self.opensooq_pagination_start_page) # # yield scrapy.Request(_next_pagination, callback=self.parse, dont_filter=True) _row = self._cache_db.get_oldest_row('', WebsiteTypes.opensooq.value) if _row: yield scrapy.Request(_row['url'], callback=self.parse_page_from_opensooq, dont_filter=True) def parse_page_from_opensooq(self, response): self._save_for_opensooq(response) _row = self._cache_db.get_oldest_row(response.url, WebsiteTypes.opensooq.value) if _row: yield scrapy.Request(_row['url'], callback=self.parse_page_from_opensooq, dont_filter=True) def _save_for_opensooq(self, hxs): _comments_selector = '//*[@class="commentItems clear"]/li' _comments_div = hxs.xpath(_comments_selector) _count = 1 for _comment_div in _comments_div: _selector = _comments_selector + '[' + str(_count) + ']' _comment_date = self.opensooq_parse.get_value_response(hxs, _selector + '/div/span/text()') if _comment_date == '': continue opensooq_comment_date = OpensooqCommentDateItem.get_default(_comment_date) self._item_db.save_opensooq_comment_date(opensooq_comment_date) _count += 1
class OpensooqDebugSpider(scrapy.Spider): url_from = WebsiteTypes.opensooq name = "{}_debug".format(url_from.value) details_urls = [ # ajax # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42946557&model_type=post' # detail # 'https://sa.opensooq.com/ar/search/42054599/شقة-للإيجار-حي-النعيم-5-غرف', # 'https://sa.opensooq.com/ar/search/43012611/فيلا-شمال-التخصصي-غرب-ابوبكر-حي-الياسمين' # detail without phone number # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42552861&model_type=post', # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=39509897&model_type=post' # 'https://sa.opensooq.com/ar/search/42552861/%D9%85%D9%86%D8%B8%D9%88%D9%85%D8%A9-%D9%85%D8%A8%D9%8A%D8%B9%D8%A7%D8%AA-%D9%84%D9%84%D8%A7%D8%B3%D9%88%D8%A7%D9%82-%D9%88%D8%A7%D9%84%D9%85%D8%AD%D9%84%D8%A7%D8%AA' # Fix phone number # 'https://sa.opensooq.com/ar/search/43152549/إفطار-صائم-بمكه-المكرمه' # 'https://sa.opensooq.com/ar/search/17978455/دهن-عود-ملكي' # mysql: insert the members row failure, (1406, "Data too long for column 'username' at row 1") # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع' # 8 comments # 'https://sa.opensooq.com/ar/search/43796687/للبيع-لوحه-قمه-التميز-س-م-و-٨٨٨٨' # 10 comments(no comment_date) # Parsing comment_date failure. # 'https://sa.opensooq.com/ar/search/38053621/لاصق-تثبيت-الجوال-والاغراض-على-طبلون-السيارة-والبيت' # No member register time. 'https://sa.opensooq.com/ar/search/11956749/مستودع-للايجار-في-الصناعيه' ] def __init__(self, name=None, **kwargs): self.allowed_domains = [websites_allowed_domains.get(self.url_from)] if is_pagination: self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)] else: self.start_urls = self.details_urls from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database( CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self._parser = OpensooqParse() super(OpensooqDebugSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(OpensooqDebugSpider, cls).from_crawler( crawler, args, host=crawler.settings.get('SQL_HOST'), port=crawler.settings.get('SQL_PORT'), user=crawler.settings.get('SQL_USER'), passwd=crawler.settings.get('SQL_PASSWD'), db=crawler.settings.get('SQL_DB'), collection_name=crawler.settings.get('SQL_COLLECTION_NAME')) def parse(self, response): if is_pagination: self._parser.parse_paginate(response.url, response, self._cache_db, self._history_db) else: item = self._parser.parse(response.url, response, self._item_db) _ids_id = item["id_ads"]
class OpensooqDebugSpider(scrapy.Spider): url_from = WebsiteTypes.opensooq name = "{}_debug".format(url_from.value) details_urls = [ # ajax # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42946557&model_type=post' # detail # 'https://sa.opensooq.com/ar/search/42054599/شقة-للإيجار-حي-النعيم-5-غرف', # 'https://sa.opensooq.com/ar/search/43012611/فيلا-شمال-التخصصي-غرب-ابوبكر-حي-الياسمين' # detail without phone number # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=42552861&model_type=post', # 'https://sa.opensooq.com/ar/post/get-phone-number?model_id=39509897&model_type=post' # 'https://sa.opensooq.com/ar/search/42552861/%D9%85%D9%86%D8%B8%D9%88%D9%85%D8%A9-%D9%85%D8%A8%D9%8A%D8%B9%D8%A7%D8%AA-%D9%84%D9%84%D8%A7%D8%B3%D9%88%D8%A7%D9%82-%D9%88%D8%A7%D9%84%D9%85%D8%AD%D9%84%D8%A7%D8%AA' # Fix phone number # 'https://sa.opensooq.com/ar/search/43152549/إفطار-صائم-بمكه-المكرمه' # 'https://sa.opensooq.com/ar/search/17978455/دهن-عود-ملكي' # mysql: insert the members row failure, (1406, "Data too long for column 'username' at row 1") # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع' # 8 comments # 'https://sa.opensooq.com/ar/search/43796687/للبيع-لوحه-قمه-التميز-س-م-و-٨٨٨٨' # 10 comments(no comment_date) # Parsing comment_date failure. # 'https://sa.opensooq.com/ar/search/38053621/لاصق-تثبيت-الجوال-والاغراض-على-طبلون-السيارة-والبيت' # No member register time. 'https://sa.opensooq.com/ar/search/11956749/مستودع-للايجار-في-الصناعيه' ] def __init__(self, name=None, **kwargs): self.allowed_domains = [websites_allowed_domains.get(self.url_from)] if is_pagination: self.start_urls = [WebsiteTypes.get_pagination_url(self.url_from)] else: self.start_urls = self.details_urls from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database(CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self._parser = OpensooqParse() super(OpensooqDebugSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(OpensooqDebugSpider, cls).from_crawler(crawler, args, host=crawler.settings.get('SQL_HOST'), port=crawler.settings.get('SQL_PORT'), user=crawler.settings.get('SQL_USER'), passwd=crawler.settings.get('SQL_PASSWD'), db=crawler.settings.get('SQL_DB'), collection_name=crawler.settings.get('SQL_COLLECTION_NAME') ) def parse(self, response): if is_pagination: self._parser.parse_paginate(response.url, response, self._cache_db, self._history_db) else: item = self._parser.parse(response.url, response, self._item_db) _ids_id = item["id_ads"]
class OpensooqDebugCommentDateSpider(scrapy.Spider): name = "opensooq_commentdate_debug" allowed_domains = [ "https://sa.opensooq.com/", ] opensooq_pagination = 'https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=&page={}' opensooq_pagination_start_page = 20 opensooq_pagination_total_page = 5 start_urls = [ # paginate opensooq_pagination.format(opensooq_pagination_start_page) # detail # 'https://sa.opensooq.com/ar/search/29602021/بيت-شعبي-مع-مجلس-مسلح-للبيع' # 8 comments ] def __init__(self, name=None, **kwargs): from cwharaj.database_factory import DatabaseFactory, CollectionTypes database_factory = DatabaseFactory(kwargs['host'], kwargs['port'], kwargs['user'], kwargs['passwd'], kwargs['db'], kwargs['collection_name']) self._cache_db = database_factory.get_database(CollectionTypes.cache) self._history_db = database_factory.get_database( CollectionTypes.history) self._item_db = database_factory.get_database(CollectionTypes.item) from cwharaj.parser.opensooq_parser import OpensooqParse self.opensooq_parse = OpensooqParse() super(OpensooqDebugCommentDateSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(OpensooqDebugCommentDateSpider, cls).from_crawler( crawler, args, host=crawler.settings.get('SQL_HOST'), port=crawler.settings.get('SQL_PORT'), user=crawler.settings.get('SQL_USER'), passwd=crawler.settings.get('SQL_PASSWD'), db=crawler.settings.get('SQL_DB'), collection_name=crawler.settings.get('SQL_COLLECTION_NAME')) def parse(self, response): # self.opensooq_parse.parse_paginate(response.url, response, self._cache_db, self._history_db) # self.opensooq_pagination_start_page -= 1 # _next_pagination = self.opensooq_pagination.format(self.opensooq_pagination_start_page) # # yield scrapy.Request(_next_pagination, callback=self.parse, dont_filter=True) _row = self._cache_db.get_oldest_row('', WebsiteTypes.opensooq.value) if _row: yield scrapy.Request(_row['url'], callback=self.parse_page_from_opensooq, dont_filter=True) def parse_page_from_opensooq(self, response): self._save_for_opensooq(response) _row = self._cache_db.get_oldest_row(response.url, WebsiteTypes.opensooq.value) if _row: yield scrapy.Request(_row['url'], callback=self.parse_page_from_opensooq, dont_filter=True) def _save_for_opensooq(self, hxs): _comments_selector = '//*[@class="commentItems clear"]/li' _comments_div = hxs.xpath(_comments_selector) _count = 1 for _comment_div in _comments_div: _selector = _comments_selector + '[' + str(_count) + ']' _comment_date = self.opensooq_parse.get_value_response( hxs, _selector + '/div/span/text()') if _comment_date == '': continue opensooq_comment_date = OpensooqCommentDateItem.get_default( _comment_date) self._item_db.save_opensooq_comment_date(opensooq_comment_date) _count += 1
content_seperator = '\n' + '\n' websites_allowed_domains = { WebsiteTypes.opensooq: "https://sa.opensooq.com/", WebsiteTypes.mstaml: 'http://www.mstaml.com', WebsiteTypes.harajsa: 'https://haraj.com.sa', } scraped_websites_pagination = { 'https://sa.opensooq.com/ar/find?term=&cat_id=&scid=&city=&allposts_cb=true&allposts=no&price_from=&price_to=&page=1': WebsiteTypes.opensooq, 'http://www.mstaml.com/market/?t=0&l=0&d=0&x=&u=&o=3': WebsiteTypes.mstaml, 'https://haraj.com.sa': WebsiteTypes.harajsa, } websites_parses = { WebsiteTypes.opensooq: OpensooqParse(), WebsiteTypes.mstaml: MstamlParse(), WebsiteTypes.harajsa: HarajSaParse() } # === # for debug # === def get_crawler_name(): # Extensions # is_pagination = True is_pagination = False