def __init__(self): print self.DOMAIN self.logger = Logger(name='dubizzle_data_log') self.err_logger = Logger(name='err_dubizzle_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager()
class DataExtractor: DOMAIN = 'dubicars.com' PROJECT_ID = 13 PATH = 'phones/' def __init__(self): print self.DOMAIN self.logger = Logger(name='dubicars_data_log') self.err_logger = Logger(name='err_dubicars_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() self.trim_list = self.db.get_trim_list() def extract_data(self, url_data): print url_data url_id = url_data['id'] url = url_data['url'] listing_id = url_data['listing_id'] data = {} response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('img', {'class': 'sold'}) if expired is not None: self.db.set_url_inactive(url_id) self.err_logger.error("EXPIRED " + str(url_data)) return elif response['status_code'] == 404: self.db.set_url_inactive(url_id) self.err_logger.error("404 " + str(url_data)) return try: marka = self.__find_make(parsed_code) year = self.__find_year(parsed_code) kilometres = self.__find_km(parsed_code) color = self.__find_color(parsed_code) specs = self.__find_specs(parsed_code) price = self.__find_price(parsed_code) model = self.__find_model(parsed_code, make=marka) trim = self.__find_trim(parsed_code, marka=marka, model=model) if trim == 'Other': self.db.set_url_processed(url_id) self.db.set_url_inactive(url_id) return phone = self.__find_phone(parsed_code) except Exception as exc: self.err_logger.error(str(exc) + str(url_data)) self.db.set_url_processed(url_id) return try: data['year'] = int(year) data['price'] = int(price) data['kilometres'] = int(kilometres) data['color'] = color data['specs'] = specs data['trim'] = trim data['model'] = model data['make'] = marka data['phone'] = phone print data except Exception as exc: self.err_logger.error(str(exc) + url_data) self.db.set_url_processed(url_id) self.db.set_url_inactive(url_id) return self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) self.db.set_url_processed(url_id) def update_data(self, url_data): timestamp = generate_timestamp() url_id = url_data['id'] listing_id = url_data['listing_id'] print listing_id url = url_data['url'] first_timestamp = url_data['timestamp'] time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('img', {'class': 'sold'}) if expired is not None: self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) return elif response['status_code'] == 404: print 404, listing_id self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) return try: price = self.__find_price(parsed_code) except: price = 0 # days = self.__calc_days_on_market(listing_id) self.db.update_listing(listing_id=listing_id, price=int(price), days_on_market=time_dif) self.db.set_updated(listing_id=listing_id) def __find_make(self, code): try: make = self.__find_tag_by_text(code, text='Make:') return make except: return '' def __find_year(self, code): try: year = self.__find_tag_by_text(code, text='Year:') year_list = year.split() for year in year_list: try: year = int(year) return year except: continue except: return '' def __find_km(self, code): try: km = self.__find_tag_by_text(code, text='Kilometers:') km = km.replace(",", "").replace(".", "").replace(" ", "") return int(km) except: return 0 def __find_color(self, code): try: color = self.__find_tag_by_text(code, text='Color:') return color.strip() except: return '' def __find_specs(self, code): try: specs = self.__find_tag_by_text(code, text='Specs:') return specs.strip() except: return '' # ============= TRIM =============== # ===== def __generateEditedTrims(self, marka, trim): for example_trim in self.trim_list: try: if len(example_trim['trim']) <= 3: continue except: continue if '-' in example_trim['trim']: if example_trim['make'] == marka: edited_example_trim = example_trim['trim'].replace( '-', ' ') if edited_example_trim in trim: print example_trim['trim'] return example_trim['trim'] edited_example_trim = example_trim['trim'].replace( '-', ' ').title() if edited_example_trim in trim: print example_trim['trim'] return example_trim['trim'] return '' def __find_trim(self, code, marka, model): try: to_return_trim = '' not_edited_trim = self.__find_tag_by_text(code, text='Model:').strip() trim = not_edited_trim.replace(model, '').strip() if len(trim.split()) == 0: print not_edited_trim, 'there is no Trim!!!!' return not_edited_trim.strip() for example_trim in self.trim_list: if example_trim['make'] == marka: if example_trim['trim'] in trim: if len(example_trim['trim']) <= 2: if ' ' + example_trim[ 'trim'] + ' ' in ' ' + trim + ' ': if len(example_trim['trim']) > len( to_return_trim): print example_trim['trim'] to_return_trim = example_trim['trim'] continue if len(example_trim['trim']) > len(to_return_trim): print example_trim['trim'] to_return_trim = example_trim['trim'] edited_trim = self.__generateEditedTrims(marka=marka, trim=trim) if len(edited_trim) > len(to_return_trim): return edited_trim elif to_return_trim == '': if len(trim.split()) <= 2 and len(trim.split()) > 0: return trim else: return to_return_trim except: return '' # ===== # ============= TRIM =============== def __find_model(self, code, make): try: breadcrumbs = code.findAll('span', {'typeof': 'v:Breadcrumb'}) name = breadcrumbs[-1].text len_make = len(make.split()) trim = name.split()[len_make:] trim = ' '.join(trim) return trim.strip() except Exception as exc: print exc return '' def __find_phone(self, code): try: phone = code.find('p', { 'id': 'contact-buttons' }).find('a')['data-reveal'] phone = phone.replace('"', "").replace(" ", "").replace("[", "").replace("]", "") return phone.strip() except Exception as exc: print exc return '' def __find_price(self, code): try: price = code.find('strong', {'class': 'money'}).text price = price.replace('AED', "").replace(" ", "").\ replace(",", "").\ replace(".", "").\ replace("-", "") return int(price) except: try: price = code.find('strong', {'class': 'money reduced'}).text price = price.replace('AED', "").replace(" ", ""). \ replace(",", ""). \ replace(".", ""). \ replace("-", "") return int(price) except: return 0 def __find_tag_by_text(self, code, text): tag_with_text = code.find(text=text) needed_tag = tag_with_text.parent.find_next_sibling() return needed_tag.text
class DataExtractor: DOMAIN = 'dubai.dubizzle.com' PROJECT_ID = 13 PATH = 'phones/' def __init__(self): print self.DOMAIN self.logger = Logger(name='dubizzle_data_log') self.err_logger = Logger(name='err_dubizzle_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() def extract_data(self, url_data): print url_data url_id = url_data['id'] url = url_data['url'] listing_id = url_data['listing_id'] data = {} response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('div', {'id': 'expired-ad-message'}) if expired is not None: #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) self.err_logger.error("EXPIRED " + str(url_data)) return elif response['status_code'] == 404: #self.db.remove_listing(listing_id) self.err_logger.error("404 " + str(url_data)) self.db.set_url_inactive(url_id) return bread = parsed_code.find('span', {'id': 'browse_in_breadcrumb'}) items = bread.findAll('div') try: year = parsed_code.find('img', attrs={ 'alt': 'Year' }).parent.text.replace('Year', '').strip() kilometres = parsed_code.find('img', attrs={ 'alt': 'Kilometers' }).parent.text.replace('Kilometers', '').strip().replace(',', '').replace('.', '') color = parsed_code.find('img', attrs={ 'alt': 'Color' }).parent.text.replace('Color', '').strip() specs = parsed_code.find('img', attrs={ 'alt': 'Specs' }).parent.text.replace('Specs', '').strip() trim = parsed_code.find('img', attrs={ 'alt': 'Trim' }).parent.parent.text.replace('Trim', '').strip() if trim == 'Other': self.db.set_url_processed(url_id) return price = parsed_code.find('span', { 'id': 'actualprice' }).text.replace(',', '').replace('.', '') model = items[-1].find('a').text.strip() marka = items[-2].find('a').text.strip() phone = self.extract_phone(parsed_code, id=url_id) except Exception as exc: self.err_logger.error(str(exc) + str(url_data)) self.db.set_url_processed(url_id) return data['year'] = int(year) data['price'] = int(price) data['kilometres'] = int(kilometres) data['color'] = color data['specs'] = specs data['trim'] = trim data['model'] = model data['make'] = marka data['phone'] = phone self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) self.db.set_url_processed(url_id) def update_data(self, url_data): timestamp = generate_timestamp() url_id = url_data['id'] listing_id = url_data['listing_id'] print listing_id url = url_data['url'] first_timestamp = url_data['timestamp'] time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('div', {'id': 'expired-ad-message'}) if expired is not None: self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) print "updated" return elif response['status_code'] == 404: print 404, listing_id self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) print "updated" return try: price = parsed_code.find('span', { 'id': 'actualprice' }).text.replace(',', '').replace('.', '') except: price = 0 # days = self.__calc_days_on_market(listing_id) self.db.update_listing(listing_id=listing_id, price=int(price), days_on_market=time_dif) self.db.set_updated(listing_id=listing_id) print "updated" # def __calc_days_on_market(self, listing_id): # days_on_market = self.db.get_car_data(listing_id).days_on_market # if days_on_market is None: # return 0 # days_on_market += 1 # return days_on_market def extract_phone(self, code, id): img = code.find('img', {'class': 'phone-num-img'})['src'] ext = img.partition('data:image/')[2].split(';')[0] with open(self.PATH + str(id) + '.' + ext, 'wb') as f: f.write(ba.a2b_base64(img.partition('base64,')[2])) text = textract.process(self.PATH + str(id) + '.' + ext).replace( ' ', '') if '+971' in text: pass else: text = '+971' + text os.remove(self.PATH + str(id) + '.' + ext) return text.strip()
def __init__(self): self._request_manager = RequestManager()
class CommentScraper: def __init__(self): self._request_manager = RequestManager() def scrape_comments(self, url, sort_by): if url.startswith("https://www"): url = url.replace("www", "old", 1) soup = self._request_manager.get_reddit_soup(url) return self._parse_comments_from_document(soup) def _parse_comments_from_document(self, document, get_children=False): comment_objects_list = [] try: container = document.find_all("div", class_=["nestedlisting"])[0] except IndexError: return comment_objects_list container_comments = container.find_all("div", class_="comment") if (get_children and len(container_comments) == 1): return comment_objects_list first_comment = (container.find_all( "div", class_="comment")[1 if get_children else 0]) comment_objects_list.append(self._extract_comment_data(first_comment)) for sibling in first_comment.next_siblings: is_tag = isinstance(sibling, Tag) is_comment = "comment" in sibling["class"] is_morechildren = "morechildren" in sibling["class"] if is_tag and is_comment: comment_objects_list.append( self._extract_comment_data(sibling)) elif is_tag and is_morechildren: subreddit = (document.find( "link", {"rel": "canonical"})["href"].split("/")[4]) comment_objects_list.extend( self._get_more_comments(sibling, subreddit)) return comment_objects_list def _extract_comment_data(self, comment_tag, recursive=True): top_level_comment_object = {} score_tag = comment_tag.find("span", class_="score unvoted") score = score_tag["title"] if score_tag is not None else "???" author_tag = comment_tag.find("a", class_="author") author = (author_tag.text.strip() if author_tag is not None else "[deleted]") date_posted = comment_tag.find("time", class_="live-timestamp") date_posted_timestamp = date_posted["datetime"] date_posted_readable = date_posted["title"] date_edited = comment_tag.find("time", class_="edited-timestamp") date_edited_timestamp = (date_edited["datetime"] if date_edited is not None else None) num_children = int( comment_tag.find("a", class_="numchildren").text.strip().replace( "(", "").replace(")", "").split(" ")[0]) permalink_old = comment_tag.find("a", class_="bylink")["href"] permalink = permalink_old.replace("old", "www", 1) comment_container = comment_tag.find( "div", class_="usertext-body may-blank-within md-container").find( "div", class_="md") comment_formatted = comment_container.prettify() comment_raw = " ".join([ p.text for p in comment_container.find_all("p") ]).strip().rstrip() top_level_comment_object["score"] = score top_level_comment_object["author"] = author top_level_comment_object["date_posted_timestamp"] = ( date_posted_timestamp) top_level_comment_object["date_posted_readable"] = date_posted_readable top_level_comment_object["date_edited_timestamp"] = ( date_edited_timestamp) top_level_comment_object["num_children"] = num_children top_level_comment_object["permalink_old"] = permalink_old top_level_comment_object["permalink"] = permalink top_level_comment_object["comment_formatted"] = comment_formatted top_level_comment_object["comment_raw"] = comment_raw if num_children == 0 or not recursive: return top_level_comment_object else: nested_soup = self._request_manager.get_reddit_soup(permalink_old) parsed_replies = self._parse_comments_from_document( nested_soup, True) if len(parsed_replies) == 0: return top_level_comment_object top_level_comment_object["replies"] = parsed_replies return top_level_comment_object def _get_more_comments(self, morecomment_tag, subreddit): morecomments_args = (morecomment_tag.a["onclick"].replace( "return morechildren", "").replace("(", "").replace(")", "").replace("'", "").split(",")) data_id = morecomment_tag["data-fullname"] link_id = morecomments_args[1].strip() sort = morecomments_args[2].strip() renderstyle = "html" limit_children = False r = subreddit children = (",".join(morecomments_args[3:len(morecomments_args) - 1]).strip()) payload = { "id": data_id, "link_id": link_id, "sort": sort, "renderstyle": renderstyle, "limit_children": limit_children, "r": r, "children": children } more_soup = self._request_manager.post_reddit_soup( "https://old.reddit.com/api/morechildren", payload) json_comments = json.loads(more_soup.prettify()) json_comments_list = json_comments["jquery"][10][3][0] more_comments = [] for comment in json_comments_list: comment_content = comment["data"]["content"] comment_tag_string = html.unescape(comment_content) comment_tag_soup = BeautifulSoup(comment_tag_string, "html.parser") if comment["kind"] == "more": more_comments.extend( self._get_more_comments( comment_tag_soup.find("div", class_="morechildren"), subreddit)) else: more_comments.append( self._extract_comment_data(comment_tag_soup)) return more_comments
def __init__(self): self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager()
class PostScraper: def __init__(self): self._request_manager = RequestManager() def _get_posts_from_first_soup(self, first_soup, limit): try: script_data = first_soup.select('script#data') script_data_content = json.dumps(script_data[0].contents[0]) script_data_content = (script_data_content.replace( "window.___r = ", "")) script_data_content = json.loads(script_data_content) script_data_content_len = len(script_data_content) script_data_content = ( script_data_content[:script_data_content_len - 1]) script_data_dictionary = json.loads(script_data_content) script_data_list = list( script_data_dictionary["posts"]["models"].values()) filtered_list = [ post for post in script_data_list if post["belongsTo"]["type"] == "subreddit" and not post['isStickied'] and post['crosspostParentId'] is None ] if (len(filtered_list) < limit): return filtered_list return filtered_list[:limit] except IndexError: return [] def _get_posts_after_first_soup(self, soup, limit): post_list = list(json.loads(soup.text)["posts"].values()) filtered_list = [ post for post in post_list if post["belongsTo"]["type"] == "subreddit" and not post["isStickied"] and post["crosspostParentId"] is None ] if (len(filtered_list) < limit): return filtered_list return filtered_list[:limit] def _get_processed_posts(self, posts, return_keys=[], verbose=False): post_objects = [] post_ids = [] for value in posts: post_object = {} if (len(return_keys) > 0): for return_key in return_keys: post_object[return_key] = value[return_key] elif (verbose): post_object = value else: post_object["id"] = value["id"] post_object["title"] = value["title"] post_object["numComments"] = value["numComments"] post_object["created"] = value["created"] post_object["score"] = value["score"] post_object["author"] = value["author"] post_object["upvoteRatio"] = value["upvoteRatio"] post_object["permalink"] = value["permalink"] post_object["media"] = value["media"] post_ids.append(value["id"]) post_objects.append(post_object) return post_objects, post_ids def scrape_posts(self, subreddit, limit, sort_by, verbose): post_objects_list = [] post_ids_list = [] posts_count = 0 subreddit_entered = subreddit is not None and len(subreddit) > 0 while (posts_count < limit): if (posts_count == 0): url = BASE_URL if (subreddit_entered): url += "/r/{sub_name}" url = url.format(sub_name=subreddit) url += "/{sort_by}" url = url.format(sort_by=sort_by) subreddit_post_soup = self._request_manager.get_reddit_soup( url) posts = self._get_posts_from_first_soup( subreddit_post_soup, limit) else: remaining_limit = limit - posts_count if (subreddit_entered): url = URL_AFTER_ID.format(sub_name=subreddit, last_id=post_ids_list[-1], sort_by=sort_by) else: url = BASE_URL + "/{sort_by}/?after={last_id}" url = url.format(sort_by=sort_by, last_id=post_ids_list[-1]) subreddit_post_soup = self._request_manager.get_reddit_soup( url) posts = (self._get_posts_after_first_soup( subreddit_post_soup, remaining_limit) if (subreddit_entered) else self._get_posts_from_first_soup( subreddit_post_soup, remaining_limit)) post_objects, post_ids = self._get_processed_posts(posts, return_keys=[], verbose=verbose) post_objects_list.extend(post_objects) post_ids_list.extend(post_ids) posts_count = len(post_objects_list) if (posts_count == 0): break return post_objects_list
class LinksExtractor: DOMAIN = 'dubicars.com' PROJECT_ID = 13 def __init__(self): self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() def __createUrl(self, templateUrl, page): #url = templateUrl[:-1] + str(page) url = templateUrl.format(page) return url def findLinks(self, sourceCode): links = [] status = True sourceCode = sourceCode.find('section', {'data-item-hash': "search"}) listOfTags = sourceCode.findAll('li') for block in listOfTags: try: data = block['data-sp-item'] except: continue data = json.loads(data) listing_id = data['id'] try: km = int(data['km']) except: km = 101 if km < 100: continue tag_a = block.find('a') href = tag_a['href'] links.append({'url': href, 'listing_id': listing_id}) return {'links': links, 'status': status} def main(self, sourceUrl): page = 1 while True: url = self.__createUrl(sourceUrl, page) print url try: response = self.request_manager.take_get_request( url, proxy_using=False) except Exception as exc: print exc break parseSourceCode = self.source_code_manager.parse_code( response['source_code']) links_data = self.findLinks(parseSourceCode) links = links_data['links'] self.db.insert_urls(urls_list=links, source=self.DOMAIN) if self.isLastPage(parseSourceCode): print "last" break page += 1 def find_last_page(self, code): pagination = code.find('div', {'class': 'paging '}) pages = pagination.findAll('a') last_page = int(pages[-2].text) print last_page return last_page def isLastPage(self, code): next_page = code.find('a', {'class': 'next'}) if next_page is None: return True else: return False