def get_vendor_main_info(self, response): try: vendor = items.User() vendor["username"] = self.get_text( response.xpath( ".//div[@class='user info']//div[@class='content']//a[@class='header']" )) vendor["username"] = vendor["username"].replace("@", "") vendor["relativeurl"] = response.xpath( ".//div[@class='user info']//div[@class='content']//a[@class='header']/@href" ).extract_first(default="") vendor["fullurl"] = self.make_url(vendor["relativeurl"]) vendor["last_active"] = response.xpath( ".//div[@class='user info']//div[@class='meta']//span[contains(text(),'Last seen')]/text()" ).extract_first(default="").strip() vendor["last_active"] = vendor["last_active"].replace( "Last seen", "").strip() vendor["last_active"] = self.parse_datetime(vendor["last_active"]) vendor["join_date"] = response.xpath( ".//div[@class='user info']//div[@class='meta']//span[contains(text(),'Registered')]/text()" ).extract_first(default="").strip() vendor["join_date"] = vendor["join_date"].replace( "Registered", "").strip() vendor["join_date"] = self.parse_datetime( vendor["join_date"]).date() vendor["level"] = response.xpath( ".//div[@class='user info']//div[@class='header']/div[contains(text(),'Level:')]/text()" ).extract_first(default="").strip() vendor["level"] = vendor["level"].replace("Level:", "").strip() vendor["trusted_seller"] = response.xpath( ".//div[@class='user info']//div[@class='header']/div[text()[contains(.,'Trusted Vendor')]]" ) vendor[ "trusted_seller"] = True if vendor["trusted_seller"] else False vendor["average_rating"] = response.xpath( ".//div[@class='user info']//div[@class='extra content']//i[@class='icon thumbs up']/following-sibling::span/text()" ).extract_first(default="").strip() vendor["feedback_received"] = self.get_text( response.xpath( ".//div[@class='ui vertical menu tiny basic fluid secondary']/a[contains(@href,'/reviews')]/span" )) vendor["warnings_number"] = self.get_text( response.xpath( ".//div[@class='ui vertical menu tiny basic fluid secondary']/a[contains(@href,'/warnings')]/span" )) vendor["has_warning"] = True if ( vendor["warnings_number"] != "") and ( vendor["warnings_number"] != "0") else False vendor["member_class"] = response.xpath( ".//div[@class='user info']//div[@class='header']/div[contains(text(),'Vendor')]/text()" ).extract_first(default="").strip() vendor["ship_from"] = response.xpath( ".//div[@class='user info']//a[@class='header']/i[contains(@class,'flag')]/@class" ).extract_first(default="") vendor["ship_from"] = vendor["ship_from"].replace("flag", "").strip() return vendor except Exception as error: self.logger.warning("Couldn't yield vendor at %s (Error: %s)" % (response.url, error))
def parse_user(self, response): try: user = items.User() user['username'] = self.get_text( response.css('h1::text').extract_first()) if user['username'] == '': user['username'] = response.xpath( ".//h1/i/del/text()").extract_first() if response.xpath(".//h1/i/text()") and 'Banned' in response.xpath( ".//h1/i/text()").extract_first(): user['is_banned'] = True user['relativeurl'] = self.get_relative_url(response.url) user['fullurl'] = response.url user['profile'] = self.get_text( response.css('ul.nav-tabs').xpath('following-sibling::p')) user['positive_feedback'] = self.get_text( response.css('a.no-style')[0].css('strong')) user['neutral_feedback'] = self.get_text( response.css('a.no-style')[1].css('strong')) user['negative_feedback'] = self.get_text( response.css('a.no-style')[2].css('strong')) info = self.get_text(response.css('p.text-muted')[0]) info = re.sub('\s+', ' ', info) user['last_active'] = self.parse_datetime( re.search("Last seen - (.*?) UTC", info).group(1)) user['join_date'] = self.parse_datetime( re.search("Vendor since - (.*?) UTC", info).group(1)) user['ship_from'] = re.search("Ships From - (.*)", info).group(1) yield user except Exception as error: self.logger.warning("Failed to yield user at %s because '%s'" % (response.url, error))
def parse_userpgp(self, response): user = items.User() user['username'] = response.meta[ 'username'] # Mandatory information to add info to a user user['public_pgp_key'] = self.get_text( response.css('.main-content textarea')) yield user
def parse_vendor(self, response): try: user = items.User() user['relativeurl'] = self.get_relative_url(response.url) user['fullurl'] = response.url user['profile'] = self.get_text(response.css('.mb-3 p.card-text')) user['username'] = re.search(r'/account/([\w-]+)/', response.url).group(1) if not response.xpath('.//div[@class="card-body"]/a[contains(text(), "au shop")]'): user['is_buyer'] = True else: user['is_buyer'] = False pgp = self.get_text(response.css('div.card-body pre')) if 'PGP' in pgp: user['public_pgp_key'] = pgp trs = response.css("table.m-0 tr") for tr in trs: key = self.get_text(tr.css('th')).lower() value = self.get_text(tr.css('td')) if value is not '': if key == 'likes': user['positive_feedback'] = int(value) elif key == 'unlikes': user['negative_feedback'] = int(value) elif key == 'moyenne': user['average_rating_percent'] = float(re.search(r'([\d,]+)%', value).group(1).replace(',','.')) elif key == 'inscr.': user['join_date'] = self.parse_datetime(value) elif key == 'dern.co.': user['last_active'] = self.parse_datetime(value) elif key == 'identité fdw': user['forum_username'] = value elif key == 'e-mail': user['email'] = value elif key == 'irc': user['irc'] = value elif key == 'ricochet': user['ricochet'] = value elif key == 'bitmessage': user['bitmessage'] = value elif key == 'btc': user['btc_address'] = value elif value == '' or key == 'jid': pass else: self.logger.warning("Found a new piece of user information, '%s', with value '%s' at %s" % (key, value, response.url)) yield user except Exception as error: self.logger.warning("Failed to yield user at %s because '%s'" % (response.url, error))
def parse_user(self, response): user_item = items.User() details = response.css('div.tabularDetails>div') verified_list = [] user_item['username'] = dict(parse_qsl(urlparse(response.url).query))[ 'member'] # Read get parmater "member" from url for div in details: label = div.css('label:first-child') label_txt = self.get_text(label).lower() content = div.css('div>:not(label)') if label_txt == 'username': ratings = self.parse_ratings(content) for key in ratings: user_item[key] = ratings[key] else: content = div.css('div>:not(label)') if label_txt == 'trusted seller': user_item['trusted_seller'] = self.get_text(content) elif label_txt == 'verified': verified_list.append(self.get_text(content)) elif label_txt == 'fe enabled': user_item['fe_enabled'] = self.get_text(content) elif label_txt == 'join date': user_item['join_date'] = self.get_text(content) elif label_txt == 'last active': user_item['last_active'] = self.get_text(content) else: #self.logger.warning('Found a user detail (%s) that is unknown to this spider. Consider handling it.' % label_txt) self.logger.warning( 'Found a user detail (%s) that is unknown to this spider. Consider handling it. It should be on URL: %s' % (label_txt, response.url)) user_item['verified'] = json.dumps(verified_list) for div in response.css('div.messagingTab>div'): try: title = self.get_text(div.css('div>div:first-child')) content = self.get_text(div.css('div>div:nth-child(2)')) lower_title = title.lower() if lower_title == 'public pgp key': user_item['public_pgp_key'] = content elif lower_title == 'terms and conditions': user_item['terms_and_conditions'] = content except Exception, e: self.logger.warning( 'Error while reading messaging tab. Error : %s' % e)
def parse_category(self, response): for user_link in response.xpath('//a[starts-with(@href, "/vendor/")]'): user = items.User() user['username'] = self.get_text(user_link) user['relativeurl'] = user_link.xpath('@href').extract_first() user['fullurl'] = self.make_url(user['relativeurl']) yield user yield self.make_request('userprofile', url=user['relativeurl']) for listing_url in response.xpath('//a[starts-with(@href, "/listing/") and not(contains(@href, "also-available"))]/@href').extract(): yield self.make_request('listing', url=listing_url) for cat_url in response.css('ul.pagination li a::attr(href)').extract(): yield self.make_request('category', url=cat_url)
def parse_user(self, response): username = self.get_url_id(response.url) user = items.User() user['username'] = username user['public_pgp_key'] = self.get_text( response.css('section.main_items textarea')) tables = response.css('div.containing-div table') for table in tables: table_headers = table.css('thead th') table_columns = table.css('tbody td') if len(table_headers) == len( table_columns) and len(table_headers) > 0: for i in range(len(table_headers)): header = self.get_text(table_headers[i]).lower() text = self.get_text(table_columns[i]) if header == 'rank': user['level'] = text elif header == 'last active:': user['last_active'] = self.parse_timestr(text) elif header == 'registered on libertas:': user['join_date'] = self.parse_timestr(text) elif header == 'ships from:': user['ship_from'] = text elif header == 'sales:': user['successful_transactions'] = text vendor_feedback = response.css( 'a[href*="/feedback/"]::attr(href)').extract_first() if vendor_feedback: feedback_text = self.get_text( response.css('a[href*="/feedback/"]')) user['average_rating'] = feedback_text.count( '★') # ★ is the html entity for ★ yield self.make_request('user_ratings', url=vendor_feedback, username=username) vendor_ads_link = response.css( 'article a[href*="/item/"]::attr(href)').extract() for link in vendor_ads_link: yield self.make_request('ads', url=link, ads_id=self.get_url_id(link)) yield user
def parse_user_pgp(self, response): try: user = items.User() user['username'] = self.get_text( response.css('h1::text').extract_first()) if user['username'] == '': user['username'] = response.xpath( ".//h1/i/del/text()").extract_first() user['fullurl'] = re.search("(^.*)&tab", response.url).group(1) user['relativeurl'] = self.get_relative_url(user['fullurl']) user['public_pgp_key'] = self.get_text( response.css('ul.nav-tabs').xpath('following-sibling::p')) yield user except Exception as error: self.logger.warning( "Failed to yield user PGP from %s because '%s'" % (response.url, error))
def parse_buyer(self, response): try: buyer = items.User() buyer["is_buyer"] = True buyer["username"] = self.get_text( response.xpath( ".//div[@class='user info']//div[@class='content']//a[@class='header']" )) buyer["username"] = buyer["username"].replace("@", "") buyer["relativeurl"] = response.xpath( ".//div[@class='user info']//div[@class='content']//a[@class='header']/@href" ).extract_first(default="") buyer["fullurl"] = self.make_url(buyer["relativeurl"]) buyer["last_active"] = response.xpath( ".//div[@class='user info']//div[@class='meta']//span[contains(text(),'Last seen')]/text()" ).extract_first(default="").strip() buyer["last_active"] = buyer["last_active"].replace( "Last seen", "").strip() buyer["last_active"] = self.parse_datetime(buyer["last_active"]) buyer["join_date"] = response.xpath( ".//div[@class='user info']//div[@class='meta']//span[contains(text(),'Registered')]/text()" ).extract_first(default="").strip() buyer["join_date"] = buyer["join_date"].replace("Registered", "").strip() buyer["join_date"] = self.parse_datetime(buyer["join_date"]).date() buyer["buyer_profile"] = self.get_text( response.xpath( ".//div[@class='segment ui']/h3[contains(text(),'About')]/following-sibling::div[@class='ui container']" )) buyer["buyer_country"] = response.xpath( ".//div[@class='user info']//a[@class='header']/i[contains(@class,'flag')]/@class" ).extract_first(default="") buyer["buyer_country"] = buyer["buyer_country"].replace( "flag", "").strip() yield buyer except Exception as error: self.logger.warning("Couldn't yield buyer at %s (Error: %s)" % (response.url, error))
def parse_userprofile(self, response): user = items.User() profile_content = response.css("#content1") user['username'] = self.get_url_param(response.url, 'user') user['fullurl'] = response.url user['profile'] = self.get_text_first(profile_content.css('.bubble p')) up = urlparse(response.url) user['relativeurl'] = up.path if up.query: user['relativeurl'] += '?%s' % up.query # Username text has a title like "Vendor Steve has xx feedback lbah blah blah". We grab the title there. user_title_text = self.get_text( profile_content.css('.grid .grid').xpath( './/span[contains(text(), "%s")]/@title' % user['username']).extract_first()) m = re.search('(\w+)\s*%s' % user['username'], user_title_text, re.IGNORECASE) if m: user['title'] = m.group(1) # Lot of property listed in listing # So it's hard to pinpoint the right data here. What we do is that we find the data div (<div><label>Feedback Score</label> <htmlhtml> xxx</div>) # We normalize text, search for the label content, take the rest as the value. for label in profile_content.css('.grid .grid label'): fulltext = self.get_text(label.xpath('..')) label_text = self.get_text(label).lower() m = re.search('%s(.+)' % label_text, fulltext, re.IGNORECASE | re.S) if m: value = m.group(1).strip() if label_text == 'feedback score': user['average_rating'] = value elif label_text == 'Sales': user['successful_transactions'] = value elif label_text == 'last logged': user['last_active'] = self.parse_timestr(value) elif label_text == 'member since': user['join_date'] = self.parse_timestr(value) elif label_text == 'shipping from': user['ship_from'] = value elif label_text == 'shipping to': user['ship_to'] = value # PGP Key pgp_key = self.get_text( profile_content.css(".pgp_box")) # Will be normalized by pipeline if 'has not been set yet' not in pgp_key: user['public_pgp_key'] = pgp_key # Score on other markets. for score in response.css(".externalFeedback"): score_text = self.get_text(score) score_title = self.get_text(score.xpath("@title").extract_first()) if re.search('dreammarket', score_title, re.IGNORECASE | re.S): user['dreammarket_rating'] = score_text elif re.search('hansa', score_title, re.IGNORECASE | re.S): user['hansa_rating'] = score_text elif re.search('alphabay', score_title, re.IGNORECASE | re.S): user['alphabay_rating'] = score_text else: self.logger.warning( 'Unknown other website score. Title is : %s' % (score_title)) # Number of sales. user['successful_transactions'] = self.get_text( profile_content.xpath( './/div[@class = "col-2"]/span[@class = "bigInfo"]/text()'). extract_first()) yield user
def parse_user(self, response): try: last_active_span = response.css( '.panel-heading .row div:nth-child(2) span') last_active = self.parse_datetime( re.search(r'Last seen: (.+)', self.get_text(last_active_span)).group(1)) user = items.User() user['username'] = self.get_text( response.css('.breadcrumb li.active')) user['relativeurl'] = self.get_relative_url(response.url) user['fullurl'] = response.url user['profile'] = self.get_text(response.css('#profile .col-md-9')) user['average_rating'] = self.get_text( response.css('center span')[0]) user['last_active'] = last_active user['terms_and_conditions'] = self.get_text( response.css('#tac .col-md-9')) user['public_pgp_key'] = self.normalize_pgp_key( self.get_text(response.css('#pgp pre.well'))) level_match = re.search( 'Level (\d+)', self.get_text(response.css('.label-success'))) if level_match: user['level'] = level_match.group(1) if 'FE' in response.xpath( ".//div/span[@class='label label-default']/text()" ).extract(): user['fe_enabled'] = True else: user['fe_enabled'] = False dream_rating = response.xpath( ".//small[preceding-sibling::img[contains(@title, 'Dream')]]/text()" ) if dream_rating: dream_rating = dream_rating.extract_first() user['dreammarket_sales'] = re.search("([0-9]*),", dream_rating).group(1) user['dreammarket_rating'] = re.search(", ([0-9\.]*)", dream_rating).group(1) yield user except Exception as error: self.logger.warning("Failed to yield user at %s because '%s'" % (response.url, error)) try: ratings = response.xpath( ".//div[@id='feedback']/div/div/div/table[1]/tbody/tr") if ratings and 'No available feedback' not in ratings.extract_first( ): for rating in ratings: tds = rating.css('td') user_rating = items.UserRating() user_rating['username'] = user['username'] user_rating['submitted_by'] = self.get_text(tds[0]) user_rating['rating'] = len(tds[1].css('i')) user_rating['comment'] = self.get_text(tds[2]) user_rating['price_usd'] = re.search( '([\d\.]+)', self.get_text(tds[3])).group(1) user_rating['submitted_on'] = self.parse_datetime( self.get_text(tds[4])).date() user_rating['submitted_on_string'] = self.get_text(tds[4]) yield user_rating except Exception as error: self.logger.warning( "Failed to yield user ratings at %s because '%s'" % (response.url, error))
def parse_userprofile(self, response): try: user = items.User() user['username'] = self.get_username_from_profile(response) user['relativeurl'] = '/vendor/%s' % user['username'] user['fullurl'] = self.make_url(user['relativeurl']) containertop = response.xpath('//h1/..') labels = self.get_text(containertop.css("span.label")).lower() user['trusted_seller'] = True if 'trusted vendor' in labels else False m = re.search('level (\d+)', labels) if m: user['level'] = m.group(1) containertext = self.get_text(containertop) m = re.search(r'last seen\s*-\s*([^\s]+)',containertext, re.IGNORECASE) if m: user['last_active'] = self.parse_timestr(m.group(1)) m = re.search(r'vendor since\s*-\s*([^\s]+)',containertext, re.IGNORECASE) if m: user['join_date'] = self.parse_timestr(m.group(1)) m = re.search(r'(\d+) subscribers',containertext, re.IGNORECASE) if m: user['subscribers'] = m.group(1) fbcontainer = response.xpath("//h3[contains(text(), 'Feedback Ratings')]/..") m = re.search(r'\d+', self.get_text(fbcontainer.xpath('.//a[contains(@href, "show=positive")]'))) if m: user['positive_feedback'] = m.group(0) m = re.search(r'\d+', self.get_text(fbcontainer.xpath('.//a[contains(@href, "show=neutral")]'))) if m: user['neutral_feedback'] = m.group(0) m = re.search(r'\d+', self.get_text(fbcontainer.xpath('.//a[contains(@href, "show=negative")]'))) if m: user['negative_feedback'] = m.group(0) m = re.search(r"(\d+(.\d+)?\s*\%)\s*positive feedback", self.get_text(fbcontainer), re.IGNORECASE) if m: user['average_rating'] = m.group(1) user['successful_transactions'] = self.get_text(response.xpath("//h3[contains(text(), 'Orders')]/../p")) avg_volume_str = self.get_text(response.xpath("//h3[contains(text(), 'Average Volume')]/../p")) m = re.search(r'(\d+(.\d+)?)\s*\(\s*USD\s*(\d+(.\d+)?)\s*\)\s*per order',avg_volume_str) if m: user['avg_volume'] = m.group(1) # Big tabs at bottom. Only one per page. We will ne to reload the prodifle to get the rest. active_presentation = self.get_text(response.css("ul.nav li[role='presentation'].active")).lower() if active_presentation == 'profile': user['profile'] = self.get_text(response.xpath('//h4[contains(text(), "Vendor Profile")]/../p')) elif active_presentation == 'terms & conditions': user['terms_and_conditions'] = self.get_text(response.xpath('//h4[contains(text(), "Terms & Conditions")]/../p')) elif active_presentation == 'pgp': user['public_pgp_key'] = self.get_text(response.xpath('//h4[contains(text(), "Vendor Public PGP Key")]/../code')) # Ratings from other websites. fbhistory_container = response.xpath("//h3[contains(text(), 'Feedback History')]/..") agora_score = self.get_text(fbhistory_container.xpath("span[contains(@title, 'Agora')]")) m = re.search(r'(\d+(.\d+)?\/\d+)*', agora_score) if m : user['agora_rating'] = m.group(1) abraxas_score = self.get_text(fbhistory_container.xpath("span[contains(@title, 'Abaxas')]")) m = re.search(r'(\d+(.\d+)?\/\d+)', abraxas_score) if m : user['abraxas_rating'] = m.group(1) nucleus_score = self.get_text(fbhistory_container.xpath("span[contains(@title, 'Nucleus')]")) m = re.search(r'(\d+(.\d+)?\/\d+)', nucleus_score) if m : user['nucleus_rating'] = m.group(1) dreammarket_score = self.get_text(fbhistory_container.xpath("span[contains(@title, 'Dream Market')]")) m = re.search(r'(\d+(.\d+)?)\/\d+,', dreammarket_score) if m : user['dreammarket_rating'] = "%s/5" % m.group(1) valhalla_score = self.get_text(fbhistory_container.xpath("span[contains(@title, 'Valhalla')]")) m = re.search(r'\d+/\d+', valhalla_score) if m : user['valhalla_rating'] = m.group(0) oasis_score = self.get_text(fbhistory_container.xpath("span[contains(@title, 'Oasis')]")) m = re.search(r'\d+/\d+/\d+', oasis_score) if m : user['oasis_rating'] = m.group(0) yield user for url in response.css("ul li[role='presentation'] a::attr(href)").extract(): if 'feedback' in url : yield self.make_request('user_feedback', url=url, username=user['username']) else: yield self.make_request('userprofile', url=url) # Will reload profile with new info in it. They will add in database. except WarningException as e: self.logger.warning("Could not parse profile at %s. %s" % (response.url, e))
def parse_userprofile(self, response): user = items.User() user['username'] = response.meta['username'] user['fullurl'] = response.url user['relativeurl'] = self.get_relative_url(response.url) user['profile'] = self.get_text( response.css('.vendor-profile-details')) # =============== Top list of properties for line in response.css('.vendor-profile .vendor-profile-list li'): key = self.get_text(line.xpath('./span[1]')).lower() val_span = line.xpath('./span[2]') if key: if key == 'ships from': user['ship_from'] = self.get_text( val_span.xpath( './/img[1]/@alt').extract_first()).upper() elif key == 'last seen': user['last_active'] = self.parse_timestr( self.get_text(val_span)) elif key == 'vendor deals': user['successful_transactions'] = self.get_text(val_span) elif key == 'vendor %': user['level'] = self.get_text(val_span) elif key == 'vendor rating': user['average_rating'] = self.get_text(val_span) elif key == 'fans': user['subscribers'] = self.get_text(val_span) else: self.logger.warning( 'New property found in user profile. Property = %s. URL=%s' % (key, response.url)) # ====== State machine to parse the rightmost feed containing news and terms of service ========= # We make a list of news, then gather parse_state = 'unknown' news_entry = None last_content = '' news = [] terms_sel_list = scrapy.selector.unified.SelectorList() rightmost_block = response.css('.vendor-profile-news') for node in response.css('.vendor-profile-news>*'): if node.root.tag == 'h2': if self.get_text(node).lower() == 'news': parse_state = 'news' elif self.get_text(node).lower() == 'terms of service': parse_state = 'terms' if parse_state == 'news': if news_entry != None and (node.root.tag == 'h3' or node.root.tag == 'h2'): news.append(news_entry) news_entry = None if node.root.tag == 'h3': if news_entry == None: news_entry = {'title': '', 'content': ''} news_entry['title'] = self.get_text(node) last_content = '' elif node.root.tag == 'p': news_entry['content'] += self.get_text(node) elif parse_state == 'terms': pass if len(news) > 0: user['news'] = json.dumps(news) ## ========================================================== # === Try to get Terms of service by splitting HTML. Best we can do with scrapy ==== blocks = ''.join(response.css('.vendor-profile-news').extract()).split( '<h2>Terms of Service</h2>') if len(blocks) == 2: sel = scrapy.Selector(text='<article>' + blocks[1]) user['terms_and_conditions'] = self.get_text(sel) # ===================== yield user
def parse_ads(self, response): title = response.xpath(".//div[@id='main']/h1/text()").extract_first() if title is None and response.xpath( ".//div[contains(text(), 'Produkten finns inte.')]"): self.logger.warning( "Found what is likely an empty page at %s. Flugsvamp writes: %s" % (response.url, response.xpath( ".//div[contains(text(), 'Produkten finns inte.')]/text()" ).extract_first().strip())) else: ads_item = items.Ads() user_item = items.User() ads_item['title'] = title ads_item['offer_id'] = response.url.split("=")[-1] ads_item['fullurl'] = response.url ads_item['relativeurl'] = self.get_relative_url(response.url) # COMMENT WHY THIS IS. description = self.get_text( response.xpath( '//strong[contains(text(), "Beskrivning:")]/parent::div') ).replace('Beskrivning:', '') if description: ads_item['description'] = description try: keys = response.xpath(".//div[@class='lightrow']") for key_ele in keys: key = key_ele.xpath("strong/text()").extract_first() if key == None: continue key = key.lower() if "omd" in key: value = key_ele.xpath( './/span[@class="grey"]/text()').extract_first() m = re.search('(.*?)\ \((.*?)\ omd', value, re.M | re.I | re.S) if m: ads_item['product_rating'] = m.group(1) ads_item['already_sold'] = m.group(2) elif "ljare" in key: ads_item['vendor_username'] = key_ele.xpath( './/a/text()').extract_first() user_item['username'] = ads_item['vendor_username'] user_item['relativeurl'] = key_ele.xpath( './/a/@href').extract_first() user_item['fullurl'] = response.urljoin( user_item['relativeurl']) value = key_ele.xpath( './/span[@class="grey"]/text()').extract_first() m = re.search('(.*?)\ \((.*?)\ omd', value, re.M | re.I | re.S) if m: user_item['average_rating'] = m.group(1) user_item['feedback_received'] = m.group(2) elif key == "kategori:": ads_item['category'] = key_ele.xpath( './/a/text()').extract_first() elif key == "kvantitet:": ads_item['quantity'] = self.get_text( key_ele.xpath('span[@class="float-right"]')) elif key == "ditt pris inkl. frakt:": value = self.get_text( key_ele.xpath('.//span[@class="float-right"]')) m = re.search('(.*?)\ \((.*?)\)', value, re.M | re.I | re.S) if m: ads_item['price_btc'] = m.group(2) elif key == "pristabell:": price_options = [] priceList = key_ele.xpath( './/span[@class="float-right"]').extract_first( ).split('<br>') for list_item in priceList: linesel = scrapy.Selector(text=list_item) line_txt = self.get_text(linesel) price_options.append(line_txt) if len(price_options) > 0: ads_item['price_options'] = price_options else: self.logger.warning( "Found a new piece of product information, '%s', at %s" % (key, response.url)) yield ads_item yield user_item except Exception as error: self.logger.warning( "Failed to parse listing (Error: '%s'). See URL %s" % (error, response.url)) # ===================== IMAGES ===================== images_url = response.css('img.float-right::attr(src)').extract() for url in images_url: if url: img_item = items.AdsImage(image_urls=[]) img_item['image_urls'].append( self.make_request(reqtype='image', url=url, headers=self.tor_browser)) img_item['ads_id'] = ads_item['offer_id'] yield img_item
def parse_vendor(self, response): username = re.search(r"u_id=([^&]+)", response.url, re.M | re.I) username = username.group(1) if username else None if username not in self.spider_settings['logins']: vendor = items.User() vendor["username"] = username vendor["fullurl"] = response.url.split("&")[0] vendor["relativeurl"] = self.get_relative_url(vendor["fullurl"]) vendor["last_active"] = self.get_text( response.xpath( "//small//b[contains(text(),'Last Login :'******'Member since :')]/ancestor::small" )).replace("Member since :", "").strip() vendor["join_date"] = self.parse_datetime( vendor["join_date"]).date() vendor["successful_transactions"] = self.get_text( response.xpath( "//small//b[contains(text(),'Sales :')]/ancestor::small") ).replace("Sales :", "").strip() vendor["successful_transactions_as_buyer"] = self.get_text( response.xpath( "//small//b[contains(text(),'Orders :')]/ancestor::small") ).replace("Orders :", "").strip() vendor["average_rating_percent"] = self.get_text( response.xpath( "//small//b[contains(text(),'Positive Feedback :')]/ancestor::small" )).replace("Positive Feedback :", "").strip() vendor["average_rating_percent"] = vendor[ "average_rating_percent"].replace("(", "") vendor["average_rating_percent"] = vendor[ "average_rating_percent"].replace(")", "") vendor["trusted_seller"] = response.xpath( "//small//b[contains(text(),'Seller Trusted :')]/ancestor::small//img/@src" ).extract_first(default="") vendor["trusted_seller"] = True if "yes_trusted" in vendor[ "trusted_seller"] else False vendor["verified"] = response.xpath( "//small//b[contains(text(),'Seller Verified :')]/ancestor::small//img/@src" ).extract_first(default="") vendor["verified"] = True if "yes_verified" in vendor[ "verified"] else False fe = self.get_text( response.xpath( "//small//b[contains(text(),'FE :')]/ancestor::small") ).replace("FE :", "").strip() vendor["fe_enabled"] = False if fe.lower() == "deny" else True vendor["badges"] = [] badges = response.xpath("//small/span[@class='badge']") for badge in badges: badge = self.get_text(badge) if "Seller Level" in badge: vendor["level"] = badge.replace("Seller Level", "").strip() elif "Trust Level" in badge: vendor["trust_level"] = badge.replace("Trust Level", "").strip() else: self.logger.warning("New badge: '%s' at %s" % (badge, response.url)) vendor["badges"].append(badge) vendor["disputes"] = self.get_text( response.xpath( "//small//b[contains(text(),'Disputes :')]/ancestor::small" )).replace("Disputes :", "").strip() vendor["positive_feedback"] = self.get_text( response.xpath( "//div/button/b[contains(text(),'Positive :')]")).replace( "Positive :", "").strip() vendor["neutral_feedback"] = self.get_text( response.xpath( "//div/button/b[contains(text(),'Neutral :')]")).replace( "Neutral :", "").strip() vendor["negative_feedback"] = self.get_text( response.xpath( "//div/button/b[contains(text(),'Negative :')]")).replace( "Negative :", "").strip() active_tab = self.get_text( response.xpath( "//ul[@class='nav nav-tabs']/li[@class='active']/a")) if "Profile" in active_tab: vendor["profile"] = self.get_text( response.xpath("//div[@class='tab-content']//i")) # new fields vendor["icq"] = self.get_text( response.xpath( "//div[@class='tab-content']//span/b[contains(text(),'ICQ :')]/ancestor::span" )).replace("ICQ :", "").strip() vendor["jabber"] = self.get_text( response.xpath( "//div[@class='tab-content']//span/b[contains(text(),'Jabber :')]/ancestor::span" )).replace("Jabber :", "").strip() vendor["email"] = self.get_text( response.xpath( "//div[@class='tab-content']//span/b[contains(text(),'E-Mail :')]/ancestor::span" )).replace("E-Mail :", "").strip() vendor["website"] = self.get_text( response.xpath( "//div[@class='tab-content']//span/b[contains(text(),'My WebSite :')]/ancestor::span" )).replace("My WebSite :", "").strip() elif "PGP Public Key" in active_tab: vendor["public_pgp_key"] = self.get_text( response.xpath("//div[@class='tab-content']//pre")) elif "Feedback" in active_tab: pass else: self.logger.warning("Unknown tab: %s at %s" % (active_tab, response.url)) yield vendor
def parse_user(self, response): user = items.User() user['username'] = self.get_text( response.css('section#main .vendor-box h2')) user['public_pgp_key'] = self.get_text( response.css('.textarea.pgp textarea')) news = response.css( 'section#main .vendor-box .grey-box.formatted div.formatted') if news: user['news'] = self.get_text(news) # News history in /blog/[username], is there a way to collect all news for a vendor? ratings = response.css('section#main .vendor-box .rating.stars') if ratings: ratings_text = self.get_text(ratings) match = re.search('\[(.*)\]\((\d+) ratings\)', ratings_text) if match: user['average_rating'] = match.group(1) user['feedback_received'] = match.group(2) user['last_active'] = self.parse_timestr( self.get_text( response.css( 'section#main .vendor-box .corner li:first-child>div:first-child' ))) user['forum_posts'] = self.get_text( response.css( 'section#main .vendor-box .corner li:nth-child(2)>div:first-child' )) user['subscribers'] = self.get_text( response.css( 'section#main .vendor-box .corner li:last-child>div:first-child' )) user['relativeurl'] = urlparse(response.url).path user['fullurl'] = response.url tabs_buttons_list = response.css( '.special-tabs input[name="vendor-section"]') tabs_list = response.css('.special-tabs .right .contents .formatted') if tabs_buttons_list and tabs_list and len(tabs_buttons_list) == len( tabs_list): i = 0 profile = list() for tab_button in tabs_buttons_list: tab = tabs_list[i] section = self.get_text( tab_button.css('::attr(id)').extract_first()) if 'terms' in section: user['terms_and_conditions'] = self.get_text(tab) elif 'ship' in section: user['shipping_information'] = self.get_text(tab) elif 'news' in section: user['news'] = self.get_text(tab) elif 'refund' in section: user['refund_policy'] = self.get_text(tab) elif 'reship' in section: user['reship_policy'] = self.get_text(tab) else: profile.append(self.get_text(tab)) #self.logger.warning('Found an unknown section on profile page : %s (%s)' % (section, response.url)) i += 1 user['profile'] = "".join(profile) yield user reviews_url = response.css( 'section#main .vendor-box .rating.stars a::attr(href)' ).extract_first() if reviews_url: yield self.make_request('user_ratings', url=reviews_url, username=user['username'], priority=5)
def parse_vendor(self, response): vendor_profile = response.xpath( ".//section[@id='content1']//span[contains(text(),'Vendor')]/text()" ).extract_first() if vendor_profile and vendor_profile.strip() == "Vendor": # Yield vendor. try: vendor = items.User() vendor['username'] = response.xpath( ".//section[@id='content1']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() vendor['relativeurl'] = self.get_relative_url(response.url) vendor['fullurl'] = response.url vendor['last_active'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Last Logged')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['last_active'] = self.parse_datetime( vendor['last_active']) vendor['public_pgp_key'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='bubble']//div[@class='pgp_box']" )) if vendor['public_pgp_key'].endswith("BLOCK----"): self.logger.warning( "PGP key is missing a last letter '-' so adding it. Page %s" % response.url) vendor['public_pgp_key'] = vendor['public_pgp_key'] + "-" vendor['public_pgp_key'] = self.normalize_pgp_key( vendor['public_pgp_key']) vendor['join_date'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Member Since')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['join_date'] = self.parse_datetime(vendor['join_date']) vendor['feedback_received'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Feedback Score')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['ship_from'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Shipping From')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['ship_to'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Shipping To')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['profile'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='bubble']/p")) vendor['successful_transactions'] = response.xpath( ".//section[@id='content1']//label[contains(text(), 'Sales')]/following-sibling::span/text()" ).extract_first(default="").strip() # new fields vendor['response_time'] = response.xpath( ".//section[@id='content1']//label[contains(text(), 'Average Message Response Time')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['vacation_mode'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='row nomargin']//div[@class='col-2']/span[contains(@style,'color')]" )) vacation_mode_normalized = re.search(r"([\w\s]+)", vendor['vacation_mode'], re.M | re.I) if vacation_mode_normalized: vendor['vacation_mode'] = vacation_mode_normalized.group( 1).strip() yield vendor except Exception as error: self.logger.warning( "Couldn't yield vendor from %s (Error: %s)" % (response.url, error)) # Yield ratings. feedbacks = response.xpath( ".//section[@id='content2']//div[@class='feedback']") if feedbacks: for feedback in feedbacks: try: rating = items.UserRating() rating['username'] = response.xpath( ".//section[@id='content1']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() if rating['username'] is None or len( rating['username']) < 2: inspect_response(response, self) ads_id = feedback.xpath( "div[@class='feedback_header']/a/@href" ).extract_first() if ads_id is not None: rating['ads_id'] = self.get_url_param( ads_id, 'lid') rating['submitted_by'] = feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() rating['item_name'] = feedback.xpath( "div[@class='feedback_header']/a/text()" ).extract_first(default="").strip() submitted_on_string = feedback.xpath( "div[@class='feedback_header']/span/text()" ).extract_first(default="").strip() if 'Private Listing' in submitted_on_string: submitted_on_string = feedback.xpath( "div[@class='feedback_header']/span/span/span/text()" ).extract_first() rating['submitted_on_string'] = submitted_on_string rating['submitted_on'] = self.parse_datetime( submitted_on_string) rating[ 'submitted_by_number_transactions'] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sub" )) rating['submitter_rating'] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sup" )) rating['comment'] = self.get_text(feedback.xpath("p")) rating['price_usd'] = feedback.xpath( "div[@class='feedback_subheader']/div/span/text()[contains(., 'USD')]" ).extract_first() rating['price_usd'] = rating['price_usd'].replace( "~", "").replace("USD", "").replace(" ", "") rating_star = feedback.xpath( "div[@class='feedback_subheader']//div[contains(@style,'img/star.png')]/@style" ).extract_first(default="") rating_star = re.search(r"width:(\d+)px;height", rating_star, re.M | re.S) if rating_star: rating_star = float(rating_star.group(1)) rating['rating'] = rating_star / 120 * 5 warning = feedback.xpath( "div[@class='feedback_subheader']/div/span") if warning and len(warning) > 1: rating['warnings'] = self.get_text(warning[0]) yield rating except Exception as error: self.logger.warning( "Couldn't yield feedbacks from %s (Error: %s)" % (response.url, error)) else: self.logger.warning( "Encountered a buyer profile. Skipping page %s. This should NOT happen." % response.url)
def parse_userprofile(self, response): user = items.User() user['username'] = self.get_text(''.join( response.xpath('.//h2/text()').extract())) user['trusted_seller'] = True if 'trusted vendor' in self.get_text( response.css('h2')).lower() else False user['relativeurl'] = '/profile/%s' % user['username'] user['fullurl'] = self.make_url(user['relativeurl']) ## =========== Main property table ======== for line in response.xpath('.//h2/../table[1]//tr'): key_txt = self.get_text( line.xpath('.//td[1]/text()').extract_first()).lower() val_cell = line.xpath('.//td[2]') if val_cell: val_cell = val_cell[0] if key_txt in ['buyer-statistics', 'vendor-statistics']: # Empty lines pass elif key_txt == 'last online': user['last_active'] = self.parse_timestr( self.get_text(val_cell)) elif key_txt == 'member since': user['join_date'] = self.parse_timestr(self.get_text(val_cell)) elif key_txt == 'completed orders': m = re.search('(\d+)/(\d+)', self.get_text(val_cell)) if m: user['successful_transactions_as_buyer'] = m.group(2) elif key_txt == 'disputes involved as buyer': # Not relevant pass elif key_txt == 'rated orders': # Not relevant pass elif key_txt == 'average rating': m = re.search('\((\d+(.\d+)?)\)', self.get_text(val_cell)) if m: user['average_rating'] = '%s/5' % m.group( 1) # Score is on 5 stars elif key_txt == 'vendor-level': exp = self.get_text(val_cell.css('span.badge-primary')) user['exp'] = exp.lower().replace('exp', '').strip() level = self.get_text(val_cell.css('span.badge-success')) m = re.search('level (\d+)', level, re.IGNORECASE) if m: user['level'] = m.group(1) elif key_txt == 'vendor since': # We have "member since" pass elif 'rating of the last' in key_txt: # Not relevant pass elif 'open/completed orders': m = re.search('(\d+)/(\d+)', self.get_text(val_cell)) if m: user['successful_transactions'] = m.group(2) else: self.logger.warning( 'New property on user profile page : %s at %s' % (key_txt, response.url)) # ================================ # ===== Main tab ============= if response.url.endswith('info'): #user['profile'] = response.xpath('.//div[@id = "tabcontent"]').extract() user['profile'] = self.get_text(response.css("#tabcontent")) elif response.url.endswith('pgp'): user['public_pgp_key'] = self.get_text(response.css("#tabcontent")) elif response.url.endswith( 'offers' ): # Find the link plus its category as it need to be passed by parameter actual_category = '' for line in response.css("#tabcontent table tr"): td = line.xpath('./td[1]') if len(td.css("a")) > 0: for url in td.css('a::attr(href)').extract(): yield self.make_request('offer', url=url, category=actual_category) else: actual_category = '/'.join([ self.get_text(x) for x in td.xpath('.//text()').extract() ]) # We reload the same page with additional data. Fields will add on each other. elif response.url.endswith(user['username']): yield self.make_request('userprofile', url='%s/offers' % response.url) yield self.make_request('userprofile', url='%s/pgp' % response.url) yield self.make_request('userprofile', url='%s/info' % response.url) # =============================== # Some vendors also buys. We know by the amount of transaction. # They can be Vendor, Buyer, Vendor/Buyer titles = [] if 'successful_transactions' in user and user[ 'successful_transactions'] not in ['', ' ', '0']: titles.append('Vendor') if 'successful_transactions_as_buyer' in user and user[ 'successful_transactions_as_buyer'] not in ['', ' ', '0']: titles.append('Buyer') if len(titles) > 0: user['title'] = '/'.join(titles) yield user