def parse_threadlisting(self, response): topics = response.css('ul.row.big-list.zebra > li') for topic in topics: threaditem = items.Thread() threaditem['title'] = self.get_text( topic.css("div.main > div > a")) href = topic.css("div.main > div > a::attr(href)").extract_first() threaditem['relativeurl'] = self.get_relative_url(href) if href != "": threaditem['fullurl'] = self.make_url(href) threadid = self.get_thread_id(href) threaditem['threadid'] = threadid threaditem['author_username'] = topic.css( "div.main > div > span a::text").extract_first("").strip() replies = self.get_text( topic.css("div.main > div > span strong:last-child")) if re.match(r'^\d+$', replies) is None: replies = 0 threaditem['replies'] = replies yield threaditem flair = topic.css( "div.main > div > span a::attr(data-flair)" ).extract_first() if flair is not None: user = items.User() user["username"] = topic.css( "div.main > div > span a::text").extract_first("").strip() user["flair"] = flair.strip() user['fullurl'] = topic.css( "div.main > div > span a::attr(href)").extract_first("").strip() user["relativeurl"] = self.get_relative_url(user['fullurl']) yield user
def parse_user(self, response): # self.logger.info("Yielding profile from %s" % response.url) user = items.User() user['relativeurl'] = urlparse(response.url).path user['fullurl'] = response.url dts = response.css("form#viewprofile dl dt") for dt in dts: key = self.get_text(dt).lower() value = dt.xpath('following-sibling::dd[1]') ddtext = self.get_text(value) if key == 'username:'******'username'] = ddtext elif key == 'groups:': user['group'] = value.css('*::text').extract_first() elif key == 'joined:': user['joined_on'] = self.parse_datetime(ddtext) elif key == 'last active:': user['last_post'] = self.parse_datetime(ddtext) elif key == 'total posts:': m = re.match(r"^(\d+).+", ddtext) if m: user['post_count'] = m.group(1) elif key == 'pgpkey:': user['signature'] = ddtext yield user
def parse_thread(self, response): posts = response.xpath('.//div[@class="post "]') for post in posts: messageitem = items.Message() guest_user = len(post.xpath('.//span[contains(text(), "Unregistered")]')) > 0 special_user = guest_user is True and post.xpath('.//div[@class="author_information"]/strong/span/a[contains(@href, "member")]//text()').extract_first() is not None if guest_user is False or special_user is True: messageitem['author_username'] = post.xpath('.//div[@class="author_information"]//a[contains(@href, "member")]//text()').extract_first() if messageitem['author_username'] is None: messageitem['author_username'] = post.xpath('.//div[@class="author_information"]/strong/span/a[contains(@href, "member")]//text()').extract_first() messageitem['postid'] = post.xpath('@id').extract_first().lstrip('post_') messageitem['threadid'] = re.search('tid\=([0-9]+)', response.url).group(1) msg = post.xpath('.//div[contains(@class, "post_body")]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) # Post date handling posted_on = post.xpath('.//span[@class="post_date"]/text()').extract_first() messageitem['posted_on'] = self.parse_datetime(posted_on) else: messageitem['author_username'] = post.xpath('div/div/strong/span/text()').extract_first() messageitem['postid'] = post.xpath('@id').extract_first().lstrip('post_') messageitem['threadid'] = re.search('tid\=([0-9]+)', response.url).group(1) msg = post.xpath('.//div[contains(@class, "post_body")]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) # Post date handling posted_on = post.xpath('.//span[@class="post_date"]/text()').extract_first() messageitem['posted_on'] = self.parse_datetime(posted_on) if messageitem['author_username'] is None: self.logger.warning("Author username is still None at URL: %s. Can't yield item." % response.url) yield messageitem # Yield user. useritem = items.User() if guest_user is False or special_user is True: useritem['username'] = messageitem['author_username'] useritem['fullurl'] = post.xpath('.//div[@class="author_information"]//span[@class="largetext"]/a/@href').extract_first() useritem['relativeurl'] = useritem['fullurl'].split('.onion')[1] useritem['title'] = post.xpath('.//div[@class="author_information"]//span[@class="smalltext"]/text()[1]').extract_first().strip() message_count = post.xpath('.//div[@class="author_statistics"]/text()[2]').extract_first() useritem['message_count'] = int(re.sub('[^0-9]', '', message_count)) post_count = post.xpath('.//div[@class="author_statistics"]/text()[3]').extract_first() useritem['post_count'] = int(re.sub('[^0-9]', '', post_count)) useritem['joined_on'] = self.parse_datetime(post.xpath('.//div[@class="author_statistics"]/text()[4]').extract_first().replace("Registrato: ", '')) useritem['reputation'] = post.xpath('.//strong[contains(@class, "reputation")]/text()').extract_first() useritem['post_count'] = int(re.sub('[^0-9]', '', post_count)) useritem['username_id'] = re.search('([0-9]+)', useritem['relativeurl']).group(1) useritem['membergroup'] = post.xpath('.//img[not(@class="buddy_status")]/@title').extract_first() else: # Unregistered users have no message count, join date, post count, reputation, id.. useritem['username'] = messageitem['author_username'] useritem['fullurl'] = self.spider_settings['endpoint'] + "/" + useritem['username'] useritem['relativeurl'] = useritem['username'] useritem['title'] = post.xpath('.//div[@class="author_information"]//span[@class="smalltext"]/text()[1]').extract_first().strip() yield useritem
def parse_userprofile(self, response): user = items.User() if self.is_private_userprofile( response) is True or self.is_unavailable_userprofile( response) is True: self.logger.warning( "Encountered a limited/private/banned profile at %s. Basic info filled using meta-keys." % response.url) user['relativeurl'] = response.meta['relativeurl'] user['fullurl'] = response.url user['username'] = response.meta['username'] yield user else: user['relativeurl'] = response.meta['relativeurl'] user['fullurl'] = response.url user['username'] = self.get_text(response.css("h1.username")) user['title'] = self.get_text(response.css("span.userTitle")) user['signature'] = self.get_text(response.css("div.signature")) dts = response.css("#content .mast dl dt") for dt in dts: key = self.get_text(dt).lower() ddtext = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'joined:': user['joined_on'] = self.parse_datetime(ddtext) elif key == 'messages:': user['message_count'] = ddtext elif key == 'likes received:': user['likes_received'] = ddtext elif key == 'home page:': user['website'] = ddtext elif key == 'gender:': user['gender'] = ddtext elif key == 'location:': user['location'] = ddtext elif key == 'last activity:': user['last_activity'] = ddtext elif key == 'email': user['email'] = ddtext elif key == 'trophy points:': user['trophy_points'] = ddtext elif key == 'birthday:': user['birthday'] = ddtext elif key == 'occupation:': user['occupation'] = ddtext elif key in ['avatar', 'pm']: pass else: self.logger.warning( 'New information found on use profile page: "%s"' % key) yield user
def parse_userprofile(self, response): user = items.User() user['username'] = self.get_text(response.css("#basicinfo .username h4::text").extract_first()) user['relativeurl'] = response.meta['relativeurl'] user['fullurl'] = response.url user['membergroup'] = self.get_text(response.css("#basicinfo .username h4 span.position")) user['icq'] = self.extract_icq(response.css("#basicinfo a.icq")) user['msn'] = self.extract_msn(response.css("#basicinfo a.msn")) signature = self.get_text(response.css("#detailedinfo .signature")) user['signature'] = self.get_text(re.sub("^Signature:", "", signature)) dts = response.css("#detailedinfo .content dl dt") for dt in dts: key = self.get_text(dt).lower().rstrip(':') ddtext = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'posts': m = re.search('(\d+)\s*\((.+) per day\)', ddtext) if m: user['post_count'] = m.group(1) user['post_per_day'] = m.group(2) else: user['post_count'] = ddtext elif key == 'karma': user['karma'] = ddtext elif key == 'age': user['age'] = ddtext elif key == 'position 1': user['group'] = ddtext elif key == 'gender': user['gender'] = ddtext elif key == 'personal text': user['personal_text'] = ddtext elif key == 'date registered': try: user['joined_on'] = self.parse_timestr(ddtext) except: user['joined_on'] = ddtext elif key == 'last active': try: user['last_active'] = self.parse_timestr(ddtext) except : user['last_active'] = ddtext elif key == 'location': user['location'] = ddtext elif key == 'custom title': user['custom_title'] = ddtext elif key in ['local time']: pass else: self.logger.warning('New information found on user profile page : %s. (%s)' % (key, response.url)) yield user
def parse_userprofile(self, response): myprofile_username = self.get_text(response.css(".blockform h2")) if self.login['username'].lower() in myprofile_username.lower(): self.logger.info("Skipping my own profile") else: user = items.User() user['relativeurl'] = response.meta['relativeurl'] user['fullurl'] = response.url dts = response.css("#viewprofile dl dt") for dt in dts: key = self.get_text(dt).lower() ddtext = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'username': user['username'] = ddtext elif key == 'title': user['title'] = ddtext elif key == 'registered': user['joined_on'] = self.parse_timestr(ddtext) elif key == 'last post': user['last_post'] = self.parse_timestr(ddtext) elif key == 'posts': m = re.match("^(\d+).+", ddtext) if m: user['post_count'] = m.group(1) elif key == 'signature': user['signature'] = ddtext elif key == 'location': user['location'] = ddtext elif key == 'jabber': user['jabber'] = ddtext elif key == 'icq': user['icq'] = ddtext elif key == 'real name': user['realname'] = ddtext elif key == 'microsoft account': user['microsoft_account'] = ddtext elif key == 'yahoo! messenger': user['yahoo_messenger'] = ddtext elif key == 'website': user['website'] = ddtext elif key in ['avatar', 'email', 'pm', 'contacts']: pass else: self.logger.warning('New information found on use profile page : %s (%s)' % (key, response.url)) yield user
def parse_user(self, response): if response.xpath('.//div[@class="inner"]/p/text()').extract_first( ) and "The requested user does not exist." in response.xpath( './/div[@class="inner"]/p/text()').extract_first(): self.logger.warning( 'User profile not available. Likely deleted: "%s"' % response.url) return else: user = items.User() user['relativeurl'] = response.url.replace( 'http://satri4bb5r56y253.onion/.', '') user['fullurl'] = response.url dts = response.css("#viewprofile dl dt") for dt in dts: key = self.get_text(dt).lower() ddtext = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'username:'******'username'] = ddtext elif key == 'total posts:': user['message_count'] = ddtext.split('|')[0].strip() elif key == 'joined:': user['joined_on'] = self.parse_timestr(ddtext) elif key == 'last active:': user['last_activity'] = self.parse_timestr(ddtext) # There are additional items elif key == 'rank:': user['rank'] = ddtext elif key == 'groups:': ddtext1 = '|'.join( dt.xpath( 'following-sibling::dd[1]/select/option/text()'). extract()) user['membergroup'] = ddtext1 elif key == 'most active forum:': pass # user['most active forum'] = ddtext elif key == 'most active topic:': pass # user['most active topic'] = ddtext elif key == '': pass else: self.logger.warning( 'New information found on use profile page : "%s"' % key) yield user
def parse_userprofile(self, response): if response.status == 403: #Unauthorized profile. Happen for private profiles return content = response.css(".profilePage") if content: content = content[0] useritem = items.User() useritem['username'] = self.get_text_first( content.css(".username")) urlparsed = urlparse(response.url) useritem['relativeurl'] = "%s?%s" % (urlparsed.path, urlparsed.query) useritem['fullurl'] = response.url useritem['title'] = self.get_text_first(content.css(".userTitle")) useritem['banner'] = self.get_text_first( content.css(".userBanner")) try: m = re.match('members/([^/]+)', urlparse(response.url).query.strip('/')) m2 = re.match("(.+\.)?(\d+)$", m.group(1)) useritem['user_id'] = m2.group(2) except: pass infos = content.css(".infoBlock dl") for info in infos: name = info.css('dt::text').extract_first().strip() try: if name == 'Last Activity:': useritem['last_activity'] = self.read_datetime_div( info.css('dd .DateTime')) elif name == 'Joined:': useritem['joined_on'] = self.read_datetime_div( info.css('dd')) elif name == 'Messages:': numberstr = self.get_text_first(info.css('dd')) useritem['message_count'] = int( numberstr.replace(',', '')) elif name == 'Likes Received:': numberstr = self.get_text_first(info.css('dd')) useritem['likes_received'] = int( numberstr.replace(',', '')) except: pass yield useritem
def parse_message(self, response): threadid = self.get_url_param(response.url, 'id') posts = response.css("#brdmain .blockpost") for post in posts: # Yield message. messageitem = items.Message() messageitem['contenthtml'] = post.xpath( ".//div[@class='postmsg']").extract_first() messageitem['contenttext'] = self.get_text( post.xpath(".//div[@class='postmsg']")) messageitem['postid'] = self.get_url_param( post.css("h2 span a::attr(href)").extract_first(), 'pid') messageitem['threadid'] = threadid messageitem['author_username'] = self.get_text( post.css(".postleft dl dt strong span")) messageitem['posted_on'] = self.parse_timestr( self.get_text(post.css("h2 span a"))) yield messageitem # Yield user. useritem = items.User() useritem['username'] = self.get_text( post.css(".postleft dl dt strong span")) member_group = post.css(".postleft dd.usertitle") if len(member_group) > 0: useritem['membergroup'] = self.get_text(member_group) website = post.css( ".postleft dd.usercontacts span.website a::attr(href)") if len(website) > 0: useritem['website'] = self.get_text(website) attributes = post.css(".postleft dd") for attribute in attributes: if not attribute.css("span::attr(class)"): content = self.get_text(attribute.css("span")) match = re.search('(.+): (.+)', content) if match: key = match.group(1) value = match.group(2) if 'From' in key or 'Lieu' in key: useritem['location'] = value elif 'Posts' in key or 'Messages' in key: useritem['post_count'] = value elif 'Registered' in key or 'Inscription' in key: useritem['joined_on'] = self.parse_timestr(value) else: self.logger.warning('New information found : %s' % key) yield useritem
def parse_user(self, response): user = items.User() user['relativeurl'] = self.get_relative_url(response.url) user['fullurl'] = response.url dts = response.css("#viewprofile dl dt") for dt in dts: key = self.get_text(dt).lower() ddtext = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'username': user['username'] = ddtext elif key == 'title': user['title'] = ddtext elif key == 'registered': user['joined_on'] = self.parse_timestr(ddtext) elif key == 'last post': user['last_post'] = self.parse_timestr(ddtext) elif key == 'posts': m = re.match(r"^(\d+).+", ddtext) if m: user['post_count'] = m.group(1) elif key == 'signature': user['signature'] = ddtext elif key == 'location': user['location'] = ddtext elif key == 'jabber': user['jabber'] = ddtext elif key == 'icq': user['icq'] = ddtext elif key == 'real name': user['realname'] = ddtext elif key == 'microsoft account': user['microsoft_account'] = ddtext elif key == 'yahoo! messenger': user['yahoo_messenger'] = ddtext elif key == 'website': user['website'] = ddtext elif key == 'email': user['email'] = ddtext elif key in ['avatar', 'pm']: pass else: self.logger.warning( 'New information found on use profile page : "%s"' % key) yield user
def parse_user(self, response): user = items.User() user['relativeurl'] = self.get_relative_url(response.url) user['fullurl'] = response.url user['username'] = self.get_text(response.css("div.main-infos h2")) if user["username"] == "": self.logger.warning("Couldn't get username at %s. Field empty." % response.url) # Extract ratings. # If the user has no ratings, we will receive a "". rating_str = self.get_text(response.css("div.rating.stars")) if rating_str != "": m = re.search(r"\[([\d\.]+)\]", rating_str, re.M | re.I) if m is not None: user["average_rating"] = m.group(1).strip() m = re.search(r"\(([\d]+)[\s]rating", rating_str, re.M | re.I) if m is not None: user["rating_count"] = m.group(1).strip() user["membergroup"] = self.get_text(response.css("div.main-infos p")) activity_list = response.css("div.corner ul.zebra.big-list li") pgp_key_str = self.get_text(response.css("div.right div.contents label.textarea textarea")) if pgp_key_str != "": user["pgp_key"] = self.normalize_pgp_key(pgp_key_str) for tr_item in activity_list: key = self.get_text(tr_item.css("div.main div span")) value = self.get_text(tr_item.css("div.aux div span")) if key == "": self.logger.warning("Key is ''. Value is %s at URL %s" % (value, response.url)) if key == "Last Seen": user["last_activity"] = self.parse_timestr(value) elif key == "Forum Posts": user["post_count"] = value elif key == "Followers": user["followers"] = value else: self.logger.warning('New information found on use profile page: "{}", {}'.format(key, response.url)) yield user
def get_user_item_from_postwrapper(self, postwrapper, response): useritem = items.User() profilelink = postwrapper.css(".poster h4").xpath( ".//a[not(contains(@href, 'action=pm'))]") useritem['username'] = self.get_text(postwrapper.css(".poster h4")) useritem['relativeurl'] = profilelink.xpath("@href").extract_first() useritem['fullurl'] = self.make_url(useritem['relativeurl']) extrainfo = postwrapper.css(".poster ul") useritem['postgroup'] = self.get_text(extrainfo.css("li.postgroup")) useritem['membergroup'] = self.get_text( extrainfo.css("li.membergroup")) m = re.search('(\d+)', self.get_text(extrainfo.css("li.postcount"))) if m: useritem['post_count'] = m.group(1) useritem['karma'] = self.get_text(extrainfo.css("li.karma")) useritem['stars'] = str(len(extrainfo.css("li.stars img"))) return useritem
def get_user_item_from_postwrapper(self, postwrapper, response): useritem = items.User() profilelink = postwrapper.css(".poster h4").xpath( ".//a[not(contains(@href, 'action=pm'))]") useritem['username'] = self.get_text(postwrapper.css(".poster h4")) if useritem['username'] == "1": inspect_response(response, self) #useritem['relativeurl'] = profilelink.xpath("@href").extract_first() extrainfo = postwrapper.css(".poster ul") useritem['postgroup'] = self.get_text(extrainfo.css("li.postgroup")) useritem['membergroup'] = self.get_text( extrainfo.css("li.membergroup")) m = re.search('(\d+)', self.get_text(extrainfo.css("li.postcount"))) if m: useritem['post_count'] = m.group(1) useritem['karma'] = self.get_text(extrainfo.css("li.karma")) useritem['stars'] = str(len(extrainfo.css("li.stars img"))) #self.logger.warning("%s" % useritem) relativeurl = postwrapper.xpath( './/h4/a[contains(@title, "View the profile")]/@href' ).extract_first() if relativeurl is None: self.logger.warning( "No relative URL could be generated from %s. User: %s Userinfo: %s" % (response.url, useritem['username'], self.get_text( postwrapper.xpath(".//div[@class='poster']/ul")))) self.logger.warning( "Because no relative URL could be generated, URL-value fields are set as GUEST NAME and ENDPOINT + GUEST NAME." ) # See of filling in shitty data makes it ok. If not, IT MUST BE DELETED useritem['relativeurl'] = useritem['username'] useritem['fullurl'] = self.spider_settings['endpoint'] + useritem[ 'username'] else: useritem['relativeurl'] = relativeurl useritem['fullurl'] = self.make_url(useritem['relativeurl']) return useritem
def parse_user(self, response): #self.logger.info("Yielding profile from %s" % response.url) user = items.User() user['relativeurl'] = urlparse(response.url).path + "?" + urlparse( response.url).query user['fullurl'] = response.url user_info_td = response.xpath("//fieldset[not(@id)]/table//td[1]") user['username'] = self.get_text( user_info_td.xpath(".//span[@class='largetext']/strong")) if user["username"] == "": self.logger.warning("Could not get username. %s %s" % (response.url, response.body)) #return user["rating_count"] = len( user_info_td.xpath(".//span[@class='smalltext']/img")) text_html = self.get_text( user_info_td.xpath(".//span[@class='smalltext']")) try: user['membergroup'] = re.search(r"\((.*)\)Registration Date", text_html, re.M | re.I | re.S).group(1).strip() except Exception as e: self.logger.warning("membergroup error %s with value %s" % (response.url, e)) birthday_str = "" try: birthday_str = re.search("Date of Birth:(.*)Local Time", text_html, re.M | re.I | re.S).group(1).strip() except Exception as e: self.logger.warning("birthday error %s value %s" % (response.url, e)) if birthday_str == "Not Specified": birthday_str = "" user['birthday'] = birthday_str forum_info_list = response.xpath( "//fieldset[not(@id)]/following-sibling::table[1]//table[1]//tr") for tr_item in forum_info_list: key = self.get_text( tr_item.xpath("td[not(@class='thead')]/strong")) value = self.get_text(tr_item.xpath("td[2]")) if key == "": continue if key == "Last Visit:": user["last_activity"] = self.parse_datetime( value.split("(")[0]) elif key == "Total Posts:": user["post_count"] = value.split(" (")[0].strip() try: user["post_per_day"] = re.search(r"\((.*)posts per day", value, re.M | re.I | re.S).group(1).strip() except Exception as e: self.logger.warning( "Couldn't get posts per day. Please verify at %s" % response.url) elif key == "Joined:": user['joined_on'] = self.parse_datetime(value) elif "Reputation:" in key: self.get_text( tr_item.xpath( ".//strong[contains(@class, 'reputation_')]")) user['reputation'] = self.get_text( tr_item.xpath( ".//strong[contains(@class, 'reputation_')]")) elif "Sex:" == key: if value != "Undisclosed": user["gender"] = value elif "Location:" == key: user["location"] = value elif "Bio" in key: pass elif "Total Threads" in key: pass elif "Public PGP Key:" in key: pass elif "Time Spent Online" in key: pass elif "Warning Level:" in key: pass else: self.logger.warning( 'New information found on use profile page: "{}", {}'. format(key, response.url)) yield user
def parse_user(self, response): # self.logger.info("Yielding profile from %s" % response.url) try: useritem = items.User() useritem['username'] = self.get_text( response.css("fieldset span.largetext span")) if useritem['username'] == '' or useritem['username'] is None: useritem['username'] = self.get_text( response.xpath('.//span[@class="largetext"]')) useritem['relativeurl'] = urlparse(response.url).path useritem['fullurl'] = response.url useritem['username_id'] = self.get_url_param(response.url, 'uid') trs = response.css("#content div.wrapper table.tborder tr") for tr in trs: if (len(tr.css('td')) == 2): key = self.get_text( tr.css('td:first-child strong')).lower() content = self.get_text( tr.xpath('.//td[last()]/text()').extract_first()) if key == 'joined:': useritem['joined_on'] = self.parse_timestr( content, response) elif key == 'last visit:': useritem['last_active'] = self.parse_timestr( content, response) elif key == 'total posts:': match = re.match('(\d+\.?\d*)', content) if match: useritem['message_count'] = match.group(1) else: self.logger.warning( "Couldn't get user's total number of posts at %s" % response.url) elif key == 'total threads:': match = re.match('(\d+\.?\d*)', content) if match: useritem['post_count'] = match.group(1) elif key == 'reputation:': useritem['reputation'] = self.get_text( tr.css('.reputation_positive')) elif key == 'sex:': useritem['gender'] = content elif key == 'warning level:': useritem['warning_level'] = self.get_text( tr.xpath('.//td[last()]')) elif key == 'homepage:': useritem['website'] = self.get_text( tr.xpath('.//td[last()]')) elif key in [ 'avatar', 'email:', 'private message:', 'bio:', 'time spent online:' ]: pass else: self.logger.warning( 'New information found on user profile page: "%s" with value "%s"' % (key, content)) yield useritem except Exception as e: self.logger.warning("Cannot parse user item at URL %s because %s" % (response.url, e)) pass
def parse_thread(self, response): inspect_response(response, self) threadid = self.get_id_from_url(response.url) # We first parse the first post. messageitem = items.Message() messageitem['threadid'] = threadid messageitem['postid'] = "thread" + threadid msg = response.xpath('.//div[@class="col-xs-10 alert alert-info whitebg"]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) # there are 3 user classes. Buyer, vendor and support. vendor = response.xpath(".//div[@class='col-xs-12']/small/a/text()").extract_first() is not None support = response.xpath(".//div[@class='col-xs-12']/small/b/text()").extract_first() == 'Support' buyer = vendor is False and support is False # Buyer username. if buyer is True: author_username = response.xpath(".//div[@class='col-xs-12']/small/text()").extract_first().strip() author_username = re.search('by (.*)$', author_username).group(1) messageitem['author_username'] = author_username membergroup = "Buyer" # Support staff. elif support is True: author_username = response.xpath(".//div[@class='col-xs-12']/small/b/text()").extract_first().strip() messageitem['author_username'] = author_username membergroup = "Support" # vendor username. elif vendor is True: author_username = response.xpath(".//div[@class='col-xs-12']/small/a/text()").extract_first() messageitem['author_username'] = author_username membergroup = "Vendor" else: self.logger.warning('Unknown member group at %s' % response.url) # Get info about the post. postinfo = self.get_text(response.xpath(".//div[@class='col-xs-12']/small")) if postinfo: matches = re.search(r'(\d+) (.+) ago by ([^ ]+)', postinfo) messageitem['posted_on'] = self.parse_datetime(matches.group(0)) else: self.logger.warning("No postinfo yielded at %s" % response.url) yield messageitem user = items.User() user['username'] = author_username user['membergroup'] = membergroup if membergroup in ["Buyer", "Support"]: user['relativeurl'] = user['username'] user['fullurl'] = self.spider_settings['endpoint'] + user['username'] elif membergroup == "Vendor": user['relativeurl'] = response.xpath(".//div[@class='col-xs-12']/small/a/@href").extract_first() user['fullurl'] = self.spider_settings['endpoint'] + user['relativeurl'] else: self.logger.warning('Unknown member group at %s' % response.url) poster_block = response.xpath(".//div[@class='col-xs-12']") if membergroup in ['Buyer', 'Vendor']: stars = poster_block.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first() if stars: stars = re.search('[Vendor|Buyer]: ([0-9]{1,1000})', stars).group(1) user['stars'] = stars else: self.logger.warning('No stars at URL %s' % response.url) yield user # We now parse the comments and yield them to the DB. # To treat the DB nice and avoid race conditions, sleep for a second. #time.sleep(0.5) post = response.css('.row .col-lg-8 > div') # Parse the remaining comments. # Post IDs are not caught by the comment selector. We loop them using an index. reply_index = 0 msg_ids = post.xpath(".//span[@class='forumMsgOffset']") for comment in post.css('div.comment p'): messageitem = items.Message() messageitem['threadid'] = threadid messageitem['postid'] = msg_ids[reply_index].xpath("@id").extract_first() reply_index += 1 post_info = comment.css('small::text').extract_first() if post_info: matches = re.search(r'(\d+) point([s]*) (.+)', post_info) if matches: messageitem['posted_on'] = self.parse_timestr(matches.group(3)) author_name = comment.css('a.vendorname::text').extract_first() if not author_name: author_name = comment.css('*::text').extract_first() messageitem['author_username'] = author_name.strip() messageitem['contenttext'] = ''.join(comment.css('p::text').extract()[1:]) messageitem['contenthtml'] = self.get_text(comment.css('p').extract_first()) yield messageitem # Sleep again to avoid race condition. #time.sleep(0.5) for comment in post.css('div.comment p'): useritem = items.User() vendor = comment.xpath('.//a[@class="vendorname"]/text()').extract_first() is not None buyer = comment.xpath('.//span[@class="left lightGrey"]').extract_first() is not None and self.get_text(comment).startswith('Support') is False support = comment.xpath('.//span/b') is not None and self.get_text(comment).startswith('Support') is True if vendor is True: useritem['username'] = comment.xpath('.//a[@class="vendorname"]/text()').extract_first() useritem['relativeurl'] = comment.xpath('.//a[@class="vendorname"]/@href').extract_first() useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['relativeurl'] membergroup = "Vendor" useritem['stars'] = comment.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first().replace('Vendor: ', '') elif support is True: username = self.get_text(comment) username = re.search('^(Support)[0-9]{1,100} ', username).group(1) useritem['username'] = username useritem['relativeurl'] = useritem['username'] useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['username'] membergroup = "Support" elif buyer is True: username = self.get_text(comment) username = re.search('^(.*?) Buyer', username).group(1) useritem['username'] = username useritem['relativeurl'] = useritem['username'] useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['username'] membergroup = "Buyer" useritem['stars'] = comment.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first().replace('Buyer: ', '') else: self.logger.warning("Unknown commenter group at %s" % response.url) useritem['membergroup'] = membergroup yield useritem
def parse_message(self, response): #self.logger.info("Yielding messages from %s" % response.url) threadid = self.get_url_param(response.url, 'topic').split(".")[0] posts = response.css("#forumposts div.windowbg") + response.css( "#forumposts div.windowbg2") for post in posts: messageitem = items.Message() posttime = self.parse_timestr( re.search("«.*on:(.*?)»", self.get_text(post.css("div.keyinfo div.smalltext")), re.S | re.M).group(1).strip()) author_username = post.xpath(".//h4/a/text()").extract_first() if author_username is not None: # Verified posters. messageitem['author_username'] = author_username.strip() elif post.xpath(".//h4/text()").extract_first() is not None: messageitem['author_username'] = post.xpath( ".//h4/text()").extract_first().strip() else: self.logger.warning('Unknown problem yielding user at URL %s' % response.url) messageitem['postid'] = post.css( "div.post div.inner::attr(id)").extract_first().replace( "msg_", "") messageitem['threadid'] = threadid messageitem['posted_on'] = posttime msg = post.css("div.post") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem for post in posts: useritem = items.User() username = post.xpath(".//h4/a/text()").extract_first() if username is not None: # Verified posters. useritem['username'] = username.strip() useritem["relativeurl"] = self.get_relative_url( post.css(".poster h4 a::attr(href)").extract_first()) useritem["fullurl"] = self.make_url( post.css(".poster h4 a::attr(href)").extract_first()) elif post.xpath(".//h4/text()").extract_first() is not None: useritem['username'] = post.xpath( ".//h4/text()").extract_first().strip() useritem["relativeurl"] = useritem['username'] useritem["fullurl"] = self.spider_settings[ 'endpoint'] + useritem['username'] else: self.logger.warning('Unknown problem yielding user at URL %s' % response.url) for li in post.xpath(".//ul/li"): key = li.xpath(".//@class").extract_first() keytext = li.xpath(".//text()").extract_first() if key == "postgroup": useritem['postgroup'] = keytext elif key == "membergroup": useritem['membergroup'] = keytext elif key == 'karma': useritem['karma'] = keytext.replace('Karma: ', '') elif key == 'title': useritem['title'] = keytext elif key == 'stars': useritem['stars'] = keytext elif key == 'postcount': useritem['post_count'] = keytext.replace('Posts: ', '') elif key == 'custom': awards = li.xpath(".//text()").extract() useritem['awards'] = '|'.join(awards).replace( 'Awards: |', '') elif key is None or key in [ 'blurb', 'avatar', 'profile', 'new_win', 'quote', 'quote_button' ]: pass else: self.logger.warning( "Unknown key in user profile '%s' with value '%s'" % (key, keytext)) yield useritem
def parse_user(self, response): user = items.User() user['relativeurl'] = response.url.replace("http://avengersdutyk3xf.onion", "").replace(";area=summary", "") user['username'] = self.get_text(response.css("#basicinfo .username h4::text").extract_first()) user['fullurl'] = response.url.replace(";area=summary", "") user['membergroup'] = self.get_text(response.css("#basicinfo .username h4 span.position")) checkEmail = response.xpath('//li[@class="custom_field"]/text()').extract_first() if checkEmail: user['email'] = checkEmail checkSignature = response.xpath('//div[@class="signature"]/h5/following-sibling::text()').extract_first() if checkSignature: user['signature'] = checkSignature dts = response.css("#detailedinfo .content dl dt") for dt in dts: key = self.get_text(dt).lower().rstrip(':') ddtext = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'posts': m = re.search('(\d+)\s*\((.+) per day\)', ddtext) if m: if "N/A" not in m.group(1): user['post_count'] = m.group(1) if "N/A" not in m.group(2): user['post_per_day'] = m.group(2) else: if "N/A" not in ddtext: user['post_count'] = ddtext elif key == 'karma': if "N/A" not in ddtext: user['karma'] = ddtext elif key == 'age': if "N/A" not in ddtext: user['age'] = ddtext elif key == 'position 1': if "N/A" not in ddtext: user['group'] = ddtext elif key == 'gender': if "N/A" not in ddtext: user['gender'] = ddtext elif key == 'personal text': if "N/A" not in ddtext: user['personal_text'] = ddtext elif key == 'date registered': if "N/A" not in ddtext: user['joined_on'] = self.parse_timestr(ddtext) elif key == 'last active': if "N/A" not in ddtext: user['last_active'] = self.parse_timestr(ddtext) elif key == 'location': if "N/A" not in ddtext: user['location'] = ddtext elif key == 'custom title': if "N/A" not in ddtext: user['custom_title'] = ddtext elif key == 'pgp': if "just ask me" not in ddtext: user['pgp_key'] = self.normlaize_pgp_key(ddtext) elif key == 'email': if "N/A" not in ddtext: user['email'] = ddtext elif key in ['local time']: pass else: self.logger.warning('New information found on use profile page : %s. (%s)' % (key, response.url)) yield user