def parseComments(data): """ Parse comments from site """ global comments reviewBegins = '<div style="margin-left:0.5em;">' reviewEnds = '<div style="padding-top: 10px; clear: both; width: 100%;">' stars_line = 'margin-right:5px;' stars = re.compile('\d+.\d+ out of 5 stars') header_line = '<span style="vertical-align:middle;"' helpful_line ='people found the following review helpful' helpful = re.compile('\d+ of \d+ people found the following review helpful') reviewText = '<span class="h3color tiny">' # Actual review boundaries = commentsStartStopLineNmbr(data) for i in range(boundaries[0], boundaries[1] + 1): if reviewBegins in data[i]: curcomment = Comment() while reviewEnds not in data[i]: # Parse stars if stars_line in data[i]: stars_found = re.search(stars, data[i]) if stars_found != None: curcomment.stars = stars_found.group() # Parse header elif header_line in data[i]: line = data[i] begin = line.find('<b>') + 3 end = line.find('</b>') curcomment.header = line[begin : end] # Parse helpfulness elif helpful_line in data[i]: helpful_found = data[i].replace(",", "") helpful_found = re.search(helpful, helpful_found) if helpful_found != None: curcomment.helpful = helpful_found.group() # Parse body text elif reviewText in data[i]: i += 3 if '<span class="small"' in data[i]: # Yep, dirty trick :( i += 3 data[i] = stripHtmlTags(data[i]) curcomment.comment = re.sub("\s+", " ", data[i]) i += 1 comments.append(curcomment.getonelinecomment())