Пример #1
0
def parseComments(data):
    """
    Parse comments from site
    """
    global comments
    reviewBegins = '<div style="margin-left:0.5em;">'
    reviewEnds = '<div style="padding-top: 10px; clear: both; width: 100%;">'
    stars_line = 'margin-right:5px;'
    stars = re.compile('\d+.\d+ out of 5 stars')
    header_line = '<span style="vertical-align:middle;"'
    helpful_line ='people found the following review helpful'
    helpful = re.compile('\d+ of \d+ people found the following review helpful')
    reviewText = '<span class="h3color tiny">' # Actual review

    boundaries = commentsStartStopLineNmbr(data)
    for i in range(boundaries[0], boundaries[1] + 1):
        if reviewBegins in data[i]:
            curcomment = Comment()
            while reviewEnds not in data[i]:
                # Parse stars
                if stars_line in data[i]:
                    stars_found = re.search(stars, data[i])
                    if stars_found != None:
                        curcomment.stars = stars_found.group()
                # Parse header
                elif header_line in data[i]:
                    line = data[i]
                    begin = line.find('<b>') + 3
                    end = line.find('</b>')
                    curcomment.header = line[begin : end]
                # Parse helpfulness
                elif helpful_line in data[i]:
                    helpful_found = data[i].replace(",", "")
                    helpful_found = re.search(helpful, helpful_found)
                    if helpful_found != None:
                        curcomment.helpful = helpful_found.group()
                # Parse body text
                elif reviewText in data[i]:
                    i += 3
                    if '<span class="small"' in data[i]: # Yep, dirty trick :(
                        i += 3
                    data[i] = stripHtmlTags(data[i])
                    curcomment.comment = re.sub("\s+", " ", data[i])
                i += 1
            comments.append(curcomment.getonelinecomment())