示例#1
0
def read_review_data(filename):
    review_list = []
    with open(filename) as json_file:
        data = json.load(json_file)
        for obj in data['reviews']:
            review = Review()
            review.rating = obj['rating']
            review.date = obj['date']
            review.text = obj['review_text']
            review_list.append(review)
    return review_list
    paths = [relpath for relpath in paths if \
        relpath.find('index.html') == -1 and \
        relpath != "urls" and relpath.find('.arff') == -1]

    for relpath in paths:
        path = data_dir + relpath
        print path
        soup = BeautifulSoup(open(path))
        try:
            meta = soup.find('ul', {"class": "review-meta"})
            rev = Review()
            rev.id = int(re.search('\d+', relpath).group(0))
            rev.artist = meta.find('h1').find('a').get_text()
            rev.album = meta.find('h2').get_text()
            str_date = meta.find('span', {"class": "pub-date"}).get_text()
            rev.date = datetime.strptime(str_date, '%B %d, %Y')
            rev.score = float(meta.find('span', {"class": "score"}).get_text())
            rev.text = soup.find('div', {"class": "editorial"}).get_text()

            db.reviews.insert(rev.__dict__)
            count = count + 1
            print count
            #out.write(rev.arff_row())

            #print str(rev)
        except Exception as e:
            print e
            print 'failed to parse ' + path

#out.close()
    paths = [relpath for relpath in paths if \
        relpath.find('index.html') == -1 and \
        relpath != "urls" and relpath.find('.arff') == -1]

    for relpath in paths:
        path = data_dir + relpath
        print path
        soup = BeautifulSoup(open(path))
        try:
            meta = soup.find('ul', {"class":"review-meta"})
            rev = Review()
            rev.id = int(re.search('\d+', relpath).group(0))
            rev.artist = meta.find('h1').find('a').get_text()
            rev.album = meta.find('h2').get_text()
            str_date = meta.find('span', {"class":"pub-date"}).get_text()
            rev.date = datetime.strptime(str_date, '%B %d, %Y')
            rev.score = float(meta.find('span', {"class":"score"}).get_text())
            rev.text = soup.find('div', {"class":"editorial"}).get_text()

            db.reviews.insert(rev.__dict__)
            count = count + 1
            print count 
            #out.write(rev.arff_row())

            #print str(rev)
        except Exception as e:
            print e
            print 'failed to parse ' + path

#out.close()