def printDF(filename='df.txt'): import json wc = df() checkMkdir(OUT_DIR) out = open(OUT_DIR+filename, 'w') json.dump(wc, out) out.close()
def products2text(products, category): dirname = PLAIN_DIR+category+'/' checkMkdir(dirname) for prod in products: content = prod.getReviews(htmlStyle=True) filename = unicode(prod).replace('/','').replace(' ','_') + '.txt' fout = codecs.open(dirname+filename, 'w', 'utf-8') print >>fout, '\n'.join(content) fout.close()
def outDF(filename='df.db'): wc = df() print len(wc) checkMkdir(OUT_DIR) out = open(OUT_DIR+filename+'keys.txt', 'w') print >>out, pp_str(wc.keys()) out.close() d = shelve.open(OUT_DIR+filename) for k, v in wc.iteritems(): d[k.encode('utf-8')] = v d.close()
def outNgram(n, filesuffix='gram.db'): filename = str(n)+filesuffix wc = ngram(n) print len(wc) checkMkdir(OUT_DIR) out = open(OUT_DIR+filename+'keys.txt', 'w') print >>out, pp_str(wc.keys()) out.close() d = shelve.open(OUT_DIR+filename) for k, v in wc.iteritems(): d[u'-'.join(k).encode('utf-8')] = v d.close()
def createRank(minReviewCount=0, minProductCount=0, onlyValidCategory=False): checkMkdir(RANK_DIR) for category, products in iterAllProducts(minReviewCount): ranking = [] if len(products) < minProductCount: continue for prod in products: prodCategory = prod['CategoryName'] if onlyValidCategory and category != prodCategory.split('>')[-1]: break ranking.append(unicode(prod)+'\t'+prodCategory) else: fout = codecs.open(RANK_DIR+category, 'w', 'utf-8') print >>fout, str(len(products)) + '\n' + '\n'.join(ranking) fout.close()
def products2html(products, category=None, maxReview=None): dirname = HTML_DIR needRoot = False if category is not None: dirname += category + '/' needRoot = True checkMkdir(dirname) for prod in products: reviews = prod.getReviews(max=maxReview, htmlStyle=True) content = '<div class="review">' for i, review in enumerate(reviews): if i: content += '</div>\n\n<div class="review">' content += getHtmlContent(review, i) content += '</div>' filename = unicode(prod).replace('/','').replace(' ','_') + '.html' fout = codecs.open(dirname+filename, 'w', 'utf-8') print >>fout, wrapHtml(content, needRoot=needRoot) fout.close()