示例#1
0
    # Save a map of biz ID to categories
    biz_id_to_categories = {}
    for business in simplejson.load(open(business_in_path, "rb")):
        roots = set()
        for cat in business["categories"]:
            roots.update(categories.get_yelp_roots_for_cat(cat))
        biz_id_to_categories[business["id"]] = list(roots)

    simplejson.dump(biz_id_to_categories, open(biz_to_cats_out_path, "wbc"), indent=2)

    print >>sys.stderr, "Saved biz to category map to %s" % biz_to_cats_out_path

    # Save the map of categories to biz counts
    category_to_biz_count = defaultdict(int)
    for biz_id, cats in biz_id_to_categories.iteritems():
        if not cats:
            category_to_biz_count["__uncategorized"] += 1
        else:
            for cat in cats:
                category_to_biz_count[cat] += 1

    simplejson.dump(category_to_biz_count, open(cat_to_biz_count_out_path, "wbc"), indent=2)

    print >>sys.stderr, "Saved category to biz count to %s" % cat_to_biz_count_out_path

    # Now build the review dump
    with open(review_dump_out_path, "wbc") as reviews_out_file:
        for review in simplejson.load(open(reviews_in_path, "rb")):
            print >> reviews_out_file, encode_document(review["biz_id"], review["text"])
    print >>sys.stderr, "Saved review dump to %s" % review_dump_out_path
示例#2
0
                    open(biz_to_cats_out_path, 'wbc'),
                    indent=2)
   
    print >>sys.stderr, "Saved biz to category map to %s" % biz_to_cats_out_path
    
    
    
    # Save the map of categories to biz counts
    category_to_biz_count = defaultdict(int)
    for biz_id, cats in biz_id_to_categories.iteritems():
        if not cats:
            category_to_biz_count['__uncategorized'] += 1
        else:
            for cat in cats:
                category_to_biz_count[cat] += 1
    
    simplejson.dump(category_to_biz_count, 
                    open(cat_to_biz_count_out_path, 'wbc'), 
                    indent=2)
    
    print >>sys.stderr, "Saved category to biz count to %s" % cat_to_biz_count_out_path

    
    
    # Now build the review dump
    with open(review_dump_out_path, 'wbc') as reviews_out_file:
        for review in simplejson.load(open(reviews_in_path, 'rb')):
            print >>reviews_out_file, \
                    encode_document(review['biz_id'],
                                    review['text'])
    print >>sys.stderr, "Saved review dump to %s" % review_dump_out_path
示例#3
0
    simplejson.dump(product_id_to_categories, 
                    open(product_to_cats_out_path, 'wbc'), 
                    indent=2)
    
    print >>sys.stderr, "Saved product to category map to %s" % product_to_cats_out_path
    
    # Save the map of categories to product counts
    category_to_product_count = defaultdict(int)
    for product_id, cats in product_id_to_categories.iteritems():
        if not cats:
            category_to_product_count['__uncategorized'] += 1
        else:
            for cat in cats:
                category_to_product_count[cat] += 1
    
    simplejson.dump(category_to_product_count, 
                    open(cat_to_doc_count_out_path, 'wbc'), 
                    indent=2)
    
    print >>sys.stderr, "Saved category to product count to %s" % cat_to_doc_count_out_path

    
    
    # Now build the review dump
    with open(review_dump_out_path, 'wbc') as reviews_out_file:
        for review in csv.DictReader(open(reviews_in_path, 'rb')):
            print >>reviews_out_file, \
                    encode_document(review['product_id'],
                                    review['text'])

    print >>sys.stderr, "Saved review dump to %s" % review_dump_out_path
示例#4
0
                categories.get_amazon_roots_for_cat_id(product['category_id'])

    simplejson.dump(product_id_to_categories,
                    open(product_to_cats_out_path, 'wbc'),
                    indent=2)

    print >> sys.stderr, "Saved product to category map to %s" % product_to_cats_out_path

    # Save the map of categories to product counts
    category_to_product_count = defaultdict(int)
    for product_id, cats in product_id_to_categories.iteritems():
        if not cats:
            category_to_product_count['__uncategorized'] += 1
        else:
            for cat in cats:
                category_to_product_count[cat] += 1

    simplejson.dump(category_to_product_count,
                    open(cat_to_doc_count_out_path, 'wbc'),
                    indent=2)

    print >> sys.stderr, "Saved category to product count to %s" % cat_to_doc_count_out_path

    # Now build the review dump
    with open(review_dump_out_path, 'wbc') as reviews_out_file:
        for review in csv.DictReader(open(reviews_in_path, 'rb')):
            print >>reviews_out_file, \
                    encode_document(review['product_id'],
                                    review['text'])

    print >> sys.stderr, "Saved review dump to %s" % review_dump_out_path