예제 #1
0
def factor_categories():
	print 'load cats'
	cats = j.loads(open('categories.json','r').read())

	print 'make categories phonebook'
	aggregator = s.make_category_listing(cats)
	cats = aggregator['phonebook']
	tree_cats = aggregator['tree']

	print 'load products'
	prods = s.read_all_products()

	print 'fix product counts'
	s.count_products(cats, prods)
	s.roll_up_counts(cats)

	print 'prune food, money, and baby stuff'
	s.prune_category('28985', cats)
	s.prune_category('33546', cats) 
	s.prune_category('34458', cats) 

	print 'tally attributes'
	cats = s.tally_attributes(cats, prods)

	print 'roll up attributes'
	cats = s.roll_up_attributes(cats)

	print 'summarize_attributes'
	cats = s.summarize_attributes(cats)

	print 'eliminate singletons'
	cats = s.eliminate_singletons(cats)

	print 'coalesce by jaccard'
	s.coalesce_by_jaccard(cats, JACCARD_THRESHHOLD)

	print 'make tree'
	tree = s.make_tree(cats)

	print 'simplify tree'
	tree = c.deepcopy(tree)
	s.strip_down_tree(tree)

	return tree, cats
import survey as s
import codecs
import json as j
import re


if __name__ == "__main__":
    cats = j.loads(open("categories_flat.json").read())
    prods = s.read_all_products()
    extract_all_equivalence(cats, prods)


def extract_all_equivalence(cats, prods, write_folder=""):
    for franch in s.get_franchises(cats):
        name = franch["singularName"].split()[0]
        if not write_folder.endswith("/"):
            write_folder += "/"
        fname = name + "_eq_text.csv"
        franch_id = franch["id"]
        filt_cats = dict(filter(lambda c: c[1]["franchise"] == franch_id, cats.items()))
        extract_equivalence(filt_cats, prods, fname, write_folder=write_folder, headings_in_file=False)


# Build a list of rows that will be entries in the db
def extract_equivalence(cats, prods, fname, write_folder="", headings_in_file=False):

    # By default, we'll index equivalence classes
    USE_FIELD_TYPE = "equivalence_class"

    fh = codecs.open(write_folder + fname, "w", "utf-8")
    is_first = True