示例#1
0
def main(_):
    if not os.path.exists(FLAGS.output_dir):
        print('Creating directory: %s' % FLAGS.output_dir)
        os.mkdir(FLAGS.output_dir)

    desc_counter = Counter()
    attr_counter = Counter()
    partial_counts = defaultdict(Counter)

    print('Processing data...')
    n = len(FLAGS.inputs)
    for i, fname in enumerate(FLAGS.inputs):
        print('File %i of %i: %s' % (i, n, fname))
        with open(fname, 'r') as f:
            data = json.load(f)
        for product in data:
            desc = product['clean_text'].split() + \
                product['clean_title'].split()
            desc_counter.update(desc)
            for attr, value in product['specs'].items():
                attr_counter.update((attr,))
                partial_counts[attr].update((value,))

    # Filter values
    partial_counts = {x: {y: z for y, z in y.items() if z >= FLAGS.min_value }
                      for x, y in partial_counts.items()}

    # Remove singular attributes
    singular = {x for x, y in partial_counts.items() if len(y) <= 1}
    attr_counter = Counter({x: y for x, y in attr_counter.items() if x not in singular})
    partial_counts = {x: y for x, y in partial_counts.items() if x not in singular}

    # Filter attrs
    if FLAGS.max_attr is not None:
        attr_counter = {x: y for x, y in attr_counter.most_common(FLAGS.max_attr)}

    # Filter desc
    desc_counter = Counter({x: y for x, y in desc_counter.items() if y >= FLAGS.min_desc})
    desc_vocab = Vocab.build_from_counter(desc_counter)
    attr_vocab = Vocab.build_from_counter(attr_counter)
    value_set = ValueSet.build_from_partial_counts(partial_counts)

    print('Writing to disk...')
    desc_fname = os.path.join(FLAGS.output_dir, 'desc.txt')
    with open(desc_fname, 'w') as f:
        desc_vocab.write(f)
    attr_fname = os.path.join(FLAGS.output_dir, 'attr.txt')
    with open(attr_fname, 'w') as f:
        attr_vocab.write(f)
    value_fname = os.path.join(FLAGS.output_dir, 'value.txt')
    with open(value_fname, 'w') as f:
        value_set.write(f)
    stats_fname = os.path.join(FLAGS.output_dir, 'stats.txt')
    with open(stats_fname, 'w') as f:
        f.write('num_attrs: %i\n' % len(attr_vocab))
        f.write('num_vals: %i\n' % len(value_set))
        f.write('num_words: %i\n' % len(desc_vocab))

    print('Done')