num = 0 print "Finding counts: ", for ky in data_table.keys(): if ((num % 1000) == 0): print "{} ".format(num), sys.stdout.flush() xact = data_table[ky] if ((language_filter != 'none') and (xact['lid_lui'] != language_filter) ): # do topic only for docs that match language filter print('Not processing transaction {}'.format(xact)) continue # Topic normalization topic_norm.normalize_msg(xact, rw_hash, debug) # Get counts xact['counts'] = mt.get_counts(xact['msg_topic']) num += 1 print # Write out counts to a file and perform topic clustering fn_counts = dir_temp + fn_table + '.{}.counts.txt'.format(num_topics) if (os.path.exists(fn_counts)): os.remove(fn_counts) mt.write_counts_file(data_table, fn_counts) # Run topic clustering binary fn_feat = dir_temp + fn_table + '.{}.feat.txt'.format(num_topics) fn_model = dir_temp + fn_table + '.{}.plsa'.format(num_topics) cmd = '{}/bin/plsa_estimation_combined_file -vector_list_in {} '.format(dir_topic, fn_counts) + \ '-stop_list_in {} '.format(stop_list) + \
# Normalization and counts rw_hash = mt.create_utf8_rewrite_hash() num = 0 print "Finding counts: ", for ky in data_table.keys(): if ((num % 1000)==0): print "{} ".format(num), sys.stdout.flush() xact = data_table[ky] if ((language_filter != 'none') and (xact['lid_lui'] != language_filter)): # do topic only for docs that match language filter print('Not processing transaction {}'.format(xact)) continue # Topic normalization topic_norm.normalize_msg(xact, rw_hash, debug) # Get counts xact['counts'] = mt.get_counts(xact['msg_topic']) num += 1 print # Write out counts to a file and perform topic clustering fn_counts = dir_temp + fn_table + '.{}.counts.txt'.format(num_topics) if (os.path.exists(fn_counts)): os.remove(fn_counts) mt.write_counts_file(data_table, fn_counts) # Run topic clustering binary fn_feat = dir_temp + fn_table + '.{}.feat.txt'.format(num_topics) fn_model = dir_temp + fn_table + '.{}.plsa'.format(num_topics) cmd = '{}/bin/plsa_estimation_combined_file -vector_list_in {} '.format(dir_topic, fn_counts) + \ '-stop_list_in {} '.format(stop_list) + \