def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() neato_out = [] vector_sums = boringmatrix.get_vectorsums(results, NOTE_BEGINS) sorted_sums = sorted(vector_sums.items(), key=itemgetter(1), # (1) is value reverse=True) for itempair in sorted_sums: sorted_weights = sorted(boringmatrix.cooccurrence_weights(results[NOTE_BEGINS[0]][itempair[0]], results[NOTE_BEGINS[1]][itempair[0]]).items(), key=itemgetter(1), reverse=True) #wcnt = max(10, int(math.floor(len(sorted_weights) * 0.10))) #wcnt = min(10, int(math.floor(len(sorted_weights) * 0.10))) wcnt = min(10, len(sorted_weights)) neato_out.append((str(boringmatrix.datetime_from_long(itempair[0])), itempair[1], len(sorted_weights), sorted_weights[0:wcnt])) with open("%s.out" % output_name, 'w') as fout: fout.write(dumps(neato_out, indent=4))
def main(): """.""" # Did they provide the correct args? if len(sys.argv) != 5: usage() sys.exit(-1) # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist( results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) new_matrix = {} for note in results: for start in results[note]: for term, value in results[note][start].term_matrix.items(): try: new_matrix[term] += value except KeyError: new_matrix[term] = value with open("%s_top_terms_tuples.json" % output_name, 'w') as fout: fout.write( dumps(vectorspace.top_terms_tuples(new_matrix, 10000), indent=4))
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 7 or len(sys.argv) > 9: usage() sys.exit(-1) use_short_terms = False use_file_out = False value = 0 # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True elif "-file" == sys.argv[idx]: use_file_out = True elif "-value" == sys.argv[idx]: value = int(sys.argv[idx + 1]) except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() output_count_graphs(results[NOTE_BEGINS[0]], results[NOTE_BEGINS[1]], "%s_counters_gt_%d" % (output_name, value), value, use_file_out)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False use_file_out = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True elif "-file" == sys.argv[idx]: use_file_out = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist( results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2( results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() output_distinct_graphs(results[NOTE_BEGINS[0]], results[NOTE_BEGINS[1]], "%s_distinct" % (output_name), use_file_out)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False full_term_matrix_out = False merged_term_matrix_out = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True elif "-ftm" == sys.argv[idx]: full_term_matrix_out = True elif "-mtm" == sys.argv[idx]: merged_term_matrix_out = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist( results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2( results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() if use_short_terms and full_term_matrix_out: raise Exception("Cannot use short and full at the same time buddy") # ---------------------------------------------------------------------- # Output a CSV with a model built from merging boston and i495 for each # t. Using the short list, or whatever is set. if merged_term_matrix_out: merged = {} for start in results[NOTE_BEGINS[0]]: x = boringmatrix.BoringMatrix(None) for note in NOTE_BEGINS: for term in results[note][start].term_matrix: val = results[note][start].term_matrix[term] try: x.term_matrix[term] += val except KeyError: x.term_matrix[term] = val x.total_count += val if use_short_terms: x.drop_not_in(sterm_list) x.compute() merged[start] = x if use_short_terms: boringmatrix.output_full_matrix(sterm_list, merged, "%s_merged.csv" % output_name) else: boringmatrix.output_full_matrix(term_list, merged, "%s_merged.csv" % output_name) elif full_term_matrix_out: for note in NOTE_BEGINS: output = "%s_%s_full.csv" % (output_name, note) boringmatrix.output_full_matrix(term_list, results[note], output) elif use_short_terms: for note in results: output = "%s_%s.csv" % (output_name, note) boringmatrix.output_full_matrix(sterm_list, results[note], output)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() # ---------------------------------------------------------------------- # Compute the permutation entropy for the window. # # Use set resemblance to get entropy probability value. for note in results: sorted_indices_dict = {} for start in results[note]: full_list = results[note][start].build_fulllist(term_list) indices = sorted_indices(full_list) try: sorted_indices_dict[str(indices)] += 1 except KeyError: sorted_indices_dict[str(indices)] = 1 # Compare to the number of slices. print "number of sorted indices: %d" % len(sorted_indices_dict)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist( results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2( results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() # ---------------------------------------------------------------------- # Output each slice for each area as a new-line broken up term count # file. These values aren't normalized, so they're not terribly useful # yet. outdir = "%s_%s" % (output_name, "pca1") if os.path.exists(outdir): os.rmdir(outdir) os.mkdir(outdir) if use_short_terms: the_terms = sterm_list else: the_terms = term_list for note in results: for start in results[note]: filename = "%s-%d" % (note, start) values = [] for term in the_terms: # Could probably just index with a try/catch. if term in results[note][start].term_matrix: value = results[note][start].term_matrix[term] else: value = 0 values.append(value) try: data_str = "\n".join(["%d" % value for value in values]) except TypeError, e: print type(values), type(values[0]), values[0], values[1] print e sys.exit(-2) with open(os.path.join(outdir, filename), 'w') as fout: fout.write(data_str)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False use_file_out = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True elif "-file" == sys.argv[idx]: use_file_out = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() # ---------------------------------------------------------------------- # Compute the cosine similarities. # YOU NEED TO CALL .compute() before this or you'll get garbage. vector_sums = boringmatrix.get_vectorsums(results, NOTE_BEGINS) count_cosine = {} weight_cosine = {} # ---------------------------------------------------------------------- # Compute the similarity and counts for the given models as well as the # entropy. for start in results[NOTE_BEGINS[0]]: # These are identical... as they should be. Really, I should be # using these. # Totally different than those above. count_cosine[int(start)] = \ boringmatrix.boring_count_similarity(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) weight_cosine[int(start)] = \ boringmatrix.boring_weight_similarity(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) # Consider using a few panes. output_similarity_gnuplot(vector_sums, "%s_sims" % output_name, use_file_out) output_similarity_gnuplot(count_cosine, "%s_sims_count" % output_name, use_file_out) output_similarity_gnuplot(weight_cosine, "%s_sims_weight" % output_name, use_file_out) for start in count_cosine: if count_cosine[start] > 0.8: print start print terms_in_common(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) print set_resemblance(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) print "x" * 20
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False use_file_out = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True elif "-file" == sys.argv[idx]: use_file_out = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist( results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2( results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() # ---------------------------------------------------------------------- # Compute the cosine similarities. # YOU NEED TO CALL .compute() before this or you'll get garbage. vector_sums = boringmatrix.get_vectorsums(results, NOTE_BEGINS) count_cosine = {} weight_cosine = {} # ---------------------------------------------------------------------- # Compute the similarity and counts for the given models as well as the # entropy. for start in results[NOTE_BEGINS[0]]: # These are identical... as they should be. Really, I should be # using these. # Totally different than those above. count_cosine[int(start)] = \ boringmatrix.boring_count_similarity(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) weight_cosine[int(start)] = \ boringmatrix.boring_weight_similarity(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) # Consider using a few panes. output_similarity_gnuplot(vector_sums, "%s_sims" % output_name, use_file_out) output_similarity_gnuplot(count_cosine, "%s_sims_count" % output_name, use_file_out) output_similarity_gnuplot(weight_cosine, "%s_sims_weight" % output_name, use_file_out) for start in count_cosine: if count_cosine[start] > 0.8: print start print terms_in_common(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) print set_resemblance(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start]) print "x" * 20
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False full_term_matrix_out = False merged_term_matrix_out = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True elif "-ftm" == sys.argv[idx]: full_term_matrix_out = True elif "-mtm" == sys.argv[idx]: merged_term_matrix_out = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() if use_short_terms and full_term_matrix_out: raise Exception("Cannot use short and full at the same time buddy") # ---------------------------------------------------------------------- # Output a CSV with a model built from merging boston and i495 for each # t. Using the short list, or whatever is set. if merged_term_matrix_out: merged = {} for start in results[NOTE_BEGINS[0]]: x = boringmatrix.BoringMatrix(None) for note in NOTE_BEGINS: for term in results[note][start].term_matrix: val = results[note][start].term_matrix[term] try: x.term_matrix[term] += val except KeyError: x.term_matrix[term] = val x.total_count += val if use_short_terms: x.drop_not_in(sterm_list) x.compute() merged[start] = x if use_short_terms: boringmatrix.output_full_matrix(sterm_list, merged, "%s_merged.csv" % output_name) else: boringmatrix.output_full_matrix(term_list, merged, "%s_merged.csv" % output_name) elif full_term_matrix_out: for note in NOTE_BEGINS: output = "%s_%s_full.csv" % (output_name, note) boringmatrix.output_full_matrix(term_list, results[note], output) elif use_short_terms: for note in results: output = "%s_%s.csv" % (output_name, note) boringmatrix.output_full_matrix(sterm_list, results[note], output)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() # ---------------------------------------------------------------------- # Convert to sets and compute the set resemblances, see if any are # high, compared to each other at each t. termSets = {} for start in results[NOTE_BEGINS[0]]: set_a = termset.TermSet(results[NOTE_BEGINS[0]][start], "%s.%s" % (NOTE_BEGINS[0], str(start))) set_b = termset.TermSet(results[NOTE_BEGINS[1]][start], "%s.%s" % (NOTE_BEGINS[1], str(start))) termSets[start] = termset.set_resemblance(set_a, set_b) #print sorted( # termSets.items(), # key=itemgetter(1), # (1) is value # reverse=True) # ------------------------------------------------------------------ # Convert to sets and compute the set resemblances for the goal of # clustering all the sets so that I can build a "table" for each t # in T, the bin ID of Xt, Yt | counts --> so I have probabilities # to build the entropy computation for the window. termSetsFull = [] for note in results: for start in results[note]: termSetsFull.append(termset.TermSet(results[note][start], "%s.%s" % (note, str(start)))) resem_matrix = {} length = len(termSetsFull) for i in xrange(0, length): resem_matrix[i] = {} for j in xrange(i + 1, length): resem_matrix[i][j] = termset.set_resemblance(termSetsFull[i], termSetsFull[j]) resem_values = {} for i in resem_matrix: for j in resem_matrix[i]: try: resem_values[resem_matrix[i][j]] += 1 except KeyError: resem_values[resem_matrix[i][j]] = 1 #print dumps(sorted(resem_values, reverse=True), indent=4) print "Resemblance Values Computed" resem_histogram = {0.1 : 0, 0.2 : 0, 0.3 : 0, 0.4 : 0, 0.5 : 0, 0.6 : 0, 0.7 : 0, 0.8 : 0, 0.9 : 0, 1.0 : 0} for value in resem_values.keys(): if value <= 0.1: resem_histogram[0.1] += resem_values[value] elif value <= 0.2: resem_histogram[0.2] += resem_values[value] elif value <= 0.3: resem_histogram[0.3] += resem_values[value] elif value <= 0.4: resem_histogram[0.4] += resem_values[value] elif value <= 0.5: resem_histogram[0.5] += resem_values[value] elif value <= 0.6: resem_histogram[0.6] += resem_values[value] elif value <= 0.7: resem_histogram[0.7] += resem_values[value] elif value <= 0.8: resem_histogram[0.8] += resem_values[value] elif value <= 0.9: resem_histogram[0.9] += resem_values[value] else: resem_histogram[1.0] += resem_values[value] print dumps(resem_histogram, indent=4)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) < 5 or len(sys.argv) > 6: usage() sys.exit(-1) use_short_terms = False # could use the stdargs parser, but that is meh. try: for idx in range(1, len(sys.argv)): if "-in" == sys.argv[idx]: model_file = sys.argv[idx + 1] elif "-out" == sys.argv[idx]: output_name = sys.argv[idx + 1] elif "-short" == sys.argv[idx]: use_short_terms = True except IndexError: usage() sys.exit(-2) if len(NOTE_BEGINS) != 2: sys.stderr.write("use this to compare two sets.\n") sys.exit(-1) # not building the model. results = None if model_file is None: sys.exit(-1) if output_name is None: sys.exit(-1) with open(model_file, 'r') as moin: results = loads(moin.read(), object_hook=boringmatrix.as_boring) # dict(loads(moin.read(), object_hook=as_boring)) # ---------------------------------------------------------------------- # Compute the term weights. boringmatrix.fix_boringmatrix_dicts(results) # for start in results[NOTE_BEGINS[0]]: # for note in NOTE_BEGINS: # total = 0.0 # for term in results[note][start].term_weights: # total += results[note][start].term_weights[term] # print total, # 1.0 is the total weight, yay. print "number of slices: %d" % len(results[NOTE_BEGINS[0]]) term_list = boringmatrix.build_termlist(results) # length of this is used to normalize sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize print "Full Dictionary: %d" % len(term_list) print "Short Dictionary: %d" % len(sterm_list) # ---------------------------------------------------------------------- # Prune out low term counts; re-compute. if use_short_terms: for note in results: for start in results[note]: results[note][start].drop_not_in(sterm_list) results[note][start].compute() # ---------------------------------------------------------------------- # Just build a dictionary of the documents. results_as_dict = {} doc_length = {} doc_freq = {} top_terms_slist = None for note in results: for start in results[note]: doc_id = "%s-%d" % (note, start) results_as_dict[doc_id] = results[note][start].term_matrix.copy() doc_length[doc_id] = results[note][start].total_count for term in results_as_dict[doc_id]: try: doc_freq[term] += 1 except KeyError: doc_freq[term] = 1 invdoc_freq = vectorspace.calculate_invdf(len(results_as_dict), doc_freq) doc_tfidf = \ vectorspace.calculate_tfidf(doc_length, results_as_dict, invdoc_freq) with open("%s_%s" % (output_name, "top_tfidf.json"), 'w') as fout: fout.write(dumps(vectorspace.top_terms_overall(doc_tfidf, TOP_TERM_CNT), indent=4)) top_terms_slist = \ vectorspace.top_terms_overall(results_as_dict, int(len(doc_freq)*.10)) with open("%s_%s" % (output_name, "top_tf.json"), 'w') as fout: fout.write(dumps(top_terms_slist, indent=4)) for note in results: for start in results[note]: results[note][start].drop_not_in(top_terms_slist) results[note][start].compute() boringmatrix.output_full_matrix(top_terms_slist, results[note], "%s_%s_tops.csv" % (output_name, note))