def tag_json(): """ Tags all the files in the JSON directory """ records = db_util.get_all_records() pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start() for idx, record in enumerate(records): record = db_util.open_NASARecord(record) # it's OK to concatenate all the text since we're using a # bag of words approach record_words = set() # set of all the words & phrases in the record for i in range(1, tag_analyzer.MAX_PHRASE_LENGTH+1): word_hash = {} tag_analyzer.hash_string(record, i, word_hash) record_words = record_words.union(set(word_hash.keys())) # now, you have a hash of all the word combinations #print "Description:" #print record.description #print record_words found_tags = list(canonical_set.intersection(record_words)) #print "\n------------------------------------------------" #print "Tags found:" indicies = [tag_index[canonical_tags[x]] for x in found_tags] record.tags = indicies record.save() # TODO: save back into the file # TODO: create a tag_list.txt file which has all the possible tags # in a list, so that you don't have to write the whole string into # the file pbar.update(idx + 1) pbar.finish()
def tag_json(): """ Tags all the files in the JSON directory """ records = db_util.get_all_records() pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start() for idx, record in enumerate(records): record = db_util.open_NASARecord(record) # it's OK to concatenate all the text since we're using a # bag of words approach record_words = set() # set of all the words & phrases in the record for i in range(1, tag_analyzer.MAX_PHRASE_LENGTH + 1): word_hash = {} tag_analyzer.hash_string(record, i, word_hash) record_words = record_words.union(set(word_hash.keys())) # now, you have a hash of all the word combinations #print "Description:" #print record.description #print record_words found_tags = list(canonical_set.intersection(record_words)) #print "\n------------------------------------------------" #print "Tags found:" indicies = [tag_index[canonical_tags[x]] for x in found_tags] record.tags = indicies record.save() # TODO: save back into the file # TODO: create a tag_list.txt file which has all the possible tags # in a list, so that you don't have to write the whole string into # the file pbar.update(idx + 1) pbar.finish()
def generate_html(): global tag_list tag_list = tag_json.get_tag_list() records = db_util.get_all_records() htmlfile = open("tags_preview.html", "w") htmlfile.write("<html>\n<head>\n") htmlfile.write('<link rel="stylesheet" type="text/css" href="preview.css" />') htmlfile.write("</head>\n<body>\n") pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start() for idx, record in enumerate(records): record = db_util.open_NASARecord(record) htmlfile.write("<p>\n") record_w_link = '<a href="%s">%s</a>' % (record.med_image, record.record_id) category = get_highlighted_text(record.category, record.tags) description = get_highlighted_text(record.description, record.tags) htmlfile.write(format_div(record_w_link, "record_id")) htmlfile.write(format_div(category, "category")) htmlfile.write("<br/>") htmlfile.write(format_div(description, "description")) htmlfile.write("<br/>") htmlfile.write(format_div("Tags:", "tag_divider")) # TODO: make the tags the same colors if len(record.tags) > 0: tag_text = ", ".join(format_tag(tag_list[tag_id][0], tag_num) for tag_num, tag_id in enumerate(record.tags)) else: tag_text = "None" # tag_text = ", ".join(tag_list[i][0] for i in record.tags) # print "canonical tag:",canonical_tag htmlfile.write(format_div(tag_text, "tag_text")) htmlfile.write("<br/>") htmlfile.write("</p>\n") pbar.update(idx + 1) pbar.finish() htmlfile.write("</body></html>\n")