def get_doc_page(request, alg_db, doc_title, docid, docloc, doc_cutoff=10, topic_cutoff=10, alg=''): """ return the document page to the user with related terms and topics and the document text TODO limit the length of the document returned to first XY bytes """ myrelations = relations(alg_db) doc = Document(docid, doc_title) topics = myrelations.get_top_related_topics(doc, topic_cutoff) piearray = get_js_doc_topic_pie_array(topics) # related topics topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) leftcol = {'piearray':piearray, 'data':topic_keys[:topic_cutoff], 'webname':'topics'} # related documents docs = myrelations.get_top_related_docs(doc, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) rightcol = {'data':doc_keys[:topic_cutoff], 'webname':'documents'} try: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),'r') except IOError: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'),'r') # TODO fix hack midcol = {'doc':gen_clean_text(doc_text_file)} return render_to_response("three-column-vis.html", {'leftcol':leftcol, 'rightcol':rightcol, 'midcol':midcol, 'title':doc.title}, context_instance=RequestContext(request))
def get_term_page(request, alg_db, term_title, termid, term_cutoff=NUM_TERMS, doc_cutoff=10, topic_cutoff=10, alg=''): """ returns the term page to the user with related terms, documents, and topics """ # init myrelations = relations(alg_db) term = Term(termid, term_title, count=-1) # related topics # pie array topics = myrelations.get_top_related_topics(term, topic_cutoff) topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) piearray = get_js_term_topics_pie_array(myrelations, term, topic_keys) topics = myrelations.get_top_related_topics(term, topic_cutoff) topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) leftcol = {'piearray':piearray, 'data': topic_keys[:topic_cutoff], 'webname':'topics'} # related docs docs = myrelations.get_top_related_docs(term, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) midcol = {'data': doc_keys[:doc_cutoff], 'webname':'documents'} # related terms top_related_terms = myrelations.get_top_related_terms(term, term_cutoff) rightcol = {'data': top_related_terms[:term_cutoff], 'webname':'terms'} return render_to_response("three-column-vis.html", {'leftcol':leftcol, 'midcol':midcol, 'rightcol':rightcol, 'title':term.title}, context_instance=RequestContext(request))
def get_topic_page(request, alg_db, topic_title, topicid, term_cutoff=NUM_TERMS, doc_cutoff=10, topic_cutoff=10, alg=''): """ returns the topic page to the user with related terms, documents, and topics """ myrelations = relations(alg_db) if not topic_title[0] == '{': topic_title = '{' + ', '.join(topic_title.strip().split()) + '}' topic = Topic(myrelations, topicid, topic_title) # related terms terms = topic.get_terms(term_cutoff) # add an interactive pie chart piearray = get_js_topic_term_pie_array(topic, terms) # TODO replace this in template leftcol = {'piearray':piearray, 'data':terms, 'webname':'terms'} # # related docs docs = myrelations.get_top_related_docs(topic, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) midcol = {'data': doc_keys[0:doc_cutoff], 'webname':'documents'} # # related topics topics = myrelations.get_top_related_topics(topic, topic_cutoff) topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) rightcol = {'data': topic_keys[0:topic_cutoff], 'webname':'topics'} return render_to_response("three-column-vis.html", {'leftcol':leftcol, 'midcol':midcol, 'rightcol':rightcol, 'title':topic.title}, context_instance=RequestContext(request))
def get_rel_page(request, alg_db, dataloc, alg=''): """ returns the interactive (d3) relationship page """ myrelations = relations(alg_db) # build the doc-doc ddata = build_graph_json(myrelations.get_docs(), myrelations, myrelations.get_top_related_docs) # and term-term graphs tdata = {} #tdata = build_graph_json(myrelations.get_terms(), myrelations, myrelations.get_top_related_terms) # acquire the topics topdata = build_topic_json(myrelations) # feed the results into the heavily customized template ret_val = render_to_response("model-relationships.html", { 'alg': alg, 'doc_data': ddata, 'term_data': tdata, 'top_data': topdata }, context_instance=RequestContext(request)) return ret_val
def presence_graph(request, item, alg_db, alg='', template='table-graph.html', extra_context=None, RPP=49): """ Returns a graph of the relative presence of each item (bar graph) """ if request.GET.has_key('page'): page = int(request.GET['page']) start_val = (page-1)*RPP end_val = page*RPP else: start_val = 0 end_val = RPP myrelations = relations(alg_db) if item == "topics": mobjs = myrelations.get_topics() score_fnct = lambda topic: topic.score max_score = Topic.max_score title_fnct = lambda score,width: str(width) + '% of max' elif item == "terms": mobjs = myrelations.get_terms(start_val=start_val, end_val=end_val) score_fnct = lambda term: term.count max_score = Term.max_occ title_fnct = lambda score, width: str(score) + ' occurences, ' + str(width) + '% of max' context = {'input':get_bar_chart(mobjs, score_fnct, max_score, title_fnct), 'alg':alg, 'group_data_type': item, 'RPP':RPP} if extra_context: context.update(extra_context) return render_to_response(template, context, context_instance=RequestContext(request))
def get_summary_page(request, alg_db, numterms = NUM_TERMS, numcolumns = 3, alg=''): """ Returns the Analyzer's summary page @param request: the django request object @param alg_db: the algorithm database @param numterms: the number of terms to display for the topics @param numcolumns: the number of columns to use to display the topics @param alg: the name of the algorithm e.g. 'LDA' or "HDP' """ myrelations = relations(alg_db) topics= myrelations.get_topics() ncol = min(len(topics), numcolumns) # number of columns for the data table disp_tops = [] topic_row = [] ct = 1 for i in xrange(len(topics)): topic_trms = map( lambda x: {'title':x.title, 'id':x.id}, topics[i].get_terms(numterms)) # get the top numterms terms topic_row.append({'title':topics[i].title, 'id':topics[i].topic_id, 'terms':topic_trms}) if ct == ncol or i==len(topics)-1: # group nicely into rows for django ct = 1 disp_tops.append(topic_row) topic_row = [] else: ct += 1 template = 'summary.html' return render_to_response(template, {'disp_topics':disp_tops, 'alg':alg}, context_instance=RequestContext(request))
def get_doc_page(request, alg_db, doc_title, docid, docloc, doc_cutoff=10, topic_cutoff=10, alg=''): """ return the document page to the user with related terms and topics and the document text TODO limit the length of the document returned to first XY bytes """ myrelations = relations(alg_db) doc = Document(docid, doc_title) topics = myrelations.get_top_related_topics(doc, topic_cutoff) piearray = get_js_doc_topic_pie_array(topics) # related topics topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) leftcol = { 'piearray': piearray, 'data': topic_keys[:topic_cutoff], 'webname': 'topics' } # related documents docs = myrelations.get_top_related_docs(doc, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) rightcol = {'data': doc_keys[:topic_cutoff], 'webname': 'documents'} try: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))), 'r') except IOError: doc_text_file = open( os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'), 'r') # TODO fix hack midcol = {'doc': gen_clean_text(doc_text_file)} return render_to_response("three-column-vis.html", { 'leftcol': leftcol, 'rightcol': rightcol, 'midcol': midcol, 'title': doc.title }, context_instance=RequestContext(request))
def get_term_page(request, alg_db, term_title, termid, term_cutoff=NUM_TERMS, doc_cutoff=10, topic_cutoff=10, alg=''): """ returns the term page to the user with related terms, documents, and topics """ # init myrelations = relations(alg_db) term = Term(termid, term_title, count=-1) # related topics # pie array topics = myrelations.get_top_related_topics(term, topic_cutoff) topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) piearray = get_js_term_topics_pie_array(myrelations, term, topic_keys) topics = myrelations.get_top_related_topics(term, topic_cutoff) topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) leftcol = { 'piearray': piearray, 'data': topic_keys[:topic_cutoff], 'webname': 'topics' } # related docs docs = myrelations.get_top_related_docs(term, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) midcol = {'data': doc_keys[:doc_cutoff], 'webname': 'documents'} # related terms top_related_terms = myrelations.get_top_related_terms(term, term_cutoff) rightcol = {'data': top_related_terms[:term_cutoff], 'webname': 'terms'} return render_to_response("three-column-vis.html", { 'leftcol': leftcol, 'midcol': midcol, 'rightcol': rightcol, 'title': term.title }, context_instance=RequestContext(request))
def get_summary_page(request, alg_db, numterms=NUM_TERMS, numcolumns=3, alg=''): """ Returns the Analyzer's summary page @param request: the django request object @param alg_db: the algorithm database @param numterms: the number of terms to display for the topics @param numcolumns: the number of columns to use to display the topics @param alg: the name of the algorithm e.g. 'LDA' or "HDP' """ myrelations = relations(alg_db) topics = myrelations.get_topics() ncol = min(len(topics), numcolumns) # number of columns for the data table disp_tops = [] topic_row = [] ct = 1 for i in xrange(len(topics)): topic_trms = map(lambda x: { 'title': x.title, 'id': x.id }, topics[i].get_terms(numterms)) # get the top numterms terms topic_row.append({ 'title': topics[i].title, 'id': topics[i].topic_id, 'terms': topic_trms }) if ct == ncol or i == len( topics) - 1: # group nicely into rows for django ct = 1 disp_tops.append(topic_row) topic_row = [] else: ct += 1 template = 'summary.html' return render_to_response(template, { 'disp_topics': disp_tops, 'alg': alg }, context_instance=RequestContext(request))
def get_rel_page(request, alg_db, dataloc, alg=''): """ returns the interactive (d3) relationship page """ myrelations = relations(alg_db) # build the doc-doc ddata = build_graph_json(myrelations.get_docs(), myrelations, myrelations.get_top_related_docs) # and term-term graphs tdata = {} #tdata = build_graph_json(myrelations.get_terms(), myrelations, myrelations.get_top_related_terms) # acquire the topics topdata = build_topic_json(myrelations) # feed the results into the heavily customized template ret_val = render_to_response("model-relationships.html", {'alg':alg, 'doc_data':ddata, 'term_data':tdata, 'top_data':topdata}, context_instance=RequestContext(request)) return ret_val
def table_graph_rel(request, type, alg_db, alg='', template='table-graph.html', extra_context=None, RPP=49): """ constructs a table-graph to display relationships in the data """ # check for ajax pagination if request.GET.has_key('page'): page = int(request.GET['page']) start_val = (page-1)*RPP end_val = page*RPP else: start_val = 0 end_val = RPP myrelations = relations(alg_db) num_topics = myrelations.get_num_topics() if type == 'doc-graph': main_objs = myrelations.get_docs(start_val=start_val, end_val=end_val) rel_pct_fct = lambda top, doc, tops: 100*tops[top]/sum(tops.values()) get_top_related_fnct = myrelations.get_top_related_topics group_data_type = "documents" data_type = "topics" color_bars = True elif type == 'term-graph': main_objs = myrelations.get_terms(start_val=start_val, end_val=end_val) rel_pct_fct = myrelations.get_top_in_term_rel_pct get_top_related_fnct = myrelations.get_top_related_topics group_data_type = "terms" data_type = "topics" color_bars = True elif type == 'topic-graph': main_objs = myrelations.get_topics(start_val=start_val, end_val=end_val) rel_pct_fct = myrelations.get_term_in_top_rel_pct get_top_related_fnct = myrelations.get_topic_terms group_data_type = "topics" data_type = "terms" color_bars = False context = {'input':table_object_gen(main_objs, get_top_related_fnct, rel_pct_fct), "num_topics":num_topics, "color_bars":color_bars, 'alg':alg, 'group_data_type': group_data_type, 'data_type':data_type, 'RPP':RPP} if extra_context: context.update(extra_context) return render_to_response(template, context, context_instance=RequestContext(request))
def presence_graph(request, item, alg_db, alg='', template='table-graph.html', extra_context=None, RPP=49): """ Returns a graph of the relative presence of each item (bar graph) """ if request.GET.has_key('page'): page = int(request.GET['page']) start_val = (page - 1) * RPP end_val = page * RPP else: start_val = 0 end_val = RPP myrelations = relations(alg_db) if item == "topics": mobjs = myrelations.get_topics() score_fnct = lambda topic: topic.score max_score = Topic.max_score title_fnct = lambda score, width: str(width) + '% of max' elif item == "terms": mobjs = myrelations.get_terms(start_val=start_val, end_val=end_val) score_fnct = lambda term: term.count max_score = Term.max_occ title_fnct = lambda score, width: str(score) + ' occurences, ' + str( width) + '% of max' context = { 'input': get_bar_chart(mobjs, score_fnct, max_score, title_fnct), 'alg': alg, 'group_data_type': item, 'RPP': RPP } if extra_context: context.update(extra_context) return render_to_response(template, context, context_instance=RequestContext(request))
def get_topic_page(request, alg_db, topic_title, topicid, term_cutoff=NUM_TERMS, doc_cutoff=10, topic_cutoff=10, alg=''): """ returns the topic page to the user with related terms, documents, and topics """ myrelations = relations(alg_db) if not topic_title[0] == '{': topic_title = '{' + ', '.join(topic_title.strip().split()) + '}' topic = Topic(myrelations, topicid, topic_title) # related terms terms = topic.get_terms(term_cutoff) # add an interactive pie chart piearray = get_js_topic_term_pie_array( topic, terms) # TODO replace this in template leftcol = {'piearray': piearray, 'data': terms, 'webname': 'terms'} # # related docs docs = myrelations.get_top_related_docs(topic, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) midcol = {'data': doc_keys[0:doc_cutoff], 'webname': 'documents'} # # related topics topics = myrelations.get_top_related_topics(topic, topic_cutoff) topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) rightcol = {'data': topic_keys[0:topic_cutoff], 'webname': 'topics'} return render_to_response("three-column-vis.html", { 'leftcol': leftcol, 'midcol': midcol, 'rightcol': rightcol, 'title': topic.title }, context_instance=RequestContext(request))
def get_model_page(request, alg_db, corpus_dbloc, dataloc, alg='', num_terms=NUM_TERMS, bing_dict={}, form=None): # TODO stem the terms once for bing and wiki calculations """ Obtain/save the appropriate data for the model-description page @param request: the django request object @param alg_db: the algorithm database @param corpus_dbloc: the location of the corpus specific database (contains cooccurence information) @param dataloc: the location of the data for this analysis @param alg: the algorithm used for this analysis @param num_terms: the number of terms to display for the topics """ alg_loc = "%s/%s" % (dataloc, alg) # prep the model likelihood results lin = open(os.path.join(alg_loc,'js_likelihood.dat'), 'r') ldata = lin.readline().strip() lin.close() # likelihood myrelations = relations(alg_db) topics = myrelations.get_topics() log_like = {} for tpc in topics: terms = myrelations.get_topic_terms(tpc, num_terms) log_like[tpc.id] = round( sum(terms.values()), 2) # topic coherence tc_scores = get_topic_coherence_scores(topics, corpus_dbloc) for tid in tc_scores: tc_scores[tid] = round(sum(tc_scores[tid]),2) # see if we already acquired the bing scores, if not, save the topic_terms for AJAX queries search_title_scores = {} if bing_dict != {}: search_title_scores= bing_dict else: save_topic_terms(topics, alg_loc, num_terms) # wikipedia scores wiki_abs_scores = get_wiki_pmi_coherence(topics) for tid in wiki_abs_scores: if len(wiki_abs_scores[tid]) > 0: wiki_abs_scores[tid] = round(median(wiki_abs_scores[tid]), 2) else: wiki_abs_scores[tid] = -1 # TODO verify why this happens --- it's usually with strange words # build "topic dictionaries" for faster django access top_dicts = [] srt_tc_scores = sorted(tc_scores.values(), reverse=False) srt_search_title_scores = sorted(search_title_scores.values(), reverse=False) srt_wiki_abs_scores = sorted(wiki_abs_scores.values(), reverse=False) srt_log_like = sorted(log_like.values(), reverse=False) ntopics = len(topics) for i in xrange(ntopics): top_dicts.append({}) topics[i].get_terms(num_terms) top_dicts[i]['id'] = topics[i].id top_dicts[i]['terms'] = ', '.join([topics[i].get_term(x).title for x in range(num_terms)]) if tc_scores != {}: top_dicts[i]['tc_score'] = tc_scores[topics[i].id] top_dicts[i]['tc_score_alpha'] = round(srt_tc_scores.index(tc_scores[topics[i].id])/float(ntopics-1),3) # TODO remove code repetion if search_title_scores != {}: top_dicts[i]['search_title_score'] = search_title_scores[topics[i].id] top_dicts[i]['search_title_score_alpha'] = round(srt_search_title_scores.index(search_title_scores[topics[i].id])/float(ntopics-1),3) if wiki_abs_scores != {}: top_dicts[i]['wiki_abs_score'] = wiki_abs_scores[topics[i].id] top_dicts[i]['wiki_abs_score_alpha'] = round(srt_wiki_abs_scores.index(wiki_abs_scores[topics[i].id])/float(ntopics-1),3) if log_like != {}: top_dicts[i]['topic_likelihood'] = log_like[topics[i].id] top_dicts[i]['topic_likelihood_alpha'] = round(srt_log_like.index(log_like[topics[i].id])/float(ntopics-1),3) rgb = {"r":255,"g":171,"b":115} ret_val = render_to_response("model-analysis.html", {'form': form,'max_ntopics': MAX_NUM_TOPICS, 'rgb':rgb, 'like_data':ldata, 'topics':top_dicts, "query_bing":search_title_scores=={}, 'do_ppx':ALLOW_PERPLEX}, context_instance=RequestContext(request)) return ret_val
def table_graph_rel(request, type, alg_db, alg='', template='table-graph.html', extra_context=None, RPP=49): """ constructs a table-graph to display relationships in the data """ # check for ajax pagination if request.GET.has_key('page'): page = int(request.GET['page']) start_val = (page - 1) * RPP end_val = page * RPP else: start_val = 0 end_val = RPP myrelations = relations(alg_db) num_topics = myrelations.get_num_topics() if type == 'doc-graph': main_objs = myrelations.get_docs(start_val=start_val, end_val=end_val) rel_pct_fct = lambda top, doc, tops: 100 * tops[top] / sum(tops.values( )) get_top_related_fnct = myrelations.get_top_related_topics group_data_type = "documents" data_type = "topics" color_bars = True elif type == 'term-graph': main_objs = myrelations.get_terms(start_val=start_val, end_val=end_val) rel_pct_fct = myrelations.get_top_in_term_rel_pct get_top_related_fnct = myrelations.get_top_related_topics group_data_type = "terms" data_type = "topics" color_bars = True elif type == 'topic-graph': main_objs = myrelations.get_topics(start_val=start_val, end_val=end_val) rel_pct_fct = myrelations.get_term_in_top_rel_pct get_top_related_fnct = myrelations.get_topic_terms group_data_type = "topics" data_type = "terms" color_bars = False context = { 'input': table_object_gen(main_objs, get_top_related_fnct, rel_pct_fct), "num_topics": num_topics, "color_bars": color_bars, 'alg': alg, 'group_data_type': group_data_type, 'data_type': data_type, 'RPP': RPP } if extra_context: context.update(extra_context) return render_to_response(template, context, context_instance=RequestContext(request))
def get_model_page( request, alg_db, corpus_dbloc, dataloc, alg='', num_terms=NUM_TERMS, bing_dict={}, form=None): # TODO stem the terms once for bing and wiki calculations """ Obtain/save the appropriate data for the model-description page @param request: the django request object @param alg_db: the algorithm database @param corpus_dbloc: the location of the corpus specific database (contains cooccurence information) @param dataloc: the location of the data for this analysis @param alg: the algorithm used for this analysis @param num_terms: the number of terms to display for the topics """ alg_loc = "%s/%s" % (dataloc, alg) # prep the model likelihood results lin = open(os.path.join(alg_loc, 'js_likelihood.dat'), 'r') ldata = lin.readline().strip() lin.close() # likelihood myrelations = relations(alg_db) topics = myrelations.get_topics() log_like = {} for tpc in topics: terms = myrelations.get_topic_terms(tpc, num_terms) log_like[tpc.id] = round(sum(terms.values()), 2) # topic coherence tc_scores = get_topic_coherence_scores(topics, corpus_dbloc) for tid in tc_scores: tc_scores[tid] = round(sum(tc_scores[tid]), 2) # see if we already acquired the bing scores, if not, save the topic_terms for AJAX queries search_title_scores = {} if bing_dict != {}: search_title_scores = bing_dict else: save_topic_terms(topics, alg_loc, num_terms) # wikipedia scores wiki_abs_scores = get_wiki_pmi_coherence(topics) for tid in wiki_abs_scores: if len(wiki_abs_scores[tid]) > 0: wiki_abs_scores[tid] = round(median(wiki_abs_scores[tid]), 2) else: wiki_abs_scores[ tid] = -1 # TODO verify why this happens --- it's usually with strange words # build "topic dictionaries" for faster django access top_dicts = [] srt_tc_scores = sorted(tc_scores.values(), reverse=False) srt_search_title_scores = sorted(search_title_scores.values(), reverse=False) srt_wiki_abs_scores = sorted(wiki_abs_scores.values(), reverse=False) srt_log_like = sorted(log_like.values(), reverse=False) ntopics = len(topics) for i in xrange(ntopics): top_dicts.append({}) topics[i].get_terms(num_terms) top_dicts[i]['id'] = topics[i].id top_dicts[i]['terms'] = ', '.join( [topics[i].get_term(x).title for x in range(num_terms)]) if tc_scores != {}: top_dicts[i]['tc_score'] = tc_scores[topics[i].id] top_dicts[i]['tc_score_alpha'] = round( srt_tc_scores.index(tc_scores[topics[i].id]) / float(ntopics - 1), 3) # TODO remove code repetion if search_title_scores != {}: top_dicts[i]['search_title_score'] = search_title_scores[ topics[i].id] top_dicts[i]['search_title_score_alpha'] = round( srt_search_title_scores.index( search_title_scores[topics[i].id]) / float(ntopics - 1), 3) if wiki_abs_scores != {}: top_dicts[i]['wiki_abs_score'] = wiki_abs_scores[topics[i].id] top_dicts[i]['wiki_abs_score_alpha'] = round( srt_wiki_abs_scores.index(wiki_abs_scores[topics[i].id]) / float(ntopics - 1), 3) if log_like != {}: top_dicts[i]['topic_likelihood'] = log_like[topics[i].id] top_dicts[i]['topic_likelihood_alpha'] = round( srt_log_like.index(log_like[topics[i].id]) / float(ntopics - 1), 3) rgb = {"r": 255, "g": 171, "b": 115} ret_val = render_to_response("model-analysis.html", { 'form': form, 'max_ntopics': MAX_NUM_TOPICS, 'rgb': rgb, 'like_data': ldata, 'topics': top_dicts, "query_bing": search_title_scores == {}, 'do_ppx': ALLOW_PERPLEX }, context_instance=RequestContext(request)) return ret_val