def getEvidenceByTopic(request): if request.method == 'POST': data = json.loads(request.body) print data collection_id = data['collection_id'] topic_id = data['topic_id'] user_id = data['user_id'] evidence = Evidence.objects.filter(Q(evidencetopic__created_by=collection_id)&Q(evidencetopic__primary_topic=topic_id)).order_by('-evidencetopic__primary_topic_prob').distinct()[:500] evidenceBookmarks = EvidenceBookmark.objects.filter(user_id=user_id) evidencePersonal = Evidence.objects.filter(Q(created_by=user_id)) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) serialized_json = serializers.serialize('json', evidenceBookmarks) evidenceBookmark_json = flattenSerializedJson(serialized_json) serialized_json = serializers.serialize('json', evidencePersonal) evidencePersonal_json = flattenSerializedJson(serialized_json) evidencePersonal = json.loads(evidencePersonal_json) output = {} output['evidencePersonal'] = [] for e in evidencePersonal: if len(e['abstract']) > 50: name = Collection.objects.get(collection_id=collection_id).collection_name topic_dist, primary_topic_terms = TopicModeler.get_document_topics(e['abstract'], name) primary_topic_tuple = max(topic_dist, key=lambda x:x[1]) this_topic = primary_topic_tuple[0] if this_topic == topic_id: output['evidencePersonal'].append(e) output['evidence'] = json.loads(evidence_json) output['evidenceBookmarks'] = json.loads(evidenceBookmark_json) return HttpResponse(json.dumps(output), status=status.HTTP_200_OK)
def getTopicsForDocuments(documentIds, documents): topics = [] evidenceTopicMap = {} topics, index_topic_map = TopicModeler.run(documents) id_topic_map = {} for i in range(len(documentIds)): id_topic_map[documentIds[i]] = index_topic_map[i] return topics, id_topic_map
def getEvidenceRecommendationAcrossTopics(topic_dist, name): top_documents = TopicModeler.compute_documents_similarity(topic_dist, name) start_id = 62164 # TODO: change this based on corpus name evidence_ids = map(lambda id: id+start_id, top_documents) evidence = Evidence.objects.filter(id__in=evidence_ids) evidence = dict([(obj.id, obj) for obj in evidence]) sorted_evidence = [evidence[id] for id in evidence_ids] serialized_json = serializers.serialize('json', sorted_evidence) evidence_json = flattenSerializedJson(serialized_json) return json.loads(evidence_json)
def getEvidenceRecommendationWithinTopics(topic_dist, name, collection_id): if len(topic_dist) > 0: primary_topic_tuple = max(topic_dist, key=lambda x:x[1]) else: return [] evidence = Evidence.objects.filter(Q(evidencetopic__primary_topic=primary_topic_tuple[0])&Q(created_by=collection_id)).distinct() # use this if later needs to get evidence by topic serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) evidence = json.loads(evidence_json) abstracts = [e['abstract'] for e in evidence] evidence_ids = TopicModeler.compute_documents_similarity_sub(topic_dist, abstracts, name) sorted_evidence = map(lambda index:evidence[index], evidence_ids) return sorted_evidence
def searchEvidenceByTitle(request): if request.method == 'POST': data = json.loads(request.body) collection_id = data['collection_id'] title = data['title'] result_limit = data['result_limit'] include_personal = data['include_personal'] user_id = data['user_id'] # DONE: we can alternatively change this to treat given title as a series of separated terms title_terms = title.split(' ') print title_terms evidence = Evidence.objects.filter(Q(created_by=collection_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms])) if include_personal: personal_evidence = Evidence.objects.filter(Q(created_by=user_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms])) evidence = chain(evidence, personal_evidence) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) evidence = json.loads(evidence_json) pprint.pprint(evidence) for e in evidence: e['dist'] = edit_distance(title, e['title']) print 'result limit' print result_limit evidence = sorted(evidence, key=lambda e:e['dist'])[:result_limit] for e in evidence: e['topic'] = -1 try: e['topic'] = EvidenceTopic.objects.get(evidence=e['id']).primary_topic except ObjectDoesNotExist: if len(e['abstract']) > 50: name = Collection.objects.get(collection_id=collection_id).collection_name topic_dist, primary_topic_terms = TopicModeler.get_document_topics(e['abstract'], name) primary_topic_tuple = max(topic_dist, key=lambda x:x[1]) e['topic'] = primary_topic_tuple[0] else: print 'warning: evidence with no topic' return HttpResponse(json.dumps(evidence), status=status.HTTP_200_OK) elif request.method == 'GET': collection_id = 13 title = 'UpSet: Visualization of Intersecting Sets' evidence = Evidence.objects.filter(created_by=collection_id) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) evidence = json.loads(evidence_json) for e in evidence: e['dist'] = edit_distance(title, e['title']) evidence = sorted(evidence, key=lambda e:e['dist']) return HttpResponse(json.dumps(evidence[:20]), status=status.HTTP_200_OK)
def createOnlineLDA(request, collection_id): if request.method == 'GET': print '>> preparing data for online lda...' collection_id = int(collection_id) evidence = Evidence.objects.filter(created_by=collection_id) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) loaded_evidence = json.loads(evidence_json) abstracts = [e['abstract'] for e in loaded_evidence] evidencePks = [e['id'] for e in loaded_evidence] name = Collection.objects.get(collection_id=collection_id).collection_name numDocs = len(loaded_evidence) evidenceTopicMap, topics = TopicModeler.create_online_lda(abstracts, evidencePks, name, math.ceil(numDocs / 10)) # saveTopicsForEvidence(evidenceTopicMap, collection_id) return HttpResponse(json.dumps({}), status=status.HTTP_200_OK)
def loadOnlineLDA(request, collection_id): if request.method == 'GET': print '>> preparing stored evidence...' collection_id = int(collection_id) evidence = Evidence.objects.filter(created_by=collection_id) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) loaded_evidence = json.loads(evidence_json) abstracts = [e['abstract'] for e in loaded_evidence] evidencePks = [e['id'] for e in loaded_evidence] name = Collection.objects.get(collection_id=collection_id).collection_name print '>> loading lda model...' evidenceTopicMap, topicList = TopicModeler.load_online_lda(abstracts, evidencePks, name) print '>> saving topics for evidence...' saveTopicsForEvidence(evidenceTopicMap, collection_id) return HttpResponse(json.dumps({}), status=status.HTTP_200_OK)
def addReview(self, reviewDict): polarity = reviewDict["sentiment"] tempDict = {} tempDict["date"] = dt.datetime.strptime(reviewDict["date"], '%Y-%m-%d').date().year tempDict["negative"] = 0 tempDict["positive"] = 0 tempDict["neutral"] = 0 tempDict["sentiment"] = polarity tempDict["count"] = 1 tempDict["stars"] = reviewDict["stars"] if polarity < -0.05: tempDict["negative"] += 1 for word in tm.getWordVector(reviewDict["text"]): self.negativeCorpus.append(word) elif polarity > 0.05: tempDict["positive"] += 1 else: tempDict["neutral"] += 1 isPresent = False for item in self.dict["violations"]: if item["date"] == tempDict["date"]: if "sentiment" not in item: item["negative"] = tempDict["negative"] item["positive"] = tempDict["positive"] item["neutral"] = tempDict["neutral"] item["sentiment"] = tempDict["sentiment"] item["stars"] = tempDict["stars"] item["count"] = tempDict["count"] else: item["negative"] += tempDict["negative"] item["positive"] += tempDict["positive"] item["neutral"] += tempDict["neutral"] item["sentiment"] += tempDict["sentiment"] item["stars"] += tempDict["stars"] item["count"] += tempDict["count"] isPresent = True if not isPresent: self.dict["violations"].append(tempDict)
def cacheTopics(request, collection_id): if request.method == 'GET': evidence_count = Evidence.objects.filter(created_by=collection_id).count() Topic.objects.filter(collection_id=collection_id).delete() collection_name = Collection.objects.get(collection_id=int(collection_id)).collection_name topicList = TopicModeler.get_online_lda_topics(collection_name, evidence_count / 10) for i in range(len(topicList)): topic_id = topicList[i][0] evidence_count = Evidence.objects.filter(Q(evidencetopic__primary_topic=topic_id)&Q(created_by=collection_id)).count() t = Topic( collection_id=collection_id, index=topic_id, terms=json.dumps(topicList[i][1]), document_count=evidence_count ) t.save() return HttpResponse(json.dumps({}), status=status.HTTP_200_OK)
def getDictionary(self): for item in self.dict["violations"]: if "sentiment" in item: item["stars"] /= float(item["count"]) item["sentiment"] /= item["count"] item.pop("count") if "count*" in item: item["*"] /= item["count*"] if item["count*"] > 0 else 1 item["**"] /= item["count**"] if item["count**"] > 0 else 1 item["***"] /= item["count***"] if item["count***"] > 0 else 1 item.pop("count*") item.pop("count**") item.pop("count***") self.dict["checklist"] = [] if len(self.negativeCorpus) > 0: self.dict["checklist"] = tm.getTopics(self.negativeCorpus) return self.dict
def getEvidenceRecommendation(request): if request.method == 'POST': data = json.loads(request.body) # data = {} # data['text'] = 'Using brain imaging in humans, we showed that the lateral PFC is organized as a cascade of executive processes from premotor to anterior PFC regions that control behavior according to stimuli, the present perceptual context, and the temporal episode in which stimuli occur, respectively.' collection_id = int(data['collectionId']) name = Collection.objects.get(collection_id=collection_id).collection_name topic_dist, primary_topic_terms = TopicModeler.get_document_topics(data['text'], name) if len(topic_dist) > 0: primary_topic_tuple = max(topic_dist, key=lambda x:x[1]) else: primary_topic_tuple = ('', 0) output = {} output['topics'] = [{}] output['topics'][0]['terms'] = primary_topic_terms output['topics'][0]['prob'] = primary_topic_tuple[1] # evidence = getEvidenceRecommendationAcrossTopics(topic_dist, name) output['evidence'] = getEvidenceRecommendationWithinTopics(topic_dist, name, collection_id)[:100] return HttpResponse(json.dumps(output), status=status.HTTP_200_OK)
# businesses_results.count() # for item in range(1000): # query = {"business_id" : unicode(businesses_results[item]["business_id"])} count = 1 query = {"sentiment" : { "$lt" : -0.05 }} for review in db.reviewsSentiment.find(query): reviewsSet.append(review["text"]) sys.stdout.write("\r" + "%d reviews processed" % count) sys.stdout.flush() count += 1 # if count == 10: # break print "\nReviews processing done!" tm.buildTopicModel(reviewsSet) # count = 1 # for tip in db.tips.find(): # tipsSet.append(tip["text"]) # sys.stdout.write("\r" + "%d tips processed" % count) # sys.stdout.flush() # count += 1 # # if count == 10000: # # break # print "\nTips processing done!" # tm.analyzeSentiment(reviewsSet) # print "Reviews %d" % db.reviews.find(query).count() # print "Tips %d" % db.tips.find(query).count()
def createSimilarityMatrix(request): if request.method == 'GET': # name = 'pfc and executive functions' name = 'visualization' TopicModeler.create_similarity_matrix(name) return HttpResponse(json.dumps({}), status=status.HTTP_200_OK)
# "Fast Food Items", # "Seafood Menu Items", # "Ambiance & Hospitality", # "Dinner & Drinks"] # pickle.dump(topics, open("trained-model-topics",'w')) # topics = [] topics = pickle.load(file("trained-model-topics")) ldaModel = pickle.load(file("trained-negative-model")) ldaDictionary = pickle.load(file("model-dictionary")) print "Dictionary and Trained Model loaded!" # print ldaModel.show_topics(num_topics=10, num_words=10, formatted=False) for item in reviewsSet: topics_found = [] models = ldaModel[ldaDictionary.doc2bow(tm.getWordVector(item))] models = sorted(models, key=lambda k: k[1], reverse = True) # print models # if len(models) == 2: # print "-"*5 + str(topics[models[0][0]]) + "-"*5 + str(topics[models[1][0]]) + "-"*5 # print item for single_topic in models: topics_found.append(topics[single_topic[0]]) if len(topics_found) > 2: break print "-"*5 + str(topics_found) + "-"*5 print item c.close()