def get_by_model(self, queryset_or_model, tags): """ Create a ``QuerySet`` containing instances of the specified model associated with a given tag or list of tags. """ tags = get_tag_list(tags) tag_count = len(tags) if tag_count == 0: # No existing tags were given queryset, model = get_queryset_and_model(queryset_or_model) return model._default_manager.none() elif tag_count == 1: # Optimisation for single tag - fall through to the simpler # query below. tag = tags[0] else: return self.get_intersection_by_model(queryset_or_model, tags) queryset, model = get_queryset_and_model(queryset_or_model) content_type = ContentType.objects.get_for_model(model) opts = self.model._meta tagged_item_table = qn(opts.db_table) return queryset.extra( tables=[opts.db_table], where=[ '%s.content_type_id = %%s' % tagged_item_table, '%s.tag_id = %%s' % tagged_item_table, '%s.%s = %s.object_id' % (qn(model._meta.db_table), qn(model._meta.pk.column), tagged_item_table) ], params=[content_type.pk, tag.pk], )
def user_rep(): openfile = open("Training_Body_Title_user.p", "rb") x = pickle.load(openfile) repre = {} count = 0 use = 0 for y in x: tag_string = y['tags'].encode('utf-8') #print tag_string tag_list = utils.get_tag_list(tag_string) tag_enc = get_tag_encoding(tag_list) count += 1 print count try: repre[user_id[y['OwnerUserId']]] += tag_enc except: try: repre[user_id[y['OwnerUserId']]] = np.zeros(len(tag_dict)) repre[user_id[y['OwnerUserId']]] += tag_enc except: # use += 1 # print use continue for key in repre: repre[key] = softmax(repre[key]) print repre[key].shape return repre
def related_for_model(self, tags, model, counts=False, min_count=None): """ Obtain a list of tags related to a given list of tags - that is, other tags used by items which have all the given tags. If ``counts`` is True, a ``count`` attribute will be added to each tag, indicating the number of items which have it in addition to the given list of tags. If ``min_count`` is given, only tags which have a ``count`` greater than or equal to ``min_count`` will be returned. Passing a value for ``min_count`` implies ``counts=True``. """ from models import TaggedItem if min_count is not None: counts = True tags = get_tag_list(tags) tag_count = len(tags) tagged_item_table = qn(TaggedItem._meta.db_table) query = """ SELECT %(tag)s.id, %(tag)s.name%(count_sql)s FROM %(tagged_item)s INNER JOIN %(tag)s ON %(tagged_item)s.tag_id = %(tag)s.id WHERE %(tagged_item)s.content_type_id = %(content_type_id)s AND %(tagged_item)s.object_id IN ( SELECT %(tagged_item)s.object_id FROM %(tagged_item)s, %(tag)s WHERE %(tagged_item)s.content_type_id = %(content_type_id)s AND %(tag)s.id = %(tagged_item)s.tag_id AND %(tag)s.id IN (%(tag_id_placeholders)s) GROUP BY %(tagged_item)s.object_id HAVING COUNT(%(tagged_item)s.object_id) = %(tag_count)s ) AND %(tag)s.id NOT IN (%(tag_id_placeholders)s) GROUP BY %(tag)s.id, %(tag)s.name %(min_count_sql)s ORDER BY %(tag)s.name ASC""" % { 'tag': qn(self.model._meta.db_table), 'count_sql': counts and ', COUNT(%s.object_id)' % tagged_item_table or '', 'tagged_item': tagged_item_table, 'content_type_id': ContentType.objects.get_for_model(model).pk, 'tag_id_placeholders': ','.join(['%s'] * tag_count), 'tag_count': tag_count, 'min_count_sql': min_count is not None and ('HAVING COUNT(%s.object_id) >= %%s' % tagged_item_table) or '', } params = [tag.pk for tag in tags] * 2 if min_count is not None: params.append(min_count) cursor = connection.cursor() cursor.execute(query, params) related = [] for row in cursor.fetchall(): tag = self.model(*row[:2]) if counts is True: tag.count = row[2] related.append(tag) return related
def train(): openfile = open("Training_Body_Title_user.p", "rb") x = pickle.load(openfile) #print "x:",len(x) for loop in xrange(5): #print "loop: ",loop x_train = [] user = [] y_train = [] cnt = 0 trace = 0 for o, y in enumerate(x): try: question = y['Title'].encode('utf-8') + ' ' except: question = '' question = question + y['Body'].encode('utf-8') question = utils.clean_question(question) tag_string = y['tags'].encode('utf-8') #print tag_string tag_list = utils.get_tag_list(tag_string) question_enc = get_question_embedding(question) tag_enc = get_tag_encoding(tag_list) cnt = cnt + 1 x_train.append(question_enc) y_train.append(tag_enc) try: user.append(meta_model[str(user_num[y['OwnerUserId']])]) except: #trace += 1 #print trace user.append(np.zeros(128)) if cnt == batch_size: x_train = np.asarray(x_train) y_train = np.asarray(y_train) user = np.asarray(user) print "cnt: ", cnt, " loop: ", loop, " o: ", o # print (x_train.shape) #model.fit([x_train, user],y_train, epochs=1) model.fit(x_train, y_train, epochs=1) #model.save('model4_train_add_Body_Title_gru_epochs10.h5') #model.save_weights('model4_train_add_Body_Title_weights_gru_epochs10.h5') model.save( 'model4_train_DeepTagRecContent_usingAdd_sigmoid.h5') model.save_weights( 'model4_train_DeepTagRecContent_weights_usingAdd_sigmoid.h5' ) x_train = [] user = [] y_train = [] cnt = 0
def get_intersection_by_model(self, queryset_or_model, tags): """ Create a ``QuerySet`` containing instances of the specified model associated with *all* of the given list of tags. """ tags = get_tag_list(tags) tag_count = len(tags) queryset, model = get_queryset_and_model(queryset_or_model) if not tag_count: return model._default_manager.none() model_table = qn(model._meta.db_table) # This query selects the ids of all objects which have all the # given tags. query = """ SELECT %(model_pk)s FROM %(model)s, %(tagged_item)s WHERE %(tagged_item)s.content_type_id = %(content_type_id)s AND %(tagged_item)s.tag_id IN (%(tag_id_placeholders)s) AND %(model_pk)s = %(tagged_item)s.object_id GROUP BY %(model_pk)s HAVING COUNT(%(model_pk)s) = %(tag_count)s""" % { 'model_pk': '%s.%s' % (model_table, qn(model._meta.pk.column)), 'model': model_table, 'tagged_item': qn(self.model._meta.db_table), 'content_type_id': ContentType.objects.get_for_model(model).pk, 'tag_id_placeholders': ','.join(['%s'] * tag_count), 'tag_count': tag_count, } cursor = connection.cursor() cursor.execute(query, [tag.pk for tag in tags]) object_ids = [row[0] for row in cursor.fetchall()] if len(object_ids) > 0: return queryset.filter(pk__in=object_ids) else: return model._default_manager.none()
def __init__(self, corpus): self.grammar = {} self.lexicon = {} self.get_pcfg(corpus) self.freq_tokens = {} for tag in self.lexicon.keys(): for word in self.lexicon[tag].keys(): if word in self.freq_tokens.keys(): self.freq_tokens[word] += self.lexicon[tag][word] else: self.freq_tokens[word] = self.lexicon[tag][word] sum = np.sum(list(self.freq_tokens.values())) for word in self.freq_tokens: self.freq_tokens[word] /= sum self.set_artificial_tags = set() self.chomskyfy() self.freq_terminal_tags = {tag: np.sum(list(counts.values())) for (tag, counts) in self.lexicon.items()} sum = np.sum(list(self.freq_terminal_tags.values())) for tag in self.freq_terminal_tags: self.freq_terminal_tags[tag] /= sum self.grammar = get_prob(self.grammar) self.lexicon = get_prob(self.lexicon) list_all_tags = get_tag_list(self.grammar) self.list_artificial_symbols = list(self.set_artificial_tags) self.list_tags = list(set(list_all_tags).difference(self.set_artificial_tags)) self.list_all_tags = self.list_tags + self.list_artificial_symbols self.nb_tags = len(self.list_tags) self.nb_all_tags = len(self.list_all_tags)
def test(): # openfile = open("ValidationPosts_15000.pickle", "rb") openfile = open("score_low.p", "rb") x = pickle.load(openfile) x_test = [] user = [] actual = [] cnt = 0 correct = 0 precision = 0.0 recall = 0.0 total = 0 count = 0 for y in x: #if total > 5: # break #question = y['Body'].encode('utf-8') question = y['Title'].encode('utf-8') question = question + ' ' + y['Body'].encode('utf-8') question = utils.clean_question(question) tag_string = y['Tags'].encode('utf-8') tag_list = utils.get_tag_list(tag_string) question_enc = get_question_embedding(question) tag_enc = get_tag_encoding(tag_list) try: user.append(meta_model[str(user_num[y['OwnerUserId']])]) except: #trace += 1 #print trace user.append(np.zeros(128)) cnt = cnt + 1 #print cnt x_test.append(question_enc) actual.append(np.asarray(tag_enc)) #print(cnt) user = np.asarray(user) x_test = np.asarray(x_test) #s = model.predict([x_test, user]) s = model.predict(x_test) actual = np.asarray(actual) predicted = s #run_user(s, user) #print predicted #break # (t_correct, t_total) = calc_precision_new(actual,s) #(t_num, t_den_p, t_den_r, t_total) = evaluate(actual,s) #num += t_num #den_p += t_den_p #den_r += t_den_r # correct += t_correct #total += t_total # print "correct = ", correct #print "total = ", total #print "=============" #predicted = [] #dict1 = pickle.load(open("predicted.p", 'rb')) #for i in dict1.keys(): # predicted.append(dict1[i]) #predicted = np.asarray(predicted) #actual = np.asarray(actual) #''' #print 'I m done' #break #break for i in [3, 5, 10]: #print actual precision = 0.0 recall = 0.0 total = 0 p, r, t = evaluate(actual, predicted, i) precision += p * t recall += r * t total += t precision = precision / total recall = recall / total #precision = (float(num)/den_p)/ float(total) #recall = (float(num)/den_r)/ float(total) print "Precision @" + str(i) + ": ", precision print "Recall @" + str(i) + ": ", recall