def is_fashion(text): try: #In Unix environment from sinal import signal, SIGALRM, alarm #@UnresolvedImport def handler(signum, frame): raise Exception("This code block runs for too long time!") signal(SIGALRM, handler) alarm(3) mmseg_text = mmseg.Algorithm(text.encode('utf-8')) alarm(0) #cancel the alarm after finised except ImportError: # In windows, SIGALRM, alarm is not available in signal module mmseg_text = mmseg.Algorithm(text.encode('utf-8')) except: #mmseg halts for too long, process next tweet return False for token in mmseg_text: try: term = token.text.decode('utf-8').lower() except: continue train_value = area_train_data.get(term, None) if train_value: for index in train_value: if index == 1: #Fashion is the 1st area return True
def classify_influence(inf_id, other=False): """ TODO: 1, Calculate the most popular tags, so that we can assign idf score 2, Accumulate more meaningful tags """ inf_id = inf_id.decode('gbk') print inf_id.encode('gbk') try: inf_id = int(inf_id) inf = Influence.objects.get(pk=inf_id) except: inf, created = Influence.objects.get_or_create(screen_name=inf_id) if created: auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET) auth.setToken('128021658f2bfdae185d89bdffb3cede','1713185d5c8208e8f1ef27a1f484ebc9') api = API(auth) user = api.get_user(screen_name=inf_id) inf.sina_account = getAtt(user, 'id') inf.verified = getAtt(user,'verified') inf.screen_name = getAtt(user, 'screen_name') inf.description = getAtt(user, 'description') inf.follower_count = getAtt(user, 'followers_count') inf.following_count = getAtt(user, 'friends_count') inf.status_count = getAtt(user, 'statuses_count') inf.favourites_count = getAtt(user, 'favourites_count') inf.create_date = getAtt(user, 'created_at') inf.save() auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET) if other: auth.setToken('128021658f2bfdae185d89bdffb3cede', '1713185d5c8208e8f1ef27a1f484ebc9') else: auth.setToken(inf.sina_key, inf.sina_secret) api = API(auth) mmseg.dict_load_defaults() """Put this in db first""" candidate_tags = KeyValue.objects.get(key='CANDIDATE_TAGS') area_dict = {} # id_list = api.followers_ids(user_id=inf.sina_account, count=100) #default to 500, maximum is 5000; This consumes a lot of api limit # ids = id_list[0].ids #Weird that getAtt won't work # for id in ids: # tags = api.tags(user_id=id) #user_id is required! # tag_list = [] # for tag in tags: # tag_list.append(getAtt(tag, 'value').lower().encode('utf-8')) # mmseg_text = mmseg.Algorithm(' '.join(tag_list)) # for token in mmseg_text: # try: # term = token.text.decode('utf-8').lower() # #next_term = mmseg_text[i+1].text.decode('utf-8') if i < len_list - 1 else '' # except: # continue # train_value = area_train_data.get(term, None) # #if not train_value: # # train_value = area_train_data.get(term + next_term, None) # if train_value: # print 'in dict' # for index in train_value: # if index in area_dict: # area_dict[index] += 1 # else: # area_dict[index] = 1 # else: # candidate_tags.value += ' ' + term candidate_tags.save() area_distr_dict = {} mid_list = [] ids_list = [] tweet_list = [] #Store the text of tweet and retweet rt_count_list = [] tried_count = 0 while True: timeline = api.user_timeline(user_id=inf.sina_account, count=200) if len(timeline) == 0 and inf.status_count >0: tried_count += 1 print 'try again in getting timeline' else: break if tried_count > 3: raise Exception('weibo api error. No timeline got') break for line in timeline: text = getAtt(line, 'text') retweet = getAtt(line, 'retweeted_status') retweet_text = getAtt(retweet, 'text') if retweet_text: text += retweet_text tweet_list.append(text) mid_list.append(str(getAtt(line, "id"))) if len(mid_list) == 20: ids_list.append(','.join(mid_list)) mid_list = [] if mid_list: #append the remaining ids ids_list.append(','.join(mid_list)) if inf.status_count > 0 and not ids_list: raise Exception('weibo api fails') tweet_list_correct = [] correct_index = 20 for ids in ids_list: counts = api.counts(ids=ids) if len(counts) == 0: print 'error in counts!' correct_index += 20 continue for obj in counts: rt_count_list.append(getAtt(obj, 'rt')) tweet_list_correct.extend(tweet_list[correct_index-20:correct_index]) correct_index += 20 if len(tweet_list_correct) == 0 or len(tweet_list_correct) != len(rt_count_list): raise Exception('weibo api fails') print 'length of tweet list and rt_count list', len(tweet_list_correct), len(rt_count_list) #Remedy for those user who has posted less than 200 status amplify_ratio = 1.0 if len(tweet_list_correct) == 200 else 200.0/len(tweet_list_correct) for i in range(len(tweet_list_correct)): print i #This number 100 should be replaced by avg_follower_count #Use math.sqrt to boost those tweet that has not been retweeted, #and smooth the effect of famous people tweeting about things not related to them added_count = (rt_count_list[i]*100 + math.sqrt(inf.follower_count)) * amplify_ratio assigned_area = {} try: #In Unix environment from sinal import signal, SIGALRM, alarm #@UnresolvedImport def handler(signum, frame): #print 'Signal handler called with signal', signum raise Exception("This code block runs for too long time!") signal(SIGALRM, handler) alarm(3) mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8')) alarm(0) #cancel the alarm after finised except ImportError: # In windows, SIGALRM, alarm is not available in signal module mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8')) except: #mmseg halts for too long, process next tweet continue for token in mmseg_text: try: term = token.text.decode('utf-8').lower() except: continue train_value = area_train_data.get(term, None) if train_value: print 'in dict' for index in train_value: if index not in assigned_area: #This tweet has already been assigned to one area if index in area_dict: area_dict[index] += added_count else: area_dict[index] = added_count assigned_area[index] = True if index in area_distr_dict: area_distr_dict[index] += 1 else: area_distr_dict[index] = 1 else: area_distr_dict[index] += 1 candidate_tags.save() sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True) if inf.follower_count > 100000: for i in range(1,len(sorted_tuple)): #Only normalize on secondary influence areas and belows index = sorted_tuple[i][0] model_follower_count = areas[index][4] if inf.follower_count > model_follower_count: area_dict[index] = area_dict[index]*1.0/inf.follower_count*model_follower_count num_areas = len(area_distr_dict) total_keyword_count = 0 for index in area_distr_dict: total_keyword_count += area_distr_dict[index] for k in area_dict: area_distr_ratio = num_areas * area_distr_dict[k]*1.0/total_keyword_count print k , area_distr_ratio, area_distr_dict[k] area_dict[k] = 100.0/math.log(areas[k][3]) * math.log(area_dict[k]*area_distr_ratio) if area_dict[k] > 100: area_dict[k] = 100.0 sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for st in sorted_tuple: print areas[st[0]][1].encode('gbk'), st[1]