예제 #1
0
def is_fashion(text):
    try: #In Unix environment
        from sinal import signal, SIGALRM, alarm #@UnresolvedImport
        def handler(signum, frame):
            raise Exception("This code block runs for too long time!")
        signal(SIGALRM, handler)
        alarm(3)
        mmseg_text = mmseg.Algorithm(text.encode('utf-8'))
        alarm(0) #cancel the alarm after finised
    except ImportError: # In windows, SIGALRM, alarm is not available in signal module
        mmseg_text = mmseg.Algorithm(text.encode('utf-8'))
    except: #mmseg halts for too long, process next tweet
        return False
    
    for token in mmseg_text:
        try:
            term = token.text.decode('utf-8').lower()
        except:
            continue 
        train_value = area_train_data.get(term, None)
        if train_value:
            for index in train_value:
                if index == 1: #Fashion is the 1st area
                    return True    
예제 #2
0
def classify_influence(inf_id, other=False):
    """
    TODO:
    1, Calculate the most popular tags, so that we can assign idf score
    2, Accumulate more meaningful tags
    """
    inf_id = inf_id.decode('gbk')
    print inf_id.encode('gbk')
    try:
        inf_id = int(inf_id)
        inf = Influence.objects.get(pk=inf_id)
    except:
        inf, created = Influence.objects.get_or_create(screen_name=inf_id)
        if created:
            auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET)
            auth.setToken('128021658f2bfdae185d89bdffb3cede','1713185d5c8208e8f1ef27a1f484ebc9')
            api = API(auth)
            
            user = api.get_user(screen_name=inf_id)
            inf.sina_account = getAtt(user, 'id')
            inf.verified = getAtt(user,'verified')
            inf.screen_name = getAtt(user, 'screen_name')
            inf.description = getAtt(user, 'description')
            inf.follower_count = getAtt(user, 'followers_count')
            inf.following_count = getAtt(user, 'friends_count')
            inf.status_count = getAtt(user, 'statuses_count')
            inf.favourites_count = getAtt(user, 'favourites_count')
            inf.create_date = getAtt(user, 'created_at')
            inf.save()
    
    auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET)
    if other:
        auth.setToken('128021658f2bfdae185d89bdffb3cede', '1713185d5c8208e8f1ef27a1f484ebc9')
    else:
        auth.setToken(inf.sina_key, inf.sina_secret)
    api = API(auth)
    mmseg.dict_load_defaults()
    """Put this in db first"""
    candidate_tags = KeyValue.objects.get(key='CANDIDATE_TAGS')
    
    area_dict = {}
#    id_list = api.followers_ids(user_id=inf.sina_account, count=100) #default to 500, maximum is 5000; This consumes a lot of api limit
#    ids = id_list[0].ids  #Weird that getAtt won't work
#    for id in ids:
#        tags = api.tags(user_id=id)  #user_id is required!
#        tag_list = []
#        for tag in tags:
#            tag_list.append(getAtt(tag, 'value').lower().encode('utf-8'))
#        mmseg_text = mmseg.Algorithm(' '.join(tag_list))
#        for token in mmseg_text:
#            try:
#                term = token.text.decode('utf-8').lower()
#                #next_term = mmseg_text[i+1].text.decode('utf-8') if i < len_list - 1 else ''
#            except:
#                continue
#            train_value = area_train_data.get(term, None)
#            #if not train_value:
#            #    train_value = area_train_data.get(term + next_term, None)
#            if train_value:
#                print 'in dict'
#                for index in train_value:
#                    if index in area_dict:
#                        area_dict[index] += 1
#                    else:
#                        area_dict[index] = 1
#            else:
#                candidate_tags.value += ' ' + term
        
    candidate_tags.save()
    area_distr_dict = {}
    mid_list = []
    ids_list = []
    tweet_list = [] #Store the text of tweet and retweet
    rt_count_list = []
    tried_count = 0
    while True:
        timeline = api.user_timeline(user_id=inf.sina_account, count=200)
        if len(timeline) == 0 and inf.status_count >0:
            tried_count += 1
            print 'try again in getting timeline'
        else:
            break
        if tried_count > 3:
            raise Exception('weibo api error. No timeline got')
            break
        
    for line in timeline:
        text = getAtt(line, 'text')
        retweet = getAtt(line, 'retweeted_status')
        retweet_text = getAtt(retweet, 'text')
        if retweet_text:
            text += retweet_text
        tweet_list.append(text)   
        mid_list.append(str(getAtt(line, "id")))
        if len(mid_list) == 20:
            ids_list.append(','.join(mid_list))
            mid_list = []
    if mid_list: #append the remaining ids
        ids_list.append(','.join(mid_list))
    if inf.status_count > 0 and not ids_list:
        raise Exception('weibo api fails')
    tweet_list_correct = []
    correct_index = 20 
    for ids in ids_list:
        counts = api.counts(ids=ids)
        if len(counts) == 0:
            print 'error in counts!'
            correct_index += 20
            continue
        for obj in counts:
            rt_count_list.append(getAtt(obj, 'rt'))
        tweet_list_correct.extend(tweet_list[correct_index-20:correct_index])
        correct_index += 20    
    if len(tweet_list_correct) == 0 or len(tweet_list_correct) != len(rt_count_list):
        raise Exception('weibo api fails')
    print 'length of tweet list and rt_count list', len(tweet_list_correct), len(rt_count_list)
    #Remedy for those user who has posted less than 200 status
    amplify_ratio = 1.0 if len(tweet_list_correct) == 200 else 200.0/len(tweet_list_correct)
    for i in range(len(tweet_list_correct)):
        print i
        #This number 100 should be replaced by avg_follower_count
        #Use math.sqrt to boost those tweet that has not been retweeted, 
        #and smooth the effect of famous people tweeting about things not related to them 
        added_count = (rt_count_list[i]*100 + math.sqrt(inf.follower_count)) * amplify_ratio
        assigned_area = {}
        try: #In Unix environment
            from sinal import signal, SIGALRM, alarm #@UnresolvedImport
            def handler(signum, frame):
                #print 'Signal handler called with signal', signum
                raise Exception("This code block runs for too long time!")
            signal(SIGALRM, handler)
            alarm(3)
            mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8'))
            alarm(0) #cancel the alarm after finised
        except ImportError: # In windows, SIGALRM, alarm is not available in signal module
            mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8'))
        except: #mmseg halts for too long, process next tweet
            continue
        for token in mmseg_text:
            try:
                term = token.text.decode('utf-8').lower()
            except:
                continue 
            train_value = area_train_data.get(term, None)
            if train_value:
                print 'in dict'
                for index in train_value:
                    if index not in assigned_area: #This tweet has already been assigned to one area
                        if index in area_dict:
                            area_dict[index] += added_count
                        else:
                            area_dict[index] = added_count
                        assigned_area[index] = True
                        if index in area_distr_dict:
                            area_distr_dict[index] += 1
                        else:
                            area_distr_dict[index] = 1
                    else:
                        area_distr_dict[index] += 1
    candidate_tags.save()
    
    sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
    if inf.follower_count > 100000: 
        for i in range(1,len(sorted_tuple)): #Only normalize on secondary influence areas and belows
            index = sorted_tuple[i][0]
            model_follower_count = areas[index][4]
            if inf.follower_count > model_follower_count:
                area_dict[index] = area_dict[index]*1.0/inf.follower_count*model_follower_count  
    
    num_areas = len(area_distr_dict)
    total_keyword_count = 0
    for index in area_distr_dict:
        total_keyword_count += area_distr_dict[index]
    for k in area_dict:
        area_distr_ratio = num_areas * area_distr_dict[k]*1.0/total_keyword_count
        print k , area_distr_ratio, area_distr_dict[k]
        area_dict[k] = 100.0/math.log(areas[k][3]) * math.log(area_dict[k]*area_distr_ratio)
        if area_dict[k] > 100:
            area_dict[k] = 100.0
                    
    sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True) 
    for st in sorted_tuple:
        print areas[st[0]][1].encode('gbk'), st[1]