예제 #1
0
def separater(user_weibos):
    #print user_weibos
    s = load_scws()
    contents = []
    #all_words_dict = {}
    for user_weibo in user_weibos:
        content = user_weibo['_source']['text']
        print str(content)
        content = cut_filter(content)
        content = re_cut(content)
        separated_words = cut(s, content)
        words_dict = {}
        for word in separated_words:
            print str(word)
            try:
                words_dict[word] += 1
            except:
                words_dict[word] = 1

        #for item in words_dict:
        #print str(words_dict[item])

        #contents.append(content)

    #print contents

    return words_dict
예제 #2
0
def triple_classifier(tweet):
    """content168 以utf-8编码
    """
    sentiment = 0
    text = tweet['content168']

    if '//@' in text:
        text = text[:text.index('//@')]

    if not len(text):
        text = remove_at(tweet['content168'])

    emoticon_sentiment = emoticon(pe_set,ne_set, text)
    if emoticon_sentiment in [1,2]:
        sentiment = 1
        text = ''

    if text != '':
        entries = cut(sw, text)
        entry = [e.decode('utf-8') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1,1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            sentiment = 1
        else:
            sentiment = 0

    return sentiment
예제 #3
0
def cut_words_noun(text):
    '''分词, 加入黑名单过滤单个词,保留名词
       input
           texts: 输入text的list,utf-8
       output:
           terms: 关键词list
    '''
    if not isinstance(text, str):
        raise ValueError("cut words input text must be string")

    cx_terms = cut(s, text, cx=True)

    return [term for term, cx in cx_terms if cx in cx_dict_noun and term not in black_words]
예제 #4
0
def prepare_svm_input(texts, y=None, dictionary=dictionary):
    """处理svm输入
    """
    x = []

    if not y:
        y = [1.0 for i in range(0, len(texts))]

    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        x.append(dict(feature))

    return y, x
예제 #5
0
def prepare_svm_input_file(texts, dictionary=dictionary):
    """将svm输入处理成文件
    """
    pid = os.getpid()
    svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid)

    fw = open(svm_input_path, 'w')
    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        line = '1 ' + ' '.join([
            str(wordid + 1) + ':' + str(wordcount)
            for wordid, wordcount in feature
        ])
        fw.write('%s\n' % line)
    fw.close()

    return svm_input_path
def triple_classifier(tweet):
    """text: utf-8 encoding
    """
    sentiment = 0
    text = tweet['text']  # encode

    #if_empty_retweet = if_empty_retweet_weibo(tweet)
    #if if_empty_retweet:
    #    text = tweet['retweeted_status']['text']

    # if_emoticoned = if_emoticoned_weibo(tweet)
    # if if_emoticoned == 1:
    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        sentiment = emoticon_sentiment
        text = ''

    if text != '':
        entries = cut(cut_str, text)
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        bow = dictionary_1.doc2bow(entry)  #将其向量化
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0]**pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1]**pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)  #将其向量化
            s = [1, 1, 1]
            for pair in bow:
                s[0] = s[0] * (step2_score[pair[0]][0]**pair[1])
                s[1] = s[1] * (step2_score[pair[0]][1]**pair[1])
                s[2] = s[2] * (step2_score[pair[0]][2]**pair[1])
            if s[0] > s[1] and s[0] > s[2]:
                sentiment = HAPPY
            elif s[1] > s[0] and s[1] > s[2]:
                sentiment = SAD
            elif s[2] > s[1] and s[2] > s[0]:
                sentiment = ANGRY

    return sentiment
예제 #7
0
파일: user_domain.py 프로젝트: SwoJa/ruman
def user_domain_classifier_v2(user):
    r = user
    label = labels[11]

    verified_type = r['verified_type']
    location = r['user_location']
    province = location.split(' ')[0]

    followers_count = r['fansnum']
    statuses_count = r['statusnum']

    name = r['nick_name']
    description = r['description']

    if verified_type == 4:
        label = labels[0] # 高校微博

    elif verified_type == 1:
        label = labels[7]#政府机构及人士
        
    elif verified_type == 8 or verified_type == 7 or verified_type == 2:
        if province not in outlist:
            label = labels[1] # 境内机构
        else:
            label = labels[2] # 境外机构

    elif verified_type == 3:
        if location not in outlist:
            label = labels[3] # 境内媒体
        else:
            label = labels[4] # 境外媒体 

    elif verified_type == 5 or verified_type == 6:
        label = labels[5] # 民间组织

    elif verified_type == 0:
        text = name + description
        kwdlist = cut(s, text)
        lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw]) # 律师
        adminw_weight = sum([1 for keyword in kwdlist if keyword in adminw]) # 政府官员
        mediaw_weight = sum([1 for keyword in kwdlist if keyword in mediaw]) # 媒体人士
        businessw_weight = sum([1 for keyword in kwdlist if keyword in businessw]) # 商业人士

        max_weight = 0
        '''
        if max_weight < lawyer_weight:
            max_weight = lawyer_weight
            label = labels[6]
        '''
        
        if max_weight < businessw_weight:
            max_weight = businessw_weight
            label = labels[12]

        if max_weight < adminw_weight:
            max_weight = adminw_weight
            label = labels[7]

        if max_weight < mediaw_weight:
            max_weight = mediaw_weight
            label = labels[8]

        if max_weight == 0:
            label = labels[9]

        if lawyer_weight!=0:
            label = labels[6]

    elif verified_type == 220 or verified_type == 200:
        label = labels[9]

    elif verified_type == 400:
        label = labels[11]    

    else:
        if followers_count >= FOLLOWER_THRE and statuses_count >= STATUS_THRE:
            label = labels[10] # 草根

        lawyer_weight = 0
        text = name + description
        kwdlist = cut(s, text)
        lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw])

        if lawyer_weight != 0:
            label = labels[6]

    return label
예제 #8
0
파일: user_domain.py 프로젝트: SwoJa/ruman
def user_domain_classifier_v2(user):
    r = user
    label = labels[11]

    verified_type = r['verified_type']
    location = r['user_location']
    province = location.split(' ')[0]

    followers_count = r['fansnum']
    statuses_count = r['statusnum']

    name = r['nick_name']
    description = r['description']

    if verified_type == 4:
        label = labels[0]  # 高校微博

    elif verified_type == 1:
        label = labels[7]  #政府机构及人士

    elif verified_type == 8 or verified_type == 7 or verified_type == 2:
        if province not in outlist:
            label = labels[1]  # 境内机构
        else:
            label = labels[2]  # 境外机构

    elif verified_type == 3:
        if location not in outlist:
            label = labels[3]  # 境内媒体
        else:
            label = labels[4]  # 境外媒体

    elif verified_type == 5 or verified_type == 6:
        label = labels[5]  # 民间组织

    elif verified_type == 0:
        text = name + description
        kwdlist = cut(s, text)
        lawyer_weight = sum([1 for keyword in kwdlist
                             if keyword in lawyerw])  # 律师
        adminw_weight = sum([1 for keyword in kwdlist
                             if keyword in adminw])  # 政府官员
        mediaw_weight = sum([1 for keyword in kwdlist
                             if keyword in mediaw])  # 媒体人士
        businessw_weight = sum(
            [1 for keyword in kwdlist if keyword in businessw])  # 商业人士

        max_weight = 0
        '''
        if max_weight < lawyer_weight:
            max_weight = lawyer_weight
            label = labels[6]
        '''

        if max_weight < businessw_weight:
            max_weight = businessw_weight
            label = labels[12]

        if max_weight < adminw_weight:
            max_weight = adminw_weight
            label = labels[7]

        if max_weight < mediaw_weight:
            max_weight = mediaw_weight
            label = labels[8]

        if max_weight == 0:
            label = labels[9]

        if lawyer_weight != 0:
            label = labels[6]

    elif verified_type == 220 or verified_type == 200:
        label = labels[9]

    elif verified_type == 400:
        label = labels[11]

    else:
        if followers_count >= FOLLOWER_THRE and statuses_count >= STATUS_THRE:
            label = labels[10]  # 草根

        lawyer_weight = 0
        text = name + description
        kwdlist = cut(s, text)
        lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw])

        if lawyer_weight != 0:
            label = labels[6]

    return label