示例#1
0
文件: segment.py 项目: decode/Rpkg
  def fetch(self, quantity=5000, format_str=", "):
    mmseg.dict_load_defaults()
    items = Item.query.from_statement('select id, name from items order by id desc limit ' + str(quantity))

    f = open('data.basket', 'a')  
    o = open('original.txt', 'a')  

    banlist = ['、','(',')','★','【','】','!',':']

    for i in items:
      seg = Segment.query.filter_by(id=i.id).first()
      if(seg==None):
        seg = Segment(item_id=i.id)
        #session.commit()
      text = i.name
      text = text.encode("utf-8")
      o.write(i.name.encode('utf-8') + "\n")

      algor = mmseg.Algorithm(text)

      sep = "|"
      s = ""
      for tok in algor:
        if tok.text in banlist:
          continue
        sep += tok.text.decode('utf-8')
        sep += "|"
      seg.content = sep
      session.commit()
      f.write(self.format(sep).encode('utf-8') + "\n")

    f.close()
示例#2
0
def HY_pymmseg(file1,file2):
    if(os.path.isfile(file1)):
        mmseg.dict_load_defaults()
        Dict={}
        f1=open(file1,'r')
        f2=open(file2,'w')
        for item in f1.readlines():
            alg=mmseg.Algorithm(item)
            wordlist=[]
            for tok in alg:
                wordlist.append(tok.text+"//")
                print "tok.text",tok.text
                if tok.text not in Dict:
                    Dict[tok.text]=1
                else:
                    Dict[tok.text]+=1
                    
            f2.writelines(wordlist)
        f1.close
        f2.close()
        print "HY_pymmseg FINISHED"
        """
        for item in Dict:
            print "DICT"
            print item
            print Dict[item]
        """
    else:
        print "EROR:HY_pymmseg eror"
示例#3
0
 def __init__(self):
     self.wordSegFlag = False
     self.idfMethod = 'userIndependent'
     self.segInMemo = False
     self.invInMemo = False
     self.segLst = dict()
     self.invLst = dict()
     self.TFIDFLst = dict()
     #self.mmseg = mmseg
     mmseg.dict_load_defaults()
示例#4
0
文件: views.py 项目: loyoen/parser
def SplitKeyword(req):
    mmseg.dict_load_defaults()
    com_list = company.objects.filter()
    for com in com_list:
        words = com.Company_Name.encode("utf-8")
        algor = mmseg.Algorithm(words)
        for tok in algor:
            word = tok.text.decode("utf-8")
            print word
            keytable = keyword(word=word, mycom = com)
            keytable.save()
    
    return HttpResponse("split end")
示例#5
0
def fileToDict(file):
    if(os.path.isfile(file)):
        mmseg.dict_load_defaults()
        Dict={}
        f=open(file,'r')
        for item in f.readlines():
            alg=mmseg.Algorithm(item)
            for tok in alg:
                if tok.text not in Dict:
                    Dict[tok.text]=1
                else:
                    Dict[tok.text]+=1
        return Dict
    else:
        print "Eror:<segment.py->fileToDict()> File not found"
示例#6
0
 def __init__(self, *args, **kwargs):
     # 参数检查
     if args:
         if len(args) % 2 != 0:
             raise ParameterError("Config requires an equal number of values and scores")
     # 动态初始化实例变量
     for i in range(len(args) / 2):
         setattr(self, args[i * 2], args[i * 2 + 1])
     for key in kwargs:
         setattr(self, key, kwargs[key])
     # redis
     pool = redis.ConnectionPool(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db'])
     self.r = redis.Redis(connection_pool=pool)
     # self.r = redis.StrictRedis(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db'])
     self.pipeline = self.r.pipeline()
     # 加载分词
     mmseg.dict_load_defaults()
示例#7
0
文件: g_azure.py 项目: kymo/GAzure
from pymmseg import mmseg
import thread
import md5
import os
import socket
import time, datetime
from beaker.middleware import SessionMiddleware
from func import SendEmailThread, html

# link the database
db = DBModel("g_azure")
db.link_database()
# rsa token
TOKEN = 2113
# load dict
mmseg.dict_load_defaults()
# the state of mission
WAITING = 0
COMPILING = 1
RUNNING = 2
COMPLETED = 3


def user_auth(func):
    """ user authenticate

        this is a decorator for all url that need user's authtication
        before dealing with url,we need to get the identify of the visitor
        
        Args:
            is_login: a bool indicating whether the user has been logined or not
示例#8
0
# -*- coding: utf8 -*-

from pymmseg import mmseg
 
mmseg.dict_load_defaults()
text = '工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作'
algor = mmseg.Algorithm(text)
for tok in algor:
    print '%s [%d..%d]' % (tok.text.decode('utf8'), tok.start, tok.end)
示例#9
0
    def handle(self, *args, **options):
        
        self.stdout.write('Start training videos\n')
        
        mmseg.dict_load_defaults()
        
        #Build feature_dict which stores index of feature names in features list 
        #used for building feature vector for each document
        feature_dict = {}
        index = 0
        for f in features:
            feature_dict[f] = index
            index += 1 
        
        #Build con_1 dictionary which is used to construct con_2
        #Initialize prior dictionary
        con_1 = {}
        prior = {}
        channels = Channel.objects.filter(type=1)
        id_name_mapping = {}
        for c in channels:
            con_1[c.id] = {} #will be filled with: {word:(has_count, total_count),...}
            id_name_mapping[c.id] = c.name
            prior[c.id] = 0
        
        #Update con_1 by processing all training data
        #..Warning..: With the assumption that all the videos in the database are already classified
        # Set share_count to 1 to represent training data; share_count = 2 means test data
        items = Item.objects.raw("select * from main_item where type=1 and channels<>'' and share_count=1")
        self.stdout.write(str(len(list(items))) + ' items to train\n')
        for item in items:
            feature_vector = [0 for f in features] #Changes to 1 if this feature is present
            item_channels = [int(c) for c in item.channels.split(',')]
            
            #update prior
            for c in item_channels:
                prior[c] += 1
            
            algor = mmseg.Algorithm((item.name + item.snippet).encode('utf-8')) #mmseg requires utf-8
            for tok in algor:
                try:
                    token_text = tok.text.decode('utf-8') #The text come out is utf-8, turn it to unicode
                except:
                    continue
                token_text = token_text.lower()
                if token_text in feature_dict:
                    feature_vector[feature_dict[token_text]] = 1
            
            #Now update con_1
            for i in range(len(feature_vector)):
                if feature_vector[i] != 0:
                    for c in item_channels:
                        if features[i] in con_1[c]:
                            has_count, total_count = con_1[c][features[i]]
                        else:
                            has_count, total_count = (0,0)
                        has_count += feature_vector[i]
                        total_count += feature_vector[i]
                        con_1[c][features[i]] = (has_count, total_count)     
                else:
                    for c in item_channels:
                        if features[i] in con_1[c]:
                            has_count, total_count = con_1[c][features[i]]
                        else:
                            has_count, total_count = (0,0)
                        total_count += 1
                        con_1[c][features[i]] = (has_count, total_count)
        
        #Builds con_2, pickle it and store it
        self.stdout.write('Start building con_2\n')
        con_2 = []
        for channel_id, d in con_1.items():
            d_2 = {}
            for word, stats in d.items():
                if stats[0] == 0:
                    d_2[word] = 0.001  #normalize the difference of sample size for different channels
                elif stats[0] == stats[1]:
                    d_2[word] = 0.999  #So the the opposite won't be 0
                else:
                    d_2[word] = (stats[0]+1)*1.0/(stats[1]+1)
                #Boost the features that are representative of the channel
                #..Warning:some value will exceed 1, so the premise is that we won't use 1-P in classification
                if word in boosting_features[channel_id]:
                    d_2[word] *= boosting_features[channel_id][word]
            con_2.append((channel_id, id_name_mapping[channel_id], d_2))        
        with open(settings.PROJECT_ROOT+'dataset/video_train_conditional.pkl', 'wb') as out:
            pickle.dump(con_2, out, -1) 
        
        #Caculate prios dict, pickle it and store it
        self.stdout.write('Start building prior\n')
        num_items = len(list(items))
        for k,v in prior.items():
            prior[k] = v*1.0/num_items
        
        with open(settings.PROJECT_ROOT+'dataset/video_train_prior.pkl', 'wb') as out:
            pickle.dump(prior, out, -1) 
                        
        self.stdout.write('Finished building training data for videos\n')

        
示例#10
0
    def classify_video_channel(self):
        con = pickle.load(open(settings.PROJECT_ROOT + 'dataset/video_train_conditional.pkl', 'rb'))
        prior = pickle.load(open(settings.PROJECT_ROOT + 'dataset/video_train_prior.pkl', 'rb'))
        mmseg.dict_load_defaults()
        feature_dict = {} #The same as that in train_video_channel
        index = 0
        for f in features:
            feature_dict[f] = index
            index += 1 
        
        days_ago = datetime.now() - timedelta(days=2)
        #items = Item.objects.filter(type=1, channels='', share_count=2) #In dev, share_count=2 means test data
        items = Item.objects.filter(type=1, channels='', create_date__gt=days_ago)
        item_count = len(list(items))
        print(str(item_count) + 'items to classify\n')
        
#        if settings.DEBUG:
#            count = 0
#            correct_count = 0
#            no_class_count = 0
#            false_list = []
        for item in items:
            feature_vector = [0 for f in features]
            
            algor = mmseg.Algorithm((item.name + item.snippet).encode('utf-8')) #mmseg requires utf-8
            for tok in algor:
                try:
                    token_text = tok.text.decode('utf-8') #The text come out is utf-8, turn it to unicode
                except:
                    continue
                token_text = token_text.lower()
                if token_text in feature_dict:
                    feature_vector[feature_dict[token_text]] = 1
            
            max_score = 0.0
            max_channel = 0
            for c in con:
                score = 1.0
                has_count = 0
                for i in range(len(feature_vector)):
                    if feature_vector[i] != 0: #do not multiply 1 - c[2][features[i]] otherwise 
                        score *= c[2][features[i]]
                        has_count += 1 
                if has_count == 0 or has_count == 1: #no confidence
                    score = 0.0
                score *= prior[c[0]]   
                if score > max_score:
                    max_score = score
                    max_channel = c[0]
            
            if settings.PRODUCTION and item.channels:
                channels = item.channels
                c_list = channels.split(',')
                if not str(max_channel) in c_list:
                    c_list.append(str(max_channel))
                    item.channels = ','.join(c_list)
            else:
                if max_score != 0.0: #do not use default score if we have no confidence
                    item.channels = str(max_channel)
            
            item.save()
            
#            if settings.DEBUG:  #Need to change previous setting of channels to channels_predict
#                if count % 10 == 0:
#                    print count
#                count += 1
#                if not item.channels:
#                    no_class_count += 1
#                elif item.channels_predict in item.channels :
#                    correct_count += 1 
#                else:
#                    false_list.append(item.id)
                  
#        if settings.DEBUG:
#           print('Correct(not false) ratio:' + str((correct_count+no_class_count)*1.0/item_count) +'\n' )
#           print('Number of items that could not be classfied:' + str(no_class_count) +'\n')
#           #print(str(false_list))
示例#11
0
文件: views.py 项目: loyoen/parser
def search(req):
    if req.method == 'POST':
        words = req.POST.get('keywords', '')
        #print words
        com_count = {}
        cont = {}
        #分词
        mmseg.dict_load_defaults()
        #print words.encode("utf-8")
        algor = mmseg.Algorithm(words.encode("utf-8"))
        for tok in algor:
            word = tok.text.decode("utf-8")
            print word
            res = keyword.objects.filter(word = word)
            for item in res:
                print "get............."
                try:
                    com_count[str(item.mycom.id)] += 1
                except:
                    com_count[str(item.mycom.id)] = 1
                    cont[str(item.mycom.id)] = item.mycom
        
        sortres = sorted(com_count.iteritems(),key=lambda asd:asd[1], reverse=True)
        resultlist = []
        for each in sortres:
            eachres = {}
            eachres['id'] = str(each[0])
            eachres['count'] = str(each[1])
            eachres['Company_Name'] = cont[str(each[0])].Company_Name
            eachres['company_id'] = cont[str(each[0])].id
            resultlist.append(cont[str(each[0])])
            
        a={}
        if not resultlist:
            resultlist = company.objects.filter()
            
        test=[]
        favor_com = favor.objects.filter(who_id=req.user.id)
        for item in favor_com:
            test.append(item.which_id)
        
        for item in resultlist:
            if item.id in test:
                item.is_in_attention = True
            else:
                item.is_in_attention = False
            
        a["result"] = resultlist
        
        if req.user.is_authenticated():
            req.user.is_authenticated = True
            a['user'] = req.user
        
        a["keyword"] = words
        return render_to_response("search.html",a)
    else:
        a={}
     
        resultlist = company.objects.filter()
              
        a["result"] = resultlist
        
        favor_com = favor.objects.filter(who_id=req.user.id)
        test=[]
        for item in favor_com:
            test.append(item.which_id)
            
        for item in resultlist:
            if item.id in test:
                item.is_in_attention = True
            else:
                item.is_in_attention = False
        
        if req.user.is_authenticated():
            req.user.is_authenticated = True
            a['user'] = req.user
        return render_to_response("search.html",a)
示例#12
0
def go ():
    while 1:
        focus_id = get_all_fid()
        for fid in focus_id:
            config = load_config()
            scrapy_content (config,fid)
            split_word (config)
        print 'sleep %s seconds ' % config.get('scy_stop')
        time.sleep(int(config.get('scy_stop')))

    #thread.start_new_thread(scrapy_content, (config,))
    #thread.start_new_thread(split_word, (config,))



if __name__ == "__main__":

    dbname = "webpy"
    dbuser = "******"
    dbpawd = "1234"
    conn = psycopg2.connect(database=dbname, user=dbuser, password=dbpawd, host='localhost', port=5432)
    cur  = conn.cursor()

    mmseg.dict_load_defaults()   #split chinese word

    go()
    
    conn.commit()
    cur.close()
    conn.close()
示例#13
0
def test(text):
    mmseg.dict_load_defaults()
    algor = mmseg.Algorithm(text)
    for tok in algor:
        print '%s [%d..%d]' % (tok.text, tok.start, tok.end)
示例#14
0
def classify_influence(inf_id, other=False):
    """
    TODO:
    1, Calculate the most popular tags, so that we can assign idf score
    2, Accumulate more meaningful tags
    """
    inf_id = inf_id.decode('gbk')
    print inf_id.encode('gbk')
    try:
        inf_id = int(inf_id)
        inf = Influence.objects.get(pk=inf_id)
    except:
        inf, created = Influence.objects.get_or_create(screen_name=inf_id)
        if created:
            auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET)
            auth.setToken('128021658f2bfdae185d89bdffb3cede','1713185d5c8208e8f1ef27a1f484ebc9')
            api = API(auth)
            
            user = api.get_user(screen_name=inf_id)
            inf.sina_account = getAtt(user, 'id')
            inf.verified = getAtt(user,'verified')
            inf.screen_name = getAtt(user, 'screen_name')
            inf.description = getAtt(user, 'description')
            inf.follower_count = getAtt(user, 'followers_count')
            inf.following_count = getAtt(user, 'friends_count')
            inf.status_count = getAtt(user, 'statuses_count')
            inf.favourites_count = getAtt(user, 'favourites_count')
            inf.create_date = getAtt(user, 'created_at')
            inf.save()
    
    auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET)
    if other:
        auth.setToken('128021658f2bfdae185d89bdffb3cede', '1713185d5c8208e8f1ef27a1f484ebc9')
    else:
        auth.setToken(inf.sina_key, inf.sina_secret)
    api = API(auth)
    mmseg.dict_load_defaults()
    """Put this in db first"""
    candidate_tags = KeyValue.objects.get(key='CANDIDATE_TAGS')
    
    area_dict = {}
#    id_list = api.followers_ids(user_id=inf.sina_account, count=100) #default to 500, maximum is 5000; This consumes a lot of api limit
#    ids = id_list[0].ids  #Weird that getAtt won't work
#    for id in ids:
#        tags = api.tags(user_id=id)  #user_id is required!
#        tag_list = []
#        for tag in tags:
#            tag_list.append(getAtt(tag, 'value').lower().encode('utf-8'))
#        mmseg_text = mmseg.Algorithm(' '.join(tag_list))
#        for token in mmseg_text:
#            try:
#                term = token.text.decode('utf-8').lower()
#                #next_term = mmseg_text[i+1].text.decode('utf-8') if i < len_list - 1 else ''
#            except:
#                continue
#            train_value = area_train_data.get(term, None)
#            #if not train_value:
#            #    train_value = area_train_data.get(term + next_term, None)
#            if train_value:
#                print 'in dict'
#                for index in train_value:
#                    if index in area_dict:
#                        area_dict[index] += 1
#                    else:
#                        area_dict[index] = 1
#            else:
#                candidate_tags.value += ' ' + term
        
    candidate_tags.save()
    area_distr_dict = {}
    mid_list = []
    ids_list = []
    tweet_list = [] #Store the text of tweet and retweet
    rt_count_list = []
    tried_count = 0
    while True:
        timeline = api.user_timeline(user_id=inf.sina_account, count=200)
        if len(timeline) == 0 and inf.status_count >0:
            tried_count += 1
            print 'try again in getting timeline'
        else:
            break
        if tried_count > 3:
            raise Exception('weibo api error. No timeline got')
            break
        
    for line in timeline:
        text = getAtt(line, 'text')
        retweet = getAtt(line, 'retweeted_status')
        retweet_text = getAtt(retweet, 'text')
        if retweet_text:
            text += retweet_text
        tweet_list.append(text)   
        mid_list.append(str(getAtt(line, "id")))
        if len(mid_list) == 20:
            ids_list.append(','.join(mid_list))
            mid_list = []
    if mid_list: #append the remaining ids
        ids_list.append(','.join(mid_list))
    if inf.status_count > 0 and not ids_list:
        raise Exception('weibo api fails')
    tweet_list_correct = []
    correct_index = 20 
    for ids in ids_list:
        counts = api.counts(ids=ids)
        if len(counts) == 0:
            print 'error in counts!'
            correct_index += 20
            continue
        for obj in counts:
            rt_count_list.append(getAtt(obj, 'rt'))
        tweet_list_correct.extend(tweet_list[correct_index-20:correct_index])
        correct_index += 20    
    if len(tweet_list_correct) == 0 or len(tweet_list_correct) != len(rt_count_list):
        raise Exception('weibo api fails')
    print 'length of tweet list and rt_count list', len(tweet_list_correct), len(rt_count_list)
    #Remedy for those user who has posted less than 200 status
    amplify_ratio = 1.0 if len(tweet_list_correct) == 200 else 200.0/len(tweet_list_correct)
    for i in range(len(tweet_list_correct)):
        print i
        #This number 100 should be replaced by avg_follower_count
        #Use math.sqrt to boost those tweet that has not been retweeted, 
        #and smooth the effect of famous people tweeting about things not related to them 
        added_count = (rt_count_list[i]*100 + math.sqrt(inf.follower_count)) * amplify_ratio
        assigned_area = {}
        try: #In Unix environment
            from sinal import signal, SIGALRM, alarm #@UnresolvedImport
            def handler(signum, frame):
                #print 'Signal handler called with signal', signum
                raise Exception("This code block runs for too long time!")
            signal(SIGALRM, handler)
            alarm(3)
            mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8'))
            alarm(0) #cancel the alarm after finised
        except ImportError: # In windows, SIGALRM, alarm is not available in signal module
            mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8'))
        except: #mmseg halts for too long, process next tweet
            continue
        for token in mmseg_text:
            try:
                term = token.text.decode('utf-8').lower()
            except:
                continue 
            train_value = area_train_data.get(term, None)
            if train_value:
                print 'in dict'
                for index in train_value:
                    if index not in assigned_area: #This tweet has already been assigned to one area
                        if index in area_dict:
                            area_dict[index] += added_count
                        else:
                            area_dict[index] = added_count
                        assigned_area[index] = True
                        if index in area_distr_dict:
                            area_distr_dict[index] += 1
                        else:
                            area_distr_dict[index] = 1
                    else:
                        area_distr_dict[index] += 1
    candidate_tags.save()
    
    sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
    if inf.follower_count > 100000: 
        for i in range(1,len(sorted_tuple)): #Only normalize on secondary influence areas and belows
            index = sorted_tuple[i][0]
            model_follower_count = areas[index][4]
            if inf.follower_count > model_follower_count:
                area_dict[index] = area_dict[index]*1.0/inf.follower_count*model_follower_count  
    
    num_areas = len(area_distr_dict)
    total_keyword_count = 0
    for index in area_distr_dict:
        total_keyword_count += area_distr_dict[index]
    for k in area_dict:
        area_distr_ratio = num_areas * area_distr_dict[k]*1.0/total_keyword_count
        print k , area_distr_ratio, area_distr_dict[k]
        area_dict[k] = 100.0/math.log(areas[k][3]) * math.log(area_dict[k]*area_distr_ratio)
        if area_dict[k] > 100:
            area_dict[k] = 100.0
                    
    sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True) 
    for st in sorted_tuple:
        print areas[st[0]][1].encode('gbk'), st[1]