Exemplo n.º 1
0
def get_documents(docs):
    # docs only have the (docid, tokens, length of subject keyword)
    docs_tokens = []
    for doc in docs:
        tokens, sub_tok_len = get_keywords(doc[1], doc[2], tokenizer)
        docs_tokens.append((doc[0], tokens, sub_tok_len))

    for docid, tokens, sub_tok_len in docs_tokens:
        for i in range(sub_tok_len):
            # We change subject token weight to twice of its original
            tokens[i] = (tokens[i][0], tokens[i][1] * 2)
    # merge the same tokens
    def merge_tokens(tokens):
        tmp_tokens = {}
        for tok, count in tokens:
            if tmp_tokens.has_key(tok):
                tmp_tokens[tok] += count
            else:
                tmp_tokens[tok] = count
        return sorted(tmp_tokens.iteritems(), key=lambda d: d[1], reverse=True)
    docs_tokens_new = []
    for docid, tokens, sub_tok_len in docs_tokens:
        docs_tokens_new.append((docid, merge_tokens(tokens)))
    
    # appear frequence and appear documents of every token
    tokens_appear_map = {}
    doc_terms_count = {}
    for docid, tokens in docs_tokens_new:
        terms_count = 0
        for token, count in tokens:
            tokens_appear_map.setdefault(token,[]).append(docid)
            terms_count += count
        doc_terms_count[docid] = terms_count
    #for token, appear_list in tokens_appear_map.iteritems():
    #    print token, appear_list
    #for docid, tokens in docs_tokens_new:
    #    print docid, doc_terms_count[docid]

    # print the last result
    #for docid, tokens in docs_tokens_new:
    #    print docid
    #    for toks in tokens:
    #        print toks[0], toks[1]
    documents = []
    for docid, tokens in docs_tokens_new:
        token_map = {}
        for key, count in tokens:
            token_map[key] = {"count": count}
        token_list = [key[0] for key in tokens]
        documents.append({"text":   "id:%s\ntext:%s" % (docid, " ".join(token_list)),
                             "docid": docid,
                             "token_map": token_map,
                             "token_list": token_list})

    return documents, tokens_appear_map, doc_terms_count
Exemplo n.º 2
0
def quota_sensitivity(topic, start_ts, end_ts):
    '''
    关键词中敏感词的个数L,关键词个数N,敏感度L/N
    敏感词set1, 关键词set2, min(1,len(set1&set2)/len(set2))
    '''
    limit = 50 
    keywords_set = get_keywords(topic, start_ts, end_ts, limit) # 获得前limit的keywords_set 
    class_result = db.session.query(ClassSensitivity).filter(ClassSensitivity.topic==topic ,\
                                                             ClassSensitivity.start_ts==start_ts ,\
                                                             ClassSensitivity.end_ts==end_ts).first()
    class_sensitivity_set = set(json.loads(class_result.words))
    L = len(class_sensitivity_set & keywords_set)
    ratio_class = float(L) / float(limit)
    classfication = 1
    if ratio_class < 1:
        save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_class)
    else:
        ratio_class = 1
        save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_class)
        
    word_result = db.session.query(WordSensitivity).filter(WordSensitivity.topic==topic ,\
                                                         WordSensitivity.start_ts==start_ts ,\
                                                         WordSensitivity.end_ts==end_ts).first()
    word_sensitivity_set = set(json.loads(word_result.words))
    L = len(word_sensitivity_set & keywords_set)
    ratio_word = float(L) / float(limit)
    classfication = 2
    if ratio_word < 1:
        save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_word)
    else:
        ratio_class = 1
        save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_word)
        
    place_result = db.session.query(PlaceSensitivity).filter(PlaceSensitivity.topic==topic ,\
                                                             PlaceSensitivity.start_ts==start_ts ,\
                                                             PlaceSensitivity.end_ts==end_ts).first()
    place_sensitivity_set = set(json.loads(place_result.words))
    L = len(place_sensitivity_set & keywords_set)
    ratio_place = float(L) / float(limit)
    classfication = 3
    if ratio_place < 1:
        save_sensitivity_quota(topic, start_ts, end_ts, classfication , ratio_place)
    else:
        ratio_class = 1
        save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_place)
Exemplo n.º 3
0
def get_documents(docs):
    # docs only have the (docid, tokens, length of subject keyword)
    docs_tokens = []
    for doc in docs:
        tokens, sub_tok_len = get_keywords(doc[1], doc[2], tokenizer)
        docs_tokens.append((doc[0], tokens, sub_tok_len))

    for docid, tokens, sub_tok_len in docs_tokens:
        for i in range(sub_tok_len):
            # We change subject token weight to twice of its original
            tokens[i] = (tokens[i][0], tokens[i][1] * 2)
    # merge the same tokens
    def merge_tokens(tokens):
        tmp_tokens = {}
        for tok, count in tokens:
            if tmp_tokens.has_key(tok):
                tmp_tokens[tok] += count
            else:
                tmp_tokens[tok] = count
        return sorted(tmp_tokens.iteritems(), key=lambda d: d[1], reverse=True)

    docs_tokens_new = []
    for docid, tokens, sub_tok_len in docs_tokens:
        docs_tokens_new.append((docid, merge_tokens(tokens)))

    # appear frequence and appear documents of every token
    tokens_appear_map = {}
    doc_terms_count = {}
    for docid, tokens in docs_tokens_new:
        terms_count = 0
        for token, count in tokens:
            tokens_appear_map.setdefault(token, []).append(docid)
            terms_count += count
        doc_terms_count[docid] = terms_count
    #for token, appear_list in tokens_appear_map.iteritems():
    #    print token, appear_list
    #for docid, tokens in docs_tokens_new:
    #    print docid, doc_terms_count[docid]

    # print the last result
    #for docid, tokens in docs_tokens_new:
    #    print docid
    #    for toks in tokens:
    #        print toks[0], toks[1]
    documents = []
    for docid, tokens in docs_tokens_new:
        token_map = {}
        for key, count in tokens:
            token_map[key] = {"count": count}
        token_list = [key[0] for key in tokens]
        documents.append({
            "text":
            "id:%s\ntext:%s" % (docid, " ".join(token_list)),
            "docid":
            docid,
            "token_map":
            token_map,
            "token_list":
            token_list
        })

    return documents, tokens_appear_map, doc_terms_count