Python vector_blend 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: util.vector_operation

메소드/함수: vector_blend

hotexamples.com에서의 예제들: 4

Python vector_blend - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 util.vector_operation.vector_blend에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: sparse_base.py 프로젝트: RainNo1/InfoSystem

def score_computing(document,sparse_base_ranking,query,para_2,para_3):
    """
    """
    alpha = para_2
    vec_candidate = document.get_sparse_rep()
    relate_score = vector_multiplier(query,vec_candidate)
    
    if len(sparse_base_ranking)==0:
        return relate_score,0.0,0.0,0.0,0.0
    exponent_a = 0.0
    vec = [0.0 for i in range(len(vec_candidate))]
    for document_selected in sparse_base_ranking:
        vec_selected = document_selected.get_sparse_rep()
        vec = vector_blend(vec,vec_selected)
        exponent_a += vector_multiplier(vec_candidate,vec_selected)
    e_vec = [0.0 for i in range(len(vec_candidate))]
    for i in range(len(e_vec)):
        if vec[i]== 0.0:
            e_vec[i] = query[i]
    exponent_b = 0.5*vector_multiplier(vec_candidate,e_vec)
    diversify_score = (1.0/len(sparse_base_ranking))*(math.exp(-1*(exponent_a-para_3*exponent_b)))#/(len(sparse_base_ranking))
    
    return relate_score +alpha*diversify_score,relate_score, diversify_score,exponent_a,exponent_b

예제 #2

파일 보기

파일: sparse_base.py 프로젝트: RainNo1/InfoSystem

def score_computing(document, sparse_base_ranking, query, para_2, para_3):
    """
    """
    alpha = para_2
    vec_candidate = document.get_sparse_rep()
    relate_score = vector_multiplier(query, vec_candidate)

    if len(sparse_base_ranking) == 0:
        return relate_score, 0.0, 0.0, 0.0, 0.0
    exponent_a = 0.0
    vec = [0.0 for i in range(len(vec_candidate))]
    for document_selected in sparse_base_ranking:
        vec_selected = document_selected.get_sparse_rep()
        vec = vector_blend(vec, vec_selected)
        exponent_a += vector_multiplier(vec_candidate, vec_selected)
    e_vec = [0.0 for i in range(len(vec_candidate))]
    for i in range(len(e_vec)):
        if vec[i] == 0.0:
            e_vec[i] = query[i]
    exponent_b = 0.5 * vector_multiplier(vec_candidate, e_vec)
    diversify_score = (1.0 / len(sparse_base_ranking)) * (math.exp(
        -1 * (exponent_a - para_3 * exponent_b)))  #/(len(sparse_base_ranking))

    return relate_score + alpha * diversify_score, relate_score, diversify_score, exponent_a, exponent_b

예제 #3

파일 보기

파일: data_preparing.py 프로젝트: RainNo1/InfoSystem

def construct_data(basepath,topicID,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method = "1"):
    
    dictionary_A1 = []
    sid_dict = {}               # the word of each subtopic for build original dictionary A.
    subtopic_candidates = []    # a list of subtopic class for dictionary learning.
    documents = []              # a list of docs class for docs sparse representation.
    subtopic_word_dict = {}     # a dict store the subtopic ids of a word
    term_weight = {}            # a dict store the term weight.
    query = []                  # a vector for subtopic probability to original query
    pickle_path = basepath+"pickle_data/"+topicID+"-"+mine_method+"/"
    if not os.path.exists(pickle_path):
        os.mkdir(pickle_path)
    pickle_file = pickle_path+dictionary_rep+dictionary_norm+document_ori_rep+document_rep_norm
    if os.path.exists(pickle_file) and os.path.isfile(pickle_file):
        infile = open(pickle_file,"rb")
        subtopic_candidates = pickle.load(infile)
        documents  = pickle.load(infile)
        word2id_weight = pickle.load(infile)
        dictionary_A1 = pickle.load(infile)
        query = pickle.load(infile)
        infile.close()
    else:
#         middle_result_path = basepath+'middle/'+topicID+"/"
#         if not os.path.exists(middle_result_path):
#             os.mkdir(middle_result_path)
#         middle_run_result = open(middle_result_path+dictionary_rep+dictionary_norm+document_ori_rep+document_rep_norm+".out","w")
        subtopic_detail = prepare_subtopic(basepath,topicID,[dictionary_rep],mine_method)
        subtopic_dict,word2id_weight,query_dict,subtopic_lenght_ave,topic_words,ori_subtopics ,subtopic_number = subtopic_detail[dictionary_rep]
        docs,document_term_frequency,average_document_len,topic_word_num = doc_preprocessing(basepath,topicID,word2id_weight,topic_words)
        
        # get subtopic id set of a word
        sids = subtopic_dict.keys()
        sids.sort()
        for sid in sids:
            for id ,label,words,frequency,urls in subtopic_dict[sid]:
                for word in words:
                    if subtopic_word_dict.has_key(word) :#and subtopic_word_dict[word].count(sid)==0:
                        subtopic_word_dict[word].append(label)
                    else:
                        subtopic_word_dict[word] = [label]
                        
        # prepare data for construct the original dictionary A1 
        query = [0.0 for i in range(len(sids))]
        for index,sid in enumerate(sids):
            query[index] = query_dict[sid]
            vec = [0.0 for i in range(len(word2id_weight))]
            for id ,label,words,frequency,urls in subtopic_dict[sid]:
                vec_tf_idf = [0.0 for i in range(len(word2id_weight))]
                score_sum = 0.0
                for word in words:
                    tf = words.count(word)*frequency
                    doc_len = len(words)
                    term_fre = None
                    key_fre = 1
                    num_doc = subtopic_number
                    doc_fre = 0
                    for temp, count in word2id_weight[word][2]:
                        doc_fre += count
                    score_tf_idf= 0.0    
                    if dictionary_rep =="S1":
                        score_tf_idf = TF(tf, doc_len, doc_fre, term_fre, key_fre, num_doc, subtopic_lenght_ave)
                    elif dictionary_rep == "S2":  # consider the weight of the word in this subtopic set
                        score_tf_idf = TF(tf, doc_len, doc_fre, term_fre, key_fre, num_doc, subtopic_lenght_ave)*subtopic_word_dict[word].count(label)/(len(subtopic_word_dict[word])+1)
                    else:
                        score_tf_idf = TF(tf, doc_len, doc_fre, term_fre, key_fre, num_doc, subtopic_lenght_ave)
                    score_sum += score_tf_idf
                    word_index = word2id_weight[word][0]
                    vec_tf_idf[word_index] = score_tf_idf
                if dictionary_norm =="Y" and score_sum != 0:
                    vec_tf_idf = [vec_tf_idf[i]/score_sum for i in range(len(word2id_weight))]
                vec = vector_blend(vec,vec_tf_idf)
                true_sprase_vec=[0.0 for i in range(len(sids))]
                true_sprase_vec[index] =1.0#query_dict[sid]
                subtopic = Subtopic()
                subtopic.set_query_frequency(frequency)
                subtopic.set_query_label(label)
                subtopic.set_query_urls(urls)
                subtopic.set_query_vec_tfidf(vec_tf_idf)
                subtopic.set_query_words(words)
                subtopic.set_query_id(id)
                subtopic.set_true_sprase_vec(true_sprase_vec)
                subtopic_candidates.append(subtopic)
                if sid_dict.has_key(index):
                    sid_dict[index].append(subtopic)
                else:
                    sid_dict[index] = []
                    sid_dict[index].append(subtopic)
            vec_sum = sum(vec)        
            if vec_sum != 0 and dictionary_norm =="Y":
                vec = [vec[i]/vec_sum for i in range(len(vec))]
            dictionary_A1.append(vec)
       
        # term weight computing
        for term in document_term_frequency.keys():
            if not word2id_weight.has_key(term):
                continue
            document_term = document_term_frequency[term][0]
            document_fre= len(document_term_frequency[term][1])
            topic_word_count = topic_word_num
           
            if not term_weight.has_key(term):
                topic_num = len(set(subtopic_word_dict[term]))
                topic_num_all = len(sid_dict)
                subtopic_num_term = 0.0
                for temp, count in word2id_weight[word][2]:
                    subtopic_num_term += count
                if document_ori_rep=="D1":
                    term_weight[term] = 1
                elif document_ori_rep=="D2": # the number of subtopic contain this term/the number of subtopic set
                    term_weight[term] = math.log(topic_num_all/(topic_num+1)+1,10)
                elif document_ori_rep=="D3": # the idf score of the term in subtopic collection
                    term_weight[term] = IDF(subtopic_num_term,subtopic_number)
                elif document_ori_rep=="D4":
                    term_weight[term] = math.log(topic_word_count/(len(topic_words)*(document_term+1 )+1),10)
                elif document_ori_rep=="D5":
                    term_weight[term] = 1/(math.log(topic_word_count/(document_fre+1 )+1,2))
        
    
        # preparing docs for sparse representation    
        for doc in docs:
            id = doc.get_id()
            terms = doc.get_term_vec()
    #         true_rank = doc.get_true_rank()
            related_sid = doc.get_related_sid()
            tf_idf_vec = [0.0 for i in range(len(word2id_weight))]
            tf_idf_score_sum = 0.0
#             print >>middle_run_result,id
#             print >>middle_run_result,doc.get_doc_str()
#             print >>middle_run_result,doc.get_related_sid()
            for term in terms:
                tf = terms.count(term)
                doc_len = len(terms)
                doc_fre = 1
                term_fre = None
                num_doc = len(docs)
                key_fre = word2id_weight[term][1]
                score_tf_idf = TF_IDF(tf, doc_len, doc_fre, term_fre, 1, num_doc, average_document_len)*term_weight[term]
                term_index = word2id_weight[term][0]
                tf_idf_vec[term_index] = score_tf_idf
                tf_idf_score_sum += score_tf_idf
            vec = tf_idf_vec
            if document_rep_norm =="Y" and tf_idf_score_sum!=0:
                vec = [tf_idf_vec[i]/tf_idf_score_sum for i in range(len(tf_idf_vec))]
    
            for term in word2id_weight.keys():
                index =  word2id_weight[term][0]
#                 if vec[index]!= 0:
#                     print >>middle_run_result,term,vec[index] ,subtopic_word_dict[term]
            doc.set_tfidf_vec(vec)
            documents.append(doc)
            
        outfile = open(pickle_file,"wb")
        pickle.dump(subtopic_candidates,outfile) 
        pickle.dump(documents,outfile)
        pickle.dump(word2id_weight,outfile)
        pickle.dump(dictionary_A1,outfile)
        pickle.dump(query,outfile)
        outfile.close()
    return subtopic_candidates,documents,word2id_weight,dictionary_A1,query

예제 #4

파일 보기

파일: data_preparing.py 프로젝트: RainNo1/InfoSystem

def construct_data(basepath,
                   topicID,
                   dictionary_norm,
                   dictionary_rep,
                   document_ori_rep,
                   document_rep_norm,
                   mine_method="1"):

    dictionary_A1 = []
    sid_dict = {}  # the word of each subtopic for build original dictionary A.
    subtopic_candidates = [
    ]  # a list of subtopic class for dictionary learning.
    documents = []  # a list of docs class for docs sparse representation.
    subtopic_word_dict = {}  # a dict store the subtopic ids of a word
    term_weight = {}  # a dict store the term weight.
    query = []  # a vector for subtopic probability to original query
    pickle_path = basepath + "pickle_data/" + topicID + "-" + mine_method + "/"
    if not os.path.exists(pickle_path):
        os.mkdir(pickle_path)
    pickle_file = pickle_path + dictionary_rep + dictionary_norm + document_ori_rep + document_rep_norm
    if os.path.exists(pickle_file) and os.path.isfile(pickle_file):
        infile = open(pickle_file, "rb")
        subtopic_candidates = pickle.load(infile)
        documents = pickle.load(infile)
        word2id_weight = pickle.load(infile)
        dictionary_A1 = pickle.load(infile)
        query = pickle.load(infile)
        infile.close()
    else:
        #         middle_result_path = basepath+'middle/'+topicID+"/"
        #         if not os.path.exists(middle_result_path):
        #             os.mkdir(middle_result_path)
        #         middle_run_result = open(middle_result_path+dictionary_rep+dictionary_norm+document_ori_rep+document_rep_norm+".out","w")
        subtopic_detail = prepare_subtopic(basepath, topicID, [dictionary_rep],
                                           mine_method)
        subtopic_dict, word2id_weight, query_dict, subtopic_lenght_ave, topic_words, ori_subtopics, subtopic_number = subtopic_detail[
            dictionary_rep]
        docs, document_term_frequency, average_document_len, topic_word_num = doc_preprocessing(
            basepath, topicID, word2id_weight, topic_words)

        # get subtopic id set of a word
        sids = subtopic_dict.keys()
        sids.sort()
        for sid in sids:
            for id, label, words, frequency, urls in subtopic_dict[sid]:
                for word in words:
                    if subtopic_word_dict.has_key(
                            word
                    ):  #and subtopic_word_dict[word].count(sid)==0:
                        subtopic_word_dict[word].append(label)
                    else:
                        subtopic_word_dict[word] = [label]

        # prepare data for construct the original dictionary A1
        query = [0.0 for i in range(len(sids))]
        for index, sid in enumerate(sids):
            query[index] = query_dict[sid]
            vec = [0.0 for i in range(len(word2id_weight))]
            for id, label, words, frequency, urls in subtopic_dict[sid]:
                vec_tf_idf = [0.0 for i in range(len(word2id_weight))]
                score_sum = 0.0
                for word in words:
                    tf = words.count(word) * frequency
                    doc_len = len(words)
                    term_fre = None
                    key_fre = 1
                    num_doc = subtopic_number
                    doc_fre = 0
                    for temp, count in word2id_weight[word][2]:
                        doc_fre += count
                    score_tf_idf = 0.0
                    if dictionary_rep == "S1":
                        score_tf_idf = TF(tf, doc_len, doc_fre, term_fre,
                                          key_fre, num_doc,
                                          subtopic_lenght_ave)
                    elif dictionary_rep == "S2":  # consider the weight of the word in this subtopic set
                        score_tf_idf = TF(
                            tf, doc_len, doc_fre, term_fre, key_fre, num_doc,
                            subtopic_lenght_ave
                        ) * subtopic_word_dict[word].count(label) / (
                            len(subtopic_word_dict[word]) + 1)
                    else:
                        score_tf_idf = TF(tf, doc_len, doc_fre, term_fre,
                                          key_fre, num_doc,
                                          subtopic_lenght_ave)
                    score_sum += score_tf_idf
                    word_index = word2id_weight[word][0]
                    vec_tf_idf[word_index] = score_tf_idf
                if dictionary_norm == "Y" and score_sum != 0:
                    vec_tf_idf = [
                        vec_tf_idf[i] / score_sum
                        for i in range(len(word2id_weight))
                    ]
                vec = vector_blend(vec, vec_tf_idf)
                true_sprase_vec = [0.0 for i in range(len(sids))]
                true_sprase_vec[index] = 1.0  #query_dict[sid]
                subtopic = Subtopic()
                subtopic.set_query_frequency(frequency)
                subtopic.set_query_label(label)
                subtopic.set_query_urls(urls)
                subtopic.set_query_vec_tfidf(vec_tf_idf)
                subtopic.set_query_words(words)
                subtopic.set_query_id(id)
                subtopic.set_true_sprase_vec(true_sprase_vec)
                subtopic_candidates.append(subtopic)
                if sid_dict.has_key(index):
                    sid_dict[index].append(subtopic)
                else:
                    sid_dict[index] = []
                    sid_dict[index].append(subtopic)
            vec_sum = sum(vec)
            if vec_sum != 0 and dictionary_norm == "Y":
                vec = [vec[i] / vec_sum for i in range(len(vec))]
            dictionary_A1.append(vec)

        # term weight computing
        for term in document_term_frequency.keys():
            if not word2id_weight.has_key(term):
                continue
            document_term = document_term_frequency[term][0]
            document_fre = len(document_term_frequency[term][1])
            topic_word_count = topic_word_num

            if not term_weight.has_key(term):
                topic_num = len(set(subtopic_word_dict[term]))
                topic_num_all = len(sid_dict)
                subtopic_num_term = 0.0
                for temp, count in word2id_weight[word][2]:
                    subtopic_num_term += count
                if document_ori_rep == "D1":
                    term_weight[term] = 1
                elif document_ori_rep == "D2":  # the number of subtopic contain this term/the number of subtopic set
                    term_weight[term] = math.log(
                        topic_num_all / (topic_num + 1) + 1, 10)
                elif document_ori_rep == "D3":  # the idf score of the term in subtopic collection
                    term_weight[term] = IDF(subtopic_num_term, subtopic_number)
                elif document_ori_rep == "D4":
                    term_weight[term] = math.log(
                        topic_word_count / (len(topic_words) *
                                            (document_term + 1) + 1), 10)
                elif document_ori_rep == "D5":
                    term_weight[term] = 1 / (math.log(
                        topic_word_count / (document_fre + 1) + 1, 2))

        # preparing docs for sparse representation
        for doc in docs:
            id = doc.get_id()
            terms = doc.get_term_vec()
            #         true_rank = doc.get_true_rank()
            related_sid = doc.get_related_sid()
            tf_idf_vec = [0.0 for i in range(len(word2id_weight))]
            tf_idf_score_sum = 0.0
            #             print >>middle_run_result,id
            #             print >>middle_run_result,doc.get_doc_str()
            #             print >>middle_run_result,doc.get_related_sid()
            for term in terms:
                tf = terms.count(term)
                doc_len = len(terms)
                doc_fre = 1
                term_fre = None
                num_doc = len(docs)
                key_fre = word2id_weight[term][1]
                score_tf_idf = TF_IDF(tf, doc_len, doc_fre, term_fre, 1,
                                      num_doc,
                                      average_document_len) * term_weight[term]
                term_index = word2id_weight[term][0]
                tf_idf_vec[term_index] = score_tf_idf
                tf_idf_score_sum += score_tf_idf
            vec = tf_idf_vec
            if document_rep_norm == "Y" and tf_idf_score_sum != 0:
                vec = [
                    tf_idf_vec[i] / tf_idf_score_sum
                    for i in range(len(tf_idf_vec))
                ]

            for term in word2id_weight.keys():
                index = word2id_weight[term][0]


#                 if vec[index]!= 0:
#                     print >>middle_run_result,term,vec[index] ,subtopic_word_dict[term]
            doc.set_tfidf_vec(vec)
            documents.append(doc)

        outfile = open(pickle_file, "wb")
        pickle.dump(subtopic_candidates, outfile)
        pickle.dump(documents, outfile)
        pickle.dump(word2id_weight, outfile)
        pickle.dump(dictionary_A1, outfile)
        pickle.dump(query, outfile)
        outfile.close()
    return subtopic_candidates, documents, word2id_weight, dictionary_A1, query