Exemplo n.º 1
    def Feedback(self, searchList):
        queryVector = self.buildQueryVector(searchList)
        self.vectorIDF = [float(2048.0/x) for x in self.vectorIDF]
        self.vectorIDF = [float(math.log10(x)) for x in self.vectorIDF]
        self.tfidf = [map(lambda (a,b):a*b,zip(self.vectorIDF, documentVector)) for documentVector in self.documentVectors]
        ratings = [
            util.cosine(queryVector, documentVector)
            for documentVector in self.tfidf
        maxone = 0
        targetone = 0
        for i in range(len(ratings)):
            if (ratings[i] > maxone):
                maxone = ratings[i]
                targetone = i
        newqueryVector = []
        for i in range(len(queryVector)):
            newqueryVector.append(queryVector[i] +
                                  ((0.5) * self.documentVectors[targetone][i]))
        ratings2 = [
            util.cosine(newqueryVector, documentVector)
            for documentVector in self.tfidf

        return ratings2
Exemplo n.º 2
 def feedbacksearch(self, searchList, wordString, flag):
     queryVector = self.buildQueryVector(searchList, flag)
     feedback = self.makeVector(wordString, flag)
     for index in range( 0, len(queryVector)):
         queryVector[index] = float( queryVector[index] + feedback[index] * (1/2) )
     ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
     return ratings
Exemplo n.º 3
def baseline_average_cos(article_num, num_times, print_out=True, equal_prob=True):
    Generates two random actual articles of source article article_num
    Computes the cosine similarity
    Does this num_times and outputs the lowest cos, highest, and average
    lowest = float("inf")
    highest = float("-inf")
    total = 0.0
    for _ in xrange(num_times):
        if equal_prob:
            a1 = source_articles.spin_articles(article_num)[0]
            a2 = source_articles.spin_articles(article_num)[0]
            a1, a2 = source_articles.spin_articles(article_num, 2)

        cos_sim = cosine(a1, a2)
        lowest = min(lowest, cos_sim)
        highest = max(highest, cos_sim)
        total += cos_sim

    average = total / num_times
    if print_out:
        print 'Ran {0} times for article {1}'.format(num_times, article_num)
        if equal_prob: print 'Generated with equal probability'
        else: print 'Generated using heuristic to produce low cosine'
        print 'Lowest : {0}\nHighest : {1}\nAverage : {2}\n'.format(lowest, highest, average)
Exemplo n.º 4
    def search(self,searchList):
        """ search for documents that match based on a list of terms """
        queryVector = self.buildQueryVector(searchList)

        ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
        return ratings
Exemplo n.º 5
 def related(self, documentId):
     """ find documents that are related to the document indexed by passed Id within the document Vectors"""
     rating_dic = {}
     for key, value in self.documentVectors.items():
         ratings = util.cosine(self.documentVectors[documentId], value)
         rating_dic[key] = ratings
     return rating_dic
Exemplo n.º 6
 def related(self, documentId):
     """ find documents that are related to the document indexed by passed Id within the document Vectors"""
     ratings = [
         util.cosine(self.documentVectors[documentId], documentVector)
         for documentVector in self.documentVectors
     return ratings
Exemplo n.º 7
	def search(self,searchList):
		""" search for documents that match based on a list of terms """
		queryVector = self.buildQueryVector(searchList)

		ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
		return ratings
Exemplo n.º 8
 def search(self, searchList, compare, flag): 
     queryVector = self.buildQueryVector(searchList, flag)
     if compare == "cos":
         ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
     elif compare == "dis":
         ratings = [util.Euclidean(queryVector, documentVector) for documentVector in self.documentVectors]
     return ratings
 def related(self,documentId):
     """ find documents that are related to the document indexed by passed Id within the document Vectors"""
     ratings = {}
     for key, value in documentVectors.items():
         rating = util.cosine(self.documentVectors[documentId], value) 
         ratings[key] = rating
     return ratings
Exemplo n.º 10
    def TF_Cosine(self, query):
        queryTFVector = [self.makeTfVector(query)]

        tf_cos = []
        for documentTFVector in self.documentTFVectors:
            tf_cos.append(util.cosine(queryTFVector, documentTFVector))

        return tf_cos
    def searchTFIDFWithCosine(self, searchList):
        """ search for documents that match based on a list of terms """
        queryVector = self.buildTFIDFQueryVector(searchList)

        ratings = [
            util.cosine(queryVector, documentVector)
            for documentVector in self.TFIDFVectors

        return ratings
 def relevence_search(self, searchVector, formula="cosine", weighting='tf'):
     ratings = {}
     for key, value in self.documentVectors.items():
         if formula == "cosine":
             rating = util.cosine(searchVector, value) 
         elif formula == "euclidean":
             rating = util.euclidean(searchVector, value)
         ratings[key] = rating
     ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)}
     return ratings
Exemplo n.º 13
def get_cosines(src_articles):
    positive_cosines = []
    negative_cosines = []

    while len(positive_cosines) < NUM_POSITIVES:
        num = random.randint(0, src_articles.count - 1)
        a1, a2 = src_articles.spin_dissimilar_articles(num, 2)
        positive_cosines.append(cosine(" ".join(a1), " ".join(a2)))

    while len(negative_cosines) < NUM_NEGATIVES:
        num1 = random.randint(0, src_articles.count - 1)
        similar_articles = list(src_articles.get_very_similar_articles(num1))
        if not similar_articles:
        num2 = random.choice(similar_articles)
        a1 = " ".join(src_articles.spin_articles(num1)[0])
        a2 = " ".join(src_articles.spin_articles(num2)[0])
        negative_cosines.append(cosine(a1, a2))

    return positive_cosines, negative_cosines
Exemplo n.º 14
 def searchTf(self, query):
     """ search for documents that match based on a list of terms """
     queryVector = self.makeTfVector(query)
     tf_cos = [
         util.cosine(queryVector, documentVector)
         for documentVector in self.tfVectors
     tf_dist = [
         util.euclidean(queryVector, documentVector)
         for documentVector in self.tfVectors
     return [tf_cos, tf_dist]
 def search(self,searchList, formula="cosine", weighting="tf"):
     """ search for documents that match based on a list of terms """
     ratings = {}
     queryVector = self.buildQueryVector(searchList, weighting)
     for key, value in self.documentVectors.items():
         if formula == "cosine":
             rating = util.cosine(queryVector, value) 
         elif formula == "euclidean":
             rating = util.euclidean(queryVector, value)
         ratings[key] = rating
     ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)}
     return ratings
Exemplo n.º 16
    def search_nltk(self, searchList, method="1"):
        rating_dic = {}
        for key, value in self.documentVectors.items():
            rating_dic[key] = util.cosine(searchList, value)

        result = {
            k: v
            for k, v in sorted(
                rating_dic.items(), key=lambda item: item[1], reverse=True)

        return list(result.items())[:10]
Exemplo n.º 17
    def tf_idf_search(self, searchList):

        queryVector = self.buildQueryVector(searchList)
        #        print (queryVector)
        self.tfidVectors = util.tf_idf(self.documentVectors)
        #    print self.tfidVectors
        ratings = [
            util.cosine(queryVector, documentVector)
            for documentVector in self.tfidVectors

        # ratings = [util.cosine(queryVector, util.tf_idf(documentVectors)) for documentVector in self.documentVectors]
        #        print(ratings)
        return ratings
Exemplo n.º 18
    def f_search(self,searchList,doc,way):
        """ search for documents that match based on a list of terms """
        queryVector = self.buildQueryVector(searchList)
        fVector = self.makeTagVector(doc)

        for i in range(0,len(queryVector)):
            queryVector[i] += fVector[i]
        if way == "cosine":
            ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
        elif way == "euclid":
            ratings = [util.euclid(queryVector, documentVector) for documentVector in self.documentVectors]
        return ratings
Exemplo n.º 19
def baseline_cos_different_articles(a1_num, a2_num, num_times):
    lowest = float("inf")
    highest = float("-inf")
    total = 0.0
    for _ in xrange(num_times):
        a1 = source_articles.spin_articles(a1_num)[0]
        a2 = source_articles.spin_articles(a2_num)[0]
        cos_sim = cosine(a1, a2)
        lowest = min(lowest, cos_sim)
        highest = max(highest, cos_sim)
        total += cos_sim

    average = total / num_times
    return lowest, highest, average
Exemplo n.º 20
 def search1and2(self, searchList, compare, flag):
     """ search for documents that match based on a list of terms """
     queryVector = self.buildQueryVector(searchList)
     if compare == "cos":
         ratings = [
             util.cosine(queryVector, documentVector)
             for documentVector in self.documentVectors
     elif compare == "el":
         ratings = [
             util.Euclidean(queryVector, documentVector)
             for documentVector in self.documentVectors
     return self.Sort(ratings, flag)
Exemplo n.º 21
    def search(self, searchList, method="0"):
        """ search for documents that match based on a list of terms """
        rating_dic = {}
        queryVector = self.buildQueryVector(searchList, method)
        for key, value in self.documentVectors.items():
            rating_dic[key] = util.cosine(queryVector, value)

        result = {
            k: v
            for k, v in sorted(
                rating_dic.items(), key=lambda item: item[1], reverse=True)

        return list(result.items())[:10]
Exemplo n.º 22
    def search(self, relevanceType):
        """ search for documents that match based on a list of terms """
        self.queryVector = self.buildQueryVector(self.queryList)

        if relevanceType == 'cs':
            ratings = [
                util.cosine(self.queryVector, documentVector)
                for documentVector in self.documentVectors
        elif relevanceType == 'eu':
            ratings = [
                util.euclidean(self.queryVector, documentVector)
                for documentVector in self.documentVectors
        return ratings
Exemplo n.º 23
 def tfidf(self, queryVector, flag):
     for i in range(len(queryVector)):
         if self.idf[i] > 0:
             queryVector[i] = queryVector[i] * math.log10(
                 float(7034 / self.idf[i]))
     if flag == "cos":
         ratings = [
             util.cosine(queryVector, documentVector)
             for documentVector in self.documentVectors
     elif flag == "el":
         ratings = [
             util.Euclidean(queryVector, documentVector)
             for documentVector in self.documentVectors
     return ratings
    def searchRelevantFeedback(self, QueryFeedback):
        # first element is query, second element is relevant feedback

        query = self.buildTFIDFQueryVector(QueryFeedback[0])
        feedback = self.buildTFIDFQueryVector(QueryFeedback[1])

        NewQuery = [0] * len(query)

        for i in range(len(query)):
            NewQuery[i] = 1 * query[i] + 0.5 * feedback[i]

        ratings = [
            util.cosine(NewQuery, documentVector)
            for documentVector in self.TFIDFVectors

        return ratings
Exemplo n.º 25
 def search(self, searchList, way, idf=False):
     """ search for documents that match based on a list of terms """
     queryVector = self.buildQueryVector(searchList)
     if idf == True:
         queryVector = self.tfidf(queryVector)
     # print(queryVector)
     if way == 'cos':
         ratings = [
             util.cosine(queryVector, documentVector)
             for documentVector in self.documentVectors
         ratings = [
             util.euc_distance(queryVector, documentVector)
             for documentVector in self.documentVectors
     return ratings
Exemplo n.º 26
    def feedback(self, first_doc):
        ''' get first result from #1-3, then get noun and verb to make new query to get relevance feedback '''
        text = nltk.word_tokenize(first_doc)
        pos_tagged = nltk.pos_tag(text)

        feedbackQueryList = [
            e[0] for e in filter(
                lambda x: x[1][:2] == 'NN' or x[1][:2] == 'VB', pos_tagged)
        ]  #
        feedbackQueryVector = self.buildQueryVector(feedbackQueryList)
        if len(self.queryVector) == 0:
            self.queryVector = self.buildQueryVector(self.queryList)

        queryVector = [
            e + 0.5 * a for e, a in zip(self.queryVector, feedbackQueryVector)
        ratings = [
            util.cosine(queryVector, documentVector)
            for documentVector in self.documentVectors
        return ratings
Exemplo n.º 27
 def get_comparison_stats(self, article1, article2, classified_article1, classified_article2, do_print=True):
     returns cosine of all possible things
     of two articles and their comparisons
     a1_a2 = cosine(" ".join(article1), " ".join(article2))
     ca1_ca2 = cosine(" ".join(classified_article1), " ".join(classified_article2))
     a1_ca1 = cosine(" ".join(classified_article1), " ".join(article1))
     a2_ca2 = cosine(" ".join(classified_article2), " ".join(article2))
     a1_ca2 = cosine(" ".join(classified_article2), " ".join(article1))
     a2_ca1 = cosine(" ".join(classified_article1), " ".join(article2))
     if do_print:
         print "COSINE OF ARTICLES: {0}".format(a1_a2)
         print "COSINE OF CLASSIFIED ARTICLES: {0}".format(ca1_ca2)
         print "cosine of A1 and classified A1: {0}".format(a1_ca1)
         print "cosine of A2 and classified A2: {0}".format(a2_ca2)
         print "cosine of A1 and classified A2: {0}".format(a1_ca2)
         print "cosine of A2 and classified A1: {0}".format(a2_ca1)
         print "ratio: {0}".format(((a2_ca1 / a1_ca1) + (a1_ca2 / a2_ca2)) / 2)
     return (a1_a2, ca1_ca2, a1_ca1, a2_ca2, a1_ca2, a2_ca1)
Exemplo n.º 28
    print('TF Weighting + Cosine Similarity:')

    print('TF Weighting + Euclidean Distance:')

    print('TF-IDF Weighting + Cosine Similarity:')

    print('TF-IDF Weighting + Euclidean Distance:')

    #   for Q2 Relevance feedback
    newSearchIndex = indexList.index(top5_tfidf_cos[0][0])
    documents = doc[newSearchIndex]
    feedBackVector = vectorSpace.makeVecRelevance(documents)
    ansVec = searchVector + feedBackVector

    finalScore = [
        util.cosine(ansVec, docVec) for docVec in vectorSpace.tfidfVec
    Q2 = sorted(list(zip(indexList, finalScore)),

    print('Relevance Feedback + TF-IDF Weighting + Cosine Similarity:')
    time_end = datetime.datetime.now()
    time_cost = time_end - time_start
    print("cost time :", time_cost)
Exemplo n.º 29
 def agreement(self, index1, index2):
     return util.cosine(self.allIdeas[index1], self.allIdeas[index2])
 def searchtfidfcosine(self,searchList):
     queryVector = self.buildQueryVector(searchList)
     queryVector = self.computeidf(queryVector)
     tempVectors = [self.computeidf(documentVector) for documentVector in self.documentVectors]
     ratings = [util.cosine(queryVector , documentVector) for documentVector in tempVectors]
     return ratings
Exemplo n.º 31
def main(query):

    #create vector space model instance
    vectorSpace = VectorSpace(documents)

    #caculate different conmbinations
    tf_cos = vectorSpace.TF_Cosine(query)
    tf_euclidean = vectorSpace.TF_Euclidean(query)
    tfidf_cos = vectorSpace.TFIDF_Cosine(query)
    tfidf_euclidean = vectorSpace.TFIDF_Euclidean(query)

    #sort with top five score
    top5_tf_cos = sorted(list(zip(indexList, tf_cos)),
                         key=lambda x: x[1])[:5]
    top5_tf_euclidean = sorted(list(zip(indexList, tf_euclidean)),
                               key=lambda x: x[1])[:5]
    top5_tfidf_cos = sorted(list(zip(indexList, tfidf_cos)),
                            key=lambda x: x[1])[:5]
    top5_tfidf_euclidean = sorted(list(zip(indexList, tfidf_euclidean)),
                                  key=lambda x: x[1])[:5]

    #print out the output
    print('Term Frequency Weighting + Cosine Similarity:')

    print('Term Frequency Weighting + Euclidean Distance:')

    print('TF-IDF Weighting + Cosine Similarity:')

    print('TF-IDF Weighting + Euclidean Distance:')

    #Relevance Feedback

    #get the document of the first score of the tfidf + cosine similarity by given query
    indx_fb = indexList.index(top5_tfidf_cos[0][0])
    fb = documents[indx_fb]

    #the new query term weighting scheme is [1 * original query + 0.5 * feedback query]
    feedback_vector = vectorSpace.makeFeedbackVector(fb)
    query_vector = np.array(vectorSpace.makeTfIdfVector(query))
    rf_vector = query_vector + feedback_vector

    # evaluate the relevance vector with each document by tfidf + cosine similarity
    rf_tfidf_cos = []
    for documentTFIDFVector in vectorSpace.documentTFIDFVectors:
        rf_tfidf_cos.append(util.cosine(rf_vector, documentTFIDFVector))

    top5_rf_tfidf_cos = sorted(list(zip(indexList, rf_tfidf_cos)),
                               key=lambda x: x[1])[:5]

    #print out the output
    print('Relevance Feedback + TF-IDF Weighting + Cosine Similarity:')
Exemplo n.º 32
            for documentVector in self.documentVectors
        return ratings

    def IDFCOS(self, searchList):
        queryVector = self.buildQueryVector(searchList)

        self.vectorIDF = [float(2048.0 / x) for x in self.vectorIDF]
        self.vectorIDF = [float(math.log10(x)) for x in self.vectorIDF]
        self.tfidf = [
            map(lambda (a, b): a * b, zip(self.vectorIDF, documentVector))
            for documentVector in self.documentVectors
        ratings = [
            util.cosine(queryVector, documentVector)
            for documentVector in self.tfidf
        return ratings

    def IDFED(self, searchList):
        queryVector = self.buildQueryVector(searchList)

        #tfidf = [map(lambda (a,b):a*b,zip(self.vectorIDF, documentVector)) for documentVector in self.documentVectors]
        ratings = [
            util.Euclidean_Distance(queryVector, documentVector)
            for documentVector in self.tfidf
Exemplo n.º 33
def cal_fq_tfidf_cs(vectorSpace, files, query):
    # Feedback Query + TF-IDF Weighting + Cosine Similarity
    # step 1
    # step 2
    # step 3
    # step 4

    sorted_indices, _ = cal_tfidf_cs(vectorSpace, files, query)
    # The new query term weighting = [1 * original query + 0.5 * feedback query]
    For instance, suppose the index vector is 
    ["network" ,"computer" , "share", "ask", "soccer", "song"], 
    the query is "network", and the content of the feedback document is:
    Jimmy shares songs via the computer network.
    Then we will get a new query vector like this:
    1 * [1, 0, 0, 0, 0, 0] + 0.5 * [1, 1, 1, 0, 0, 1] = [1.5, 0.5, 0.5, 0, 0, 0.5]

    # get ranked first vector
    first_vector = vectorSpace.documentVectors[sorted_indices[0]]

    my_dict = vectorSpace.vectorKeywordIndex

    def get_key(val):
        for key, value in my_dict.items():
            if val == value:
                return key

        return "key doesn't exist"

    # map the vector' item into words
    words = []
    for i in range(len(first_vector)):
        if first_vector[i] > 0:
            for j in range(first_vector[i]):

    # do the pos tagging to words
    tagged = nltk.pos_tag(words)

    feedback = [0] * len(vectorSpace.vectorKeywordIndex)

    # find the verb and noun words and transform to feedback vector
    pos = ['NN', 'VB', 'VBP', 'VBD', 'VBG']
    for tup in tagged:
        if tup[1] in pos:
            feedback[my_dict[tup[0]]] += 1

    feedback = np.array(feedback)

    queryVector = vectorSpace.buildQueryVector([query])

    new_query = queryVector + 0.5 * feedback
    new_query = list(new_query)

    # feedback rating
    print("Feedback Queries + TF-IDF Weighting + Cosine Similarity feedback")
    scores = [
        util.cosine(new_query, documentVector)
        for documentVector in vectorSpace.documentVectors

    # Indices of N largest elements in list
    indices = np.argpartition(scores, -5)[-5:]

    # save as (index, value)
    d = {}
    for i in indices:
        d[i] = scores[i]

    # sort dict by value instead of key
    sd = sorted(d.items(), key=lambda item: item[1], reverse=True)

    sorted_indices = [s[0] for s in sd]
    sorted_scores = [s[1] for s in sd]

    # find docid
    docid = []
    for index in sorted_indices:
        x = files[index]

    round_score = [round(score, 6) for score in sorted_scores]

    d = {'docID': docid, 'Score': round_score}

    return pd.DataFrame(data=d)
Exemplo n.º 34
 def searchTFidfCos(self, searchList):
     searchVec = self.makeVectorforTFidf(searchList)
     vector = [util.cosine(searchVec, docVec) for docVec in self.tfidfVec]
     return vector
Exemplo n.º 35
	def related(self,documentId):
		""" find documents that are related to the document indexed by passed Id within the document Vectors"""
		ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors]
		return ratings
Exemplo n.º 36
        # print(my_dict[tup[0]])
        feedback[my_dict[tup[0]]] += 1

# print(feedback)
feedback = np.array(feedback)

queryVector = vectorSpace.buildQueryVector(["drill wood sharp"])

new_query = queryVector + 0.5 * feedback
new_query = list(new_query)

# feedback rating
print("TF-IDF Weighting + Cosine Similarity feedback")
scores = [
    util.cosine(new_query, documentVector)
    for documentVector in vectorSpace.documentVectors

# Indices of N largest elements in list
indices = np.argpartition(scores, -5)[-5:]

# save as (index, value)
d = {}
for i in indices:
    d[i] = scores[i]

# sort dict by value instead of key
sd = sorted(d.items(), key=lambda item: item[1], reverse=True)
# print(sd)
Exemplo n.º 37
    top5_tf_cos = sorted(list(zip(indexes, tf_cos)), reverse=True, key=sortByRatings)[:5]
    top5_tf_dist = sorted(list(zip(indexes, tf_dist)), reverse=False, key=sortByRatings)[:5]
    top5_tfidf_cos = sorted(list(zip(indexes, tfidf_cos)), reverse=True, key=sortByRatings)[:5]
    top5_tfidf_dist = sorted(list(zip(indexes, tfidf_dist)), reverse=False, key=sortByRatings)[:5]
    print('Term Frequency Weighting + Cosine Similarity:')

    print('Term Frequency Weighting + Euclidean Distance:')

    print('TF-IDF Weighting + Cosine Similarity:')

    print('TF-IDF Weighting + Euclidean Distance:')

    #create feedback-relevance vector
    newQueryIndex = indexes.index(top5_tfidf_cos[0][0])
    doc = contents[newQueryIndex]
    feedbackVector = vectorspace.getRelevanceFeedbackVector(doc)
    qfVector = queryVector+feedbackVector

    # compute the scores and re-rank
    scores = [util.cosine(qfVector, documentVector) for documentVector in vectorspace.tfidfVectors]
    relevanceFeedback = sorted(list(zip(indexes, scores)), reverse=True, key=sortByRatings)[:5]

    print('Feedback Queries + TF-IDF Weighting + Cosine Similarity:')
Exemplo n.º 38
from VectorSpace import *
import util

f = open('data/data.txt','r')

descs = []
f = open('data/id3v23.txt','r')

vs = VectorSpace(descs)

print util.cosine(vs.documentVectors[0],vs.documentVectors[1])
print util.cosine(vs.documentVectors[1],vs.documentVectors[0])

Exemplo n.º 39
def s3(t1, terms):
    return set([t2 for t2 in terms if t1 != t2 and len(t1)>3 
                and cosine(t1,t2) > 0.8])