示例#1
0
 def __init__(self, documents, numDoc, docFreqDict, topK):
     self.documents = documents
     self.numDoc = numDoc
     self.docFreqDict = docFreqDict
     self.topK = topK
     self.vectorGenerator = VectorGenerator(docFreqDict, numDoc)
     self.vectorUtil = VectorUtil()
     self.logWriter = None
示例#2
0
def load_vectorspace(filename):
    f = open(filename, 'r')

    vectors = dict()

    for line in f:
        data = line.split('|')
        if len(data) == 4 and '' not in data:
            a1 = VectorUtil.string_to_vector(data[1].replace(' ', ''))
            p = VectorUtil.string_to_vector(data[2].replace(' ', ''))
            a2 = VectorUtil.string_to_vector(data[3].replace(' ', ''))
            vectors.update({data[0]: (a1, p, a2)})

    return vectors
def load_vectorspace(filename):
    f = open(filename,'r')

    vectors = dict()

    for line in f:
        data = line.split('|')
        if len(data) == 4 and '' not in data:
            a1 = VectorUtil.string_to_vector(data[1].replace(' ',''))
            p = VectorUtil.string_to_vector(data[2].replace(' ',''))
            a2 = VectorUtil.string_to_vector(data[3].replace(' ',''))
            vectors.update({data[0]: (a1,p,a2)})


    return vectors
示例#4
0
def main(args):
    msg = 'COMMANDS: lr [word] | cmp [rel1] [rel2] | pv [rel_name] | quit | ls | help\n'
    vectors = load_vectorspace(args[1])
    line = input(msg)

    while line != 'quit':
        cmds = line.split(' ')

        if cmds[0] == 'lr' and len(cmds) <= 3:

            if len(cmds) == 1:
                for v in vectors:
                    sys.stderr.write(v + ' | ')
                sys.stderr.write('\n\n')
            else:
                for v in vectors:
                    w = v.split('_')
                    if cmds[1] in w:
                        sys.stderr.write(v + ' , ')
                sys.stderr.write('\n\n')

        elif cmds[0] == 'cmp' and len(cmds) == 3:
            names = cmds
            names[2].replace('\n', '')
            if names[1] in vectors and names[2] in vectors:
                #print('Distance between ' + names[1] + ' and ' + names[2] + ' is ')
                print('DISTANCE: ')
                print(
                    str(
                        VectorUtil.distance(vectors[names[1]], vectors[
                            names[2]])) + '\n')

        elif cmds[0] == 'pv' and len(cmds) == 2:
            if cmds[1] in vectors:
                print(vectors[cmds[1]])

        elif cmds[0] == 'fs' and len(cmds) == 2:
            print('Not implemented.')

        elif cmds[0] == 'ls' and len(cmds) == 1:
            print('Loaded ' + str(len(vectors)) + ' vectors.')

        elif cmds[0] == 'help' and len(cmds) == 1:
            print('COMMANDS explained:')
            print(
                'lr: find relation containing the word [word]. List all relations loaded if no word is provided.'
            )
            print('ls: display amount of relations loaded.')

        line = input('Type other command.\n' + msg)
def main(args):
    msg = 'COMMANDS: lr [word] | cmp [rel1] [rel2] | pv [rel_name] | quit | ls | help\n'
    vectors = load_vectorspace(args[1])
    line = input(msg)

    while line != 'quit':
        cmds = line.split(' ')

        if cmds[0] == 'lr' and len(cmds) <= 3:

            if len(cmds) == 1:
                for v in vectors:
                    sys.stderr.write(v+' | ')
                sys.stderr.write('\n\n')
            else:
                for v in vectors:
                    w = v.split('_')
                    if cmds[1] in w:
                        sys.stderr.write(v + ' , ')
                sys.stderr.write('\n\n')

        elif cmds[0] == 'cmp' and len(cmds) == 3:
            names = cmds
            names[2].replace('\n','')
            if names[1] in vectors and names[2] in vectors:
                #print('Distance between ' + names[1] + ' and ' + names[2] + ' is ')
                print('DISTANCE: ')
                print(str(VectorUtil.distance(vectors[names[1]], vectors[names[2]])) + '\n')


        elif cmds[0] == 'pv' and len(cmds) == 2:
            if cmds[1] in vectors:
                print(vectors[cmds[1]])

        elif cmds[0] == 'fs' and len(cmds) == 2:
            print('Not implemented.')

        elif cmds[0] == 'ls' and len(cmds) == 1:
            print('Loaded '+str(len(vectors)) + ' vectors.')

        elif cmds[0] == 'help' and len(cmds) == 1:
            print('COMMANDS explained:')
            print('lr: find relation containing the word [word]. List all relations loaded if no word is provided.')
            print('ls: display amount of relations loaded.')


        line = input('Type other command.\n' + msg)
示例#6
0
    def reduce_vectors_of_file_column(self,
                                      vectors_file,
                                      rel_list_filename,
                                      col,
                                      n_dim=100,
                                      n_samples=100,
                                      step=1):
        self.labels = []
        self.data = []

        self.preds = []
        incomplete_vector_set_relation_list = []

        fin = open(vectors_file, 'r+')
        #fout= open(filename+'_reduced','a+')
        rel_list_file = open(rel_list_filename, 'r')
        sys.stderr.write('Loading most frequent relations list.\n')

        most_frequent_relations_list = []
        sample_count = 0
        for line in rel_list_file:
            most_frequent_relations_list.append(line.split('\t')[0])
            sample_count = sample_count + 1
            if sample_count == n_samples:
                break

        #print(str(most_frequent_relations_list))

        sys.stderr.write('Done.' + str(len(most_frequent_relations_list)) +
                         '\n')

        sample_count = 0
        for line in fin:
            line = line.split('|')
            vec = line[col].replace(' ', '')
            rel = line[0].replace(' ', '')
            if rel in most_frequent_relations_list:
                #print('Loading data of relation ' + rel + '\n')
                vec = VectorUtil.string_to_vector(vec)
                predvec = line[pred_root_col].replace(' ', '')
                if step == 1 and predvec is not '':
                    # print('pred col = ' + str(line[pred_root_col]))
                    predvec = VectorUtil.string_to_vector(predvec)
                    self.preds.append(predvec)

                if predvec is not '':
                    self.labels.append(rel)
                    self.data.append(vec)
                    sample_count = sample_count + 1
                else:
                    sys.stderr.write(
                        'WARNING: relation "' + rel +
                        '" skipped due to the absence of predicate root vector.\n'
                    )
                    incomplete_vector_set_relation_list.append(rel)

        sys.stderr.write(str(len(self.data)) + ' relations loaded.\n')

        if step == 1:
            sys.stderr.write('Creating new file..\n')
            fout = self.create_empty_output_file(
                vectors_file + '_reduced_step1.csv', n_samples, self.labels)
            fin.seek(0)
        elif step >= 2:
            fin.close()
            fin = open(vectors_file + '_reduced_step' + str(step - 1) + '.csv',
                       'r')
            fout = open(vectors_file + '_reduced_step' + str(step) + '.csv',
                        'w')

        sys.stderr.write('Reducing, please wait...\n')
        dimred = TruncatedSVD(n_components=n_dim)
        dimred.fit(self.data)
        sys.stderr.write('Reduction finished.\n')

        sample_count = 0

        for line in fin:
            if line == '\n':
                continue

            sline = line.split('|')
            rel = sline[0].replace(' ', '')

            if rel in most_frequent_relations_list:
                if rel in incomplete_vector_set_relation_list:
                    continue

                if len(sline) - 1 < col:
                    sys.stderr.write('WARNING: not enough columns in line\n')
                #  sys.stderr.write(str(len(sline)) + '\n')

                rel_pos = self.labels.index(rel)
                if step == 1:
                    sline[pred_root_col] = VectorUtil.vector_to_string(
                        self.preds[rel_pos])

                vecred = dimred.transform(self.data[rel_pos])
                #sys.stderr.write('Transformed vector size is ' + str(len(vecred))+ ' x ' + str(len(vecred[0])) + '\n')

                if len(vecred) == 1:
                    vec_write = vecred[0]

                #self.data[sample_count] = vec
                sline[col] = VectorUtil.vector_to_string(
                    vec_write)  #vector reduced is put in the right column

                rvstr = '|'.join(sline)

                #sline[0] = rel
                #print(sline[col])

                # sys.stderr.write(rvstr)
                fout.write(rvstr + '\n')
                fout.flush()

                sample_count = sample_count + 1

        fout.close()
    def reduce_vectors_of_file_column(self,vectors_file,rel_list_filename,col,n_dim=100,n_samples=100,step=1):
        self.labels = []
        self.data = []

        self.preds = []
        incomplete_vector_set_relation_list = []

        fin= open(vectors_file,'r+')
        #fout= open(filename+'_reduced','a+')
        rel_list_file = open(rel_list_filename,'r')
        sys.stderr.write('Loading most frequent relations list.\n')

        most_frequent_relations_list = []
        sample_count = 0
        for line in rel_list_file:
            most_frequent_relations_list.append(line.split('\t')[0])
            sample_count = sample_count + 1
            if sample_count == n_samples:
                break

        #print(str(most_frequent_relations_list))

        sys.stderr.write('Done.' + str(len(most_frequent_relations_list)) + '\n')


        sample_count = 0
        for line in fin:
            line = line.split('|')
            vec = line[col].replace(' ','')
            rel = line[0].replace(' ','')
            if rel in most_frequent_relations_list:
                #print('Loading data of relation ' + rel + '\n')
                vec = VectorUtil.string_to_vector(vec)
                predvec = line[pred_root_col].replace(' ','')
                if step == 1 and predvec is not '':
                   # print('pred col = ' + str(line[pred_root_col]))
                    predvec = VectorUtil.string_to_vector(predvec)
                    self.preds.append(predvec)

                if predvec is not '':
                    self.labels.append(rel)
                    self.data.append(vec)
                    sample_count = sample_count + 1
                else:
                    sys.stderr.write('WARNING: relation "' + rel + '" skipped due to the absence of predicate root vector.\n')
                    incomplete_vector_set_relation_list.append(rel)


        sys.stderr.write(str(len(self.data))+' relations loaded.\n')


        if step == 1:
            sys.stderr.write('Creating new file..\n')
            fout = self.create_empty_output_file(vectors_file+'_reduced_step1.csv',n_samples,self.labels)
            fin.seek(0)
        elif step >= 2:
            fin.close()
            fin = open(vectors_file+'_reduced_step'+str(step-1)+'.csv','r')
            fout = open(vectors_file+'_reduced_step'+str(step)+'.csv','w')

        sys.stderr.write('Reducing, please wait...\n')
        dimred = TruncatedSVD(n_components=n_dim)
        dimred.fit(self.data)
        sys.stderr.write('Reduction finished.\n')

        sample_count = 0

        for line in fin:
            if line == '\n':
                continue

            sline = line.split('|')
            rel = sline[0].replace(' ','')

            if rel in most_frequent_relations_list:
                if rel in incomplete_vector_set_relation_list:
                    continue

                if len(sline)-1 < col:
                    sys.stderr.write('WARNING: not enough columns in line\n')
                  #  sys.stderr.write(str(len(sline)) + '\n')

                rel_pos = self.labels.index(rel)
                if step == 1:
                    sline[pred_root_col] = VectorUtil.vector_to_string(self.preds[rel_pos])

                vecred = dimred.transform(self.data[rel_pos])
                #sys.stderr.write('Transformed vector size is ' + str(len(vecred))+ ' x ' + str(len(vecred[0])) + '\n')


                if len(vecred) == 1:
                    vec_write = vecred[0]

                #self.data[sample_count] = vec
                sline[col] = VectorUtil.vector_to_string(vec_write) #vector reduced is put in the right column

                rvstr = '|'.join(sline)

                #sline[0] = rel
                #print(sline[col])


               # sys.stderr.write(rvstr)
                fout.write(rvstr+'\n')
                fout.flush()

                sample_count = sample_count + 1



        fout.close()
示例#8
0
class RankModel:

    def __init__(self, documents, numDoc, docFreqDict, topK):
        self.documents = documents
        self.numDoc = numDoc
        self.docFreqDict = docFreqDict
        self.topK = topK
        self.vectorGenerator = VectorGenerator(docFreqDict, numDoc)
        self.vectorUtil = VectorUtil()
        self.logWriter = None

    def setFileFolder(self, fileFolderName):
        fileFolderName = fileFolderName+"/" if not fileFolderName.endswith("/") else fileFolderName
        self.fileFolder = fileFolderName

    def setLogWriter(self, logWriter):
        self.logWriter = logWriter

    # clean the original data
    def clean(self, content):
        content = re.sub("<.+?>", "", content)
        content = re.sub("'", "", content)
        content = re.sub("\.", "", content)
        content = re.sub("\n", "", content)
        content = re.sub("\t", "", content)
        terms = content.split(" ")
        return content, terms

    # deal with a query
    def dealWithQuery(self, query):

        rankResult = dict()
        contributionDict = dict()
        cleanQuery, cleanQueryTerms = self.clean(query) # transform query

        # calculate similarity between current query and each doc in documentPath
        for documentID in self.documents.keys():
            contributions = dict()
            documentName = self.documents[documentID]["docName"]
            docContent = open(self.fileFolder+documentName).read()  # read content of document
            cleanContent, cleanTerms = self.clean(docContent)   # transform the content

            # build a bag of words for current query & current document
            bagOfWords = list()
            bagOfWords.extend(cleanTerms)
            bagOfWords.extend(cleanQueryTerms)

            # generate tf-idf vector for current query and current document
            vecQuery = self.vectorGenerator.genTFIDFVector("QUERY", query, bagOfWords)
            vecDocument = self.vectorGenerator.genTFIDFVector("DOCUMENT", cleanContent, bagOfWords)

            # calculate the cosine similarity
            cosSimilarity, contributionVec = self.vectorUtil.cosineSimilarity(vecQuery, vecDocument)
            rankResult[documentID] = cosSimilarity
            for word, contribution in zip(bagOfWords, contributionVec):
                contributions[word] = contribution
            contributionDict[documentID] = contributions

        # sort the ranking result (DESC)
        rankResult = sorted(rankResult.items(), key=lambda pair:pair[1], reverse=True)

        # write log
        if(self.logWriter is None):
            raise Exception("Please set your log writer!")
        self.logWriter.write(query, cleanQueryTerms, rankResult, contributionDict)

        return cleanQueryTerms, rankResult[:self.topK]
import VectorUtil
import math
from VectorLength import vector_length
from ScalarProduct import scalar_product


def angle_in_between(u, v):
    product = scalar_product(u, v)
    print()
    ulen = vector_length(u)
    print()
    vlen = vector_length(v)
    print()
    print("cos(γ) = u*v / |u|*|v|")
    print(product, " / (", ulen, " * ", vlen, ")")
    print(product, " / (", ulen * vlen, ")")
    cos = product/(ulen * vlen)
    print("cos(γ) = ", cos)
    print("γ = arccos(cos(γ))")
    print("γ = ", math.acos(cos), "rad ~", math.degrees(math.acos(cos)), "°")


if(__name__ == "__main__"):
    u = VectorUtil.read_vector("Input U:")
    v = VectorUtil.read_vector("Input V:")
    angle_in_between(u, v)
示例#10
0
from OrtogonalVector import ortogonal_vector
from ScalarProduct import scalar_product
import VectorUtil


def triple_product(a, b, c):
    print("[a, b, c] = (a×b)·c ")
    ort = ortogonal_vector(a, b)
    vec = scalar_product(ort, c)

    print("[a, b, c] =", vec)
    return vec


if (__name__ == "__main__"):
    a = VectorUtil.read_vector("Input A:")
    b = VectorUtil.read_vector("Input B:")
    c = VectorUtil.read_vector("Input C:")
    triple_product(a, b, c)