예제 #1
0
파일: Models.py 프로젝트: jsnajder/derivsem
def get_neighbors(vector, space, n_neighbors=5, pos=None):
    if pos is not None:
        space = space_pos_filter(space, pos)
    targets = space.id2row
    if n_neighbors is None:
        n_neighbors = len(targets)
    n_neighbors = min(n_neighbors, len(targets))
    sims_to_matrix = CosSimilarity().get_sims_to_matrix(vector, space.cooccurrence_matrix)
    sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1)
    return [(space.id2row[i], sims_to_matrix[i, 0]) for i in sorted_perm[:n_neighbors]]
class MixedCompositionalTreeKernel(SyntacticTreeKernel):
    '''
    Mixed Salad Kernel 2 variation
    '''
    
    kernel_name = "mixed_salad_kernel22"


    def __init__(self, lambda_):
        '''
        Constructor
        '''
        self._lambda = lambda_
        self._measure = CosSimilarity()
        
    # default one
    def dot_product(self, tree1, tree2):
        assert_type(tree1, SemanticTree)
        assert_type(tree2, SemanticTree)
        return super(MixedCompositionalTreeKernel, self).dot_product(tree1, tree2)
    
    # new delta
    def _delta(self, node1, node2, node2id1, node2id2, delta_matrix):
        
        if (node1.is_terminal() and node2.is_terminal() 
            and node1._label == node2._label 
            and node1._word == node2._word): 
                delta_matrix[node2id1[node1],node2id2[node2]] = 1
        elif not node1.has_same_production(node2):
            if node1._label != node2._label:
                delta_matrix[node2id1[node1],node2id2[node2]] = 0
            else:
                delta_matrix[node2id1[node1],node2id2[node2]] = self._measure.get_sim(node1._vector, node2._vector)
        else:
            product_children_delta = self._lambda 
            for i in xrange(len(node1._children)):
                child1 = node1.get_child(i)
                child2 = node2.get_child(i)
                child_delta = delta_matrix[node2id1[child1],node2id2[child2]]
                if child_delta == -1:
                    raise ValueError("???")
                else:
                    product_children_delta *= (1 + child_delta)
            
            sim_children_product = 1
            for i in xrange(len(node1._children)):
                child1 = node1.get_child(i)
                child2 = node2.get_child(i)
                sim_children_product *= self._measure.get_sim(child1._vector, child2._vector)
                
            final_delta = (product_children_delta + 
                           (self._measure.get_sim(node1._vector, node2._vector) - 
                             self._lambda * sim_children_product))
             
            delta_matrix[node2id1[node1],node2id2[node2]] = final_delta
class NaiveCompositionalSemanticTreeKernel(SyntacticTreeKernel):
    """
    Mixed Salad Kernel 1
    """

    kernel_name = "mixed_salad_kernel1"

    NO_COMPATIBILITY = 0
    LABEL_COMPATIBILITY = 1

    def __init__(self, lambda_, compatibility_level=LABEL_COMPATIBILITY):
        """
        Constructor
        """
        self._lambda = lambda_
        self._compatibility_level = compatibility_level
        self._measure = CosSimilarity()

    def dot_product(self, tree1, tree2):
        assert_type(tree1, SemanticTree)
        assert_type(tree2, SemanticTree)
        return super(NaiveCompositionalSemanticTreeKernel, self).dot_product(tree1, tree2)

    def _delta(self, node1, node2, node2id1, node2id2, delta_matrix):
        delta = 0
        if self._compatibility_level == NaiveCompositionalSemanticTreeKernel.NO_COMPATIBILITY or (
            self._compatibility_level == NaiveCompositionalSemanticTreeKernel.LABEL_COMPATIBILITY
            and node1._label == node2._label
        ):
            delta = (self._lambda ** (node1.get_height() + node2.get_height())) * self._measure.get_sim(
                node1._vector, node2._vector
            )
        delta_matrix[node2id1[node1], node2id2[node2]] = delta
예제 #4
0
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure,
                       space_files):
    sim_dict = {
        "cos": CosSimilarity(),
        "lin": LinSimilarity(),
        "dot_prod": DotProdSimilarity(),
        "euclidean": EuclideanSimilarity()
    }

    if not sim_measure in sim_dict:
        raise ValueError("Similarity measure:%s not defined" % sim_measure)

    space = io_utils.load(space_files[0], Space)
    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] +
                               space_files[1].split("/")[-1].split(".")[0:-1])

    sim = sim_dict[sim_measure]

    descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
    out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
    io_utils.create_parent_directories(out_file)

    data = io_utils.read_list(in_file)

    print("Computing neighbours: %s" % sim_measure)
    with open(out_file, "w") as out_stream:
        for word in data:
            out_stream.write("%s\n" % word)
            result = space.get_neighbours(word, no_neighbours, sim, space2)
            for neighbour, neighbour_sim in result:
                out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
 def __init__(self, lambda_, compatibility_level=LABEL_COMPATIBILITY):
     """
     Constructor
     """
     self._lambda = lambda_
     self._compatibility_level = compatibility_level
     self._measure = CosSimilarity()
예제 #6
0
def functionneighbours(words,number): 
    #load a space
    if sys.argv[2]=='full':
        my_space = io_utils.load("./data/out/thesisfull.pkl")
    if sys.argv[2]=='nonzero':
        my_space = io_utils.load("./data/out/thesis.pkl")

    return(my_space.get_neighbours(words,number, CosSimilarity()))
 def __init__(self, similarity=None):
     '''
     Constructor
     '''
     if similarity is None:
         self._similarity = CosSimilarity()
     else:
         self._similarity = similarity
예제 #8
0
def main():
    """
    cosWeeds - as described in:
    A. Lenci and G. Benotto. 2012. Identifying hypernyms in distributional semantic spaces. In *SEM
    Weeds Precision - as described in:
    J. Weeds and D. Weir. 2003. A general framework for distributional similarity. In EMNLP.
    """

    # Get the arguments
    args = docopt(
        """Compute cosWeeds Precision for a list of (x, y) pairs and save their scores.

    Usage:
        cosWeeds.py <testset_file> <dsm_prefix> <output_file>

        <testset_file> = a file containing term-pairs, labels and relations, each line in the form 
                         of x\ty\tlabel\trelation
        <dsm_prefix> = the prefix for the pkl files for the vector space
        <output_file> = where to save the results: a tab separated file with x\ty\tlabel\trelation\tscore,
                        where the score is cosWeeds (for y as the hypernym of x).
    """)

    testset_file = args['<testset_file>']
    dsm_prefix = args['<dsm_prefix>']
    output_file = args['<output_file>']

    # Load the term-pairs
    with codecs.open(testset_file) as f_in:
        test_set = [tuple(line.strip().split('\t')) for line in f_in]

    # Load the vector space
    vector_space = load_pkl_files(dsm_prefix)

    target_index = {w: i for i, w in enumerate(vector_space.id2row)}

    cooc_mat = vector_space.cooccurrence_matrix

    # Compute the score for each term
    with codecs.open(output_file, 'w', 'utf-8') as f_out:

        for (x, y, label, relation) in test_set:

            x_index, y_index = target_index.get(x, -1), target_index.get(y, -1)
            cosWeeds = 0.0

            if x_index > -1 and y_index > -1:
                x_row, y_row = cooc_mat[x_index, :], cooc_mat[y_index, :]
                score = weeds_prec(x_row, y_row)
                cosine = vector_space.get_sim(x, y, CosSimilarity())
                cosWeeds = math.sqrt(cosine * score)

            print >> f_out, '\t'.join((x, y, label, '%.5f' % cosWeeds))
예제 #9
0
def inspect_representations(path_composed_emb, output_path):
    print('Inspecting representations...')
    composed_space = Space.build(data=path_composed_emb, format='dm')
    f = codecs.open(output_path, 'w', 'utf8')
    word_list = [w for w in composed_space.get_row2id()]
    for j, w in enumerate(word_list):
        if j < 1000:
            neighbours = composed_space.get_neighbours(w, 10, CosSimilarity())

            f.write('Neighbours for ' + w + '\n')
            f.write("\n".join('%s %.6f' % x for x in neighbours))
            f.write('\n----------------------------\n')
    f.close()
예제 #10
0
def compute_sim(in_file, columns, out_dir, sim_measures, space_files):

    sim_dict = {
        "cos": CosSimilarity(),
        "lin": LinSimilarity(),
        "dot_prod": DotProdSimilarity(),
        "euclidean": EuclideanSimilarity()
    }

    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!")
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1

    try:
        space = io_utils.load(space_files[0], Space)
    except TypeError:
        warn("Not a Space instance in file: %s" % space_files[0])
        return

    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])

    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] +
                               space_files[1].split("/")[-1].split(".")[0:-1])

    descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr])

    for sim_measure in sim_measures:
        print("Computing similarities: %s" % sim_measure)
        if not sim_measure in sim_dict:
            warn("Similarity measure:%s not defined" % sim_measure)
            continue

        sim = sim_dict[sim_measure]
        out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
        io_utils.create_parent_directories(out_file)

        with open(in_file) as in_stream, open(out_file, "w") as out_stream:
            for line in in_stream:
                if not line.strip() == "":
                    elems = line.strip().split()
                    word1 = elems[col0]
                    word2 = elems[col1]

                    predicted_sim = space.get_sim(word1, word2, sim, space2)
                    out_stream.write("%s %s\n" %
                                     (line.strip(), str(predicted_sim)))
예제 #11
0
def main():
    """
    Cosine similarity
    """

    # Get the arguments
    args = docopt(
        """Compute cosine for a lis of (x, y) pairs and save their scores.

    Usage:
        cosine.py <testset_file> <dsm_prefix> <output_file>

        <testset_file> = a file containing term-pairs, labels and relations, each line in the form of 
                         x\ty\tlabel\trelation
        <dsm_prefix> = the prefix for the pkl files for the vector space
        <output_file> = where to save the results: a tab separated file with x\ty\tlabel\trelation\tscore,
                        where the score is cosine (simmetric measure).
    """)

    testset_file = args['<testset_file>']
    dsm_prefix = args['<dsm_prefix>']
    output_file = args['<output_file>']

    # Load the term-pairs
    with codecs.open(testset_file) as f_in:
        test_set = [tuple(line.strip().split('\t')) for line in f_in]

    # Load the vector space
    vector_space = load_pkl_files(dsm_prefix)

    target_index = {w: i for i, w in enumerate(vector_space.id2row)}

    # Compute the score for each term
    with codecs.open(output_file, 'w', 'utf-8') as f_out:

        for (x, y, label, relation) in test_set:

            x_index, y_index = target_index.get(x, -1), target_index.get(y, -1)
            cosine = 0.0

            if x_index > -1 and y_index > -1:
                cosine = vector_space.get_sim(x, y, CosSimilarity())

            print >> f_out, '\t'.join((x, y, label, '%.5f' % cosine))
예제 #12
0
def getThesaurus(word):
    if isinstance(word, unicode):
        word = word.encode('utf-8')
    else:
        try:
            word.decode('utf-8')
        except:
            raise

    # find synonyms in chilin
    for line in open(THES_PATH + 'chilin-zh-TW.csv'):
        synonyms = line.split()
        if word in synonyms:
            break

    # calculate word similarity
    word_sim_dict = {}
    my_space = Space.build(data=THES_PATH + 'sm',
                           rows=THES_PATH + 'words.rows',
                           cols=THES_PATH + 'cols',
                           format='sm')
    for row in open(THES_PATH + 'words.rows'):
        word1 = row.strip()
        sim = my_space.get_sim(word1, word, CosSimilarity())
        if sim > .3:
            word_sim_dict[word1] = sim

    # rank first those overlapping with chilin synonyms
    word_sim_list = []
    if word_sim_dict.get(word):
        word_sim_dict.pop(word)
        for key in word_sim_dict.keys():
            if key in synonyms:
                word_sim_dict.pop(key)
                word_sim_list += [key]

        # sort the rest of words
        d = sorted(word_sim_dict.items(), key=lambda x: x[1], reverse=True)
        word_sim_list += [word for word, sim in d]

        word_sim_list = word_sim_list[:9]
    return word_sim_list
class SentenceVectorKernel(Kernel):
    '''
    classdocs
    '''
    kernel_name = "sentence_vector_kernel"

    def __init__(self, similarity=None):
        '''
        Constructor
        '''
        if similarity is None:
            self._similarity = CosSimilarity()
        else:
            self._similarity = similarity
    
    def dot_product(self, tree1, tree2):
        assert_type(tree1, SemanticTree)
        assert_type(tree2, SemanticTree)
        sentence_vector1 = tree1._root._vector
        sentence_vector2 = tree2._root._vector
        if sentence_vector1.norm() == 0.0 or sentence_vector2.norm() == 0.0:
            return 0.0
        else:
            return self._similarity.get_sim(sentence_vector1, sentence_vector2)
        else:
            for x in right_context_words:
                left_unison = left_unison.multiply(final_model.get_row(x))

            base_unison = left_unison

    #print "Three"
    # Create a vector having context words and word to replace.
    if add:
        context_word_vector = base_unison + final_model.get_row(word)
    else:
        context_word_vector = base_unison.multiply(final_model.get_row(word)) if base_unison is not None else final_model.get_row(word)

    #print "Four"
    results = {}
    cos_sim = CosSimilarity()
    
    #############################################################################
    # If we simply get the nearest neigbours of the actual context word.
    #############################################################################
    if no_rerank:
        results = final_model.get_xneighbours(context_word_vector, 10, cos_sim)
        return (word, map(lambda x: x[0][:-2], results))

    #############################################################################
    # Get the list of the similar words to the given vector.
    #############################################################################
    antonyms = big_thesaurus.antonyms(word)
    replacements = []
    if thesaurus > 0.0:
        synonyms = big_thesaurus.replacements(word)
예제 #15
0
def computeAnalogy(w1,w2,w3):
	composed_space = sub.compose([(w1,w2, "step1")], space)
	composed_space2 = add.compose([("step1", w3, "step2")], (composed_space,space))
	guess=composed_space2.get_neighbours("step2", 1, CosSimilarity(),space)
	return guess
예제 #16
0
##########################################################################

from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive
from composes.similarity.cos import CosSimilarity
import sys



pkl=sys.argv[1]
base=sys.argv[2]
minus=sys.argv[3]
plus=sys.argv[4]

space = io_utils.load(pkl)

# instantiate an additive and subtractive model
add = WeightedAdditive(alpha = 1, beta = 1)
sub = WeightedAdditive(alpha = 1, beta = -1)


#print space.get_neighbours(base, 10, CosSimilarity())

print "Subtracting",minus,"from",base
composed_space = sub.compose([(base, minus, "step1")], space)
#print composed_space.get_neighbours("step1", 10, CosSimilarity(),space)

print "Adding",plus,"..."
composed_space2 = add.compose([("step1", plus, "step2")], (composed_space,space))
print composed_space2.get_neighbours("step2", 10, CosSimilarity(),space)
예제 #17
0
def main():
    """
    Compute k nearest neighbors for targets.
    """

    # Get the arguments
    args = docopt("""Compute  k nearest neighbors for targets.

    Usage:
        knn.py <spacePrefix1> <k> <outPath> [<testset> <co>]

        <spacePrefix1> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <co> = column index for targets
        <k> = parameter k (k nearest neighbors)
        <outPath> = output path for result file

    Note:
        ...
        
    """)
    
    spacePrefix1 = args['<spacePrefix1>']
    testset = args['<testset>']
    co = int(args['<co>'])
    outPath = args['<outPath>']
    k = int(args['<k>'])
    
    logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,})
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)

    if testset!=None:
        with codecs.open(testset, 'r', 'utf8') as f_in:
            targets = [line.strip().split('\t')[co] for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        targets = [target.decode('utf8') for target in space1.get_row2id()]
    
    target2neighbors = {}
    for i,t1 in enumerate(targets):
        
        try:
            neighbors1 = space1.get_neighbours(t1.encode('utf8'), k, CosSimilarity())
            del neighbors1[0]
        except KeyError:
            neighbors1 = [('nan',float('nan'))]
            
        target2neighbors[t1] = neighbors1
               

    with codecs.open(outPath +'.csv', 'w', 'utf-8') as f_out:
        for t1 in targets:
            # Convert cosine similarity to cosine distance, export nearest neighbors
            print >> f_out, t1+'\t'+' '.join([str((n,1-v)) for (n,v) in target2neighbors[t1]])

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
예제 #18
0
def main():
    """
    Compute local neighborhood distance for target pairs from two vector spaces.
    """

    # Get the arguments
    args = docopt(
        """Compute local neighborhood distance for target pairs from two vector spaces.

    Usage:
        lnd.py [(-f | -s)] <spacePrefix1> <spacePrefix2> <k> <outPath> [<testset>]

        <spacePrefix1> = path to pickled space without suffix
        <spacePrefix2> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <k> = parameter k (k nearest neighbors)
        <outPath> = output path for result file

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    testset = args['<testset>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.config.dictConfig({
        'version': 1,
        'disable_existing_loggers': True,
    })
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)

    if testset != None:
        # target vectors in first/second column are computed from space1/space2
        with codecs.open(testset, 'r', 'utf8') as f_in:
            targets = [(line.strip().split('\t')[0],
                        line.strip().split('\t')[1]) for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        target_intersection = set([
            target.decode('utf8') for target in space1.get_row2id()
        ]).intersection(
            [target.decode('utf8') for target in space2.get_row2id()])
        targets = zip(target_intersection, target_intersection)

    scores = {}
    neighborUnionSizes = {}
    for i, (t1, t2) in enumerate(targets):

        # Get nearest neighbors
        try:
            neighbors1 = space1.get_neighbours(t1.encode('utf8'), k,
                                               CosSimilarity())
            neighbors2 = space2.get_neighbours(t2.encode('utf8'), k,
                                               CosSimilarity())
        except KeyError:
            scores[(t1, t2)] = 'nan'
            neighborUnionSizes[(t1, t2)] = 'nan'
            continue

        neighborUnion = list(
            set([
                a for (a, b) in neighbors1 + neighbors2
                if (a in space1.row2id and a in space2.row2id and not a in
                    [t1.encode('utf8'), t2.encode('utf8')])
            ]))

        simVec1 = [
            space1.get_sim(t1.encode('utf8'), n, CosSimilarity())
            for n in neighborUnion
        ]
        simVec2 = [
            space2.get_sim(t2.encode('utf8'), n, CosSimilarity())
            for n in neighborUnion
        ]

        # Compute cosine distance of vectors
        distance = spatial.distance.cosine(simVec1, simVec2)
        scores[(t1, t2)] = distance
        neighborUnionSizes[(t1, t2)] = len(neighborUnion)

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                print >> f_out, '\t'.join((t1, str(float(scores[(t1, t2)])),
                                           str(neighborUnionSizes[(t1, t2)])))
            elif is_scd:  # output only second target string
                print >> f_out, '\t'.join((t2, str(float(scores[(t1, t2)])),
                                           str(neighborUnionSizes[(t1, t2)])))
            else:  # standard outputs both target strings
                print >> f_out, '\t'.join(
                    ('%s,%s' % (t1, t2), str(float(scores[(t1, t2)])),
                     str(neighborUnionSizes[(t1, t2)])))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #19
0
#similarity.py
#USAGE: python similarity [space file] [word1] [word2]
#EXAMPLE: python kneighbours ~/UkWac/dissect/ANs/ANs.kpl car_n dog_n
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity
import sys

#load a space
my_space = io_utils.load(sys.argv[1])

#print my_space.cooccurrence_matrix
#print my_space.id2row

#compute similarity between two words in the space
print "The similarity of", sys.argv[2], "and", sys.argv[
    3], "is:", my_space.get_sim(sys.argv[2], sys.argv[3], CosSimilarity())
예제 #20
0
#ex08.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load a space
my_space = io_utils.load("./data/out/ex01.pkl")

#get the top 2 neighbours of "car"
print my_space.get_neighbours("car", 2, CosSimilarity())
예제 #21
0
#kneighbours.py
#USAGE: python kneighbours [space file] [word] [k]
#EXAMPLE: python2.7 kneighbours.py ~/UkWac/dissect-data/ANs/out/CORE_SS.ans.ppmi.row.pkl car-n 30
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity
import sys

#load a space
my_space = io_utils.load(sys.argv[1])

#get the top 2 neighbours of "car"
print my_space.get_neighbours(sys.argv[2], int(sys.argv[3]), CosSimilarity())
 def __init__(self, lambda_):
     '''
     Constructor
     '''
     self._lambda = lambda_
     self._measure = CosSimilarity()
예제 #23
0
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])

lengths = []
found = True
for wp in word_pairs:
    try:
        v1 = my_space.get_row(wp[0])
        v2 = my_space.get_row(wp[1])
    except KeyError:
        #print wp[0],"or",wp[1],"not found"
        found = False
    if found:
        composed_space = add.compose([(wp[0], wp[1], "_composed_")], my_space)
        neighbours = composed_space.get_neighbours("_composed_",
                                                   10,
                                                   CosSimilarity(),
                                                   space2=my_space)
        print wp[0], wp[1]
        print neighbours
        density = 0
        for n in neighbours:
            density += n[1]
        density = density / 10
        print "Density", density
        c = composed_space.get_row("_composed_")
        print "Norm ", c.norm()
        cos = composed_space.get_sim("_composed_",
                                     wp[1],
                                     CosSimilarity(),
                                     space2=my_space)
        print "Cos ", cos
예제 #24
0
#ex07.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load two spaces
my_space = io_utils.load("./data/out/ex01.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

print(my_space.id2row)
print(my_per_space.id2row)

#compute similarity between a word and a phrase in the two spaces
print(
    my_space.get_sim("car", "sports_car", CosSimilarity(),
                     space2=my_per_space))
 def __init__(self, lambda_):
     """
     Constructor
     """
     self._lambda = lambda_
     self._measure = CosSimilarity()
예제 #26
0
print "Training Lexical Function composition model..."
comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)

print "Composing phrases..."
test_phrases_file = data_path + "ML08nvs_test.txt"
test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2])
composed_space = comp_model.compose(test_phrases, space)

print "Reading similarity test data..."
test_similarity_file = data_path + "ML08data_new.txt"
test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1])
gold = io_utils.read_list(test_similarity_file, field=2)

print "Computing similarity with lexical function..."
pred = composed_space.get_sims(test_pairs, CosSimilarity())

#use this composed space to assign similarities
print "Scoring lexical function..."
print scoring_utils.score(gold, pred, "spearman")

print "Training Full Additive composition model..."
comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)
composed_space = comp_model.compose(test_phrases, space)
pred = composed_space.get_sims(test_pairs, CosSimilarity())
print scoring_utils.score(gold, pred, "spearman")

print "Training Weighted Additive composition model..."
comp_model = WeightedAdditive()
comp_model.train(train_data, space, per_space)
예제 #27
0
#ex09.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load two spaces
my_space = io_utils.load("./data/out/ex01.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

print(my_space.id2row)
print(my_space.cooccurrence_matrix)
print(my_per_space.id2row)
print(my_per_space.cooccurrence_matrix)

#get the top two neighbours of "car" in a peripheral space
print(my_space.get_neighbours("car", 2, CosSimilarity(), space2=my_per_space))
예제 #28
0
파일: dissect.py 프로젝트: DariaRyzhova/phd
        els_for_comp.append(element)
    return els_for_comp


typ_space = create_space(TypDmFile, TypRowsFile)
distr_space = create_space(DistrDmFile, DistrRowsFile)

#load a space from a pickle file
#my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl")

#distributional vectors processing
distr_space = distr_space.apply(PpmiWeighting())
distr_space = distr_space.apply(Svd(300))
#io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl")

items = items_from_file(itemsFile)
els_for_comp = elements_for_composition(items)

my_comp = WeightedAdditive(alpha=1, beta=1)
distr_space = my_comp.compose(els_for_comp, distr_space)

pairs = pairs(items)

predicted = distr_space.get_sims(pairs, CosSimilarity())
gold = typ_space.get_sims(pairs, CosSimilarity())

#compute correlations
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")
예제 #29
0
파일: ex20.py 프로젝트: totonac/dissect
#ex20.py
#-------
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity

#read in a space
my_space = io_utils.load("data/out/ex01.pkl")

#compute similarities of a list of word pairs
fname = "data/in/word_sims.txt"
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])
predicted = my_space.get_sims(word_pairs, CosSimilarity())

#compute correlations
gold = io_utils.read_list(fname, field=2)
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")
예제 #30
0
파일: ex06.py 프로젝트: totonac/dissect
#ex06.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load a space
my_space = io_utils.load("./data/out/ex01.pkl")

print my_space.cooccurrence_matrix
print my_space.id2row

#compute similarity between two words in the space
print my_space.get_sim("car", "car", CosSimilarity())
print my_space.get_sim("car", "book", CosSimilarity())
예제 #31
0
#ex16.py
#-------
from composes.utils import io_utils
from composes.composition.lexical_function import LexicalFunction
from composes.similarity.cos import CosSimilarity

#training data
#trying to learn a "good" function
train_data = [("good_function", "car", "good_car"),
              ("good_function", "book", "good_book")]

#load argument and phrase space
arg_space = io_utils.load("./data/out/ex10.pkl")
phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")

#train a lexical function model on the data
my_comp = LexicalFunction()
my_comp.train(train_data, arg_space, phrase_space)

#print its parameters
print "\nLexical function space:"
print my_comp.function_space.id2row
cooc_mat = my_comp.function_space.cooccurrence_matrix
cooc_mat.reshape(my_comp.function_space.element_shape)
print cooc_mat

#similarity within the learned functional space
print "\nSimilarity between good and good in the function space:"
print my_comp.function_space.get_sim("good_function", "good_function",
                                     CosSimilarity())
예제 #32
0
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity
import sys

#read in a space
my_space = io_utils.load(sys.argv[1])

#compute similarities of a list of word pairs
fname = sys.argv[2]
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1, 2])

predicted = []
gold = []
cos = 0
for wp in word_pairs:
    try:
        cos = my_space.get_sim(wp[0], wp[1], CosSimilarity())
        if cos > 0:
            #print wp[0],wp[1],cos
            predicted.append(cos)
            gold.append(wp[2])
    except:
        print "Couldn't measure cosine..."

#compute correlations
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")