def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files): sim_dict = {"cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity()} if not sim_measure in sim_dict: raise ValueError("Similarity measure:%s not defined" % sim_measure) space = io_utils.load(space_files[0], Space) space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) sim = sim_dict[sim_measure] descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr]) out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) data = io_utils.read_list(in_file) print "Computing neighbours: %s" % sim_measure with open(out_file,"w") as out_stream: for word in data: out_stream.write("%s\n" % word) result = space.get_neighbours(word, no_neighbours, sim, space2) for neighbour, neighbour_sim in result: out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files): sim_dict = { "cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity() } if not sim_measure in sim_dict: raise ValueError("Similarity measure:%s not defined" % sim_measure) space = io_utils.load(space_files[0], Space) space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) sim = sim_dict[sim_measure] descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr]) out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) data = io_utils.read_list(in_file) print("Computing neighbours: %s" % sim_measure) with open(out_file, "w") as out_stream: for word in data: out_stream.write("%s\n" % word) result = space.get_neighbours(word, no_neighbours, sim, space2) for neighbour, neighbour_sim in result: out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
def export(self, filename): """ Prints the parameters of the composition model to file. Args: filename: output filename, string Prints the parameters of the compositional model in an appropriate format, specific to each model. """ create_parent_directories(filename) self._export(filename)
def compute_sim(in_file, columns, out_dir, sim_measures, space_files): sim_dict = { "cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity() } if not len(columns) == 2: raise ValueError("Column description unrecognized!") col0 = int(columns[0]) - 1 col1 = int(columns[1]) - 1 try: space = io_utils.load(space_files[0], Space) except TypeError: warn("Not a Space instance in file: %s" % space_files[0]) return space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr]) for sim_measure in sim_measures: print("Computing similarities: %s" % sim_measure) if not sim_measure in sim_dict: warn("Similarity measure:%s not defined" % sim_measure) continue sim = sim_dict[sim_measure] out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) with open(in_file) as in_stream, open(out_file, "w") as out_stream: for line in in_stream: if not line.strip() == "": elems = line.strip().split() word1 = elems[col0] word2 = elems[col1] predicted_sim = space.get_sim(word1, word2, sim, space2) out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
def compute_sim(in_file, columns, out_dir, sim_measures, space_files): sim_dict = {"cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity()} if not len(columns) == 2: raise ValueError("Column description unrecognized!") col0 = int(columns[0]) - 1 col1 = int(columns[1]) - 1 try: space = io_utils.load(space_files[0], Space) except TypeError: warn("Not a Space instance in file: %s" % space_files[0]) return space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr]) for sim_measure in sim_measures: print "Computing similarities: %s" % sim_measure if not sim_measure in sim_dict: warn("Similarity measure:%s not defined" % sim_measure) continue sim = sim_dict[sim_measure] out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) with open(in_file) as in_stream, open(out_file,"w") as out_stream: for line in in_stream: if not line.strip() == "": elems = line.strip().split() word1 = elems[col0] word2 = elems[col1] predicted_sim = space.get_sim(word1, word2, sim, space2) out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
def export(self, file_prefix, **kwargs): """ Exports the current space to disk. If the space has no column information, it cannot be exported in sparse format (sm). Args: file_prefix: string, prefix of the files to be exported format: string, one of dm/sm Prints: - matrix in file_prefix.<format> - row elements in file_prefix.<row> - col elements in file_prefix.<col> Raises: ValueError: if the space has no column info and "sm" exporting is attempted NotImplementedError: the space matrix is dense and "sm" exporting is attempted """ start = time.time() create_parent_directories(file_prefix) format_ = "dm" if "format" in kwargs: format_ = kwargs["format"] if not format_ in ["dm","sm"]: raise ValueError("Unrecognized format: %s" %format_) elif format_ == "dm": print_cooc_mat_dense_format(self.cooccurrence_matrix, self.id2row, file_prefix) else: print_cooc_mat_sparse_format(self.cooccurrence_matrix, self.id2row, self.id2column, file_prefix) self._export_row_column(file_prefix) log.print_matrix_info(logger, self.cooccurrence_matrix, 1, "Printed semantic space:") log.print_time_info(logger, time.time(), start, 2)
def export(self, file_prefix, **kwargs): """ Exports the current space to disk. If the space has no column information, it cannot be exported in sparse format (sm). Args: file_prefix: string, prefix of the files to be exported format: string, one of dm/sm Prints: - matrix in file_prefix.<format> - row elements in file_prefix.<row> - col elements in file_prefix.<col> Raises: ValueError: if the space has no column info and "sm" exporting is attempted NotImplementedError: the space matrix is dense and "sm" exporting is attempted """ start = time.time() create_parent_directories(file_prefix) format_ = "dm" if "format" in kwargs: format_ = kwargs["format"] if not format_ in ["dm", "sm"]: raise ValueError("Unrecognized format: %s" % format_) elif format_ == "dm": print_cooc_mat_dense_format(self.cooccurrence_matrix, self.id2row, file_prefix) else: print_cooc_mat_sparse_format(self.cooccurrence_matrix, self.id2row, self.id2column, file_prefix) self._export_row_column(file_prefix) log.print_matrix_info(logger, self.cooccurrence_matrix, 1, "Printed semantic space:") log.print_time_info(logger, time.time(), start, 2)
def config_logging(file_name, level = logging.INFO, format_ =""): if not file_name is None: create_parent_directories(file_name) logging.basicConfig(filename=file_name, level=level, format=format_) logging.debug("start logging")