示例#1
0
    def avg_contexts(self, ref_subvec, top, top_percent, top_inferences_number, exclude_ref, weights_factor):
        '''
        Performs a weighted average of
        :param ref_subvec: given subvec as a numpy matrix
        :param top:
        :param top_percent:
        :param top_inferences_number:
        :param exclude_ref:
        :param weights_factor:
        :returns: parvec, number of contexts averaged
        '''
        
        if len(self.contexts) == 0:
            return None, 0
        
        ref_weight = 1 if exclude_ref == False else 0
               
        if (top > len(self.contexts) + ref_weight):
            top = len(self.contexts) + ref_weight
            
        if (top > 0 or top_percent > 0):
            top_contexts_weights = self.sim_scores.todok()
            final_top = top-ref_weight # -1 to leave 1 for the ref_subvec
            num_top_percent = int(math.ceil(top_percent * (len(self.contexts)+ref_weight)))-ref_weight
            final_top = max(final_top, num_top_percent) 
            
            cw_sorted  = heapq.nlargest(final_top, top_contexts_weights.iteritems(), key=lambda x: x[1])
            top_contexts_weights = dok_matrix((len(self.contexts),1), dtype=np.float32)
            
            for (k,j), weight in cw_sorted:
                top_contexts_weights[k,j] = weight**weights_factor

            top_contexts_weights = top_contexts_weights.tocsr()
            contexts_num = len(cw_sorted)
                
        else:            
            contexts_num = len(self.contexts)
            if weights_factor == 0.0:
                top_contexts_weights = dok_matrix([[1.0]*contexts_num]).tocsr().transpose()
            else:
                top_contexts_weights = self.sim_scores.copy()
                top_contexts_weights.data **= weights_factor
            
        sum_weights = top_contexts_weights.sum() + ref_weight #weight +1 reserved for ref_subvec
        top_contexts_weights.data /= sum_weights


        weighted_subs_matrix = self.subs_matrix.multiply(top_contexts_weights)  #NOT SUPPORTED IN SCIPY 0.7        
        avg_subvec = weighted_subs_matrix.sum(axis=0)
        
        if (exclude_ref == False) and (ref_subvec != None):
            ref_subvec.data *= 1.0/sum_weights
            avg_subvec = avg_subvec + ref_weight * ref_subvec.transpose()
        
        result_vec = self.__vec_to_sorted_list(avg_subvec, top_inferences_number)  
        return result_vec, contexts_num        
示例#2
0
    def _compute_relations(dictionary):
        # print("Computing relations", file=sys.stderr)
        # logger.log(logging.DEBUG, "Computing tables relations")
        # logger.log(logging.DEBUG, "Computing contains/contained relations")
        # logger.log(logging.DEBUG, "Computing father/child relations")
        # print("Computing siblings relations", sys.stderr)

        relations = {}
        contains = RelationsGraph._compute_contains(dictionary)
        relations['contains'] = csr_matrix(contains)
        relations['contained'] = csr_matrix(relations['contains'].transpose())

        father = RelationsGraph._compute_father(dictionary)

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            relations['father' + r] = dok_matrix(father[i])

        siblings = RelationsGraph._compute_siblings(dictionary)
        relations['opposed'] = dok_matrix(siblings[0])
        relations['associated'] = dok_matrix(siblings[1])
        relations['crossed'] = dok_matrix(siblings[2])
        relations['twin'] = dok_matrix(siblings[3])

        # self._do_inhibitions()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            relations['child' + r] = relations['father' + r].transpose()

        # self.relations['siblings'] = sum(siblings)
        # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1)
        # self.relations['father'] = self.relations['father_substance'] + \
        #                            self.relations['father_attribute'] + \
        #                            self.relations['father_mode']
        # self.relations['child'] = self.relations['child_substance'] + \
        #                           self.relations['child_attribute'] + \
        #                           self.relations['child_mode']
        # self.relations['etymology'] = self.relations['father'] + self.relations['child']

        table = RelationsGraph._compute_table_rank(dictionary,
                                                   relations['contained'])
        for i in range(6):
            relations['table_%d' % i] = table[i]

        relations['identity'] = csr_matrix(np.eye(len(dictionary)))

        missing = {s for s in RELATIONS if s not in relations}
        if missing:
            raise ValueError("Missing relations : {%s}" % ", ".join(missing))

        return {
            reltype: csr_matrix(relations[reltype])
            for reltype in RELATIONS
        }
示例#3
0
    def _compute_relations(self):
        logger.log(logging.INFO, "Computing relations")

        self.relations = {}
        contains = self._compute_contains()
        self.relations['contains'] = csr_matrix(contains)
        self.relations['contained'] = csr_matrix(
            self.relations['contains'].transpose())

        father = self._compute_father()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            self.relations['father' + r] = dok_matrix(father[i])

        siblings = self._compute_siblings()
        self.relations['opposed'] = dok_matrix(siblings[0])
        self.relations['associated'] = dok_matrix(siblings[1])
        self.relations['crossed'] = dok_matrix(siblings[2])
        self.relations['twin'] = dok_matrix(siblings[3])

        # self._do_inhibitions()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            self.relations['child' + r] = self.relations['father' +
                                                         r].transpose()

        # self.relations['siblings'] = sum(siblings)
        # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1)
        # self.relations['father'] = self.relations['father_substance'] + \
        #                            self.relations['father_attribute'] + \
        #                            self.relations['father_mode']
        # self.relations['child'] = self.relations['child_substance'] + \
        #                           self.relations['child_attribute'] + \
        #                           self.relations['child_mode']
        # self.relations['etymology'] = self.relations['father'] + self.relations['child']

        table = self._compute_table_rank(self.relations['contained'])
        for i in range(6):
            self.relations['table_%d' % i] = table[i]

        self.relations['identity'] = csr_matrix(np.eye(len(self.dictionary)))

        missing = {s for s in RELATIONS if s not in self.relations}
        if missing:
            raise ValueError("Missing relations : {%s}" % ", ".join(missing))

        self.relations = {
            reltype: csr_matrix(self.relations[reltype])
            for reltype in RELATIONS
        }
示例#4
0
def read_mm_matrix(file):
    """Read a MatrixMarket format file to a matrix object
    :param file: The file path
    """
    with open(file) as f:
        first = True
        second = False
        for line in f:

            # Skip the first line
            if first:
                first = False
                second = True
                continue

            # The header is in the second line
            elif second:
                tokens = line.strip().split()
                dim_x, dim_y = int(tokens[0]), int(tokens[1])
                m = dok_matrix((dim_x, dim_y), dtype=np.int16)
                second = False
                continue

            # The rest of the lines are the data
            x, y, v = [int(t) for t in line.strip().split()]
            m[x-1, y-1] = v
    return m.tocsr()
def bitmap_to_graph(img, mask):
    # graph = csr_matrix(img)
    am_ind = unravel_index(img.argmax(), img.shape)
    am = img[am_ind]

    # A sparse adjacency matrix.
    # Two pixels are adjacent in the graph if both are painted.
    adjacency = dok_matrix(
        (img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]))

    # The following lines fills the adjacency matrix by
    directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
    for i in range(1, img.shape[0] - 1):
        for j in range(1, img.shape[1] - 1):
            pix1 = img[i, j]
            if not mask[i, j]:
                continue
            for y_diff, x_diff in directions:
                pix2 = img[i + y_diff, j + x_diff]
                if not mask[i + y_diff, j + x_diff]:
                    continue
                adjacency[to_index(img, i, j),
                          to_index(img, i + y_diff, j + x_diff)] = float(
                              am * 2 - pix1 -
                              pix2)**16 + 1  #abs(int(pix2) - int(pix1))*2 + 1
    return adjacency
示例#6
0
    def __init__(self, resource_mat_file, entity_map_file, property_map_file, \
                 relations_file, whitelist):
        """
        Load the resource with the restricted set of edge types according to the whitelist
        :param whitelist: The list of allowed edge types
        """

        # Load the properties
        prop_to_id, id_to_prop = load_map(property_map_file, None)

        # Filter according to the whitelist
        properties_in_whitelist = set([clean(prop) for prop in whitelist])
        id_to_prop = dict([(prop_to_id[prop], prop) for prop in prop_to_id.keys() if prop in properties_in_whitelist])
        prop_to_id = dict([(prop, prop_to_id[prop]) for prop in prop_to_id.keys() if prop in properties_in_whitelist])

        self.prop_to_id, self.id_to_prop = prop_to_id, id_to_prop

        edge_types = [edge_type.replace('$', '').replace('^', '') for edge_type in whitelist]
        self.allow_reversed_edges = len([prop for prop in self.prop_to_id.keys() if '<-' + prop + '-' in edge_types]) > 0

        # Load the edges for the specific properties
        self.l2r_edges, self.r2l_edges = load_edges(relations_file, None, prop_to_id.values())

        # Load the entities
        self.term_to_id, self.id_to_term = load_map(entity_map_file, None)

        # Load the restricted matrix
        m = dok_matrix((len(self.term_to_id), len(self.term_to_id)), dtype=np.int16)
        for x in self.l2r_edges.keys():
            for y in self.l2r_edges[x].keys():
                m[x, y] = 1
        self.adjacency_matrix = m.tocsr()

        if self.allow_reversed_edges:
            self.adjacency_matrix = self.adjacency_matrix + self.adjacency_matrix.T
示例#7
0
def read_mm_matrix(file):
    """Read a MatrixMarket format file to a matrix object
    :param file: The file path
    """
    with open(file) as f:
        first = True
        second = False
        for line in f:

            # Skip the first line
            if first:
                first = False
                second = True
                continue

            # The header is in the second line
            elif second:
                tokens = line.strip().split()
                dim_x, dim_y = int(tokens[0]), int(tokens[1])
                m = dok_matrix((dim_x, dim_y), dtype=np.int16)
                second = False
                continue

            # The rest of the lines are the data
            x, y, v = [int(t) for t in line.strip().split()]
            m[x - 1, y - 1] = v
    return m.tocsr()
示例#8
0
    def _compute_relations(self):
        logger.log(logging.INFO, "Computing relations")

        self.relations = {}
        contains = self._compute_contains()
        self.relations['contains'] = csr_matrix(contains)
        self.relations['contained'] = csr_matrix(self.relations['contains'].transpose())

        father = self._compute_father()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            self.relations['father' + r] = dok_matrix(father[i])

        siblings = self._compute_siblings()
        self.relations['opposed'] = dok_matrix(siblings[0])
        self.relations['associated'] = dok_matrix(siblings[1])
        self.relations['crossed'] = dok_matrix(siblings[2])
        self.relations['twin'] = dok_matrix(siblings[3])

        # self._do_inhibitions()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            self.relations['child' + r] = self.relations['father' + r].transpose()

        # self.relations['siblings'] = sum(siblings)
        # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1)
        # self.relations['father'] = self.relations['father_substance'] + \
        #                            self.relations['father_attribute'] + \
        #                            self.relations['father_mode']
        # self.relations['child'] = self.relations['child_substance'] + \
        #                           self.relations['child_attribute'] + \
        #                           self.relations['child_mode']
        # self.relations['etymology'] = self.relations['father'] + self.relations['child']

        table = self._compute_table_rank(self.relations['contained'])
        for i in range(6):
            self.relations['table_%d'%i] = table[i]

        self.relations['identity'] = csr_matrix(np.eye(len(self.dictionary)))

        missing = {s for s in RELATIONS if s not in self.relations}
        if missing:
            raise ValueError("Missing relations : {%s}"%", ".join(missing))

        self.relations = {reltype: csr_matrix(self.relations[reltype]) for reltype in RELATIONS}
示例#9
0
 def __init__(self, args, i2w, w2i, subvecs_num, w2counts, sum_word_counts, stopwords, embeddings):
     
     self.args = args        
     self.w2i = w2i
     self.i2w = i2w
     self.w2counts = w2counts
     self.sum_word_counts = sum_word_counts
     self.stopwords = stopwords
     
     self.contexts = []
     self.sim_scores = None # points either to self.subvecs_sim_scores or to self.bow_sim_scores  
     
     initial_sim_score = 1.0 if subvecs_num==0 else 1.0/subvecs_num 
     
     self.embeddings = embeddings # when this is not None the bow representation is dense (todo: refactor this code)
     self.bow_size = args.bow_size
     if (self.bow_size >= 0):
         if (self.embeddings != None):
             bow_dimensionality = self.embeddings.dimension()
             self.bow_matrix = np.zeros((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep
             self.bow_L2_norms = None # we always keep them normalized
             self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose()
         else:
             bow_dimensionality = len(w2i)
             self.bow_matrix = dok_matrix((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep
             self.bow_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32)
             self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose()
             
     self.subs_matrix = dok_matrix((subvecs_num, len(w2i)), dtype=np.float32) #used for sim weights calculation, also for sub average only if no dual matrix
     self.subvecs_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32)
     self.subvecs_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose()
     
     self.target_counts = {} 
示例#10
0
def gen_random(nodes, k):
    building_matrix = dok_matrix((nodes, nodes))
    for node in xrange(nodes):
        k_ns = set([node])
        while node in k_ns or len(k_ns) < 3:
            k_ns = set(list(random_integers(0, nodes - 1, k)))
        for index in k_ns:
            building_matrix[node, index] = 1
    return building_matrix.tocsr()
示例#11
0
def create_one_hot_vector(x, dim):
    """Creates the one-hot vector representing this node
    :param x -- the node
    :param dim -- the number of nodes (the adjacency matrix dimension)
    """

    n_x = dok_matrix((1, dim), dtype=np.int16)
    n_x[0, x] = 1
    n_x = n_x.tocsr()
    return n_x
示例#12
0
def create_one_hot_vector(x, dim):
    """Creates the one-hot vector representing this node
    :param x -- the node
    :param dim -- the number of nodes (the adjacency matrix dimension)
    """
    
    n_x = dok_matrix((1, dim), dtype=np.int16)
    n_x[0, x] = 1
    n_x = n_x.tocsr()
    return n_x
示例#13
0
    def __init__(self,
                 total_feature_count,
                 version_count,
                 feature_list,
                 target_id,
                 start,
                 end,
                 ngram_sizes=None,
                 ngram_levels=None,
                 label="",
                 sparse=False,
                 dok=False):
        """" Initialize an empty dataset.

        A dataset consists of two components:
        - The data attribute is a matrix containing all input data. It's size is version_count x feature_count.
            Each row of the data matrix represents the feature vector of one version.
        - The target attribute is a vector containing the ground truth. It's size is version_count.

        Args:
            total_feature_count (int): Amount of versions (and ngrams). Equals the rows of the data and target matrix.
            feature_list (List[str]): A list of Feature IDs. Must be in the same order as they are in the dataset.
            target_id (str): ID of the target which is used in this dataset. E.g. 'month'
            start (datetime): Start of the date range contained in this dataset.
            end (datetime): End of the date range contained in this dataset.
            ngram_sizes (list[int]): Optional. The ngram-sizes in this dataset (e.g. [1, 2] for 1-grams and 2-grams)
            ngram_levels (list[int]): Optional. The ngram-levels in this dataset.
            label (str): An arbitrary label, e.g. "Test", for this dataset. Useful when caching!
            sparse (bool): If the data and target matrices should be sparse. Recommended in combination with ngrams.
            dok (bool): If a dok-type sparse matrix should be used. Dok is faster to update. Can be converted to CSR.
        """
        ngram_count = 0
        if ngram_sizes and ngram_levels:
            ngram_count = len(ngram_sizes) * len(ngram_levels)
        logging.debug(
            "Initializing Dataset with  %i versions, %i features and %i ngram vectors."
            % (version_count, total_feature_count, ngram_count))

        dimension = (version_count, total_feature_count + ngram_count)
        if sparse:
            if dok:
                self.data = dok_matrix(dimension, dtype=np.float64)
            else:
                self.data = csr_matrix(dimension, dtype=np.float64)
        else:
            self.data = np.zeros(dimension)
        self.target = np.zeros(version_count)
        self.feature_list = feature_list
        self.target_id = target_id
        self.start = start
        self.end = end
        self.ngram_sizes = ngram_sizes
        self.ngram_levels = ngram_levels
        self.label = label
        self.sparse = sparse
示例#14
0
 def reference_context(self, subvec, context, bow_interpolate):
     '''
     Weighs contexts in this collection according to similarity to the given reference context
     :param subvec: subvec representation of given context
     :param context: given context
     :param bow_interpolate: interpolation factor (between bow and subvec simiarity)
     :returns: subvec as a numpy matrix
     '''        
     subvec_matrix = dok_matrix((len(self.w2i),1), dtype=np.float32)       
     for word, weight in subvec:
         subvec_matrix[self.w2i[word],0] = weight    
     subvec_matrix = subvec_matrix.tocsr()       
     
     return self.__reference_context_imp(subvec_matrix, context, bow_interpolate)
示例#15
0
    def __init__(self, pset,
                       h     = None,
                       alpha = None):
        if h is None:
            self.__h = 0.012            # For liquid water=0.012 m, incompressible flow, Alejandro Jacobo Cabrera Crespo (2008)
        else:
            self.__h = h

        if alpha is None:
            self.__alpha = 0.5          # For liquid water, incompressible flow, Alejandro Jacobo Cabrera Crespo (2008)
        else:
            self.__alpha = alpha

        self.__r            = dok.dok_matrix((pset.size, pset.size), dtype=np.float64) # List of distances between particles i and j
示例#16
0
    def reference_context(self, subvec, context, bow_interpolate):
        '''
        Weighs contexts in this collection according to similarity to the given reference context
        :param subvec: subvec representation of given context
        :param context: given context
        :param bow_interpolate: interpolation factor (between bow and subvec simiarity)
        :returns: subvec as a numpy matrix
        '''
        subvec_matrix = dok_matrix((len(self.w2i), 1), dtype=np.float32)
        for word, weight in subvec:
            subvec_matrix[self.w2i[word], 0] = weight
        subvec_matrix = subvec_matrix.tocsr()

        return self.__reference_context_imp(subvec_matrix, context,
                                            bow_interpolate)
示例#17
0
    def map_kernel (self,  fn_kernel, pset):
        '''
        r is the distance between particles 'i' and 'j'
        '''
        kernel = dok.dok_matrix((pset.size, pset.size), dtype=np.float64)
        items = self.__r.items()
        for item in items:
            conn    = [self.__INI_INT,  self.__INI_INT]
            r       = self.__INI_FLOAT

            conn    = item[0]
            r       = item[1]

            kernel [conn[0], conn[1]] = fn_kernel(r=r)
        return kernel   
示例#18
0
def main(original_img,pathimage,x1,x2,y1,y2):
    img = original_img[:, :, 0] + original_img[:, :, 1] + original_img[:, :, 2]
    adjacency = dok_matrix((img.shape[0] * img.shape[1],
                            img.shape[0] * img.shape[1]), dtype=bool)

# The following lines fills the adjacency matrix by
    directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
    for i in range(1, img.shape[0] - 1):
        for j in range(1, img.shape[1] - 1):
            if not img[i, j]:
                continue

            for y_diff, x_diff in directions:
                if img[i + y_diff, j + x_diff]:
                    adjacency[to_index(img,i, j),
                            to_index(img,i + y_diff, j + x_diff)] = True

# We chose two arbitrary points, which we know are connected
    source = to_index(img,y1, x1)
    target = to_index(img,y2, x2)

# Compute the shortest path between the source and all other points in the image
    _, predecessors = dijkstra(adjacency, directed=False, indices=[source],
                             unweighted=True, return_predecessors=True)

# Constructs the path between source and target
    pixel_index = target
    pixels_path = []
    while pixel_index != source:
        pixels_path.append(pixel_index)
        pixel_index = predecessors[0, pixel_index]
        if(pixel_index==-9999):
            return 1


# The following code is just for debugging and it visualizes the chosen path
        
    #original_img.setflags(write=1)
    path=[]
    for pixel_index in pixels_path: 
        i, j = to_coordinates(img,pixel_index)
        print(i,j)
        path.append([i,j])
        pathimage[i, j,0] = 255

    plt.imshow(pathimage)
    plt.show()
    return path
def build_extra_features(noncat_matrix):
    X = dok_matrix((noncat_matrix.shape[0], noncat_matrix.shape[1] * 2))
    xs, ys = noncat_matrix.nonzero()
    print(len(xs), "nonzero elems")
    count = 0
    for x, y in zip(xs, ys):
        count += 1
        if count % 1000 == 0:
            print(count)
        val = noncat_matrix[x, y]
        if val - math.floor(val) != 0.0:
            for i in range(20):
                if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001:
                    X[x, 2 * y] = math.ceil(abs(val) * i)
                    X[x, 2 * y + 1] = i
    return X
示例#20
0
    def __init__(self, map):
        """ init shortest paths

        input:
            map - a 2d numpy array representing all reachable areas
            of the map. (probably just the reachability map)
        """

        self.map = map

        # init the adjacency matrix
        self.adjacency = dok_matrix((map.shape[0] * map.shape[1],
                                     map.shape[0] * map.shape[1]), dtype=bool)

        # fill the adjacency matrix
        self._fill_adjacency()
def build_extra_features(noncat_matrix):
    X = dok_matrix((noncat_matrix.shape[0], noncat_matrix.shape[1] * 2))
    xs, ys = noncat_matrix.nonzero()
    print(len(xs), "nonzero elems")
    count = 0
    for x, y in zip(xs, ys):
        count += 1
        if count % 1000 == 0:
            print(count)
        val = noncat_matrix[x, y]
        if val - math.floor(val) != 0.0:
            for i in range(20):
                if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001:
                    X[x, 2 * y] = math.ceil(abs(val) * i)
                    X[x, 2 * y + 1] = i
    return X
示例#22
0
def make_adjacency_matrix(img):
    rows, cols = img.shape
    adjacency = dok_matrix((rows * cols, rows * cols), dtype=bool)

    directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
    for row in range(1, rows - 1):
        for col in range(1, cols - 1):
            if not img[row, col]:
                continue

            for y_diff, x_diff in directions:
                if img[row + y_diff, col + x_diff]:
                    adjacency[to_index(cols, row, col),
                              to_index(cols, row + y_diff, col +
                                       x_diff)] = True
    return adjacency
示例#23
0
    def __init__(self, resource_mat_file, entity_map_file, property_map_file, \
                 relations_file, whitelist):
        """
        Load the resource with the restricted set of edge types according to the whitelist
        :param whitelist: The list of allowed edge types
        """

        # Load the properties
        prop_to_id, id_to_prop = load_map(property_map_file, None)

        # Filter according to the whitelist
        properties_in_whitelist = set([clean(prop) for prop in whitelist])
        id_to_prop = dict([(prop_to_id[prop], prop)
                           for prop in prop_to_id.keys()
                           if prop in properties_in_whitelist])
        prop_to_id = dict([(prop, prop_to_id[prop])
                           for prop in prop_to_id.keys()
                           if prop in properties_in_whitelist])

        self.prop_to_id, self.id_to_prop = prop_to_id, id_to_prop

        edge_types = [
            edge_type.replace('$', '').replace('^', '')
            for edge_type in whitelist
        ]
        self.allow_reversed_edges = len([
            prop for prop in self.prop_to_id.keys()
            if '<-' + prop + '-' in edge_types
        ]) > 0

        # Load the edges for the specific properties
        self.l2r_edges, self.r2l_edges = load_edges(relations_file, None,
                                                    prop_to_id.values())

        # Load the entities
        self.term_to_id, self.id_to_term = load_map(entity_map_file, None)

        # Load the restricted matrix
        m = dok_matrix((len(self.term_to_id), len(self.term_to_id)),
                       dtype=np.int16)
        for x in self.l2r_edges.keys():
            for y in self.l2r_edges[x].keys():
                m[x, y] = 1
        self.adjacency_matrix = m.tocsr()

        if self.allow_reversed_edges:
            self.adjacency_matrix = self.adjacency_matrix + self.adjacency_matrix.T
 def __init__( self , size , dim , m=np.array([]) , Consts=1.0 , f_inter=None ):
     super( LinearSpringConstrained , self ).__init__( size , dim , m , Consts , f_inter=f_inter )
     
     self.__dim = dim
     self.__size = size
     
     self.__K = Consts
     
     self.__A = np.zeros( ( size , dim ) )
     self.__F = np.zeros( ( size , dim ) )
     
     self.__Fm = dok.dok_matrix( ( size , size ) )
     self.__Fm2 = csr.csr_matrix( ( size , size ) )
             
     self.__M = np.zeros( ( size , 1 ) )
     if len(m) != 0 :
         self.set_masses( m )
示例#25
0
文件: util.py 项目: GRSEB9S/region
def scipy_sparse_matrix_from_dict(neighbors):
    """
    Parameters
    ----------
    neighbors : dict
        Each key represents an area. The corresponding value contains the
        area's neighbors.

    Returns
    -------
    adj : :class:`scipy.sparse.csr_matrix`
        Adjacency matrix representing the areas' contiguity relation.

    Examples
    --------
    >>> neighbors = {0: {1, 3}, 1: {0, 2, 4}, 2: {1, 5},
    ...              3: {0, 4}, 4: {1, 3, 5}, 5: {2, 4}}
    >>> obtained = scipy_sparse_matrix_from_dict(neighbors)
    >>> desired = np.array([[0, 1, 0, 1, 0, 0],
    ...                     [1, 0, 1, 0, 1, 0],
    ...                     [0, 1, 0, 0, 0, 1],
    ...                     [1, 0, 0, 0, 1, 0],
    ...                     [0, 1, 0, 1, 0, 1],
    ...                     [0, 0, 1, 0, 1, 0]])
    >>> (obtained.todense() == desired).all()
    True
    >>> neighbors = {"left": {"middle"},
    ...              "middle": {"left", "right"},
    ...              "right": {"middle"}}
    >>> obtained = scipy_sparse_matrix_from_dict(neighbors)
    >>> desired = np.array([[0, 1, 0],
    ...                     [1, 0, 1],
    ...                     [0, 1, 0]])
    >>> (obtained.todense() == desired).all()
    True
    """
    n_areas = len(neighbors)
    name_to_int = {
        area_name: i
        for i, area_name in enumerate(sorted(neighbors))
    }
    adj = dok_matrix((n_areas, n_areas))
    for i in neighbors:
        for j in neighbors[i]:
            adj[name_to_int[i], name_to_int[j]] = 1
    return adj.tocsr()
示例#26
0
文件: util.py 项目: knaaptime/region
def scipy_sparse_matrix_from_dict(neighbors):
    """
    Parameters
    ----------
    neighbors : dict
        Each key represents an area. The corresponding value contains the
        area's neighbors.

    Returns
    -------
    adj : :class:`scipy.sparse.csr_matrix`
        Adjacency matrix representing the areas' contiguity relation.

    Examples
    --------
    >>> neighbors = {0: {1, 3}, 1: {0, 2, 4}, 2: {1, 5},
    ...              3: {0, 4}, 4: {1, 3, 5}, 5: {2, 4}}
    >>> obtained = scipy_sparse_matrix_from_dict(neighbors)
    >>> desired = np.array([[0, 1, 0, 1, 0, 0],
    ...                     [1, 0, 1, 0, 1, 0],
    ...                     [0, 1, 0, 0, 0, 1],
    ...                     [1, 0, 0, 0, 1, 0],
    ...                     [0, 1, 0, 1, 0, 1],
    ...                     [0, 0, 1, 0, 1, 0]])
    >>> (obtained.todense() == desired).all()
    True
    >>> neighbors = {"left": {"middle"},
    ...              "middle": {"left", "right"},
    ...              "right": {"middle"}}
    >>> obtained = scipy_sparse_matrix_from_dict(neighbors)
    >>> desired = np.array([[0, 1, 0],
    ...                     [1, 0, 1],
    ...                     [0, 1, 0]])
    >>> (obtained.todense() == desired).all()
    True
    """
    n_areas = len(neighbors)
    name_to_int = {area_name: i
                   for i, area_name in enumerate(sorted(neighbors))}
    adj = dok_matrix((n_areas, n_areas))
    for i in neighbors:
        for j in neighbors[i]:
            adj[name_to_int[i], name_to_int[j]] = 1
    return adj.tocsr()
示例#27
0
    def __init__(self, args, i2w, w2i, subvecs_num, w2counts, sum_word_counts,
                 stopwords, embeddings):

        self.args = args
        self.w2i = w2i
        self.i2w = i2w
        self.w2counts = w2counts
        self.sum_word_counts = sum_word_counts
        self.stopwords = stopwords

        self.contexts = []
        self.sim_scores = None  # points either to self.subvecs_sim_scores or to self.bow_sim_scores

        initial_sim_score = 1.0 if subvecs_num == 0 else 1.0 / subvecs_num

        self.embeddings = embeddings  # when this is not None the bow representation is dense (todo: refactor this code)
        self.bow_size = args.bow_size
        if (self.bow_size >= 0):
            if (self.embeddings != None):
                bow_dimensionality = self.embeddings.dimension()
                self.bow_matrix = np.zeros(
                    (subvecs_num, bow_dimensionality), dtype=np.float32
                )  # estimate sim of contexts based on their BOW rep
                self.bow_L2_norms = None  # we always keep them normalized
                self.bow_sim_scores = dok_matrix(
                    [[initial_sim_score] * subvecs_num]).tocsr().transpose()
            else:
                bow_dimensionality = len(w2i)
                self.bow_matrix = dok_matrix(
                    (subvecs_num, bow_dimensionality), dtype=np.float32
                )  # estimate sim of contexts based on their BOW rep
                self.bow_L2_norms = dok_matrix((subvecs_num, 1),
                                               dtype=np.float32)
                self.bow_sim_scores = dok_matrix(
                    [[initial_sim_score] * subvecs_num]).tocsr().transpose()

        self.subs_matrix = dok_matrix(
            (subvecs_num, len(w2i)), dtype=np.float32
        )  #used for sim weights calculation, also for sub average only if no dual matrix
        self.subvecs_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32)
        self.subvecs_sim_scores = dok_matrix([[initial_sim_score] * subvecs_num
                                              ]).tocsr().transpose()

        self.target_counts = {}
示例#28
0
def read_matrix(path):
    """Read a MatrixMarket format file to a matrix object"""
    
    with open(path) as f:
        first = True
        second = False
        for line in f:
            if first:
                first = False
                second = True
                continue
            elif second:
                tokens = line.strip().split()
                dim_x, dim_y = int(tokens[0]), int(tokens[1])
                m = dok_matrix((dim_x, dim_y), dtype=np.int16)
                second = False
                continue
            x, y, v = [int(t) for t in line.strip().split()]
            m[x-1, y-1] = v
    return m.tocsr()
示例#29
0
def read_matrix(path):
    """Read a MatrixMarket format file to a matrix object"""

    with open(path) as f:
        first = True
        second = False
        for line in f:
            if first:
                first = False
                second = True
                continue
            elif second:
                tokens = line.strip().split()
                dim_x, dim_y = int(tokens[0]), int(tokens[1])
                m = dok_matrix((dim_x, dim_y), dtype=np.int16)
                second = False
                continue
            x, y, v = [int(t) for t in line.strip().split()]
            m[x - 1, y - 1] = v
    return m.tocsr()
    def __init__(self, size, dim, m=np.array([]), Consts=1.0, f_inter=None):
        super(LinearSpringConstrained, self).__init__(size,
                                                      dim,
                                                      m,
                                                      Consts,
                                                      f_inter=f_inter)

        self.__dim = dim
        self.__size = size

        self.__K = Consts

        self.__A = np.zeros((size, dim))
        self.__F = np.zeros((size, dim))

        self.__Fm = dok.dok_matrix((size, size))
        self.__Fm2 = csr.csr_matrix((size, size))

        self.__M = np.zeros((size, 1))
        if m is not None and len(m) != 0:
            self.set_masses(m)
示例#31
0
def cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, subvec_filename, cluster_filename):
    '''
    kmeans clustering of subvecs given in an input file
    :param w2i: word2index
    :param cluster_prunning: max size of a cluster centroid
    :param K: number of clusters
    :param ninit: number of repeating tries
    :param maxiter: number of clustering iterations
    :param min_avg_cluster_size: min size of clusters (on average)
    :param subvec_filename: input filename
    :param cluster_filename: output filename
    :returns: None
    '''
    
    if os.path.exists(cluster_filename):
        print "NOTICE: cluster file %s already exists. skipping." % cluster_filename 
        return   
        
    subvec_file = open(subvec_filename, 'r')
    subvec_num = sum(1 for line in subvec_file)/2 #subvec is on every second line
    subvec_file.seek(0)
    
    minK = min(subvec_num/min_avg_cluster_size, K)
    minK = max(1, minK)
      
    cluster_file = open(cluster_filename, 'w')    
    print "Clustering subvecs in file %s. Using K=%d\n" % (cluster_filename, minK)       
        
    target = subvec_filename[subvec_filename.rfind('/')+1:]
    subs_matrix = dok_matrix((subvec_num, len(w2i)), dtype=np.float32)
    
    line = 0    
    try:
        while True: 
            context_inst, subvec = read_context(subvec_file)
            normalize_subvec(subvec)
            for word, weight in subvec:
                if (weight != 0):
                    subs_matrix[line, w2i[word]] = weight 
            line += 1
            if line % 10000 == 0:
                sys.stderr.write("Read %d subvecs\n" % (line))
    except EOFError:            
        sys.stderr.write("Finished loading %d context lines\n" % line)
        
    subs_matrix = subs_matrix.tocsr()
        
    best_centroids = None
    best_inertia = None
    
    for init_iter in xrange(0, ninit): 
 
        kmeans = KMeans(init='k-means++', n_clusters=minK, n_init=1, max_iter=1)
        kmeans.fit(subs_matrix)
        centroids = kmeans.cluster_centers_
        normalize_centroids(centroids)
        for iter in xrange(1,maxiter):        
            kmeans = KMeans(init=centroids, n_clusters=minK, n_init=1, max_iter=1)                 
            kmeans.fit(subs_matrix)
            centroids = kmeans.cluster_centers_
            normalize_centroids(centroids)            
        inertia = kmeans.inertia_
        
        if best_centroids is None or inertia < best_inertia:
            best_inertia = inertia
            best_centroids = centroids
        
    for j in xrange(0,len(best_centroids)):        
        cluster_vec = [(i2w[i], weight) for (i, weight) in enumerate(best_centroids[j,:]) if weight != 0]
        cluster_vec = sorted(cluster_vec, key=itemgetter(1), reverse=True)[:cluster_prunning]
        norm = sum([weight**2 for word, weight in cluster_vec])**0.5
        cluster_vec = [(word, weight/norm) for word, weight in cluster_vec]
        norm = sum([weight**2 for word, weight in cluster_vec])**0.5
        cluster_file.write(target + "\t" + str(j) + "\t0\t" + target + "\tCLUSTER\t norm verified = " + '{0:1.8f}'.format(norm) + "\tpruning factor = " + str(cluster_prunning) +"\n")
        for (word, weight) in cluster_vec:
            cluster_file.write(' '.join([word, '{0:1.8f}'.format(weight)])+'\t')
        cluster_file.write('\n') 
    
    subvec_file.close()
    cluster_file.close()
示例#32
0
def findShortest(image, original_image):
    # Load the image from disk as a numpy ndarray
    #original_img = cv2.imread('d_test/24.tif')
    original_img = original_image
    # Create a flat color image for graph building:
    #img = cv2.imread('d_test/24.tif', 0)
    img = image

    # Defines a translation from 2 coordinates to a single number, y = height x = width
    def to_index(y, x):
        return y * img.shape[1] + x

    # Defines a reversed translation from index to 2 coordinates
    def to_coordinates(index):
        return index / img.shape[1], index % img.shape[1]

    # A sparse adjacency matrix.
    # Two pixels are adjacent in the graph if both are painted.

    adjacency = dok_matrix(
        (img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]),
        dtype=np.uint8)
    #adjacency = image.img_to_graph(img)
    #adjacency = np.zeros((img.shape[0]*img.shape[1], img.shape[1]*img.shape[0]))
    # The following lines fills the adjacency matrix by
    directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
    height, width, channels = original_img.shape
    #G2 = nx.complete_graph(height * width)

    #We create a graph the size of our image
    G2 = nx.DiGraph()
    for i in range(width * height):
        G2.add_node(i)

    #These loops create the nodes and the edges between them, the rules are: White pixel to white pixel: lowest cost
    #White to black and black to black: significantly higher cost, node to itself: highest cost to prevemt loops.
    for i in range(0, height):
        for j in range(0, width):
            for y_diff, x_diff in directions:
                if i + y_diff < 0 or i + y_diff > height - 1 or j + x_diff < 0 or j + x_diff > width - 1:
                    continue
                if img[i + y_diff, j + x_diff]:
                    if to_index(i, j) == to_index(i + y_diff, j + x_diff):
                        #adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = 255

                        G2.add_edge(to_index(i, j),
                                    to_index(i + y_diff, j + x_diff),
                                    weight=255)
                    else:
                        #print("( {0} , {1} )".format(to_index(i, j), to_index(i + y_diff, j + x_diff)))
                        #adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = True
                        G2.add_edge(to_index(i, j),
                                    to_index(i + y_diff, j + x_diff),
                                    weight=1)
                else:
                    #print("White to Black")
                    #print(to_index(i, j), to_index(i + y_diff, j + x_diff))
                    #adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = 50
                    G2.add_edge(to_index(i, j),
                                to_index(i + y_diff, j + x_diff),
                                weight=10)
                    #print(adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)])
                """if i == j:
                    #adjacency[to_index(i, j), to_index(i, j)] = 255
                    G2.add_edge(to_index(i, j), to_index(i + y_diff, j + x_diff), weight=255)"""

    # We chose two arbitrary points, which we know are connected
    source = to_index(1, int((width / 2)))
    target = to_index(height - 1, int((width / 2)))

    print(to_index(height - 1, int((width / 2) * 0.25)))
    #m = adjacency.todense()
    #G = nx.from_numpy_matrix(m, create_using=nx.DiGr aph(),parallel_edges=True)
    #G2 = nx.DiGraph(adjacency)
    #G2 = nx.from_scipy_sparse_matrix(adjacency, create_using=nx.MultiDiGraph)
    """for n, nbrsdict in G.adjacency_iter():
        for nbr,eattr in nbrsdict.items():
            if 'weight' in eattr:
                (n, nbr, eattr['weight'])"""
    #print(G2)

    #G2[0][0]['weight']
    path = nx.shortest_path(G2, source, target, weight='weight')
    print(path)

    # Compute the shortest path between the source and all other points in the image
    """M, predecessors = dijkstra(m, directed=False,
                               unweighted=False, return_predecessors=True) # indices = source,
    """
    # Constructs the path between source and target
    pixel_index = int(target)
    pixels_path = []

    #print(predecessors)
    #print(predecessors[pixel_index-1])
    while pixel_index != source:
        try:
            pixels_path.append(pixel_index)
            pixel_index = path[pixel_index]
        except IndexError:
            print(pixel_index)
            break

    for i in path:
        pixels_path.append(i)
    pixels_path.append(target)
    pixels_path[0] = 0
    # The following code is just for debugging and it visualizes the chosen path
    for pixel_index in pixels_path:
        try:
            i, j = to_coordinates(pixel_index)
            i = int(i)
            j = int(j)
            original_img[i, j, 0] = original_img[i, j, 1] = 5
        except IndexError:
            break
    #cv2.imwrite("d_test/Final Test/img_with_path_weighted_directed_11.tif", original_img)
    #plt.imshow(original_img)
    #plt.show()
    return original_img
 def __init__ ( self , pset=None ):
     
     self.__S = dok.dok_matrix( (1,1) , dtype=np.byte )
     
     if pset != None :
         self.pset = pset 
def allpairsmaxminpath(imgPassed): 
    img2 = np.copy(imgPassed)
    img2 = np.uint8(img2)
    
    threshold = .5
    # Detect edges using Canny
    canny_output = cv2.Canny(img2, threshold, threshold * 2)
    
    plt.imshow(canny_output)
    # Find contours
    _, contours, _ = cv2.findContours(canny_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    # Find the convex hull object for each contour
    hull_list = []
    for i in range(len(contours)):
        hull = cv2.convexHull(contours[i])
        hull_list.append(hull)
    # Draw contours + hull results
    drawing = np.zeros((canny_output.shape[0], canny_output.shape[1], 3), dtype=np.uint8)
    #for i in range(len(contours)):
        #color = (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256))
        #cv2.drawContours(drawing, contours, i, color)
        #cv2.drawContours(drawing, hull_list, i, color)
    drawing = np.zeros((canny_output.shape[0], canny_output.shape[1]), dtype=np.uint8)
    
    #hull_list_copy = np.array(hull_list)
    
    #for j in range(0,hull_list_copy.shape[0]):
    #    for i in range(0,hull_list_copy[j].shape[0]):
    #        drawing[hull_list_copy[j][i][0][1]][hull_list_copy[j][i][0][0]] = 1
    #plt.imshow(drawing)
    ## 18 pixels turned on, but tuple is only (2, 13, 1, 2)
    #np.where(drawing == 1)[0].shape
    #hull_list_copy.shape
    hull_list_copy = []
    for i in range(0,len(hull_list)):
        for j in range(0,len(hull_list[i])):
            if not any(np.array_equal(hull_list[i][j], unique_arr) for unique_arr in hull_list_copy):
                hull_list_copy.append(hull_list[i][j])
            #hull_list_copy.append(hull_list[i][j])
    #np.array(hull_list_copy).shape
    hull_list_copy = np.array(hull_list_copy)
    hull_list_copy.shape
    
    img = np.uint8(np.copy(imgPassed))
    kernel = np.ones((5, 5),np.uint8)
    dilation = cv2.dilate(np.uint8(img),kernel,iterations = 1)
    #plt.imshow(dilation)
    for j in range(0,hull_list_copy.shape[0]):
        dilation[hull_list_copy[j][0][1]][hull_list_copy[j][0][0]] = 3
    #plt.imshow(dilation)
    
    
    
    img = np.copy(dilation)
    # A sparse adjacency matrix.
    # Two pixels are adjacent in the graph if both are painted.
    adjacency = dok_matrix((img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]), dtype=bool)
            
    # The following lines fills the adjacency matrix by
    directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
    for i in range(1, img.shape[0] - 1):
        for j in range(1, img.shape[1] - 1):
            if not img[i, j]:
                continue
            
            for y_diff, x_diff in directions:
                if img[i + y_diff, j + x_diff]:
                    adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = True
            
    maxDist = 0
    maxPath = []
    #hull_list_copy[0][0].shape
    #hull_list_copy.shape
    
    ##so i think change this to a double for-loop
    for j in range(0,hull_list_copy.shape[0]):
        for k in range(j,hull_list_copy.shape[0]):
          
            #2 points we know are connected, hmmmmmmmmm
            source = to_index(hull_list_copy[j][0][1], hull_list_copy[j][0][0])
            target = to_index(hull_list_copy[k][0][1], hull_list_copy[k][0][0])
            
            #shortest path between the source and all other points in the image
            _, predecessors = dijkstra(adjacency, directed=False, indices=[source], unweighted=True, return_predecessors=True)
            
            predecessors[predecessors != -9999].shape
            
            #construct the path
            pixel_index = target
            pixels_path = []
            while pixel_index != source:
                pixels_path.append(pixel_index)
                pixel_index = predecessors[0, pixel_index]
                
                #for pixel_index in pixels_path:
                #    x, y = to_coordinates(pixel_index)
                #print(i, j)
                #    img[x, y] = 2
                
                #if this is now our longest shortest path, keep it
                if (len(pixels_path) > maxDist):
                    maxDist = len(pixels_path)
                    maxPath = np.copy(pixels_path)



    for pixel_index in maxPath:
        x, y = to_coordinates(pixel_index)
        #print(i, j)
        img2[x, y] = 2
    #plt.close()
    #plt.imshow(img2)
    #plt.show()
    
    return len(maxPath)
示例#35
0
    def __context_text_to_vec(self, context_instance):
        found_word = False        
        
        if self.embeddings != None:
            dimensionality = self.embeddings.dimension()
            weight_dtype = np.float32
            w2ind = self.w2i
            text_matrix = np.zeros((dimensionality,), dtype=weight_dtype)           
        else:
            dimensionality = len(self.w2i)
            weight_dtype = np.float32 if self.args.tfidf else np.int8
            w2ind = self.w2i
            text_matrix = dok_matrix((dimensionality,1), dtype=weight_dtype)
        
        context_text_tokens = context_instance.get_context_tokens()
        target_pos = context_instance.target_ind
        
        if (self.bow_size > 0):                                    
            start_pos = max(target_pos-self.bow_size, 0)
            end_pos = min(target_pos+self.bow_size+1, len(context_text_tokens))
            context_text_tokens = context_text_tokens[start_pos:end_pos]
            target_pos = target_pos-start_pos
                       
        stopwords = self.stopwords
        context_text_inds_left = [w2ind[word] for word in context_text_tokens[:target_pos] if word not in stopwords and word in w2ind]    
        context_text_inds_right = [w2ind[word] for word in context_text_tokens[target_pos+1:] if word not in stopwords and word in w2ind] if (target_pos+1) < len(context_text_tokens) else []
                             
        all_words_inds = context_text_inds_left+context_text_inds_right
        total_weights = 0.0
        for word_ind in all_words_inds:
            w = self.i2w[word_ind]
            if self.args.tfidf:                
                wcount = self.w2counts[w]
                log_idf = math.log(float(self.sum_word_counts)/wcount)
                log_idf -= self.args.tfidf_offset
                if (log_idf <= self.args.tfidf_threshold):
                    log_idf = 0.0
                weight = log_idf
            else:
                weight = 1
            
            if weight !=0:
                found_word = True
                if (self.embeddings != None):
                    if w in self.embeddings:                    
                        wordvec = self.embeddings.represent(w).transpose()
                        text_matrix = text_matrix + (wordvec * weight)
                    else:
                        weight = 0.0
                else:
                    text_matrix[word_ind,0] += weight
                total_weights += weight
          
        # embeddings representations are always normalized
        if (self.embeddings != None):
            if total_weights != 0:
                text_matrix /= total_weights
            norm = np.sqrt(np.sum(text_matrix*text_matrix))
            if norm != 0:
                text_matrix /= norm

        return text_matrix, found_word
示例#36
0
def estimate_ppc_roi(im, tissue_contours, glomeruli_centers, show=False):
    """Draw the ROI on a low magnification image of WSI given contours of the tissue and glomeruli centers in the image.
    Makes use of the Dijkstra approach and skeletonize on the tissue centers to draw the glomeruli. Note that for this
    method it is more beneficial to use lower magnification, such as 0.25, instead of the standard low magnification of
    range 1.25.

    Source: https://stackoverflow.com/questions/43698577/calculating-the-shortest-path-between-two-points-in-a-bitmap-in
    -python

    Parameters
    ----------
    im : np.ndarray
        RGB image of tissue at low resolution
    tissue_contours : list
        opencv style contours of the tissue in the image
    glomeruli_centers : list
        list of glemeruli (x, y) centers
    show : bool (optional)
        set to True to plot some of the results

    Return
    ------
    roi_contours : list
        opencv style contours of the ROI in the image

    """
    roi_contours = []

    # run through each individual tissue
    for tissue_contour in tissue_contours:
        # draw the tissue contour
        tissue_mask = cv.drawContours(np.zeros(im.shape[:-1]),
                                      [tissue_contour], -1, 1, cv.FILLED)

        # blur the image but force values between 0 and 1
        tissue_mask = cv.GaussianBlur(tissue_mask, (5, 5), 0)
        tissue_mask = (tissue_mask > 0.).astype(np.uint8)

        # find the glomeruli centers that fall within this tissue contour
        tissue_glom_centers = []
        for center in glomeruli_centers:
            if tissue_mask[center[1], center[0]]:
                tissue_glom_centers.append(center)

        # get the skeleton of the mask
        skeleton = skeletonize(tissue_mask).astype(np.uint8)

        # get the x, y coordinates of the skeleton
        rows, cols = np.where(skeleton)

        # find the closest skeleton point for each glomeruli in tissue
        closest_points = []
        for center in tissue_glom_centers:
            distances = []
            for x, y in zip(cols, rows):
                distances.append(get_euclidean(center, (x, y)))
            # find index of smallest distance
            i = distances.index(min(distances))

            # add the closest point as (x, y)
            closest_points.append([cols[i], rows[i]])

        # need at least 2 glomeruli to draw the roi in a tissue
        if len(closest_points) < 2:
            continue

        # Converting skeleton mask to graph problem to apply Dijkstra method to find shortest path
        def to_index(y, x):
            # translation from 2 coordinates to a single number
            return y * skeleton.shape[1] + x

        def to_coordinates(index):
            # define the reversed translation from index to 2 coordinates
            return index / skeleton.shape[1], index % skeleton.shape[1]

        # build sparse adjacency matrix - two pixels are adjacent in the graph if both are painted
        adjacency = dok_matrix((skeleton.shape[0] * skeleton.shape[1],
                                skeleton.shape[0] * skeleton.shape[1]),
                               dtype=bool)

        # the following lines fills the adjacency matrix by
        directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
        for i in range(1, skeleton.shape[0] - 1):
            for j in range(1, skeleton.shape[1] - 1):
                if not skeleton[i, j]:
                    continue

                for y_diff, x_diff in directions:
                    if skeleton[i + y_diff, j + x_diff]:
                        adjacency[to_index(i, j),
                                  to_index(i + y_diff, j + x_diff)] = True

        # convert all the closest points (x, y) to single value, these are known as sources
        sources = [to_index(source[1], source[0]) for source in closest_points]

        # calculate the distant matrix from each source to all possible values in image
        dist_matrix, predecessors = dijkstra(adjacency,
                                             directed=False,
                                             indices=sources,
                                             unweighted=True,
                                             return_predecessors=True)

        # find the two pairs of sources that are farthest away from each other
        combination = list(combinations(range(len(closest_points)), 2))
        distances = []
        for c in combination:
            distances.append(dist_matrix[c[0], sources[c[1]]])

        # find the index with largest value, these indices belong to the sources
        max_combination = combination[distances.index(max(distances))]

        # constructs the path between source and target (the pair of sources that are farthest away from each other)
        source = sources[max_combination[0]]
        target = sources[max_combination[1]]
        pixel_index = target
        pixels_path = []
        while pixel_index != source:
            pixels_path.append(pixel_index)
            pixel_index = predecessors[max_combination[0], pixel_index]

        # create a blank mask to draw only the part of the skeleton connecting the source and target
        roi_mask = Image.new('L', (im.shape[1], im.shape[0]))

        skeleton_points = []
        for pixel_index in pixels_path:
            i, j = to_coordinates(pixel_index)
            skeleton_points.append((int(j), int(i)))
            # im[int(i), int(j)] = [255, 0, 0]

        # use pillow to draw the line with width
        draw = ImageDraw.ImageDraw(roi_mask)
        draw.line(skeleton_points, fill=255, width=40, joint='curve')
        roi_mask = np.array(roi_mask)

        # the width might be too large so do a bit and operation with the tissue mask to remove edges
        roi_mask = cv.bitwise_and(roi_mask, tissue_mask)

        # extract the roi contours
        roi_contour, _ = cv.findContours(roi_mask, cv.RETR_EXTERNAL,
                                         cv.CHAIN_APPROX_TC89_KCOS)

        # if the points are too close together at low resolution, then there will be no contours to draw, skip these
        if len(roi_contour) > 0:
            # append the first contours
            roi_contours.append(roi_contour[0])

    if show:
        tissue_mask = cv.drawContours(np.zeros(im.shape[:-1]), tissue_contours,
                                      -1, 1, cv.FILLED)
        roi_mask = cv.drawContours(np.zeros(im.shape[:-1]), roi_contours, -1,
                                   1, cv.FILLED)
        im_with_roi = cv.drawContours(im.copy(), roi_contours, -1, [255, 0, 0],
                                      2)

        # plot the original image, tissue mask, and roi_mask, draw the roi contous on original image
        fig, ax = plt.subplots(ncols=3, figsize=(10, 5))
        ax[0].imshow(im_with_roi)
        ax[0].set_title('Image with ROI contours', fontsize=14)
        ax[1].imshow(tissue_mask)
        ax[1].set_title('Tissue Mask', fontsize=14)
        ax[2].imshow(roi_mask)
        ax[2].set_title('ROI Mask', fontsize=14)
        plt.show()

    return roi_contours
示例#37
0
    def __context_text_to_vec(self, context_instance):
        found_word = False

        if self.embeddings != None:
            dimensionality = self.embeddings.dimension()
            weight_dtype = np.float32
            w2ind = self.w2i
            text_matrix = np.zeros((dimensionality, ), dtype=weight_dtype)
        else:
            dimensionality = len(self.w2i)
            weight_dtype = np.float32 if self.args.tfidf else np.int8
            w2ind = self.w2i
            text_matrix = dok_matrix((dimensionality, 1), dtype=weight_dtype)

        context_text_tokens = context_instance.get_context_tokens()
        target_pos = context_instance.target_ind

        if (self.bow_size > 0):
            start_pos = max(target_pos - self.bow_size, 0)
            end_pos = min(target_pos + self.bow_size + 1,
                          len(context_text_tokens))
            context_text_tokens = context_text_tokens[start_pos:end_pos]
            target_pos = target_pos - start_pos

        stopwords = self.stopwords
        context_text_inds_left = [
            w2ind[word] for word in context_text_tokens[:target_pos]
            if word not in stopwords and word in w2ind
        ]
        context_text_inds_right = [
            w2ind[word] for word in context_text_tokens[target_pos + 1:]
            if word not in stopwords and word in w2ind
        ] if (target_pos + 1) < len(context_text_tokens) else []

        all_words_inds = context_text_inds_left + context_text_inds_right
        total_weights = 0.0
        for word_ind in all_words_inds:
            w = self.i2w[word_ind]
            if self.args.tfidf:
                wcount = self.w2counts[w]
                log_idf = math.log(float(self.sum_word_counts) / wcount)
                log_idf -= self.args.tfidf_offset
                if (log_idf <= self.args.tfidf_threshold):
                    log_idf = 0.0
                weight = log_idf
            else:
                weight = 1

            if weight != 0:
                found_word = True
                if (self.embeddings != None):
                    if w in self.embeddings:
                        wordvec = self.embeddings.represent(w).transpose()
                        text_matrix = text_matrix + (wordvec * weight)
                    else:
                        weight = 0.0
                else:
                    text_matrix[word_ind, 0] += weight
                total_weights += weight

        # embeddings representations are always normalized
        if (self.embeddings != None):
            if total_weights != 0:
                text_matrix /= total_weights
            norm = np.sqrt(np.sum(text_matrix * text_matrix))
            if norm != 0:
                text_matrix /= norm

        return text_matrix, found_word
示例#38
0
def cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter,
                        min_avg_cluster_size, subvec_filename,
                        cluster_filename):
    '''
    kmeans clustering of subvecs given in an input file
    :param w2i: word2index
    :param cluster_prunning: max size of a cluster centroid
    :param K: number of clusters
    :param ninit: number of repeating tries
    :param maxiter: number of clustering iterations
    :param min_avg_cluster_size: min size of clusters (on average)
    :param subvec_filename: input filename
    :param cluster_filename: output filename
    :returns: None
    '''

    if os.path.exists(cluster_filename):
        print "NOTICE: cluster file %s already exists. skipping." % cluster_filename
        return

    subvec_file = open(subvec_filename, 'r')
    subvec_num = sum(
        1 for line in subvec_file) / 2  #subvec is on every second line
    subvec_file.seek(0)

    minK = min(subvec_num / min_avg_cluster_size, K)
    minK = max(1, minK)

    cluster_file = open(cluster_filename, 'w')
    print "Clustering subvecs in file %s. Using K=%d\n" % (cluster_filename,
                                                           minK)

    target = subvec_filename[subvec_filename.rfind('/') + 1:]
    subs_matrix = dok_matrix((subvec_num, len(w2i)), dtype=np.float32)

    line = 0
    try:
        while True:
            context_inst, subvec = read_context(subvec_file)
            normalize_subvec(subvec)
            for word, weight in subvec:
                if (weight != 0):
                    subs_matrix[line, w2i[word]] = weight
            line += 1
            if line % 10000 == 0:
                sys.stderr.write("Read %d subvecs\n" % (line))
    except EOFError:
        sys.stderr.write("Finished loading %d context lines\n" % line)

    subs_matrix = subs_matrix.tocsr()

    best_centroids = None
    best_inertia = None

    for init_iter in xrange(0, ninit):

        kmeans = KMeans(init='k-means++',
                        n_clusters=minK,
                        n_init=1,
                        max_iter=1)
        kmeans.fit(subs_matrix)
        centroids = kmeans.cluster_centers_
        normalize_centroids(centroids)
        for iter in xrange(1, maxiter):
            kmeans = KMeans(init=centroids,
                            n_clusters=minK,
                            n_init=1,
                            max_iter=1)
            kmeans.fit(subs_matrix)
            centroids = kmeans.cluster_centers_
            normalize_centroids(centroids)
        inertia = kmeans.inertia_

        if best_centroids is None or inertia < best_inertia:
            best_inertia = inertia
            best_centroids = centroids

    for j in xrange(0, len(best_centroids)):
        cluster_vec = [(i2w[i], weight)
                       for (i, weight) in enumerate(best_centroids[j, :])
                       if weight != 0]
        cluster_vec = sorted(cluster_vec, key=itemgetter(1),
                             reverse=True)[:cluster_prunning]
        norm = sum([weight**2 for word, weight in cluster_vec])**0.5
        cluster_vec = [(word, weight / norm) for word, weight in cluster_vec]
        norm = sum([weight**2 for word, weight in cluster_vec])**0.5
        cluster_file.write(target + "\t" + str(j) + "\t0\t" + target +
                           "\tCLUSTER\t norm verified = " +
                           '{0:1.8f}'.format(norm) + "\tpruning factor = " +
                           str(cluster_prunning) + "\n")
        for (word, weight) in cluster_vec:
            cluster_file.write(' '.join([word, '{0:1.8f}'.format(weight)]) +
                               '\t')
        cluster_file.write('\n')

    subvec_file.close()
    cluster_file.close()
示例#39
0
img = original_img[:, :, 0] + original_img[:, :, 1] + original_img[:, :, 2]


# Defines a translation from 2 coordinates to a single number
def to_index(y, x):
    return y * img.shape[1] + x


# Defines a reversed translation from index to 2 coordinates
def to_coordinates(index):
    return index / img.shape[1], index % img.shape[1]


# A sparse adjacency matrix.
# Two pixels are adjacent in the graph if both are painted.
adjacency = dok_matrix((img.shape[0] * img.shape[1],
                        img.shape[0] * img.shape[1]), dtype=bool)

# The following lines fills the adjacency matrix by
directions = list(itertools.product([0, 1, -1], [0, 1, -1]))
for i in range(1, img.shape[0] - 1):
    for j in range(1, img.shape[1] - 1):
        if not img[i, j]:
            continue

        for y_diff, x_diff in directions:
            if img[i + y_diff, j + x_diff]:
                adjacency[to_index(i, j),
                          to_index(i + y_diff, j + x_diff)] = True

# We chose two arbitrary points, which we know are connected
source = to_index(14, 47)
示例#40
0
    def avg_contexts(self, ref_subvec, top, top_percent, top_inferences_number,
                     exclude_ref, weights_factor):
        '''
        Performs a weighted average of
        :param ref_subvec: given subvec as a numpy matrix
        :param top:
        :param top_percent:
        :param top_inferences_number:
        :param exclude_ref:
        :param weights_factor:
        :returns: parvec, number of contexts averaged
        '''

        if len(self.contexts) == 0:
            return None, 0

        ref_weight = 1 if exclude_ref == False else 0

        if (top > len(self.contexts) + ref_weight):
            top = len(self.contexts) + ref_weight

        if (top > 0 or top_percent > 0):
            top_contexts_weights = self.sim_scores.todok()
            final_top = top - ref_weight  # -1 to leave 1 for the ref_subvec
            num_top_percent = int(
                math.ceil(top_percent *
                          (len(self.contexts) + ref_weight))) - ref_weight
            final_top = max(final_top, num_top_percent)

            cw_sorted = heapq.nlargest(final_top,
                                       top_contexts_weights.iteritems(),
                                       key=lambda x: x[1])
            top_contexts_weights = dok_matrix((len(self.contexts), 1),
                                              dtype=np.float32)

            for (k, j), weight in cw_sorted:
                top_contexts_weights[k, j] = weight**weights_factor

            top_contexts_weights = top_contexts_weights.tocsr()
            contexts_num = len(cw_sorted)

        else:
            contexts_num = len(self.contexts)
            if weights_factor == 0.0:
                top_contexts_weights = dok_matrix([[1.0] * contexts_num
                                                   ]).tocsr().transpose()
            else:
                top_contexts_weights = self.sim_scores.copy()
                top_contexts_weights.data **= weights_factor

        sum_weights = top_contexts_weights.sum(
        ) + ref_weight  #weight +1 reserved for ref_subvec
        top_contexts_weights.data /= sum_weights

        weighted_subs_matrix = self.subs_matrix.multiply(
            top_contexts_weights)  #NOT SUPPORTED IN SCIPY 0.7
        avg_subvec = weighted_subs_matrix.sum(axis=0)

        if (exclude_ref == False) and (ref_subvec != None):
            ref_subvec.data *= 1.0 / sum_weights
            avg_subvec = avg_subvec + ref_weight * ref_subvec.transpose()

        result_vec = self.__vec_to_sorted_list(avg_subvec,
                                               top_inferences_number)
        return result_vec, contexts_num
    def __init__(self, pset=None):

        self.__S = dok.dok_matrix((1, 1), dtype=np.byte)

        if pset is not None:
            self.pset = pset