def sum(A, *dimtype): restype = 'double' dim = 1 if len(dimtype) == 2: dim = dimtype[0] dimtype = dimtype[1] elif len(dimtype) == 1: dimtype = dimtype[0] if isinstance(dimtype, str): if dimtype == 'native': restype = A.dtype else: restype = dimtype else: dim = dimtype # finally, our internal arrays are 0-based dim -= 1 n = A.msize[dim] stride = 1 for x in A.msize[:dim]: stride *= x nshp = list(A.msize) nshp[dim] = 1 # the result is nshp-aped array, lin. index xrange(_prod(nshp)) res = [] all = [ slice(0,n) for n in A.msize ] all[dim] = 0 if dim > 1 and dim == len(nshp)-1: nshp.pop() for res_i, el0 in _izip(xrange(_prod(nshp)), _ndilin(A.msize, *all)): res.append( _sum(A._a[i] for i in xrange(el0,el0+n*stride,stride)) ) return _marray(A.dtype, nshp, res)
def write_sticker_list(elements, filename, pcb): """output bom as stickers for each type of component in pdf format""" elements_grouped = get_value_list(elements) mm_to_pt = 2.835 file_pointer = filename and open(filename, 'wb') or sys.stdout pdf = cairo.PDFSurface(file_pointer, PAGE_WIDTH*mm_to_pt, PAGE_HEIGHT*mm_to_pt) gfx = cairo.Context(pdf) # Scale user units to millimetres gfx.scale(1/0.3528, 1/0.3528) labels = sheet_positions(gfx, LABEL_WIDTH, LABEL_HEIGHT, LABELS_X, LABELS_Y, MARGIN_TOP, MARGIN_LEFT, SPACING_X, SPACING_Y) bom = [] for line in elements_grouped: if not ("DO_NOT_PLACE" in line and line['DO_NOT_PLACE'] == "yes"): bom_line = Line(line['NAME'], line['VALUE'], line['PACKAGE'], "", "", line['__SIDE']) bom.append(bom_line) log.debug("number of labels: "+str(len(bom))) for line, label in _izip(bom, labels): line.render(gfx, (label[0]+1, label[1]), LABEL_WIDTH-2, 14) pcb.render(gfx, (label[0]+1, label[1]+14), LABEL_WIDTH-2, LABEL_HEIGHT-14, line.side, line.refs) log.debug("adding label at " + str(label) + " for " + str(line.refs)) log.debug("finishing page") gfx.show_page()
def write_sticker_list(elements, filename, pcb): """output bom as stickers for each type of component in pdf format""" elements_grouped = get_value_list(elements) mm_to_pt = 2.835 file_pointer = filename and open(filename, 'wb') or sys.stdout pdf = cairo.PDFSurface(file_pointer, PAGE_WIDTH * mm_to_pt, PAGE_HEIGHT * mm_to_pt) gfx = cairo.Context(pdf) # Scale user units to millimetres gfx.scale(1 / 0.3528, 1 / 0.3528) labels = sheet_positions(gfx, LABEL_WIDTH, LABEL_HEIGHT, LABELS_X, LABELS_Y, MARGIN_TOP, MARGIN_LEFT, SPACING_X, SPACING_Y) bom = [] for line in elements_grouped: if not ("DO_NOT_PLACE" in line and line['DO_NOT_PLACE'] == "yes"): bom_line = Line(line['NAME'], line['VALUE'], line['PACKAGE'], "", "", line['__SIDE']) bom.append(bom_line) log.debug("number of labels: " + str(len(bom))) for line, label in _izip(bom, labels): line.render(gfx, (label[0] + 1, label[1]), LABEL_WIDTH - 2, 14) pcb.render(gfx, (label[0] + 1, label[1] + 14), LABEL_WIDTH - 2, LABEL_HEIGHT - 14, line.side, line.refs) log.debug("adding label at " + str(label) + " for " + str(line.refs)) log.debug("finishing page") gfx.show_page()
def sum(A, *dimtype): restype = 'double' dim = 1 if len(dimtype) == 2: dim = dimtype[0] dimtype = dimtype[1] elif len(dimtype) == 1: dimtype = dimtype[0] if isinstance(dimtype, str): if dimtype == 'native': restype = A.dtype else: restype = dimtype else: dim = dimtype # finally, our internal arrays are 0-based dim -= 1 n = A.msize[dim] stride = 1 for x in A.msize[:dim]: stride *= x nshp = list(A.msize) nshp[dim] = 1 # the result is nshp-aped array, lin. index xrange(_prod(nshp)) res = [] all = [slice(0, n) for n in A.msize] all[dim] = 0 if dim > 1 and dim == len(nshp) - 1: nshp.pop() for res_i, el0 in _izip(xrange(_prod(nshp)), _ndilin(A.msize, *all)): res.append(_sum(A._a[i] for i in xrange(el0, el0 + n * stride, stride))) return _marray(A.dtype, nshp, res)
def __elmul__(self, him): from array import array if _isscalar(him): if hasattr(him, '__len__'): him = _cycle(him) else: him = _repeat(him) na = _marray(self.dtype, self.msize, array(_dtype2array[self.dtype], (x*y for x, y in _izip(self._a, him)))) return na
def __ge__(self, other): from array import array if _isscalar(other): if hasattr(other, '__len__'): other = _cycle(other) else: other = _repeat(other) na = _marray('bool', self.msize, array(_dtype2array['bool'], (x >= y for x, y in _izip(self, other)))) return na
def __setitem1__(self, i, val): # determine the size of the new array nshp = _ndshape1(self.msize, *i) i = ( isinstance(x, _marray) and iter(x._a) or x for x in i ) ins = list(_ndilin1(self.msize, *i)) if _isscalar(val): if hasattr(val, '__len__'): val = _cycle(val) else: val = _repeat(val) for j, v in _izip(ins, val): self._a[j] = v
def __setitem1__(self, i, val): # determine the size of the new array nshp = _ndshape1(self.msize, *i) i = (isinstance(x, _marray) and iter(x._a) or x for x in i) ins = list(_ndilin1(self.msize, *i)) if _isscalar(val): if hasattr(val, '__len__'): val = _cycle(val) else: val = _repeat(val) for j, v in _izip(ins, val): self._a[j] = v
def paths(in_snot,base_path="$"): def join_lists(a,b): return a+b if hasattr(in_snot,iteritems_attr): return reduce(join_lists,[paths(v,base_path+"["+k+"]") for k,v in getattr(in_snot,iteritems_attr)()]) elif isinstance(in_snot,list): return reduce(join_lists,[paths(v,base_path+"["+str(k)+"]") for k,v in _izip(_count(),in_snot)]) else: return [str(base_path)]
def __ge__(self, other): from array import array if _isscalar(other): if hasattr(other, '__len__'): other = _cycle(other) else: other = _repeat(other) na = _marray( 'bool', self.msize, array(_dtype2array['bool'], (x >= y for x, y in _izip(self, other)))) return na
def _splitter(stringiterable, name=_unknownname): if name is _unknownname and hasattr(stringiterable, 'name'): name = stringiterable.name return iter( tdtokens(name, line, *text__tokens) for line, text__tokens in _izip( _count(1), iter(( text, _str_split(text), ) for text in stringiterable)) if text__tokens[1])
def __elmul__(self, him): from array import array if _isscalar(him): if hasattr(him, '__len__'): him = _cycle(him) else: him = _repeat(him) na = _marray( self.dtype, self.msize, array(_dtype2array[self.dtype], (x * y for x, y in _izip(self._a, him)))) return na
def times(A, B): print 'times', A, B if hasattr(A, 'val'): A = A.val if hasattr(B, 'val'): B = B.val if not hasattr(A, '__len__'): if not hasattr(B, '__len__'): return A*B else: A = _repeat(A, len(B)) elif not hasattr(B, '__len__'): B = _repeat(B, len(A)) res = [ a*b for a, b in _izip(A, B)] return res
def times(A, B): print 'times', A, B if hasattr(A, 'val'): A = A.val if hasattr(B, 'val'): B = B.val if not hasattr(A, '__len__'): if not hasattr(B, '__len__'): return A * B else: A = _repeat(A, len(B)) elif not hasattr(B, '__len__'): B = _repeat(B, len(A)) res = [a * b for a, b in _izip(A, B)] return res
def _dot(A, B): """Dot product.""" M, N = A.msize[0], B.msize[1] MN = M * N na = _marray(_typegreater(A.dtype, B.dtype), (M, N)) cols = (_islice(B._a, i * M, (i + 1) * M) for i in _cycle(xrange(N))) rows = _cycle(_islice(A._a, i, MN + i, M) for i in xrange(M)) # fill in the result in the FORTRAN order for i in xrange(MN): col = cols.next() for j in xrange(N): s = sum(a * b for a, b in _izip(rows.next(), col)) na._a[i] = s return na
def _ndilin1(shp, *i): """Generator of linear indices into an array of shape `shp`. Indices are specified by slices of indices in `i`. The input index is base 1 the linear indices returned are base 0. This function produces indices, threfore the output type is 'int'.""" cp = [1] for x in shp[:-1]: cp += [cp[-1] * x] i = list(i) for j, x in enumerate(i): if isinstance(x, _mslice) and x.hasnoend(): i[j] = x.evaluate_end(shp[j]) for x in _ndi1(*i): yield int(_sum(x * (y - 1) for x, y in _izip(cp, x)))
def _ndilin1(shp, *i): """Generator of linear indices into an array of shape `shp`. Indices are specified by slices of indices in `i`. The input index is base 1 the linear indices returned are base 0. This function produces indices, threfore the output type is 'int'.""" cp = [1] for x in shp[:-1]: cp += [cp[-1]*x] i = list(i) for j, x in enumerate(i): if isinstance(x, _mslice) and x.hasnoend(): i[j] = x.evaluate_end(shp[j]) for x in _ndi1(*i): yield int(_sum( x*(y-1) for x, y in _izip(cp, x) ))
def _dot(A, B): """Dot product.""" M, N = A.msize[0], B.msize[1] MN = M*N na = _marray(_typegreater(A.dtype, B.dtype), (M, N)) cols = (_islice(B._a, i*M, (i+1)*M) for i in _cycle(xrange(N))) rows = _cycle( _islice(A._a, i, MN+i, M) for i in xrange(M) ) # fill in the result in the FORTRAN order for i in xrange(MN): col = cols.next() for j in xrange(N): s = sum( a*b for a, b in _izip(rows.next(), col) ) na._a[i] = s return na
def __init__(self, name, time, colnames, data, embryospergene=None): self.name = name self.time = time self.column_names = colnames self.column_index = _OD(_izip(colnames, _count())) self.pos = None self.Npos = None self.pos_other = dict() if "x" in self.column_index and "y" in self.column_index and "z" in self.column_index: self.pos = _np.vstack( [data[:, self.column_index[j]] for j in ["x", "y", "z"]]).T if "Nx" in self.column_index and "Ny" in self.column_index and "Nz" in self.column_index: self.Npos = _np.vstack( [data[:, self.column_index[j]] for j in ["Nx", "Ny", "Nz"]]).T self.data = data self.embryospergene = embryospergene
def __setitem__(self, k, rval): """ x.__setitem__(k, rval) <==> x[k]=rval """ if isinstance(k, slice): start, stop, step = k.indices(len(self._reflist)) if k.step is None: # simple slice irval = iter(rval) injlim = start for p in xrange(start, stop): try: el = irval.next() self._reflist[p].value = el injlim = p + 1 except StopIteration: for q in xrange(p, stop): self._reflist[q]._parent = None del self._reflist[p:stop] break else: inject = [_Ref(el, self) for el in irval] if inject: self._reflist = self._reflist[:injlim] + inject + self._reflist[injlim:] else: # extended slice if hasattr(rval, '__len__'): seq = rval else: seq = list(rval) if step > 0: n_elt = max((stop - start + step - 1) / step, 0) else: n_elt = max((start - stop - step - 1) / (-step), 0) if len(seq) != n_elt: raise ValueError, "attempt to assign sequence of size %d to extended slice of size %d" % (len(seq), n_elt) for p, el in _izip(xrange(start, stop, step), seq): self._reflist[p].value = el else: self._reflist[k].value = rval
def mcat(i): """Concatenate a list of matrices into a single matrix using separators ',' and ';'. The ',' means horizontal concatenation and the ';' means vertical concatenation. """ if i is None: return marray() # calculate the shape rows = [[]] final_rows = 0 final_cols = 0 crows = ccols = 0 pos = [] pos2 = [] for x in i: #if x == ';': if x is Ellipsis: rows.append([]) if final_cols > 0 and final_cols != ccols: error("Incompatible shapes!") else: final_cols = ccols final_rows += crows ccols = 0 pos.append(Ellipsis) else: shp = x.msize if len(shp) < 1: shp = [0] if len(shp) < 2: shp += [0] rows[-1].append(shp[0]) pos.append( (slice(final_rows, final_rows + shp[0]), slice(ccols, ccols + shp[1]))) crows = shp[0] ccols += shp[1] if final_cols > 0 and final_cols != ccols: error("Incompatible shapes!") else: final_cols = ccols final_rows += crows out = empty((final_rows, final_cols), 'double') for sl, x in _izip(pos, i): if x is not Ellipsis: if isinstance(x, _marray): x = x._a.T out._a.reshape(final_cols, final_rows).T.__setitem__(sl, x) return out
def _populate_py(in_snot, in_vector, begin=0): num_consumed = 0 the_iter = None if hasattr(in_snot, iteritems_attr): the_iter = getattr(in_snot, iteritems_attr)() elif isinstance(in_snot, list): the_iter = _izip(_count(), in_snot) else: pass #should not descend into scalars. for key, val in the_iter: if hasattr(val, iteritems_attr) or isinstance(val, list): num_consumed += _populate_py(val, in_vector, begin + num_consumed) else: in_snot[key] = in_vector[begin + num_consumed] num_consumed += 1 return num_consumed
def mcat(i): """Concatenate a list of matrices into a single matrix using separators ',' and ';'. The ',' means horizontal concatenation and the ';' means vertical concatenation. """ if i is None: return marray() # calculate the shape rows = [[]] final_rows = 0 final_cols = 0 crows = ccols = 0 pos = [] pos2 = [] for x in i: #if x == ';': if x is Ellipsis: rows.append([]) if final_cols > 0 and final_cols != ccols: error("Incompatible shapes!") else: final_cols = ccols final_rows += crows ccols = 0 pos.append(Ellipsis) else: shp = x.msize if len(shp) < 1: shp = [0] if len(shp) < 2: shp += [0] rows[-1].append(shp[0]) pos.append( (slice(final_rows, final_rows+shp[0]), slice(ccols, ccols+shp[1])) ) crows = shp[0] ccols += shp[1] if final_cols > 0 and final_cols != ccols: error("Incompatible shapes!") else: final_cols = ccols final_rows += crows out = empty((final_rows, final_cols), 'double') for sl, x in _izip(pos, i): if x is not Ellipsis: if isinstance(x, _marray): x = x._a.T out._a.reshape(final_cols, final_rows).T.__setitem__(sl, x) return out
def resolve(self, arg_types, disable_caching=False): ''' Метод разрешающий перегрузку по заданным типам параметров возвращаемое значение - функция или None если невозможно разрешить перегрузку, если результат неоднозначен бросается исключение AmbiguousFunctions ''' result = None candidates = [] try: for candidate in self._functions[len(arg_types)]: for arg, argtype in _izip(arg_types, candidate[1]): if not issubclass(arg, argtype): break else: candidates.append(candidate) except KeyError: pass if len(candidates) == 1: result = candidates[0][0] elif len(candidates) > 1: best_match = (sys.maxint, None) for (function, signature) in candidates: ancestor_count_sum = sum( _imap(_calculate_number_of_ancestors, arg_types, signature)) if best_match[0] > ancestor_count_sum: best_match = (ancestor_count_sum, function) elif best_match[0] == ancestor_count_sum: raise AmbiguousFunctions result = best_match[1] if (result is not None) and (not disable_caching): self.__cache[arg_types] = result return result
def _ndilin(shp, *i): """Generator of linear indices into an array of shape `shp`. Indices are specified by slices of indices in `i`.""" cp = [1] for x in shp[:-1]: cp += [cp[-1] * x] i = list(i) for j, x in enumerate(i): if isinstance(x, slice): start, stop, step = x.start, x.stop, x.step if x.start is None: start = 0 if x.stop == sys.maxint or x.stop is None: stop = shp[j] if x.step is None: step = 1 i[j] = slice(start, stop, step) res = [] for x in _ndi(*i): res.append(int(_sum(x * y for x, y in _izip(cp, x)))) #yield int(_sum( x*y for x, y in _izip(cp, x) )) return res
def _ndilin(shp, *i): """Generator of linear indices into an array of shape `shp`. Indices are specified by slices of indices in `i`.""" cp = [1] for x in shp[:-1]: cp += [cp[-1]*x] i = list(i) for j, x in enumerate(i): if isinstance(x, slice): start, stop, step = x.start, x.stop, x.step if x.start is None: start = 0 if x.stop == sys.maxint or x.stop is None: stop = shp[j] if x.step is None: step = 1 i[j] = slice(start, stop, step) res = [] for x in _ndi(*i): res.append(int(_sum( x*y for x, y in _izip(cp, x) ))) #yield int(_sum( x*y for x, y in _izip(cp, x) )) return res
def _worker(gen_func, args_list, q, e): """ Worker function which loops over one of more generators provided by `gen_func` and returns the result via queue `q`. Waits for signal from `e` before continuing. """ gens = [] for arg in args_list: gens.append(gen_func(*arg)) generator = _izip(*gens) for s in generator: e.clear() q.put(s) e.wait() q.close()
def _generate_parallel(n_process, n_iter, gen_func, args_list): """ Generator which spawns processes to run generators, then uses a queue for each process to retrieve the results which it then yields. """ n_items = len(args_list) # calculate how to distribute generators over processes. if n_items <= n_process and n_process > 0: n_process = n_items n_pp = 1 n_left = 1 elif n_items > n_process and n_process > 0: n_pp = n_items // n_process n_left = n_pp + n_items % n_process # if one process specified just do the generator without sub processes. if n_process <= 1: gens = [] for arg in args_list: gens.append(gen_func(*arg)) generator = _izip(*gens) for s in generator: yield s return # split up argument list sub_lists = [args_list[0:n_left]] sub_lists.extend([ args_list[n_left + i * n_pp:n_left + (i + 1) * n_pp] for i in range(n_process - 1) ]) # create lists of queues, events, and processes. es = [] qs = [] ps = [] for i in range(n_process): e = _Event() q = _Queue(1) p = _Process(target=_worker, args=(gen_func, sub_lists[i], q, e)) p.daemon = True es.append(e) qs.append(q) ps.append(p) # start processes for p in ps: p.start() # for number of iterations for i in range(n_iter): s = [] # retrieve results for each sub-process and let the process know to continue calculation. for q, e in _izip(qs, es): s.extend(q.get()) e.set() # free process to do next calculation # yield all results yield tuple(s) # end processes for p in ps: p.join()
def bin_diff(array, weighted=False): return [_diff(a, b, weighted) for a, b in _izip(array[:-1], array[1:])]
def __init__(self, instream, stream_type=None, stream_version=None, no_stream_options=False, header_only=False): if not isinstance(instream, YamldataGenerator): instream = YamldataGenerator(instream) have_filename = hasattr(instream, 'current_filename') if have_filename: self.current_filename = instream.current_filename stream_name = (self.current_filename if have_filename else "Yamldata stream") doc = instream.next() if (not hasattr(doc, '__len__')) or len(doc) == 0: raise ValueError("no header found in %s; are you sure this is a Yamldata source?" % stream_name) # attrdict of all the Onyx headers try: header = attrdict(doc[0]) except: raise ValueError("bad header structure in %s: [%s] - are you sure this is a Yamldata source?" % (stream_name, doc[0])) if len(doc) > 2: raise ValueError("bad document structure in %s, expected 1 or 2 sub-parts, got %d" % (stream_name, len(doc))) # print doc if header_only: if len(doc) != 1: raise ValueError("bad document structure in %s, expected only a header" % stream_name) data = None else: if len(doc) != 2: raise ValueError("bad document structure in %s, expected both a header and a body" % stream_name) data = doc[1] missing_headers = frozenset(self.prefix_header_names(self.required_headers)) - frozenset(header) if missing_headers: raise ValueError("missing the following required headers in %s: %s" % (stream_name, (' '.join(repr(header) for header in sorted(missing_headers))))) invalid_headers = frozenset(header) - frozenset(self.prefix_header_names(self.valid_headers)) if invalid_headers: raise ValueError("unexpected headers in %s: %s" % (stream_name, (' '.join(repr(header) for header in sorted(invalid_headers))))) for base_name, prefixed_name in _izip(self.valid_headers, self.prefix_header_names(self.valid_headers)): if prefixed_name in header: self[base_name] = header[prefixed_name] # check the fields if self.meta_version != self.VERSION: raise ValueError("unexpected meta_version in %s: expected %s, got %s" % (stream_name, self.VERSION, self.meta_version)) if stream_type is not None and self.stream_type != stream_type: raise ValueError("unexpected stream_type in %s: expected %r, got %r" % (stream_name, stream_type, self.stream_type)) if stream_version is not None and self.stream_version != stream_version: raise ValueError("unexpected stream_version in %s: expected %s, got %s" % (stream_name, repr(stream_version), repr(self.stream_version))) if no_stream_options and self.hasattr.stream_options: raise ValueError("unexpected presence of stream_options in header: %r" % (self.stream_options,)) self.current_line_number = 4 + (0 if no_stream_options else 1) def itr(): for line in data: self.current_line_contents = line self.current_line_number += 1 # Note: PyYAML will implicitly convert tokens which # match certain regexps to their "natural" types. The # effect is that if a line has only a single token # which can be converted to float or int, it will be # so converted. Here we detect that and convert back # to a tuple with one string to make our output consistent. if type(line) != str: parts = (str(line),) else: parts = line.split() if not parts: continue yield parts self._next = itr().next
def tdfilesitersizip(filenames, default=_stdin): """return iziped iterator for the sequence of files, or the default""" return _izip(*tdfilesiters(filenames, default))
def _splitter(stringiterable, name=_unknownname): if name is _unknownname and hasattr(stringiterable, 'name'): name = stringiterable.name return iter(tdtokens(name, line, *text__tokens) for line, text__tokens in _izip(_count(1), iter((text, _str_split(text),) for text in stringiterable)) if text__tokens[1])
def mtimes(A, B): print 'mtimes', A, B if hasattr(A, 'val'): A = A.val if hasattr(B, 'val'): B = B.val res = sum([ a*b for a, b in _izip(A, B)]) return res
def __setitem__(self, i, val): for i, v in _izip(_ndilin(self.msize, *i), val): self._a[i] = v
def bin_sums(array, less=None): return [(a.count + b.count)/2. for a, b in _izip(array[:-1], array[1:]) if less is None or b.value <= less]
def bin_sums(array, less=None): return [(a.count + b.count) / 2. for a, b in _izip(array[:-1], array[1:]) if less is None or b.value <= less]
def mtimes(A, B): print 'mtimes', A, B if hasattr(A, 'val'): A = A.val if hasattr(B, 'val'): B = B.val res = sum([a * b for a, b in _izip(A, B)]) return res
def predict(self, dataset, output_type='assignment'): """ Use the model to predict topics for each document. The provided `dataset` should be an SArray object where each element is a dict representing a single document in bag-of-words format, where keys are words and values are their corresponding counts. If `dataset` is an SFrame, then it must contain a single column of dict type. The current implementation will make inferences about each document given its estimates of the topics learned when creating the model. This is done via Gibbs sampling. Parameters ---------- dataset : SArray, SFrame of type dict A set of documents to use for making predictions. output_type : str, optional The type of output desired. This can either be - assignment: the returned values are integers in [0, num_topics) - probability: each returned prediction is a vector with length num_topics, where element k prepresents the probability that document belongs to topic k. Returns ------- out : SArray See Also -------- evaluate Examples -------- Make predictions about which topic each document belongs to. >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text') >>> m = graphlab.topic_model.create(docs) >>> pred = m.predict(docs) If one is interested in the probability of each topic >>> pred = m.predict(docs, output_type='probability') Notes ----- For each unique word w in a document d, we sample an assignment to topic k with probability proportional to .. math:: p(z_{dw} = k) \propto (n_{d,k} + \\alpha) * \Phi_{w,k} where - :math:`W` is the size of the vocabulary, - :math:`n_{d,k}` is the number of other times we have assigned a word in document to d to topic :math:`k`, - :math:`\Phi_{w,k}` is the probability under the model of choosing word :math:`w` given the word is of topic :math:`k`. This is the matrix returned by calling `m['topics']`. This represents a collapsed Gibbs sampler for the document assignments while we keep the topics learned during training fixed. This process is done in parallel across all documents, five times per document. """ _mt._get_metric_tracker().track('toolkit.text.topic_model.predict') dataset = _check_input(dataset) opts = {'model': self.__proxy__, 'data': dataset} response = _graphlab.toolkits._main.run("text_topicmodel_predict", opts) preds = _SArray(None, _proxy=response['predictions']) # Get most likely topic if probabilities are not requested if output_type not in ['probability', 'probabilities', 'prob']: # equivalent to numpy.argmax(x) preds = preds.apply(lambda x: max(_izip(x, xrange(len(x))))[1]) return preds
def evolve(self, psi_0, t0, times, iterate=False, n_jobs=1, block_diag=False, stack_state=False, imag_time=False, solver_name="dop853", **solver_args): """Creates symmetry blocks of the Hamiltonian and then uses them to run `hamiltonian.evolve()` in parallel. **Arguments NOT described below can be found in the documentation for the `hamiltonian.evolve()` method.** Examples -------- The example below builds on the code snippet shown in the description of the `block_ops` class. .. literalinclude:: ../../doc_examples/block_ops-example.py :linenos: :language: python :lines: 69- Parameters ----------- psi_0 : numpy.ndarray, list, tuple Quantum state which defined on the full Hilbert space of the problem. Does not need to obey and sort of symmetry. t0 : float Inistial time to start the evolution at. times : numpy.ndarray, list Contains the times to compute the solution at. Must be some an iterable object. iterate : bool, optional Flag to return generator when set to `True`. Otherwise the output is an array of states. Default is 'False'. n_jobs : int, optional Number of processes requested for the computation time evolution dynamics. NOTE: one of those processes is used to gather results. For best performance, all blocks should be approximately the same size and `n_jobs-1` must be a common devisor of the number of blocks, such that there is roughly an equal workload for each process. Otherwise the computation will be as slow as the slowest process. block_diag : bool, optional When set to `True`, this flag puts the Hamiltonian matrices for the separate symemtry blocks into a list and then loops over it to do time evolution. When set to `False`, it puts all blocks in a single giant sparse block diagonal matrix. Default is `False`. This flag is useful if there are a lot of smaller-sized blocks. Returns -------- obj if `iterate = True`, returns generator which generates the time dependent state in the full H-space basis. if `iterate = False`, returns `numpy.ndarray` which has the time-dependent states in the full H-space basis in the rows. Raises ------ ValueError Variable `imag_time=True` option on `hamiltonian.evolve()` method not supported. ValueError `iterate=True` requires `times` to be an array or a list. RuntimeError Terminates when initial state has no projection onto the specified symmetry blocks. """ if imag_time: raise ValueError( "imaginary time not supported for block evolution.") P = [] H_list = [] psi_blocks = [] for key, b in _iteritems(self._basis_dict): p = self._get_P(key) if _sp.issparse(psi_0): psi = p.H.dot(psi_0).toarray() else: psi = p.H.dot(psi_0) psi = _np.asarray(psi).ravel() if _np.linalg.norm(psi) > 1000 * _np.finfo(self.dtype).eps: psi_blocks.append(psi) P.append(p.tocoo()) H_list.append(self._get_H(key)) if block_diag and H_list: N_H = len(H_list) n_pp = N_H // n_jobs n_left = n_pp + N_H % n_jobs H_list_prime = [] psi_blocks_prime = [] if n_left != 0: H_list_prime.append( block_diag_hamiltonian(H_list[:n_left], None, None, None, None, self._dtype, get_proj=False, **self._no_checks)) psi_list_prime.append(_np.hstack(psi_blocks[:n_left])) for i in range(n_jobs - 1): i1 = n_left + i * n_pp i2 = n_left + (i + 1) * n_pp H_list_prime.append( block_diag_hamiltonian(H_list[i1:i2], None, None, None, None, self._dtype, get_proj=False, **self._no_checks)) psi_list_prime.append(_np.hstack(psi_blocks[i1:i2])) H_list = H_list_prime psi_blocks = psi_blocks_prime if len(H_list) > 0: P = _sp.hstack(P, format="csr") if iterate: if _np.isscalar(times): raise ValueError( "If iterate=True times must be a list/array.") return _block_evolve_iter(psi_blocks, H_list, P, t0, times, stack_state, imag_time, solver_name, solver_args, n_jobs) else: psi_t = _Parallel(n_jobs=n_jobs)( _delayed(_block_evolve_helper)( H, psi, t0, times, stack_state, imag_time, solver_name, solver_args) for psi, H in _izip(psi_blocks, H_list)) psi_t = _np.vstack(psi_t) psi_t = P.dot(psi_t) return psi_t else: raise RuntimeError( "initial state has no projection on to specified blocks.")
def predict(self, dataset, output_type='assignment', num_burnin=None): """ Use the model to predict topics for each document. The provided `dataset` should be an SArray object where each element is a dict representing a single document in bag-of-words format, where keys are words and values are their corresponding counts. If `dataset` is an SFrame, then it must contain a single column of dict type. The current implementation will make inferences about each document given its estimates of the topics learned when creating the model. This is done via Gibbs sampling. Parameters ---------- dataset : SArray, SFrame of type dict A set of documents to use for making predictions. output_type : str, optional The type of output desired. This can either be - assignment: the returned values are integers in [0, num_topics) - probability: each returned prediction is a vector with length num_topics, where element k prepresents the probability that document belongs to topic k. num_burnin : int, optional The number of iterations of Gibbs sampling to perform when inferring the topics for documents at prediction time. If provided this will override the burnin value set during training. Returns ------- out : SArray See Also -------- evaluate Examples -------- Make predictions about which topic each document belongs to. >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> m = graphlab.topic_model.create(docs) >>> pred = m.predict(docs) If one is interested in the probability of each topic >>> pred = m.predict(docs, output_type='probability') Notes ----- For each unique word w in a document d, we sample an assignment to topic k with probability proportional to .. math:: p(z_{dw} = k) \propto (n_{d,k} + \\alpha) * \Phi_{w,k} where - :math:`W` is the size of the vocabulary, - :math:`n_{d,k}` is the number of other times we have assigned a word in document to d to topic :math:`k`, - :math:`\Phi_{w,k}` is the probability under the model of choosing word :math:`w` given the word is of topic :math:`k`. This is the matrix returned by calling `m['topics']`. This represents a collapsed Gibbs sampler for the document assignments while we keep the topics learned during training fixed. This process is done in parallel across all documents, five times per document. """ _mt._get_metric_tracker().track('toolkit.text.topic_model.predict') dataset = _check_input(dataset) if num_burnin is None: num_burnin = self.get('num_burnin') opts = {'model': self.__proxy__, 'data': dataset, 'num_burnin': num_burnin} response = _graphlab.toolkits._main.run("text_topicmodel_predict", opts) preds = _SArray(None, _proxy=response['predictions']) # Get most likely topic if probabilities are not requested if output_type not in ['probability', 'probabilities', 'prob']: # equivalent to numpy.argmax(x) preds = preds.apply(lambda x: max(_izip(x, _xrange(len(x))))[1]) return preds
def expm(self, psi_0, H_time_eval=0.0, iterate=False, n_jobs=1, block_diag=False, a=-1j, start=None, stop=None, endpoint=None, num=None, shift=None): """Creates symmetry blocks of the Hamiltonian and then uses them to run `_expm_multiply()` in parallel. **Arguments NOT described below can be found in the documentation for the `exp_op` class.** Examples -------- The example below builds on the code snippet shown in the description of the `block_ops` class. .. literalinclude:: ../../doc_examples/block_ops-example.py :linenos: :language: python :lines: 60-67 Parameters ----------- psi_0 : numpy.ndarray, list, tuple Quantum state which defined on the full Hilbert space of the problem. Does not need to obey and sort of symmetry. t0 : float Inistial time to start the evolution at. H_time_eval : numpy.ndarray, list Times to evaluate the Hamiltonians at when doing the matrix exponentiation. iterate : bool, optional Flag to return generator when set to `True`. Otherwise the output is an array of states. Default is 'False'. n_jobs : int, optional Number of processes requested for the computation time evolution dynamics. NOTE: one of those processes is used to gather results. For best performance, all blocks should be approximately the same size and `n_jobs-1` must be a common devisor of the number of blocks, such that there is roughly an equal workload for each process. Otherwise the computation will be as slow as the slowest process. block_diag : bool, optional When set to `True`, this flag puts the Hamiltonian matrices for the separate symemtri blocks into a list and then loops over it to do time evolution. When set to `False`, it puts all blocks in a single giant sparse block diagonal matrix. Default is `False`. This flag is useful if there are a lot of smaller-sized blocks. Returns -------- obj if `iterate = True`, returns generator which generates the time dependent state in the full H-space basis. if `iterate = False`, returns `numpy.ndarray` which has the time-dependent states in the full H-space basis in the rows. Raises ------ ValueError Various `ValueError`s of `exp_op` class. RuntimeError Terminates when initial state has no projection onto the specified symmetry blocks. """ from ..operators import hamiltonian if iterate: if start is None and stop is None: raise ValueError( "'iterate' can only be True with time discretization. must specify 'start' and 'stop' points." ) if num is not None: if type(num) is not int: raise ValueError("expecting integer for 'num'.") else: num = 50 if endpoint is not None: if type(endpoint) is not bool: raise ValueError("expecting bool for 'endpoint'.") else: endpoint = True else: if start is None and stop is None: if num != None: raise ValueError("unexpected argument 'num'.") if endpoint != None: raise ValueError("unexpected argument 'endpoint'.") else: if not (_np.isscalar(start) and _np.isscalar(stop)): raise ValueError( "expecting scalar values for 'start' and 'stop'") if not (_np.isreal(start) and _np.isreal(stop)): raise ValueError( "expecting real values for 'start' and 'stop'") if num is not None: if type(num) is not int: raise ValueError("expecting integer for 'num'.") else: num = 50 if endpoint is not None: if type(endpoint) is not bool: raise ValueError("expecting bool for 'endpoint'.") else: endpoint = True P = [] H_list = [] psi_blocks = [] for key, b in _iteritems(self._basis_dict): p = self._get_P(key) if _sp.issparse(psi_0): psi = p.H.dot(psi_0).toarray() else: psi = p.H.dot(psi_0) psi = psi.ravel() if _np.linalg.norm(psi) > 1000 * _np.finfo(self.dtype).eps: psi_blocks.append(psi) P.append(p.tocoo()) H = self._get_H(key) H = H(H_time_eval) * a if shift is not None: H += a * shift * _sp.identity(b.Ns, dtype=self.dtype) H_list.append(H) if block_diag and H_list: N_H = len(H_list) n_pp = N_H // n_jobs n_left = n_pp + N_H % n_jobs H_list_prime = [] psi_blocks_prime = [] psi_block = _np.hstack(psi_blocks[:n_left]) H_block = _sp.block_diag(H_list[:n_left], format="csr") H_list_prime.append(H_block) psi_blocks_prime.append(psi_block) for i in range(n_jobs - 1): i1 = n_left + i * n_pp i2 = n_left + (i + 1) * n_pp psi_block = _np.hstack(psi_blocks[i1:i2]) H_block = _sp.block_diag(H_list[i1:i2], format="csr") H_list_prime.append(H_block) psi_blocks_prime.append(psi_block) H_list = H_list_prime psi_blocks = psi_blocks_prime H_is_complex = _np.iscomplexobj( [_np.float32(1.0).astype(H.dtype) for H in H_list]) if H_list: P = _sp.hstack(P, format="csr") if iterate: return _block_expm_iter(psi_blocks, H_list, P, start, stop, num, endpoint, n_jobs) else: ver = [int(v) for v in _scipy.__version__.split(".")] if H_is_complex and (start, stop, num, endpoint) != ( None, None, None, None) and ver[1] < 19: mats = _block_expm_iter(psi_blocks, H_list, P, start, stop, num, endpoint, n_jobs) return _np.array([mat for mat in mats]).T else: psi_t = _Parallel(n_jobs=n_jobs)( _delayed(_expm_multiply)(H, psi, start=start, stop=stop, num=num, endpoint=endpoint) for psi, H in _izip(psi_blocks, H_list)) psi_t = _np.hstack(psi_t).T psi_t = P.dot(psi_t) return psi_t else: raise RuntimeError( "initial state has no projection on to specified blocks.")