def first_pass_data_and_labels(notes): ''' first_pass_data_and_labels() Purpose: Interface with notes object to get text data and labels @param notes. List of Note objects @return <tuple> whose elements are: 0) list of tokenized sentences 1) list of labels for tokenized sentences >>> import os >>> from notes.note import Note >>> base_dir = os.path.join(os.getenv('CLINER_DIR'), 'tests', 'data') >>> txt = os.path.join(base_dir, 'single.txt') >>> con = os.path.join(base_dir, 'single.con') >>> note_tmp = Note('i2b2') >>> note_tmp.read(txt, con) >>> notes = [note_tmp] >>> first_pass_data_and_labels(notes) ([['The', 'score', 'stood', 'four', 'to', 'two', ',', 'with', 'but', 'one', 'inning', 'more', 'to', 'play', ',']], [['B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]) ''' # Get the data and annotations from the Note objects l_tokenized_sentences = [note.getTokenizedSentences() for note in notes] l_iob_labels = [note.getIOBLabels() for note in notes] tokenized_sentences = flatten(l_tokenized_sentences) iob_labels = flatten(l_iob_labels) return tokenized_sentences, iob_labels
def save(self, num_retries=3): # If the DB is not writable, the rsync won't happen # If the DB is up, but rsync fails, the status will be ERR_SYNC, # but self.state will not be updated in the database. session = self.db.session() try: # Test write access to DB # If it fails after num_retries trials, update_in_session will # raise an Exception, so save() will exit, before the rsync. self.dbstate.update_in_session({'jobman.status': self.ERR_SYNC}, session, _recommit_times=num_retries) # save self.state in file current.state, and rsync # If the rsync fails after num_retries, an Exception will be # raised, and save() will exit before 'jobman.status' is # changed back. super(DBRSyncChannel, self).save(num_retries=num_retries) if self.sync_in_save: # update DB self.dbstate.update_in_session(flatten(self.state), session, _recommit_times=num_retries) else: # update only jobman.* state_jobman = flatten({'jobman': self.state.jobman}) self.dbstate.update_in_session(state_jobman, session, _recommit_times=num_retries) finally: session.close()
def _child_compute(self, cr, uid, ids, name, args, context=None): obj_dept = self.pool.get('hr.department') obj_user = self.pool.get('res.users') result = {} for user_id in ids: child_ids = [] cr.execute('SELECT dept.id FROM hr_department AS dept \ LEFT JOIN hr_employee AS emp ON dept.manager_id = emp.id \ WHERE emp.id IN \ (SELECT emp.id FROM hr_employee \ JOIN resource_resource r ON r.id = emp.resource_id WHERE r.user_id=' + str(user_id) + ') ') mgnt_dept_ids = [x[0] for x in cr.fetchall()] ids_dept = obj_dept.search(cr, uid, [('id', 'child_of', mgnt_dept_ids)], context=context) if ids_dept: data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids'], context=context) children = map(lambda x: x['member_ids'], data_dept) children = tools.flatten(children) children = obj_user.search(cr, uid, [('id', 'in', children),('active', '=', True)], context=context) if user_id in children: children.remove(user_id) child_ids.extend(tools.flatten(children)) set = {} map(set.__setitem__, child_ids, []) child_ids = set.keys() result[user_id] = child_ids return result
def __first_train(self, tokenized_sentences, Y, do_grid=False): """ Model::__first_train() Purpose: Train the first pass classifiers (for IOB chunking) @param tokenized_sentences. <list> of tokenized sentences @param Y. <list-of-lists> of IOB labels for words @param do_grid. <boolean> whether to perform a grid search @return None """ if globals_cliner.verbosity > 0: print 'first pass' if globals_cliner.verbosity > 0: print '\textracting features (pass one)' # Seperate into prose v nonprose nested_prose_data, nested_prose_Y = zip( *filter(lambda line_iob_tup: is_prose_sentence(line_iob_tup[0]), zip(tokenized_sentences, Y))) nested_nonprose_data, nested_nonprose_Y = zip(*filter( lambda line_iob_tup: not is_prose_sentence(line_iob_tup[0]), zip(tokenized_sentences, Y))) #extract features nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data) nested_nonprose_feats = feat_obj.IOB_nonprose_features( nested_nonprose_data) # Flatten lists (because classifier will expect flat) prose_Y = flatten(nested_prose_Y) nonprose_Y = flatten(nested_nonprose_Y) # rename because code uses it pchunks = prose_Y nchunks = nonprose_Y prose = nested_prose_feats nonprose = nested_nonprose_feats # Train classifiers for prose and nonprose pvec, pclf = self.__generic_first_train('prose', prose, pchunks, do_grid) nvec, nclf = self.__generic_first_train('nonprose', nonprose, nchunks, do_grid) # Save vectorizers self._first_prose_vec = pvec self._first_nonprose_vec = nvec # Save classifiers self._first_prose_clf = pclf self._first_nonprose_clf = nclf
def _child_compute(self, cr, uid, ids): obj_dept = self.pool.get('hr.department') child_ids = [] for id in ids: ids_dept = obj_dept.search(cr, uid, [('manager_id', '=', id)]) if ids_dept: data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids']) childs = map(lambda x: x['member_ids'], data_dept) childs = tools.flatten(childs) if id in childs: childs.remove(id) child_ids.extend(tools.flatten(childs)) return child_ids
def _child_compute(self, cr, uid, ids, name, args, context=None): obj_dept = self.pool.get("hr.department") obj_user = self.pool.get("res.users") result = {} for user_id in ids: child_ids = [] cr.execute( """SELECT dept.id FROM hr_department AS dept LEFT JOIN hr_employee AS emp ON dept.manager_id = emp.id WHERE emp.id IN (SELECT emp.id FROM hr_employee JOIN resource_resource r ON r.id = emp.resource_id WHERE r.user_id = %s) """, (user_id,), ) mgnt_dept_ids = [x[0] for x in cr.fetchall()] ids_dept = obj_dept.search(cr, uid, [("id", "child_of", mgnt_dept_ids)], context=context) if ids_dept: data_dept = obj_dept.read(cr, uid, ids_dept, ["member_ids"], context=context) emp_children = map(lambda x: x["member_ids"], data_dept) emp_children = tools.flatten(emp_children) children = self.emp_to_users(cr, uid, emp_children, context=context) children = obj_user.search(cr, uid, [("id", "in", children), ("active", "=", True)], context=context) if user_id in children: children.remove(user_id) child_ids = list(set(child_ids + children)) result[user_id] = child_ids return result
def generic_predict(p_or_n, tokenized_sents, vocab, clf): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained model. ''' # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: print '\tnothing to predict ' + p_or_n return [] print '\tvectorizing words ' + p_or_n # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform(flatten(text_features)) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) print '\tpredicting labels ' + p_or_n # Predict labels predictions = crf_ml.predict(clf, X) # Format labels from output return predictions
def to_sql(self): stack = [] params = [] for i, e in reverse_enumerate(self.__exp): if self._is_leaf(e, internal=True): table = self.__field_tables.get(i, self.__main_table) q, p = self.__leaf_to_sql(e, table) params.insert(0, p) stack.append(q) else: if e == '!': stack.append('(NOT (%s))' % (stack.pop(), )) else: ops = {'&': ' AND ', '|': ' OR '} q1 = stack.pop() q2 = stack.pop() stack.append('(%s %s %s)' % ( q1, ops[e], q2, )) query = ' AND '.join(reversed(stack)) joins = ' AND '.join(self.__joins) if joins: query = '(%s) AND (%s)' % (joins, query) return (query, flatten(params))
def remove_rooms(*room_names): """Remove multiple rooms from the buiding. Arguments can be a list of rooms or comma-separated values """ room_name_list = tools.flatten(list(room_names)) for room_name in room_name_list: Amity.remove_room(room_name)
def generic_predict(p_or_n, tokenized_sents, vocab, clf): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained model. ''' # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: print '\tnothing to predict ' + p_or_n return [] print '\tvectorizing words ' + p_or_n # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform( flatten(text_features) ) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) print '\tpredicting labels ' + p_or_n # Predict labels predictions = crf_ml.predict(clf, X) # Format labels from output return predictions
def fit_from_documents(self, documents): """ ClinerModel::fit_from_documents() Train clinical concept extraction model using annotated data (files). @param notes. A list of Document objects (containing text and annotations) @return None """ # Extract formatted data tokenized_sents = flatten([d.getTokenizedSentences() for d in documents]) labels = flatten([d.getTokenLabels() for d in documents]) # Call the internal method self.fit(tokenized_sents, labels, dev_split=0.10) self._training_files = [ d.getName() for d in documents ]
def fit_from_documents(self, documents): """ ClinerModel::fit_from_documents() Train clinical concept extraction model using annotated data (files). @param notes. A list of Document objects (containing text and annotations) @return None """ # Extract formatted data tokenized_sents = flatten( [d.getTokenizedSentences() for d in documents]) labels = flatten([d.getTokenLabels() for d in documents]) # Call the internal method self.fit(tokenized_sents, labels, dev_split=0.10) self._training_files = [d.getName() for d in documents]
def train(self, notes): """ ClinerModel::train() Purpose: Train a Machine Learning model on annotated data @param notes. A list of Note objects (containing text and annotations) @return None """ # Extract formatted data tokenized_sentences = flatten( [n.getTokenizedSentences() for n in notes]) labels = flatten([n.getTokenLabels() for n in notes]) self.train_fit(tokenized_sentences, labels, dev_split=0.1) self._training_files = [n.getName() for n in notes]
def search(self, cr, uid, args, offset=0, limit=None, order=None, context=None, count=False): res = [] log_ids = super(res_log, self).search(cr, uid, args, offset, limit, order, context, count) logs = {} for log in self.browse(cr, uid, log_ids, context=context): res_dict = logs.get(log.res_model, {}) res_dict.update({log.res_id: log.id}) logs.update({log.res_model: res_dict}) res = map(lambda x: x.values(), logs.values()) return tools.flatten(res)
def sel(input, indexes, splitf): """ Performs the cutting and selecting input : an iterable of lines indexes: an iterable of valid list indexes (int, slices) splitf : the function used to separate indexes in input """ for line in input: fields = filter(is_blank, splitf(line)) selected = (getitem(fields, i, default='') for i in indexes) yield flatten(selected)
def sel(input, indexes, splitf): """ Performs the cutting and selecting input : an iterable of lines indexes: an iterable of valid list indexes (int, slices) splitf : the function used to separate indexes in input """ for line in input: fields = filter(is_blank, splitf(line)) selected = (getitem(fields, i, default = '') for i in indexes) yield flatten(selected)
def transformActiveColumnIndexes(activeColumns): pastIndexes = [] yValues = [] for activeColumn in flatten(activeColumns): if activeColumn not in pastIndexes: pastIndexes.append(activeColumn) yValue = pastIndexes.index(activeColumn) yValues.append(yValue) return yValues
def equilibrium(threshold, defender_costs, attacker_costs): # Iterate through each n = len(defender_costs) if n != len(attacker_costs): raise Exception("Unequal set of lists") else: def_equilibrium = [] att_equilibrium = [] for i in range(0, n): # print("Resource: ", i) # print("----------------") resources = list(range(0, n)) resources.remove(i) # # print("Resources: ", resources) subsets = [] # for t in range(threshold-1, n): subsets.append( list(itertools.combinations(resources, threshold - 1))) # print("Before flatten: ", subsets) subsets = tools.flatten(subsets) # print("Subsets:", subsets) # subsets.append([]) if () in subsets and len(defender_costs) != 1: subsets.remove(()) # print("Subsets: ", subsets) s = 0 for l in subsets: # creating the product p = 1 for j in range(0, n): if j == i: continue if j in l: p *= defender_costs[j] / (defender_costs[j] + attacker_costs[j]) else: p *= attacker_costs[j] / (defender_costs[j] + attacker_costs[j]) s += p s /= ((attacker_costs[i] + defender_costs[i])**2) defender_point = s * attacker_costs[i] attacker_point = s * defender_costs[i] def_equilibrium.append(defender_point) att_equilibrium.append(attacker_point) return def_equilibrium, att_equilibrium
def __generic_first_predict(self, p_or_n, text_features, dvect, clf, do_grid=False): ''' Model::__generic_first_predict() Purpose: Train that works for both prose and nonprose @param p_or_n. <string> either "prose" or "nonprose" @param text_features. <list-of-lists> of feature dictionaries @param dvect. <DictVectorizer> @param clf. scikit-learn classifier @param do_grid. <boolean> indicating whether to perform grid search ''' # If nothing to predict, skip actual prediction if len(text_features) == 0: print '\tnothing to predict (pass one) ' + p_or_n return [] # Save list structure to reconstruct after vectorization offsets = save_list_structure(text_features) if globals_cliner.verbosity > 0: print '\tvectorizing features (pass one) ' + p_or_n # Vectorize features X_feats = dvect.transform(flatten(text_features)) if globals_cliner.verbosity > 0: print '\tpredicting labels (pass one) ' + p_or_n # CRF requires reconstruct lists if self._crf_enabled: X_feats = reconstruct_list(list(X_feats), offsets) lib = crf else: lib = sci #for X in X_feats: # for x in X: # print x # print #print '\n' # Predict IOB labels out = lib.predict(clf, X_feats) # Format labels from output predictions = reconstruct_list(out, offsets) return predictions
def timeOrderedSDRPlot(activeColumns): #Make sure inputs are numpy arrays activeColumns = numpy.asarray(activeColumns) ##Transform the input values into x and y coordinates #for each active bit, the x value is the time when it was active xValues = [] for t in xrange(len(activeColumns)): xValues.append([t]*len(activeColumns[t])) xValues = numpy.fromiter(flatten(xValues),int) #not sure if int is the right thing to use here #Calculate y values #for each active bit the y value is the time when that bit was first active def transformActiveColumnIndexes(activeColumns): pastIndexes = [] yValues = [] for activeColumn in flatten(activeColumns): if activeColumn not in pastIndexes: pastIndexes.append(activeColumn) yValue = pastIndexes.index(activeColumn) yValues.append(yValue) return yValues yValues = numpy.asarray(transformActiveColumnIndexes(activeColumns)) ##Plot the x and y coordinates #set up plot (figure,axes) = plt.subplots() #plot the actiive bits axes.plot(xValues,yValues,'.') #display the horizontal grid axes.minorticks_on() axes.grid(True,axis='both',which='both',linestyle='solid',color=(0.7,0.7,0.7)) axes.set_axisbelow(True) #draw the lines beneath the points #Set the horizontal gridlines to have spacing of 1 axes.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1)) #Add a gap to the bottom and left edges so all points can be seen axes.set_xlim(left=-1) axes.set_ylim(bottom=-1) #set plot aspect ratio so the axes are even. axes.set_aspect(aspect='equal') #Make the plot bigger (the plot expands to box whilst retaining aspect ratio) figure.set_size_inches(20,20) return (figure, figure.axes)
def __second_train(self, chunked_data, inds_list, con_labels, do_grid=False): """ Model::__second_train() Purpose: Train the first pass classifiers (for IOB chunking) @param data <list> of tokenized sentences after collapsing chunks @param inds_list <list-of-lists> of indices - assertion: len(data) == len(inds_list) - one line of 'inds_list' contains a list of indices into the corresponding line for 'data' @param con_labels <list> of concept label strings - assertion: there are sum(len(inds_list)) labels AKA each index from inds_list maps to a label @param do_grid <boolean> indicating whether to perform a grid search @return None """ if globals_cliner.verbosity > 0: print 'second pass' # Extract features if globals_cliner.verbosity > 0: print '\textracting features (pass two)' text_features = [ feat_obj.concept_features(s, inds) for s, inds in zip(chunked_data, inds_list) ] flattened_text_features = flatten(text_features) if globals_cliner.verbosity > 0: print '\tvectorizing features (pass two)' # Vectorize labels numeric_labels = [concept_labels[y] for y in con_labels] # Vectorize features self._second_vec = DictVectorizer() vectorized_features = self._second_vec.fit_transform( flattened_text_features) if globals_cliner.verbosity > 0: print '\ttraining classifier (pass two)' # Train the model self._second_clf = sci.train(vectorized_features, numeric_labels, do_grid)
def get_gradient(self): """ Get (optionally make each parameter's gradient) a reference to the flat gradient. Returns: Flat gradient (by reference: future calls to 'set_gradient' will modify it) """ # Fast path if self._gradient is not None: return self._gradient # Flatten (make if necessary) gradient = tools.flatten(tools.grads_of(self._model.parameters())) self._gradient = gradient return gradient
def train(self, train_notes, val=[], test=[]): """ ClinerModel::train() Purpose: Train a Machine Learning model on annotated data @param notes. A list of Note objects (containing text and annotations) @return None """ # Extract formatted data train_sents = flatten([n.getTokenizedSentences() for n in train_notes]) train_labels = flatten([n.getTokenLabels() for n in train_notes]) if test: test_sents = flatten([n.getTokenizedSentences() for n in test]) test_labels = flatten([n.getTokenLabels() for n in test]) else: test_sents = [] test_labels = [] if val: print ("VAL") val_sents = flatten([n.getTokenizedSentences() for n in val]) val_labels = flatten([n.getTokenLabels() for n in val]) self.train_fit(train_sents,train_labels,val_sents=val_sents,val_labels=val_labels,test_sents=test_sents,test_labels=test_labels) else: print ("NO DEV") self.train_fit(train_sents, train_labels, dev_split=0.1, test_sents=test_sents, test_labels=test_labels) self._train_files = [ n.getName() for n in train_notes+val ]
def _child_compute(self, cr, uid, ids, name, args, context=None): obj_dept = self.pool.get('hr.department') obj_user = self.pool.get('res.users') result = {} for user_id in ids: child_ids = [] cr.execute('SELECT dept.id FROM hr_department AS dept \ LEFT JOIN hr_employee AS emp ON dept.manager_id = emp.id \ WHERE emp.id IN \ (SELECT emp.id FROM hr_employee \ JOIN resource_resource r ON r.id = emp.resource_id WHERE r.user_id=' + str(user_id) + ') ') mgnt_dept_ids = [x[0] for x in cr.fetchall()] ids_dept = obj_dept.search(cr, uid, [('id', 'child_of', mgnt_dept_ids)], context=context) if ids_dept: data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids'], context=context) emp_children = map(lambda x: x['member_ids'], data_dept) emp_children = tools.flatten(emp_children) children = self.emp_to_users(cr, uid, emp_children, context=context) children = obj_user.search(cr, uid, [('id', 'in', children), ('active', '=', True)], context=context) if user_id in children: children.remove(user_id) child_ids.extend(tools.flatten(children)) set = {} map(set.__setitem__, child_ids, []) child_ids = set.keys() result[user_id] = child_ids return result
def second_pass_data_and_labels(notes): ''' second_pass_data_and_labels() Purpose: Interface with notes object to get text data and labels @param notes. List of Note objects @return <tuple> whose elements are: 0) list of chunked sentences 0) list of list-of-indices designating chunks 1) list of labels for chunks >>> import os >>> from notes.note import Note >>> base_dir = os.path.join(os.getenv('CLINER_DIR'), 'tests', 'data') >>> txt = os.path.join(base_dir, 'single.txt') >>> con = os.path.join(base_dir, 'single.con') >>> note_tmp = Note('i2b2') >>> note_tmp.read(txt, con) >>> notes = [note_tmp] >>> second_pass_data_and_labels(notes) ([['The score stood four to two', ',', 'with', 'but', 'one', 'inning', 'more', 'to', 'play', ',']], [[0]], ['problem']) ''' # Get the data and annotations from the Note objects l_chunked_sentences = [note.getChunkedText() for note in notes] l_inds_list = [note.getConceptIndices() for note in notes] l_con_labels = [note.getConceptLabels() for note in notes] chunked_sentences = flatten(l_chunked_sentences) inds_list = flatten(l_inds_list) con_labels = flatten(l_con_labels) #print 'labels: ', len(con_labels) #print 'inds: ', sum(map(len,inds_list)) #exit() return chunked_sentences, inds_list, con_labels
def replay(self, mem, batchsize=None): #training area batchsize = len(mem) if batchsize is None else batchsize minibatch = random.sample(mem, min(len(mem), batchsize)) for state, action, reward, next_state, done in minibatch: target = self.fPass(state) if done: target[action] = 0 else: target[action] = reward + self.gamma * numpy.max(self.fPass(next_state)) guess = self.forward(self.toTensor(tools.flatten(state))) loss = self.criterion(guess, self.toTensor(target)) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def run_query(self, query): params = { "date1": super()._dateformat(self.num_days), "date2": super()._dateformat(), "version": super()._versioncond() } self.cursor.execute(query.format(**params)) result = self.cursor.fetchall() if not self.totalusers: super()._totalusers(params, 'modules') return { 'versions': tools.flatten(result), 'count': self.totalusers }
def main(): parser = argparse.ArgumentParser(description="Post-process dual ISO images with Magic Lantern.", parents=[logger.loggingParser]) parser.add_argument("images", nargs="+", help="Image files or directories.") parser.add_argument("--cr2hdr", default="$HOME/magic-lantern/modules/dual_iso/cr2hdr", help="Executable cr2hdr. [Default: %(default)s]") parser.add_argument("--raw-ext", nargs="+", default=["cr2", "CR2"], help="RAW file extensions. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) args.cr2hdr = os.path.expandvars(args.cr2hdr) images = tools.flatten( [tools.flatten([glob.glob(os.path.join(arg, "*."+ext)) for ext in args.raw_ext]) if os.path.isdir(arg) else [arg] for arg in args.images] ) """ images_isos = [ (image, phototools.load_exif_field(image, "-ISO"), phototools.load_exif_field(image, "-AutoISO")) for image in progressiterator.ProgressIterator(images, description="Load EXIF ISO infos") ] dual_iso_images = [image for image, iso, auto_iso in images_isos if iso != auto_iso] """ dual_iso_images = images for image in progressiterator.ProgressIterator(dual_iso_images, description="Post-process dual ISO images"): command = "%s %s" % (args.cr2hdr, image) log.debug(command) logger.subprocessCall(command)
def _child_compute(self, cr, uid, ids, name, args, context={}): obj_dept = self.pool.get('hr.department') obj_user = self.pool.get('res.users') result = {} for manager_id in ids: child_ids = [] mgnt_dept_ids = obj_dept.search(cr, uid, [('manager_id', '=', manager_id)]) ids_dept = obj_dept.search(cr, uid, [('id', 'child_of', mgnt_dept_ids)]) if ids_dept: data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids']) childs = map(lambda x: x['member_ids'], data_dept) childs = tools.flatten(childs) childs = obj_user.search(cr, uid, [('id','in',childs),('active','=',True)]) if manager_id in childs: childs.remove(manager_id) child_ids.extend(tools.flatten(childs)) set = {} map(set.__setitem__, child_ids, []) child_ids = set.keys() else: child_ids = [] result[manager_id] = child_ids return result
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained keras model. @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: print '\tnothing to predict ' + p_or_n return [] print '\tvectorizing words ' + p_or_n if use_lstm: # vectorize tokenized sentences X = [] for sent in tokenized_sents: id_seq = [] for w in sent: if w in vocab: id_seq.append(vocab[w]) else: id_seq.append(vocab['oov']) X.append(id_seq) else: # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform(flatten(text_features)) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) print '\tpredicting labels ' + p_or_n # Predict labels if use_lstm: predictions = keras_ml.predict(clf, X) else: predictions = crf.predict(clf, X) # Format labels from output return predictions
def detect_alignments(alignment_pair, segment_length=10, overlap=5, statistics=None, dist_func=token_match): logging.debug(str(alignment_pair)) seg_dists = compute_distances(alignment_pair.susp_doc, alignment_pair.src_doc, segment_length=segment_length, overlap=overlap, dist_func=dist_func) detected = detect_segments(seg_dists) if statistics: statistics.add_detected_count(len(detected)) statistics.add_susp_detected_count(len(set(map(itemgetter(0), detected)))) statistics.add_src_detected_count(len(set(map(itemgetter(1), detected)))) detected = flatten([zip(seg_to_sent(susp, segment_length, overlap), seg_to_sent(src, segment_length, overlap)) for susp, src in detected]) return detected
def __init__(self, name_build, config=Configuration(), *args, **kwargs): """ Model builder constructor. Args: name_build Model name or constructor function config Configuration to use for the parameter tensors ... Additional (keyword-)arguments forwarded to the constructor Notes: If possible, data parallelism is enabled automatically """ # Recover name/constructor if callable(name_build): name = tools.fullqual(name_build) build = name_build else: models = type(self)._get_models() name = str(name_build) build = models.get(name, None) if build is None: raise tools.UnavailableException(models, name, what="model name") # Build model with torch.no_grad(): model = build(*args, **kwargs) if not isinstance(model, torch.nn.Module): raise tools.UserException("Expected built model %r to be an instance of 'torch.nn.Module', found %r instead" % (name, getattr(type(model), "__name__", "<unknown>"))) model = model.to(**config) device = config["device"] if device.type == "cuda" and device.index is None: # Model is on GPU and not explicitly restricted to one particular card => enable data parallelism model = torch.nn.DataParallel(model) params = tools.flatten(model.parameters()) # NOTE: Ordering across runs/nodes seems to be ensured (i.e. only dependent on the model constructor) # Finalization self._model = model self._name = name self._config = config self._params = params self._gradient = None self._defaults = { "trainset": None, "testset": None, "loss": None, "criterion": None, "optimizer": None }
def read_gold_alignments(alignment_pair): doc = BeautifulStoneSoup(open(alignment_pair.plagiarism_xml_fn()).read()) plag_spans = [] for feature in doc.findAll('feature'): if feature['name'] == 'plagiarism': src_span = (int(feature['source_offset']), int(feature['source_length'])) susp_span = (int(feature['this_offset']), int(feature['this_length'])) plag_spans.append((susp_span, src_span)) plag_segs = [] for (susp_offset, susp_len), (src_offset, src_len) in plag_spans: susp_seg = match_seg(alignment_pair.susp_doc, susp_offset, susp_len) src_seg = match_seg(alignment_pair.src_doc, src_offset, src_len) plag_segs.append((susp_seg, src_seg)) return flatten(plag_segs)
def reward(threshold, defender_rates, attacker_rates, defender_costs, attacker_costs): n = len(defender_rates) if n != (len(attacker_rates) and len(defender_costs) and len(attacker_costs)): raise Exception("Unequal set of lists") else: resources = list(range(0, n)) subsets = [] for t in range(threshold, n + 1): subsets.append(list(itertools.combinations(resources, t))) subsets = tools.flatten(subsets) if () in subsets and len(defender_costs) != 1: subsets.remove(()) gain = 0 for l in subsets: # creating the product p = 1 for i in range(0, n): p *= 1 / (defender_rates[i] + attacker_rates[i]) if i in l: p *= attacker_rates[i] else: p *= defender_rates[i] gain += p defender_move_cost = 0 attacker_move_cost = 0 for i in range(0, n): defender_move_cost += defender_rates[i] * defender_costs[i] attacker_move_cost += attacker_rates[i] * attacker_costs[i] defender_reward = 1 - gain - defender_move_cost attacker_reward = gain - attacker_move_cost return defender_reward, attacker_reward
def images_to_video(self, image_files, image_durations=[5.0], transition_times=[1.0], video_file=None): # determine video file video_file = self._check_output_video_file(video_file) # check parameter list lengths if len(image_durations) > 1 and len(image_durations) < len(image_files): log.warning("Not enough image durations specified.") image_durations = (image_durations*len(image_files))[:len(image_files)] if len(transition_times) > 1 and len(transition_times) < (len(image_files) - 1): log.warning("Not enough image transition times specified.") transition_times = (transition_times*(len(image_files)-1))[:(len(image_files)-1)] # create image videos image_videos = [ Multivision.image_to_video(self, image_file, duration, output_file_extension=self.video_format) for image_file, duration in progressiterator.ProgressIterator(zip(image_files, image_durations), description="Process images") ] # create transition videos transition_videos = [ Multivision.image_transition_to_video(self, image_file1, image_file2, transition_time, output_file_extension=self.video_format) for image_file1, image_file2, transition_time in progressiterator.ProgressIterator(zip(image_files[:-1], image_files[1:], transition_times), description="Process image transitions") ] # concatenate videos video_files = tools.flatten(zip(image_videos[:-1], transition_videos))+[image_videos[-1]] video_files = [v for v in video_files if v != "None"] temporary_files = video_files self.concatenate_videos(video_files, concatenated_video_file=video_file) # remove temporary files #for temporary_file in temporary_files: # os.remove(temporary_file) # return video file return video_file
def to_sql(self): stack = [] params = [] for i, e in reverse_enumerate(self.__exp): if self._is_leaf(e, internal=True): table = self.__tables.get(i, self.__main_table) q, p = self.__leaf_to_sql(e, table) params.insert(0, p) stack.append(q) else: if e == '!': stack.append('(NOT (%s))' % (stack.pop(),)) else: ops = {'&': ' AND ', '|': ' OR '} q1 = stack.pop() q2 = stack.pop() stack.append('(%s %s %s)' % (q1, ops[e], q2,)) query = ' AND '.join(reversed(stack)) joins = ' AND '.join(map(lambda j: j[0], self.__joins)) if joins: query = '(%s) AND (%s)' % (joins, query) return (query, flatten(params))
elif stopatradeta and (self.pdgid == pdgid_eta and pdgid_gamma in map(lambda x: x.pdgid, self.daughters)): return [self] elif stopatradomega and (self.pdgid == pdgid_omega and pdgid_gamma in map(lambda x: x.pdgid, self.daughters)): #print 'here' return [self] else: return tools.flatten([dau.interestingDescendants(terminii=terminii, stopatradrho=stopatradrho, stopatraddecay=stopatraddecay, stopatallphodecay=stopatallphodecay, stopatradeta=stopatradeta, stopatradomega=stopatradomega) for dau in self.daughters]) def mcDmode(self): if abs(self.pdgid) not in (pdgid_Dp, pdgid_Dz, pdgid_Dsp): return None else: list = self.interestingDescendants() retval = 0 for node in list: if node.pdgid == pdgid_Km: retval += 1 elif node.pdgid == pdgid_Kp: retval += 10
def __second_predict(self, chunked_sentences, inds_list): # If first pass predicted no concepts, then skip # NOTE: Special case because SVM cannot have empty input if sum([len(inds) for inds in inds_list]) == 0: print "first pass predicted no concepts, skipping second pass" return [] # Create object that is a wrapper for the features if globals_cliner.verbosity > 0: print '\textracting features (pass two)' print '\textracting features (pass two)' # Extract features text_features = [ feat_obj.concept_features(s, inds) for s, inds in zip(chunked_sentences, inds_list) ] flattened_text_features = flatten(text_features) print '\tvectorizing features (pass two)' if globals_cliner.verbosity > 0: print '\tvectorizing features (pass two)' # Vectorize features vectorized_features = self._second_vec.transform( flattened_text_features) if globals_cliner.verbosity > 0: print '\tpredicting labels (pass two)' # Predict concept labels out = sci.predict(self._second_clf, vectorized_features) # Line-by-line processing o = list(out) classifications = [] for lineno, inds in enumerate(inds_list): # Skip empty line if not inds: continue # For each concept for ind in inds: # Get next concept concept = reverse_concept_labels[o.pop(0)] # Get start position (ex. 7th word of line) start = 0 for i in range(ind): start += len(chunked_sentences[lineno][i].split()) # Length of chunk length = len(chunked_sentences[lineno][ind].split()) # Classification token classifications.append( (concept, lineno + 1, start, start + length - 1)) # Return classifications return classifications
def add_rooms(*rooms): """ Add multiple room objects to the building. """ room_list = tools.flatten(rooms) for room in room_list: Amity.add_room(room)
def setup(self): # Extract a single experiment from the table that is not # already running. set self.experiment and self.state super(DBRSyncChannel, self).setup() self.state.jobman.sql.host_name = socket.gethostname() def state_del(state, keys): # Delete from the state the following key if present for key in keys: if hasattr(state, key): del state[key] # put jobs scheduler info into the state condor_slot = os.getenv("_CONDOR_SLOT") sge_task_id = os.getenv('SGE_TASK_ID') pbs_task_id = os.getenv('PBS_JOBID') if condor_slot: self.state.jobman.sql.condor_slot = condor_slot job_ad_file = os.getenv("_CONDOR_JOB_AD", None) if job_ad_file: f = open(job_ad_file) try: for line in f.readlines(): if line.startswith('GlobalJobId = '): self.state.jobman.sql.condor_global_job_id = line.split( '=')[1].strip()[1:-1] elif line.startswith('Out = '): self.state.jobman.sql.condor_stdout = line.split( '=')[1].strip()[1:-1] elif line.startswith('Err = '): self.state.jobman.sql.condor_stderr = line.split( '=')[1].strip()[1:-1] elif line.startswith('OrigIwd = '): self.state.jobman.sql.condor_origiwd = line.split( '=')[1].strip()[1:-1] finally: f.close() elif sge_task_id: self.state.jobman.sql.sge_task_id = sge_task_id self.state.jobman.sql.job_id = os.getenv('JOB_ID') self.state.jobman.sql.sge_stdout = os.getenv('SGE_STDOUT_PATH') self.state.jobman.sql.sge_stderr = os.getenv('SGE_STDERR_PATH') elif pbs_task_id: self.state.jobman.sql.pbs_task_id = pbs_task_id self.state.jobman.sql.pbs_queue = os.getenv('PBS_QUEUE') self.state.jobman.sql.pbs_arrayid = os.getenv('PBS_ARRAYID') self.state.jobman.sql.pbs_num_ppn = os.getenv('PBS_NUM_PPN') # delete old jobs scheduler info into the state # this is needed in case we move a job to a different system. # to know where it is running now. key_to_del = [] if not condor_slot: key_to_del.extend(['jobman.sql.condor_global_job_id', 'jobman.sql.condor_stdout', 'jobman.sql.condor_stderr', 'jobman.sql.condor_origiwd', 'jobman.sql.condor_slot']) if not sge_task_id: key_to_del.extend(['jobman.sql.sge_task_id', 'jobman.sql.job_id', 'jobman.sql.sge_stdout', 'self.state.jobman.sql.sge_stderr']) if not pbs_task_id: key_to_del.extend(['jobman.sql.pbs_task_id', 'jobman.sql.pbs_queue', 'jobman.sql.pbs_arrayid', 'jobman.sql.pbs_num_ppn']) flattened_state = flatten(self.state) deleted = False for k in key_to_del: if k in flattened_state: del flattened_state[k] deleted = True if deleted: self.state = expand(flattened_state) self.state.jobman.sql.start_time = time.time() self.state.jobman.sql.host_workdir = self.path self.dbstate.update(flatten(self.state))
def interestingDescendants(self, terminii=interesting_particles, stopatradrho=True, stopatraddecay=True, stopatallphodecay=True, stopatradeta=True, stopatradomega=True): if self.pdgid == pdgid_gammaFSR: return [] if (abs(self.pdgid) == pdgid_rhop and len(self.daughters) < 2): print[particle_data.findId(x.pdgid).name for x in self.daughters] return self.daughters if ((abs(self.pdgid) == pdgid_rhop and len(self.daughters) > 1 and abs(self.daughters[0].pdgid) == pdgid_pip and self.daughters[1].pdgid == pdgid_gamma)): print 'rho->pi gamma: hahahaha' print self for node in self.daughters: print node # we don't care about decays in flight or pi0 daughters # Dumb hack : we don't care right now for non-FSR photons, so # terminate if X -> Y gamma, else carry on # But hey, this kills radiative K* decays ... make an option if self.pdgid in terminii: return [self] elif (stopatallphodecay and len(self.daughters) == len( filter(lambda x: x.pdgid == pdgid_gamma, self.daughters))): return [self] elif stopatraddecay and (len(self.daughters) == 2 and pdgid_gamma in (self.daughters[0].pdgid, self.daughters[1].pdgid)): ## if self.pdgid == pdgid_etaprime: ## print 'HO' ## print 'xxxxxxxxxxx' ## print self ## print self.daughters[0] ## print self.daughters[1] ## print 'xxxxxxxxxxx' ## if self.pdgid == 323: ## print 'here', terminii, stopatraddecay return [self] elif stopatradrho and ((abs(self.pdgid) == pdgid_rhop and abs(self.daughters[0].pdgid) == pdgid_pip and self.daughters[1].pdgid == pdgid_gamma)): print 'rho check' return [self] elif stopatradeta and (self.pdgid == pdgid_eta and pdgid_gamma in map( lambda x: x.pdgid, self.daughters)): return [self] elif stopatradomega and (self.pdgid == pdgid_omega and pdgid_gamma in map(lambda x: x.pdgid, self.daughters)): #print 'here' return [self] else: return tools.flatten([ dau.interestingDescendants(terminii=terminii, stopatradrho=stopatradrho, stopatraddecay=stopatraddecay, stopatallphodecay=stopatallphodecay, stopatradeta=stopatradeta, stopatradomega=stopatradomega) for dau in self.daughters ])
def generic_train(p_or_n, tokenized_sents, iob_nested_labels, val_sents=None, val_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param iob_nested_labels. Parallel to `tokenized_sents`, 7-way labels for concept spans @param val_sents. Validation data. Same format as tokenized_sents @param val_labels. Validation data. Same format as iob_nested_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on if len(tokenized_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000): if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>10): p = int(dev_split*100) print '\tCreating %d/%d train/dev split' % (100-p,p) perm = range(len(tokenized_sents)) random.shuffle(perm) tokenized_sents = [ tokenized_sents[i] for i in perm ] iob_nested_labels = [ iob_nested_labels[i] for i in perm ] ind = int(dev_split*len(tokenized_sents)) val_sents = tokenized_sents[:ind ] train_sents = tokenized_sents[ ind:] val_labels = iob_nested_labels[:ind ] train_labels = iob_nested_labels[ ind:] tokenized_sents = train_sents iob_nested_labels = train_labels print '\tvectorizing words', p_or_n #tokenized_sents = train_sents[ :2] #iob_nested_labels = train_labels[:2] # count word frequencies to determine OOV freq = defaultdict(int) for sent in tokenized_sents: for w in sent: freq[w] += 1 # determine OOV based on % of vocab or minimum word freq threshold oov = set() ''' if len(freq) < 100: lo = len(freq)/20 oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo]) else: #lo = 2 #oov = set([ w for w,f in freq.items() if (f <= lo) ]) oov = set() ''' ''' val = None for w,f in sorted(freq.items(), key=lambda t:t[1]): if val != f: val = f print print '%8d %s' % (f,w) exit() ''' ######## # CRF ######## # vectorize tokenized sentences ''' def make_feature(ind): return {(ind,i):1 for i in range(10)} text_features = [] for sent in tokenized_sents: fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent] text_features.append(fseq) ''' text_features = extract_features(tokenized_sents) # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type,instance),value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform( flatten(text_features) ) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [ [tag2id[y] for y in y_seq] for y_seq in iob_nested_labels ] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform( flatten(val_text_features) ) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [ [tag2id[y] for y in y_seq] for y_seq in val_labels ] print '\ttraining classifiers', p_or_n #val_sents = val_sents[ :5] #val_labels = val_labels[:5] # train using crf clf, dev_score = crf_ml.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y) return vocab, clf, dev_score, enabled_features
# meta verbs with synonyms meta_verbs = (("q", "quit"), ) # direction verbs with synonyms direction_verbs = ( ("n", "north"), ("e", "east"), ("s", "south"), ("w", "west"), ) # all the verbs! verbs = tools.flatten(( *meta_verbs, *direction_verbs, )) ######################################## # REPL ######################################## game = Game("Adventure", rooms, player) game.start("outside") while True: # get user input user_input = game.prompt_user() # check validity of user input
def get_files(directory, extensions): files = tools.flatten([glob.glob(os.path.join(directory, "*.%s" % ext)) for ext in extensions]) return list(set(files))
def extract_table_from_img(input_img_name, output_img_path=None, show_tables=False, save_small_tables=False, get_test_tables=False): """ table extracted from img will be saved in table_info if want draw rectangles to directly show tables of the img set show_tables=True """ print('I am working on extracting table from image') os.path.isfile(input_img_name) img = cv2.imread(input_img_name) max_area = img.shape[0] * img.shape[1] max_area_condition = max_area * 3 / 4 gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) edge_img = cv2.Canny(gray_img, 50, 150) dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (7, 3)) dilate_image = cv2.dilate(edge_img, dilate_kernel, iterations=1) res, binary_img = cv2.threshold(dilate_image, 45, 255, cv2.THRESH_BINARY) horizontal_dilation = get_table_lines(binary_img, kernel_size=(50, 1)) vertical_dilation = get_table_lines(binary_img, kernel_size=(1, 50)) table_dilation = horizontal_dilation + vertical_dilation # table_dilation = cv2.dilate(table_dilation, dilate_kernel, iterations=1) table_dilation, contours, hierarchy = cv2.findContours( table_dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # print(len(contours)) rec_coo = [] for i in range(len(contours)): contour_coordinates = contours[i] x_coordinates = flatten(contour_coordinates[:, :, 0].tolist()) y_coordinates = flatten(contour_coordinates[:, :, 1].tolist()) x_no_repeat_list = list(set(x_coordinates)) y_no_repeat_list = list(set(y_coordinates)) same_x_num = len(x_coordinates) - len(x_no_repeat_list) same_y_num = len(y_coordinates) - len(y_no_repeat_list) if same_x_num >= 2 and same_y_num >= 2: rec_x_min, rec_y_min, rec_x_max, rec_y_max = find_left_right_conner( x_coordinates, y_coordinates) find_area = (rec_x_max - rec_x_min) * (rec_y_max - rec_y_min) if find_area is not 0 and find_area < max_area_condition: # print('find left right conner') rec_coo.append([rec_x_min, rec_y_min, rec_x_max, rec_y_max]) if show_tables or get_test_tables: cv2.rectangle(img, (rec_x_min, rec_y_min), (rec_x_max, rec_y_max), (0, 255, 0), 3) f_name = get_file_name(input_img_name) cv2.imwrite(os.path.join(f_name + '_Draw.png'), img) table_num = 0 # extract table from img_name rec_list = [] for x_y_coo in rec_coo: rec = img[x_y_coo[1]:x_y_coo[3] + 1, x_y_coo[0]:x_y_coo[2] + 1] rec_list.append(rec) if save_small_tables: table_label = os.path.join( output_img_path, 'g1_0_table_cut' + str(table_num) + '.png') cv2.imwrite(table_label, rec) table_num += 1 return rec_list, rec_coo
def get_cases_ids_from_references(references): return list({ int(x) for x in flatten([CASE_ID_RE.findall(ref) for ref in references]) })
def generic_train(p_or_n, tokenized_sents, iob_nested_labels, use_lstm, val_sents=None, val_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param iob_nested_labels. Parallel to `tokenized_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as tokenized_sents @param val_labels. Validation data. Same format as iob_nested_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(tokenized_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000): if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents) > 10): p = int(dev_split * 100) print '\tCreating %d/%d train/dev split' % (100 - p, p) perm = range(len(tokenized_sents)) random.shuffle(perm) tokenized_sents = [tokenized_sents[i] for i in perm] iob_nested_labels = [iob_nested_labels[i] for i in perm] ind = int(dev_split * len(tokenized_sents)) val_sents = tokenized_sents[:ind] train_sents = tokenized_sents[ind:] val_labels = iob_nested_labels[:ind] train_labels = iob_nested_labels[ind:] tokenized_sents = train_sents iob_nested_labels = train_labels print '\tvectorizing words', p_or_n #tokenized_sents = train_sents[ :2] #iob_nested_labels = train_labels[:2] # count word frequencies to determine OOV freq = defaultdict(int) for sent in tokenized_sents: for w in sent: freq[w] += 1 # determine OOV based on % of vocab or minimum word freq threshold oov = set() ''' if len(freq) < 100: lo = len(freq)/20 oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo]) else: #lo = 2 #oov = set([ w for w,f in freq.items() if (f <= lo) ]) oov = set() ''' ''' val = None for w,f in sorted(freq.items(), key=lambda t:t[1]): if val != f: val = f print print '%8d %s' % (f,w) exit() ''' if use_lstm: ######## # LSTM ######## # build vocabulary of words vocab = {} for sent in tokenized_sents: for w in sent: if (w not in vocab) and (w not in oov): vocab[w] = len(vocab) + 1 vocab['oov'] = len(vocab) + 1 # vectorize tokenized sentences X_seq_ids = [] for sent in tokenized_sents: id_seq = [(vocab[w] if w in vocab else vocab['oov']) for w in sent] X_seq_ids.append(id_seq) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels] # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_X = [] for sent in val_sents: id_seq = [(vocab[w] if w in vocab else vocab['oov']) for w in sent] val_X.append(id_seq) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] else: ######## # CRF ######## # vectorize tokenized sentences ''' def make_feature(ind): return {(ind,i):1 for i in range(10)} text_features = [] fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent] text_features.append(fseq) ''' text_features = extract_features(tokenized_sents) # type(text_features): <type 'list'> # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform(flatten(text_features)) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] print '\ttraining classifiers', p_or_n #val_sents = val_sents[ :5] #val_labels = val_labels[:5] if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y) else: # train using crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y) return vocab, clf, dev_score, enabled_features
def token_match(susp_sents, src_sents): susp_tokens = set(flatten([sent.words() for sent in susp_sents])) src_tokens = set(flatten([sent.words() for sent in src_sents])) return len(susp_tokens.intersection(src_tokens)) / (len(src_tokens) * 1.0)
def plotload(plotname,work,specfile=None,choice_override=None,use_group=False): """ Load postprocessing data for making a plot. Note that we currently do not use the specs items. """ #---read plot specification if 0: if not specfile: specfile = work.paths['specs_file'] #---load the yaml specifications file if type(specfile)==str: specfile = [specfile] raw_specs = '' for sfn in specfile: with open(sfn,'r') as fp: raw_specs += '\n'+fp.read() specs = yaml.load(raw_specs) specs = work.load_specs() #---merge automatic plots here if 0: if 'autoplots' in specs: for key,val in specs['autoplots'].items(): if key in specs['plots']: raise Exception('\n[ERROR] redundant names in plots and autoplots: %s'%key+ ", which is populated with django so check calculator.Calculation") else: specs['plots'][key] = deepcopy(val) plotspecs = specs['plots'][plotname] #---load the calculation from the workspace calcnames = plotspecs['calculation'] if type(calcnames)==str: calcnames = [calcnames] datasets = {name:[] for name in calcnames} calcsets = {name:[] for name in calcnames} #---loop over calcnames requested in the plot specs for calcname in calcnames: calcs = work.interpret_specs(work.calc[calcname]) if len(calcs)==0: raise Exception('[ERROR] failed to retrieve calculations') #---get the group from either plotspecs or the calculation or exception if 'group' in plotspecs: group = plotspecs['group'] elif 'group' in work.calc[calcname]: group = work.calc[calcname]['group'] else: group = None #---get the collection from either plotspecs or the upstream calculation if 'collections' in plotspecs: collections = plotspecs['collections'] if type(collections)==str: collections = [collections] else: collections = calc['collections'] sns = flatten([work.vars['collections'][c] for c in collections]) #---compile all upstream data data = [{} for c in calcs] #---iterate over the loop over upstream calculations for calcnum,calcwhittle in enumerate(calcs): status('upstream data type: %s'%str(calcwhittle),tag='load') calc = deepcopy(work.calc[calcname]) #---loop over simulations for snum,sn in enumerate(sns): status(sn.ljust(26),tag='load',i=snum,looplen=len(sns)) #---slices in plotspecs or lookup from variables with plus-syntax #---! need to allow blank slices here so that the machine looks to calcs to get them if 'slices' in plotspecs and not re.match('^\+',plotspecs['slices']): work.cursor = (work.c,work.trajectory_format) sl = work.slice(sn)[plotspecs['slices']]['all' if not group else group] elif 'slices' in plotspecs: sl = deepcopy(delve(work.vars,*plotspecs['slices'].strip('+').split('/'))) #---the slice might not have a filekey if its a combo if 'filekey' not in sl: #---! pbc and groups will usually be absent here start,end,skip,pbc = [sl[i] for i in 'start,end,skip,pbc'.split(',')] sl['filekey'] = '%s.%d-%d-%d'%(work.prefixer(sn),start,end,skip) else: raise Exception('[ERROR] cannot infer slices') #---compute base filename if not group: #---! deprecated fn_base = re.findall('^v[0-9]+\.[0-9]+-[0-9]+-[0-9]+',sl['filekey'])[0]+'.%s'%calcname elif use_group: #---special settings here for loading certain kinds of data eg protein_abstractor fn_base = '%s.%s.pbc%s.%s'%(sl['filekey'],group,sl['pbc'],calcname) else: fn_base = '%s.%s'%(sl['filekey'],calcname) #---fill in upstream details in our replicate of the calculation specs for route,val in [(i,j) for i,j in catalog(calcwhittle)]: #---! the plot has to mimic the specs structure exactly otherwise error below try: endpoint = delve(work.calc[calcname],*route) except: import pdb;pdb.set_trace() if type(endpoint)==dict and 'loop' in endpoint: try: penultimate = delve(calc,*route[:-1]) penultimate[route[-1]] = val except: pass #---get the dat file and package it fn = work.select_postdata(fn_base,calc,debug=True) if fn == None: print '[ERROR] cannot locate a file necessary for plotting via work.select_postdata\n'+\ '[ERROR] you probably need to fix your meta.yaml file and run "make compute"\n'+\ '[ERROR] check the post directory at "%s" and the variables fn_base,calc\n' import pdb;pdb.set_trace() dat_fn = os.path.basename(fn)[:-4]+'dat' data[calcnum][sn] = {'data':load(dat_fn,work.postdir), 'slice':sl,'group':group,'fn_base':fn_base} #---if only one calculation of this type then we elevate package if len(calcs)==1: calcs,data = calcs[0],data[0] datasets[calcname],calcsets[calcname] = data,calcs #---if only one upstream calculation we return that directly if len(datasets)==1: return datasets.values()[0],calcsets.values()[0] else: return datasets,calcsets
def __generic_first_train(self, p_or_n, text_features, iob_labels, do_grid=False): ''' Model::__generic_first_train() Purpose: Train that works for both prose and nonprose @param p_or_n. <string> either "prose" or "nonprose" @param text_features. <list-of-lists> of feature dictionaries @param iob_labels. <list> of "I", "O", and "B" labels @param do_grid. <boolean> indicating whether to perform grid search ''' # Must have data to train on if len(text_features) == 0: raise Exception('Training must have %s training examples' % p_or_n) # Vectorize IOB labels Y_labels = [IOB_labels[y] for y in iob_labels] # Save list structure to reconstruct after vectorization offsets = save_list_structure(text_features) if globals_cliner.verbosity > 0: print '\tvectorizing features (pass one) ' + p_or_n #X = reconstruct_list(flatten(text_features), offsets) #Y = reconstruct_list( Y_labels , offsets) #for a,b in zip(X,Y): # for x,y in zip(a,b): # print y # #print filter(lambda t:t[0]=='word', x.keys()) # print x.keys() # print # print '\n\n\n' # Vectorize features dvect = DictVectorizer() X_feats = dvect.fit_transform(flatten(text_features)) # CRF needs reconstructed lists if self._crf_enabled: X_feats = reconstruct_list(list(X_feats), offsets) Y_labels = reconstruct_list(Y_labels, offsets) lib = crf else: lib = sci if globals_cliner.verbosity > 0: print '\ttraining classifiers (pass one) ' + p_or_n #for i,X in enumerate(X_feats): # for j,x in enumerate(X): # print x, '\t', Y_labels[i][j] # print #exit() # Train classifier clf = lib.train(X_feats, Y_labels, do_grid) return dvect, clf
_spex = _home + '/proj/mdwarfs/templates/spex/' spex_fns = [line.strip() for line in os.popen('ls %s*.fits' % _spex)] sptstr = [os.path.split(el)[1].split('_')[0] for el in spex_fns] spts = np.array([10*int(el[0]=='M')+int(el[1])-10 for el in sptstr]) spex_dats = [] for fn in spex_fns: temp = pyfits.getdata(fn) nir = (temp[0] > 0.9) * (temp[0] < 2.4) dw = np.diff(temp[0,nir]).mean() xkern = np.arange(-40, 40) kern = an.gaussian([1., 1./(res*dw), 0, 0], xkern) temp[1] = np.convolve(temp[1], kern, 'same') spex_dats.append(temp) # Sort filenames: final_fns = np.array(tools.flatten([[line.strip() for line in os.popen('ls %s*final.fits' % (proc))] for proc in procs])) ai = np.argsort([os.path.split(fn)[1] for fn in final_fns]) final_fns = final_fns[ai] w_ignore = [0, .91], [1.33, 1.45], [1.75, 2.04], [2.4, 999] nobs = len(final_fns) nmod = len(spex_fns) chisq = np.zeros((nobs, nmod), dtype=float) + 9e99 for ii in range(nobs): obs0 = pyfits.getdata(final_fns[ii]) obs0[2] = np.sqrt(obs0[2]**2 + (obs0[1]/maxsnr)**2) wbins = 0.5*(obs0[0,1:] + obs0[0,0:-1]) wbins = np.concatenate((wbins[0]-np.diff(wbins[0:2]), wbins, wbins[-1]+np.diff(wbins[-2:]))) bestfit, bestmod = 0, 0 for jj in range(nmod):
def densityOrderedSDRPlot(activeColumns): #Make sure inputs are numpy arrays activeColumns = numpy.asarray(activeColumns) ##Transform the input values into x and y coordinates #for each active bit, the x value is the time when it was active xValues = [] for t in xrange(len(activeColumns)): xValues.append([t]*len(activeColumns[t])) xValues = numpy.fromiter(flatten(xValues),int) #not sure if int is the right thing to use here #Calculate y values #for each active bit the y value is the number of times that bit is active over the whole time range from collections import Counter from operator import itemgetter inputIndexes = list(flatten(activeColumns)) #get a list of tuples - [(columnIndex,count),(columnIndex, count),...] c=Counter(inputIndexes).items() #sort this list by column index first s=sorted(c,key=itemgetter(0)) #then sort the sorted list by count (so columns that have the same count will be sorted by index) sortedCounts = sorted(s,key=itemgetter(1),reverse=True) #reassign the count number in each tuple to be the position in the list d=[] for i in xrange(len(sortedCounts)): item = list(sortedCounts[i]) item[1] = i d.append(item) columnRank = dict(d) yValues = numpy.asarray([columnRank[index] for index in inputIndexes]) #Calculate horizontal grid lines #Each number next to a horizontal gridline shows the number of activations in the band above #split up plot into bands of the same activation count ie group the 'columns' by activation count yTickPositions = [] lastValue = None for i in xrange(len(sortedCounts)): currentValue = sortedCounts[i][1] if currentValue != lastValue: yTickPositions.append(i) lastValue = currentValue yTickPositions = numpy.asarray(yTickPositions) - 0.5 #the y ticks should divide the columns into groups, so they go between line, hence the 0.5 ##Plot the x and y coordinates #set up plot (figure,axes) = plt.subplots() #plot the values axes.plot(xValues,yValues,'.') #display the horizontal grid lines that go through each line of markers axes.minorticks_on() axes.grid(True,axis='both',which='minor',linestyle='solid',color=(0.7,0.7,0.7)) axes.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1)) #display the horizontal lines that seperate the columns into activation amounts axes.grid(True,axis='y',which='major',linestyle='solid',color=(0.3,0.3,0.3)) axes.set_yticks(yTickPositions) axes.set_yticklabels(numpy.unique([item[1] for item in sortedCounts])[::-1]) #draw the drid lines beneath the points axes.set_axisbelow(True) #Add a gap to the bottom and left edges so all points can be seen axes.set_xlim(left=-1) axes.set_ylim(bottom=-1) #set plot aspect ratio so the axes are even. Make the plot bigger (the plot expands to the maximum size allowed by aspect ratio) axes.set_aspect(aspect='equal') figure.set_size_inches(20,20) return (figure, figure.axes)
def add_persons(*persons): """Add persons to the people collection """ persons_list = tools.flatten(list(persons)) for person in persons_list: Amity.add_person(person)
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained keras model. @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # use_lstm=self._use_lstm if use_lstm: #parameters=hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = True #model_folder="./models/NN_models" predictions = [] sys.stdout.write('\n use_lstm \n') dataset = Exp.Dataset() fictional_labels = copy.deepcopy(tokenized_sents) for idx, x in enumerate(fictional_labels): for val_id, value in enumerate(x): fictional_labels[idx][val_id] = 'O' Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['deploy'] = tokenized_sents Datasets_labels['deploy'] = fictional_labels token_to_vector = dataset.load_dataset( Datasets_tokens, Datasets_labels, "", parameters, token_to_vector=tokens_to_vec, pretrained_dataset=pretrained_dataset) print(dataset.token_indices.keys()) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False dataset.update_dataset("", ['deploy'], Datasets_tokens, Datasets_labels) del Datasets_tokens del Datasets_labels #model=current_model model = entity_model.EntityLSTM(dataset, parameters) os.mkdir(parameters['conll_like_result_folder']) test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) sess = tf.Session() with sess.as_default(): #model=entity_model.EntityLSTM(dataset,parameters) transition_params_trained = model.restore_from_pretrained_model( parameters, dataset, sess, token_to_vector=token_to_vector, pretrained_dataset=pretrained_dataset) del token_to_vector predictions = training_predict_LSTM.prediction_step( sess, dataset, "deploy", model, 0, parameters['conll_like_result_folder'], transition_params_trained) sess.close() tf.reset_default_graph() shutil.rmtree(parameters['conll_like_result_folder']) return predictions, model # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: sys.stdout.write('\tnothing to predict %s\n' % p_or_n) return [] sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print('todo: incorporate lstm') # vectorize tokenized sentences #X = [] #for sent in tokenized_sents: # id_seq = [] # for w in sent: # if w in vocab: # id_seq.append(vocab[w]) # else: # id_seq.append(vocab['oov']) # X.append(id_seq) else: from feature_extraction.features import extract_features # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform(flatten(text_features)) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) sys.stdout.write('\tpredicting labels %s\n' % p_or_n) # Predict labels if use_lstm: print("TEST_PREDICT") exit() else: from machine_learning import crf predictions = crf.predict(clf, X) # Format labels from output return predictions
def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param train_sents. A list of sentences; each sentence is tokenized into words @param train_labels. Parallel to `train_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as train_sents @param val_labels. Validation data. Same format as train_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(train_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10): p = int(dev_split * 100) sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p)) perm = list(range(len(train_sents))) random.shuffle(perm) train_sents = [train_sents[i] for i in perm] train_labels = [train_labels[i] for i in perm] ind = int(dev_split * len(train_sents)) val_sents = train_sents[:ind] train_sents = train_sents[ind:] val_labels = train_labels[:ind] train_labels = train_labels[ind:] else: sys.stdout.write('\tUsing existing validation data\n') sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print("TESTING NEW DATSET OBJECT") dataset = Exp.Dataset() parameters = hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = False Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['train'] = train_sents Datasets_labels['train'] = train_labels if val_sents != None: Datasets_tokens['valid'] = val_sents Datasets_labels['valid'] = val_labels if test_sents != None: Datasets_tokens['test'] = test_sents Datasets_labels['test'] = test_labels dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters) pickle.dump( dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) print(Datasets_tokens['valid'][0]) print(Datasets_tokens['test'][0]) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False parameters['learning_rate'] = 0.005 sess = tf.Session() number_of_sent = list(range(len(dataset.token_indices['train']))) with sess.as_default(): model = entity_model.EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) epoch_number = -1 transition_params_trained = np.random.rand(5 + 2, 5 + 2) values = {} values["best"] = 0 f1_dictionary = {} f1_dictionary['best'] = 0 model_saver = tf.train.Saver(max_to_keep=100) print("START TRAINING") eval_dir = os.path.join( tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep) parameters['conll_like_result_folder'] = eval_dir test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(parameters['conll_like_result_folder']) os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) while epoch_number < 90: average_loss_per_phrase = 0 accuracy_per_phase = 0 step = 0 epoch_number += 1 if epoch_number != 0: sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: loss, accuracy, transition_params_trained = training_predict_LSTM.train_step( sess, dataset, sequence_number, model) average_loss_per_phrase += loss accuracy_per_phase += accuracy step += 1 if step % 10 == 0: print('Training {0:.2f}% done\n'.format( step / len(sequence_numbers) * 100)) model_saver.save( sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) total_loss = average_loss_per_phrase total_accuracy = accuracy_per_phase average_loss_per_phrase = average_loss_per_phrase / len( number_of_sent) accuracy_per_phase = accuracy_per_phase / len(number_of_sent) if epoch_number > 0: "" f1, predictions = training_predict_LSTM.prediction_step( sess, dataset, "test", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_train, _ = training_predict_LSTM.prediction_step( sess, dataset, "train", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_valid, _ = training_predict_LSTM.prediction_step( sess, dataset, "valid", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy( parameters['conll_like_result_folder'] + "valid" + os.sep + "epoche_" + str(epoch_number) + ".txt") if f1_dictionary['best'] < float(f1_valid): f1_dictionary['epoche'] = epoch_number f1_dictionary['best'] = float(f1_valid) if values["best"] < correctly_predicted_tokens: values["epoche"] = epoch_number values["best"] = correctly_predicted_tokens #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens)) print("NEW EPOCHE" + " " + str(epoch_number)) print("Current F1 on train" + " " + str(f1_train)) print("Current F1 on valid" + " " + str(f1_valid)) print("Current F1 on test" + " " + str(f1)) print("Current F1 best (validation): ") print(f1_dictionary) shutil.rmtree(parameters['conll_like_result_folder']) return parameters, dataset, f1_dictionary['best'] else: ######## # CRF ######## # vectorize tokenized sentences text_features = extract_features(train_sents) # type(text_features): <type 'list'> # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform(flatten(text_features)) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] # if there is specified test data, then vectorize it if test_sents: # vectorize test X test_text_features = extract_features(test_sents) flat_test_X_feats = vocab.transform(flatten(test_text_features)) test_X = reconstruct_list(flat_test_X_feats, save_list_structure(test_text_features)) # vectorize test Y test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels] else: test_X = None test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y, test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) return vocab, clf, dev_score, enabled_features
def computer(function,**kwargs): """ Compute function figures out how to run a calculation over a simulation. """ work = kwargs['workspace'] calc = kwargs['calc'] #---perform a calculation over all collections if 'collections' in calc: cols = tuple([calc['collections']]) if type(calc['collections'])==str else calc['collections'] sns = unique(flatten([work.vars['collections'][i] for i in cols])) else: sns = work.sns() #---get slices (required) slice_name = calc['slice_name'] group = calc['group'] if 'group' in calc else None #---pass data to the function according to upstream data type incoming_type = calc['uptype'] jobs,data = [],dict([(sn,{}) for sn in sns]) combined_slices = [] for sn in sns: new_job = {'sn':sn,'slice_name':slice_name,'group':group} if incoming_type == 'simulation': #---prepare combinations in a dictionary if slice_name not in work.slice(sn): raise Exception( '\n[ERROR] the slices yaml file is missing a slice named "%s" for simulation "%s"'% (slice_name,sn)) try: mfp = work.slice(sn)[slice_name][group]['missing_frame_percent'] except: print "[WARNING] no missing frame percentage here" mfp = 0.0 if mfp>work.missing_frame_tolerance: status('upstream slice failure: %s,%s,%s missing_frame_percent=%.1f'%( sn,slice_name,group,mfp),tag='warning') continue #---defaulting to 'all' group if group is None new_job['grofile'] = work.postdir+\ work.slice(sn)[slice_name][group if group else 'all']['gro'] #---! xtc must become a flag. recommend 'xtc' becomes work.cursor[1] #---defaulting to 'all' group if group is None new_job['trajfile'] = work.postdir+work.slice(sn)[slice_name][group if group else 'all']['xtc'] if 'specs' not in calc: calc['specs'] = '' if 'upstream' in calc['specs']: #---if no loop on upstream you can use a list if type(calc['specs']['upstream'])==list: upstream_ask = dict([(key,None) for key in calc['specs']['upstream']]) elif type(calc['specs']['upstream'])==str: upstream_ask = {calc['specs']['upstream']:None} else: upstream_ask = calc['specs']['upstream'] for key,val in upstream_ask.items(): upspecs = deepcopy(work.calc[key]) #---identify the list of particular options along with the stubs options,stubs = work.interpret_specs(upspecs,return_stubs=True) #---identify paths and values over which we "whittle" the total list of specs whittles = [(i,j) for i,j in catalog(val)] #---if no loop on upstream pickles we interpret none and send blank specs if val in ['None','none',None]: specs = [options[ss] for r,v in whittles for ss,s in enumerate(stubs)] else: #---select the correct option by matching all catalogued routes from the incoming #---...key to the original calculation specs = [options[ss] for r,v in whittles for ss,s in enumerate(stubs) if delve(s['specs'],*r)==v] if len(specs)!=1 and 'loop' not in upspecs['slice_name']: import pdb;pdb.set_trace() raise Exception('[ERROR] redundant upstream selection %s'%str(select)) #---if there are multiple slices #---! note that we expect that if slice_names is a list it will be ordered here too for slicenum,spec in enumerate(specs): #---if the upstream calculation has a group then use it in the filename if not group: if 'group' in work.calc[key]: upgroup = work.calc[key]['group'] else: upgroup = None else: upgroup = group if not upgroup: sl = work.slice(sn)[spec['slice_name']] fn_base = re.findall('^v[0-9]+\.[0-9]+-[0-9]+-[0-9]+', work.slice(sn)[upspecs['slice_name']]['all']['filekey'] )[0]+'.%s'%key else: sl = work.slice(sn)[spec['slice_name']][upgroup] fn_base = '%s.%s'%(sl['filekey'],key) #---! moved the following block left recently fn = work.select_postdata(fn_base,spec) if not fn: print '[ERROR] missing %s'%fn import pdb;pdb.set_trace() outkey = key if len(specs)==1 else '%s%d'%(key,slicenum) #---before each calculation the master loop loads the filename stored here data[sn][outkey] = os.path.basename(fn)[:-4]+'dat' new_job['upstream'] = data[sn].keys() jobs.append(new_job) #---master loop for outgoing in jobs: sn,slice_name,group = outgoing['sn'],outgoing['slice_name'],outgoing['group'] #---if we combine slices for this calculation we use the whole time span in the base filename if type(slice_name)==list: #---! simple method for making the combination file key start = min([work.slice(sn)[s]['all' if not group else group]['start'] for s in slice_name]) end = max([work.slice(sn)[s]['all' if not group else group]['end'] for s in slice_name]) skip = work.slice(sn)[s]['all' if not group else group]['skip'] #---! this filekey construction means the user will have to anticipate the names of combos fn_base = '%s.%d-%d-%d.%s'%(work.prefixer(sn),start,end,skip,function.__name__) else: #---we index all calculations automatically in case we loop over specs later index,fn_key = -1,'' if not group: fn_base = re.findall('^v[0-9]+\.[0-9]+-[0-9]+-[0-9]+', work.slice(sn)[slice_name][ 'all' if not group else group]['filekey'])[0]+'.%s'%function.__name__ else: try: fn_base = work.slice(sn)[slice_name][ 'all' if not group else group]['filekey']+'.%s'%function.__name__ except: print "no group and cannot get base filename" import pdb;pdb.set_trace() prev = glob.glob(work.postdir+fn_base+'*.dat') if prev == []: index = 0 else: index = max(map(lambda x:int(re.findall('^.+\/%s\.n([0-9]+)\.dat'%fn_base,x)[0]),prev))+1 fn_key = '.n%d'%index fn = fn_base+fn_key+'.dat' #---safety check for file errors to prevent overwriting however this should be handled by indices if os.path.isfile(work.postdir+fn): raise Exception('[ERROR] %s exists'%(work.postdir+fn)) #---check for specs file with the exact same specifications exists = True if index != -1 and work.select_postdata(fn_base,calc) != None else False if not exists: import ipdb;ipdb.set_trace() status("%s %s"%(function.__name__,str(outgoing)),tag='compute') outgoing['workspace'] = work outgoing['calc'] = calc if 'upstream' in outgoing: sn = outgoing['sn'] outgoing['upstream'] = dict([(k, load(data[sn][k],work.postdir)) for k in outgoing['upstream']]) result,attrs = function(**outgoing) """ spec files are carefully constructed they prevent redundant calculations they allow us to loop over many parameters while saving files with a single index the calculation dictionary in the specs file contains meta-parameters for looping we are careful not to save meta parameters to the spec file we only save parameters which are relevant to the calculation itself the calculation dictionary in the spec file must therefore separate these parameters in a sub-dictionary called 'specs' we prefer attrs to be small and specific since attrs is also used to uniquely specify the data all big data should be stored as a result via numpy """ #---if any calculation specifications are not in attributes we warn the user here if 'specs' in calc: unaccounted = [i for i in calc['specs'] if i not in attrs] else: unaccounted = [] if 'upstream' in unaccounted and 'upstream' not in attrs: status('automatically appending upstream data',tag='status') unaccounted.remove('upstream') attrs['upstream'] = calc['specs']['upstream'] if any(unaccounted): print computer_error_attrs_passthrough+'\n\n' status('some calculation specs were not saved: %s'% str(unaccounted),tag='STATUS') import pdb;pdb.set_trace() store(result,fn,work.postdir,attrs=attrs) with open(work.postdir+fn_base+fn_key+'.spec','w') as fp: fp.write(json.dumps(attrs)+'\n') #---no modifications to work so no save return
def link_objs(objs, output): # Link objects into the output program if tools.depends(output, objs): arr = tools.flatten([LD, '-r', objs, '-o', output]) tools.pprint('LD', output) tools.call(arr)