Пример #1
0
def first_pass_data_and_labels(notes):
    '''
    first_pass_data_and_labels()

    Purpose: Interface with notes object to get text data and labels

    @param notes. List of Note objects
    @return <tuple> whose elements are:
              0) list of tokenized sentences
              1) list of labels for tokenized sentences

    >>> import os
    >>> from notes.note import Note
    >>> base_dir = os.path.join(os.getenv('CLINER_DIR'), 'tests', 'data')
    >>> txt = os.path.join(base_dir, 'single.txt')
    >>> con = os.path.join(base_dir, 'single.con')
    >>> note_tmp = Note('i2b2')
    >>> note_tmp.read(txt, con)
    >>> notes = [note_tmp]
    >>> first_pass_data_and_labels(notes)
    ([['The', 'score', 'stood', 'four', 'to', 'two', ',', 'with', 'but', 'one', 'inning', 'more', 'to', 'play', ',']], [['B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])
    '''

    # Get the data and annotations from the Note objects
    l_tokenized_sentences = [note.getTokenizedSentences() for note in notes]
    l_iob_labels = [note.getIOBLabels() for note in notes]

    tokenized_sentences = flatten(l_tokenized_sentences)
    iob_labels = flatten(l_iob_labels)

    return tokenized_sentences, iob_labels
Пример #2
0
    def save(self, num_retries=3):
        # If the DB is not writable, the rsync won't happen
        # If the DB is up, but rsync fails, the status will be ERR_SYNC,
        # but self.state will not be updated in the database.

        session = self.db.session()
        try:
            # Test write access to DB
            # If it fails after num_retries trials, update_in_session will
            # raise an Exception, so save() will exit, before the rsync.
            self.dbstate.update_in_session({'jobman.status': self.ERR_SYNC},
                                           session,
                                           _recommit_times=num_retries)

            # save self.state in file current.state, and rsync
            # If the rsync fails after num_retries, an Exception will be
            # raised, and save() will exit before 'jobman.status' is
            # changed back.
            super(DBRSyncChannel, self).save(num_retries=num_retries)

            if self.sync_in_save:
                # update DB
                self.dbstate.update_in_session(flatten(self.state), session,
                                               _recommit_times=num_retries)
            else:
                # update only jobman.*
                state_jobman = flatten({'jobman': self.state.jobman})
                self.dbstate.update_in_session(state_jobman, session,
                                               _recommit_times=num_retries)

        finally:
            session.close()
    def save(self, num_retries=3):
        # If the DB is not writable, the rsync won't happen
        # If the DB is up, but rsync fails, the status will be ERR_SYNC,
        # but self.state will not be updated in the database.

        session = self.db.session()
        try:
            # Test write access to DB
            # If it fails after num_retries trials, update_in_session will
            # raise an Exception, so save() will exit, before the rsync.
            self.dbstate.update_in_session({'jobman.status': self.ERR_SYNC},
                                           session,
                                           _recommit_times=num_retries)

            # save self.state in file current.state, and rsync
            # If the rsync fails after num_retries, an Exception will be
            # raised, and save() will exit before 'jobman.status' is
            # changed back.
            super(DBRSyncChannel, self).save(num_retries=num_retries)

            if self.sync_in_save:
                # update DB
                self.dbstate.update_in_session(flatten(self.state),
                                               session,
                                               _recommit_times=num_retries)
            else:
                # update only jobman.*
                state_jobman = flatten({'jobman': self.state.jobman})
                self.dbstate.update_in_session(state_jobman,
                                               session,
                                               _recommit_times=num_retries)

        finally:
            session.close()
Пример #4
0
 def _child_compute(self, cr, uid, ids, name, args, context=None):
     obj_dept = self.pool.get('hr.department')
     obj_user = self.pool.get('res.users')
     result = {}
     for user_id in ids:
         child_ids = []
         cr.execute('SELECT dept.id FROM hr_department AS dept \
                     LEFT JOIN hr_employee AS emp ON dept.manager_id = emp.id \
                     WHERE emp.id IN \
                         (SELECT emp.id FROM hr_employee \
                             JOIN resource_resource r ON r.id = emp.resource_id WHERE r.user_id=' + str(user_id) + ') ')
         mgnt_dept_ids = [x[0] for x in cr.fetchall()]
         ids_dept = obj_dept.search(cr, uid, [('id', 'child_of', mgnt_dept_ids)], context=context)
         if ids_dept:
             data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids'], context=context)
             children = map(lambda x: x['member_ids'], data_dept)
             children = tools.flatten(children)
             children = obj_user.search(cr, uid, [('id', 'in', children),('active', '=', True)], context=context)
             if user_id in children:
                 children.remove(user_id)
             child_ids.extend(tools.flatten(children))
             set = {}
             map(set.__setitem__, child_ids, [])
             child_ids = set.keys()
         result[user_id] = child_ids
     return result
Пример #5
0
    def __first_train(self, tokenized_sentences, Y, do_grid=False):
        """
        Model::__first_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param tokenized_sentences. <list> of tokenized sentences
        @param Y.                   <list-of-lists> of IOB labels for words
        @param do_grid.             <boolean> whether to perform a grid search

        @return          None
        """

        if globals_cliner.verbosity > 0: print 'first pass'
        if globals_cliner.verbosity > 0:
            print '\textracting  features (pass one)'

        # Seperate into prose v nonprose
        nested_prose_data, nested_prose_Y = zip(
            *filter(lambda line_iob_tup: is_prose_sentence(line_iob_tup[0]),
                    zip(tokenized_sentences, Y)))
        nested_nonprose_data, nested_nonprose_Y = zip(*filter(
            lambda line_iob_tup: not is_prose_sentence(line_iob_tup[0]),
            zip(tokenized_sentences, Y)))

        #extract features
        nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data)
        nested_nonprose_feats = feat_obj.IOB_nonprose_features(
            nested_nonprose_data)

        # Flatten lists (because classifier will expect flat)
        prose_Y = flatten(nested_prose_Y)
        nonprose_Y = flatten(nested_nonprose_Y)

        # rename because code uses it
        pchunks = prose_Y
        nchunks = nonprose_Y
        prose = nested_prose_feats
        nonprose = nested_nonprose_feats

        # Train classifiers for prose and nonprose
        pvec, pclf = self.__generic_first_train('prose', prose, pchunks,
                                                do_grid)
        nvec, nclf = self.__generic_first_train('nonprose', nonprose, nchunks,
                                                do_grid)

        # Save vectorizers
        self._first_prose_vec = pvec
        self._first_nonprose_vec = nvec

        # Save classifiers
        self._first_prose_clf = pclf
        self._first_nonprose_clf = nclf
Пример #6
0
 def _child_compute(self, cr, uid, ids):
     obj_dept = self.pool.get('hr.department')
     child_ids = []
     for id in ids:
         ids_dept = obj_dept.search(cr, uid, [('manager_id', '=', id)])
         if ids_dept:
             data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids'])
             childs = map(lambda x: x['member_ids'], data_dept)
             childs = tools.flatten(childs)
             if id in childs:
                 childs.remove(id)
             child_ids.extend(tools.flatten(childs))
     return child_ids
Пример #7
0
 def _child_compute(self, cr, uid, ids, name, args, context=None):
     obj_dept = self.pool.get("hr.department")
     obj_user = self.pool.get("res.users")
     result = {}
     for user_id in ids:
         child_ids = []
         cr.execute(
             """SELECT dept.id FROM hr_department AS dept
             LEFT JOIN hr_employee AS emp ON dept.manager_id = emp.id
             WHERE emp.id IN
                 (SELECT emp.id FROM hr_employee
                     JOIN resource_resource r ON r.id = emp.resource_id WHERE r.user_id = %s)
             """,
             (user_id,),
         )
         mgnt_dept_ids = [x[0] for x in cr.fetchall()]
         ids_dept = obj_dept.search(cr, uid, [("id", "child_of", mgnt_dept_ids)], context=context)
         if ids_dept:
             data_dept = obj_dept.read(cr, uid, ids_dept, ["member_ids"], context=context)
             emp_children = map(lambda x: x["member_ids"], data_dept)
             emp_children = tools.flatten(emp_children)
             children = self.emp_to_users(cr, uid, emp_children, context=context)
             children = obj_user.search(cr, uid, [("id", "in", children), ("active", "=", True)], context=context)
             if user_id in children:
                 children.remove(user_id)
             child_ids = list(set(child_ids + children))
         result[user_id] = child_ids
     return result
Пример #8
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained model.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    # vectorize validation X
    text_features = extract_features(tokenized_sents)
    flat_X_feats = vocab.transform(flatten(text_features))
    X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    predictions = crf_ml.predict(clf, X)

    # Format labels from output
    return predictions
Пример #9
0
    def to_sql(self):
        stack = []
        params = []
        for i, e in reverse_enumerate(self.__exp):
            if self._is_leaf(e, internal=True):
                table = self.__field_tables.get(i, self.__main_table)
                q, p = self.__leaf_to_sql(e, table)
                params.insert(0, p)
                stack.append(q)
            else:
                if e == '!':
                    stack.append('(NOT (%s))' % (stack.pop(), ))
                else:
                    ops = {'&': ' AND ', '|': ' OR '}
                    q1 = stack.pop()
                    q2 = stack.pop()
                    stack.append('(%s %s %s)' % (
                        q1,
                        ops[e],
                        q2,
                    ))

        query = ' AND '.join(reversed(stack))
        joins = ' AND '.join(self.__joins)
        if joins:
            query = '(%s) AND (%s)' % (joins, query)
        return (query, flatten(params))
Пример #10
0
 def remove_rooms(*room_names):
     """Remove multiple rooms from the buiding.
        Arguments can be a list of rooms or comma-separated values
     """
     room_name_list = tools.flatten(list(room_names))
     for room_name in room_name_list:
         Amity.remove_room(room_name)
Пример #11
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained model.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    # vectorize validation X
    text_features = extract_features(tokenized_sents)
    flat_X_feats = vocab.transform( flatten(text_features) )
    X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    predictions =   crf_ml.predict(clf, X)

    # Format labels from output
    return predictions
Пример #12
0
    def fit_from_documents(self, documents):
        """
        ClinerModel::fit_from_documents()

        Train clinical concept extraction model using annotated data (files).

        @param notes. A list of Document objects (containing text and annotations)
        @return       None
        """
        # Extract formatted data
        tokenized_sents  = flatten([d.getTokenizedSentences() for d in documents])
        labels           = flatten([d.getTokenLabels()        for d in documents])

        # Call the internal method
        self.fit(tokenized_sents, labels, dev_split=0.10)

        self._training_files = [ d.getName() for d in documents ]
Пример #13
0
    def fit_from_documents(self, documents):
        """
        ClinerModel::fit_from_documents()

        Train clinical concept extraction model using annotated data (files).

        @param notes. A list of Document objects (containing text and annotations)
        @return       None
        """
        # Extract formatted data
        tokenized_sents = flatten(
            [d.getTokenizedSentences() for d in documents])
        labels = flatten([d.getTokenLabels() for d in documents])

        # Call the internal method
        self.fit(tokenized_sents, labels, dev_split=0.10)

        self._training_files = [d.getName() for d in documents]
Пример #14
0
    def train(self, notes):
        """
        ClinerModel::train()

        Purpose: Train a Machine Learning model on annotated data

        @param notes. A list of Note objects (containing text and annotations)
        @return       None
        """

        # Extract formatted data
        tokenized_sentences = flatten(
            [n.getTokenizedSentences() for n in notes])
        labels = flatten([n.getTokenLabels() for n in notes])

        self.train_fit(tokenized_sentences, labels, dev_split=0.1)

        self._training_files = [n.getName() for n in notes]
Пример #15
0
 def search(self, cr, uid, args, offset=0, limit=None, order=None, context=None, count=False):
     res = []
     log_ids = super(res_log, self).search(cr, uid, args, offset, limit, order, context, count)
     logs = {}
     for log in self.browse(cr, uid, log_ids, context=context):
         res_dict = logs.get(log.res_model, {})
         res_dict.update({log.res_id: log.id})
         logs.update({log.res_model: res_dict})
     res = map(lambda x: x.values(), logs.values())
     return tools.flatten(res)
Пример #16
0
def sel(input, indexes, splitf):
    """ Performs the cutting and selecting
        input  : an iterable of lines
        indexes: an iterable of valid list indexes (int, slices)
        splitf : the function used to separate indexes in input
    """

    for line in input:
        fields = filter(is_blank, splitf(line))
        selected = (getitem(fields, i, default='') for i in indexes)
        yield flatten(selected)
Пример #17
0
def sel(input, indexes, splitf):
    """ Performs the cutting and selecting
        input  : an iterable of lines
        indexes: an iterable of valid list indexes (int, slices)
        splitf : the function used to separate indexes in input
    """ 

    for line in input:
        fields   = filter(is_blank, splitf(line))
        selected = (getitem(fields, i, default = '') for i in indexes)
        yield flatten(selected)
    def transformActiveColumnIndexes(activeColumns):

        pastIndexes = []
        yValues = []
        for activeColumn in flatten(activeColumns):
            if activeColumn not in pastIndexes:
                pastIndexes.append(activeColumn)
            yValue = pastIndexes.index(activeColumn)
            yValues.append(yValue)

        return yValues
Пример #19
0
def equilibrium(threshold, defender_costs, attacker_costs):

    # Iterate through each
    n = len(defender_costs)
    if n != len(attacker_costs):
        raise Exception("Unequal set of lists")
    else:
        def_equilibrium = []
        att_equilibrium = []

        for i in range(0, n):
            # print("Resource: ", i)
            # print("----------------")
            resources = list(range(0, n))
            resources.remove(i)
            #
            # print("Resources: ", resources)

            subsets = []
            # for t in range(threshold-1, n):
            subsets.append(
                list(itertools.combinations(resources, threshold - 1)))

            # print("Before flatten: ", subsets)
            subsets = tools.flatten(subsets)
            # print("Subsets:", subsets)
            # subsets.append([])
            if () in subsets and len(defender_costs) != 1:
                subsets.remove(())
            # print("Subsets: ", subsets)
            s = 0
            for l in subsets:
                # creating the product
                p = 1
                for j in range(0, n):
                    if j == i:
                        continue
                    if j in l:
                        p *= defender_costs[j] / (defender_costs[j] +
                                                  attacker_costs[j])
                    else:
                        p *= attacker_costs[j] / (defender_costs[j] +
                                                  attacker_costs[j])

                s += p

            s /= ((attacker_costs[i] + defender_costs[i])**2)
            defender_point = s * attacker_costs[i]
            attacker_point = s * defender_costs[i]

            def_equilibrium.append(defender_point)
            att_equilibrium.append(attacker_point)

        return def_equilibrium, att_equilibrium
Пример #20
0
    def __generic_first_predict(self,
                                p_or_n,
                                text_features,
                                dvect,
                                clf,
                                do_grid=False):
        '''
        Model::__generic_first_predict()

        Purpose: Train that works for both prose and nonprose

        @param p_or_n.        <string> either "prose" or "nonprose"
        @param text_features. <list-of-lists> of feature dictionaries
        @param dvect.         <DictVectorizer>
        @param clf.           scikit-learn classifier
        @param do_grid.       <boolean> indicating whether to perform grid search
        '''

        # If nothing to predict, skip actual prediction
        if len(text_features) == 0:
            print '\tnothing to predict (pass one) ' + p_or_n
            return []

        # Save list structure to reconstruct after vectorization
        offsets = save_list_structure(text_features)

        if globals_cliner.verbosity > 0:
            print '\tvectorizing features (pass one) ' + p_or_n

        # Vectorize features
        X_feats = dvect.transform(flatten(text_features))

        if globals_cliner.verbosity > 0:
            print '\tpredicting    labels (pass one) ' + p_or_n

        # CRF requires reconstruct lists
        if self._crf_enabled:
            X_feats = reconstruct_list(list(X_feats), offsets)
            lib = crf
        else:
            lib = sci

        #for X in X_feats:
        #    for x in X:
        #        print x
        #    print
        #print '\n'

        # Predict IOB labels
        out = lib.predict(clf, X_feats)

        # Format labels from output
        predictions = reconstruct_list(out, offsets)
        return predictions
def timeOrderedSDRPlot(activeColumns):
    #Make sure inputs are numpy arrays
    activeColumns = numpy.asarray(activeColumns)
    
    
    ##Transform the input values into x and y coordinates
    
    #for each active bit, the x value is the time when it was active
    xValues = []
    for t in xrange(len(activeColumns)):
        xValues.append([t]*len(activeColumns[t]))
    xValues = numpy.fromiter(flatten(xValues),int) #not sure if int is the right thing to use here
    

    #Calculate y values 
    #for each active bit the y value is the time when that bit was first active
    def transformActiveColumnIndexes(activeColumns):

        pastIndexes = []
        yValues = []
        for activeColumn in flatten(activeColumns):
            if activeColumn not in pastIndexes:
                pastIndexes.append(activeColumn)
            yValue = pastIndexes.index(activeColumn)
            yValues.append(yValue)

        return yValues

    yValues = numpy.asarray(transformActiveColumnIndexes(activeColumns))
    
    ##Plot the x and y coordinates
    
    #set up plot
    (figure,axes) = plt.subplots()
    #plot the actiive bits
    axes.plot(xValues,yValues,'.')
    #display the horizontal grid
    axes.minorticks_on()
    axes.grid(True,axis='both',which='both',linestyle='solid',color=(0.7,0.7,0.7))
    axes.set_axisbelow(True) #draw the lines beneath the points
    #Set the horizontal gridlines to have spacing of 1
    axes.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1))

    #Add a gap to the bottom and left edges so all points can be seen
    axes.set_xlim(left=-1)
    axes.set_ylim(bottom=-1)

    #set plot aspect ratio so the axes are even.
    axes.set_aspect(aspect='equal')
    #Make the plot bigger (the plot expands to box whilst retaining aspect ratio)
    figure.set_size_inches(20,20)
    
    return (figure, figure.axes)
Пример #22
0
    def __second_train(self,
                       chunked_data,
                       inds_list,
                       con_labels,
                       do_grid=False):
        """
        Model::__second_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      <list> of tokenized sentences after collapsing chunks
        @param inds_list <list-of-lists> of indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param con_labels <list> of concept label strings
                           - assertion: there are sum(len(inds_list)) labels
                              AKA each index from inds_list maps to a label
        @param do_grid   <boolean> indicating whether to perform a grid search

        @return          None
        """

        if globals_cliner.verbosity > 0: print 'second pass'

        # Extract features
        if globals_cliner.verbosity > 0:
            print '\textracting  features (pass two)'

        text_features = [
            feat_obj.concept_features(s, inds)
            for s, inds in zip(chunked_data, inds_list)
        ]

        flattened_text_features = flatten(text_features)

        if globals_cliner.verbosity > 0:
            print '\tvectorizing features (pass two)'

        # Vectorize labels
        numeric_labels = [concept_labels[y] for y in con_labels]

        # Vectorize features
        self._second_vec = DictVectorizer()
        vectorized_features = self._second_vec.fit_transform(
            flattened_text_features)

        if globals_cliner.verbosity > 0:
            print '\ttraining  classifier (pass two)'

        # Train the model
        self._second_clf = sci.train(vectorized_features, numeric_labels,
                                     do_grid)
Пример #23
0
 def get_gradient(self):
   """ Get (optionally make each parameter's gradient) a reference to the flat gradient.
   Returns:
     Flat gradient (by reference: future calls to 'set_gradient' will modify it)
   """
   # Fast path
   if self._gradient is not None:
     return self._gradient
   # Flatten (make if necessary)
   gradient = tools.flatten(tools.grads_of(self._model.parameters()))
   self._gradient = gradient
   return gradient
Пример #24
0
    def train(self, train_notes, val=[], test=[]):
        """
        ClinerModel::train()

        Purpose: Train a Machine Learning model on annotated data

        @param notes. A list of Note objects (containing text and annotations)
        @return       None
        """

        # Extract formatted data
        train_sents  = flatten([n.getTokenizedSentences() for n in train_notes])
        train_labels = flatten([n.getTokenLabels()        for n in train_notes])

        if test:
            test_sents  = flatten([n.getTokenizedSentences() for n in test])
            test_labels = flatten([n.getTokenLabels()        for n in test])
        else:
            test_sents  = []
            test_labels = []

        if val:
            print ("VAL")
            val_sents  = flatten([n.getTokenizedSentences() for n in val])
            val_labels = flatten([n.getTokenLabels()        for n in val])
            self.train_fit(train_sents,train_labels,val_sents=val_sents,val_labels=val_labels,test_sents=test_sents,test_labels=test_labels)

        else:
            print ("NO DEV")
            self.train_fit(train_sents, train_labels, dev_split=0.1,
                           test_sents=test_sents, test_labels=test_labels)

        self._train_files = [ n.getName() for n in train_notes+val ]
Пример #25
0
 def _child_compute(self, cr, uid, ids, name, args, context=None):
     obj_dept = self.pool.get('hr.department')
     obj_user = self.pool.get('res.users')
     result = {}
     for user_id in ids:
         child_ids = []
         cr.execute('SELECT dept.id FROM hr_department AS dept \
                     LEFT JOIN hr_employee AS emp ON dept.manager_id = emp.id \
                     WHERE emp.id IN \
                         (SELECT emp.id FROM hr_employee \
                             JOIN resource_resource r ON r.id = emp.resource_id WHERE r.user_id='
                    + str(user_id) + ') ')
         mgnt_dept_ids = [x[0] for x in cr.fetchall()]
         ids_dept = obj_dept.search(cr,
                                    uid,
                                    [('id', 'child_of', mgnt_dept_ids)],
                                    context=context)
         if ids_dept:
             data_dept = obj_dept.read(cr,
                                       uid,
                                       ids_dept, ['member_ids'],
                                       context=context)
             emp_children = map(lambda x: x['member_ids'], data_dept)
             emp_children = tools.flatten(emp_children)
             children = self.emp_to_users(cr,
                                          uid,
                                          emp_children,
                                          context=context)
             children = obj_user.search(cr,
                                        uid, [('id', 'in', children),
                                              ('active', '=', True)],
                                        context=context)
             if user_id in children:
                 children.remove(user_id)
             child_ids.extend(tools.flatten(children))
             set = {}
             map(set.__setitem__, child_ids, [])
             child_ids = set.keys()
         result[user_id] = child_ids
     return result
Пример #26
0
def second_pass_data_and_labels(notes):
    '''
    second_pass_data_and_labels()

    Purpose: Interface with notes object to get text data and labels

    @param notes. List of Note objects
    @return <tuple> whose elements are:
              0) list of chunked sentences
              0) list of list-of-indices designating chunks
              1) list of labels for chunks

    >>> import os
    >>> from notes.note import Note
    >>> base_dir = os.path.join(os.getenv('CLINER_DIR'), 'tests', 'data')
    >>> txt = os.path.join(base_dir, 'single.txt')
    >>> con = os.path.join(base_dir, 'single.con')
    >>> note_tmp = Note('i2b2')
    >>> note_tmp.read(txt, con)
    >>> notes = [note_tmp]
    >>> second_pass_data_and_labels(notes)
    ([['The score stood four to two', ',', 'with', 'but', 'one', 'inning', 'more', 'to', 'play', ',']], [[0]], ['problem'])
    '''

    # Get the data and annotations from the Note objects
    l_chunked_sentences = [note.getChunkedText() for note in notes]
    l_inds_list = [note.getConceptIndices() for note in notes]
    l_con_labels = [note.getConceptLabels() for note in notes]

    chunked_sentences = flatten(l_chunked_sentences)
    inds_list = flatten(l_inds_list)
    con_labels = flatten(l_con_labels)

    #print 'labels: ', len(con_labels)
    #print 'inds:   ', sum(map(len,inds_list))
    #exit()

    return chunked_sentences, inds_list, con_labels
Пример #27
0
 def replay(self, mem, batchsize=None): #training area
     batchsize = len(mem) if batchsize is None else batchsize
     minibatch = random.sample(mem, min(len(mem), batchsize))
     for state, action, reward, next_state, done in minibatch:
         target = self.fPass(state)
         if done:
             target[action] = 0
         else:
             target[action] = reward + self.gamma * numpy.max(self.fPass(next_state))
         guess = self.forward(self.toTensor(tools.flatten(state)))
         loss = self.criterion(guess, self.toTensor(target))
         self.optimizer.zero_grad()
         loss.backward()
         self.optimizer.step()
Пример #28
0
 def run_query(self, query):
     params = {
         "date1": super()._dateformat(self.num_days),
         "date2": super()._dateformat(),
         "version": super()._versioncond()
     }
     self.cursor.execute(query.format(**params))
     result = self.cursor.fetchall()
     if not self.totalusers:
         super()._totalusers(params, 'modules')
     return {
         'versions': tools.flatten(result),
         'count': self.totalusers
     }
Пример #29
0
def main():
	parser = argparse.ArgumentParser(description="Post-process dual ISO images with Magic Lantern.", parents=[logger.loggingParser])
	
	parser.add_argument("images", nargs="+",
	                    help="Image files or directories.")
	parser.add_argument("--cr2hdr", default="$HOME/magic-lantern/modules/dual_iso/cr2hdr",
	                    help="Executable cr2hdr. [Default: %(default)s]")
	parser.add_argument("--raw-ext", nargs="+", default=["cr2", "CR2"],
	                    help="RAW file extensions. [Default: %(default)s]")
	
	args = parser.parse_args()
	logger.initLogger(args)
	
	args.cr2hdr = os.path.expandvars(args.cr2hdr)
	
	images = tools.flatten(
			[tools.flatten([glob.glob(os.path.join(arg, "*."+ext)) for ext in args.raw_ext])
			 if os.path.isdir(arg)
			 else [arg]
			 for arg in args.images]
	)
	
	"""
	images_isos = [
			(image, phototools.load_exif_field(image, "-ISO"), phototools.load_exif_field(image, "-AutoISO"))
			for image in progressiterator.ProgressIterator(images, description="Load EXIF ISO infos")
	]
	
	dual_iso_images = [image for image, iso, auto_iso in images_isos if iso != auto_iso]
	"""
	dual_iso_images = images
	
	for image in progressiterator.ProgressIterator(dual_iso_images, description="Post-process dual ISO images"):
		command = "%s %s" % (args.cr2hdr, image)
		log.debug(command)
		logger.subprocessCall(command)
Пример #30
0
    def _child_compute(self, cr, uid, ids, name, args, context={}):
        obj_dept = self.pool.get('hr.department')
        obj_user = self.pool.get('res.users')
        result = {}
        for manager_id in ids:
            child_ids = []
            mgnt_dept_ids = obj_dept.search(cr, uid, [('manager_id', '=', manager_id)])
            ids_dept = obj_dept.search(cr, uid, [('id', 'child_of', mgnt_dept_ids)])
            if ids_dept:
                data_dept = obj_dept.read(cr, uid, ids_dept, ['member_ids'])
                childs = map(lambda x: x['member_ids'], data_dept)
                childs = tools.flatten(childs)
                childs = obj_user.search(cr, uid, [('id','in',childs),('active','=',True)])
                if manager_id in childs:
                    childs.remove(manager_id)

                child_ids.extend(tools.flatten(childs))
                set = {}
                map(set.__setitem__, child_ids, [])
                child_ids =  set.keys()
            else:
               child_ids = []
            result[manager_id] = child_ids
        return result
Пример #31
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained keras model.
    @param use_lstm.        Bool indicating whether clf is a CRF or LSTM.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    if use_lstm:
        # vectorize tokenized sentences
        X = []
        for sent in tokenized_sents:
            id_seq = []
            for w in sent:
                if w in vocab:
                    id_seq.append(vocab[w])
                else:
                    id_seq.append(vocab['oov'])
            X.append(id_seq)
    else:
        # vectorize validation X
        text_features = extract_features(tokenized_sents)
        flat_X_feats = vocab.transform(flatten(text_features))
        X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    if use_lstm:
        predictions = keras_ml.predict(clf, X)
    else:
        predictions = crf.predict(clf, X)

    # Format labels from output
    return predictions
def detect_alignments(alignment_pair, segment_length=10, overlap=5, statistics=None, dist_func=token_match):
    logging.debug(str(alignment_pair))

    seg_dists = compute_distances(alignment_pair.susp_doc, alignment_pair.src_doc,
                                  segment_length=segment_length, overlap=overlap,
                                  dist_func=dist_func)
    detected = detect_segments(seg_dists)

    if statistics:
        statistics.add_detected_count(len(detected))
        statistics.add_susp_detected_count(len(set(map(itemgetter(0), detected))))
        statistics.add_src_detected_count(len(set(map(itemgetter(1), detected))))

    detected = flatten([zip(seg_to_sent(susp, segment_length, overlap), seg_to_sent(src, segment_length, overlap))
                        for susp, src in detected])

    return detected
Пример #33
0
 def search(self,
            cr,
            uid,
            args,
            offset=0,
            limit=None,
            order=None,
            context=None,
            count=False):
     res = []
     log_ids = super(res_log, self).search(cr, uid, args, offset, limit,
                                           order, context, count)
     logs = {}
     for log in self.browse(cr, uid, log_ids, context=context):
         res_dict = logs.get(log.res_model, {})
         res_dict.update({log.res_id: log.id})
         logs.update({log.res_model: res_dict})
     res = map(lambda x: x.values(), logs.values())
     return tools.flatten(res)
Пример #34
0
 def __init__(self, name_build, config=Configuration(), *args, **kwargs):
   """ Model builder constructor.
   Args:
     name_build Model name or constructor function
     config     Configuration to use for the parameter tensors
     ...        Additional (keyword-)arguments forwarded to the constructor
   Notes:
     If possible, data parallelism is enabled automatically
   """
   # Recover name/constructor
   if callable(name_build):
     name  = tools.fullqual(name_build)
     build = name_build
   else:
     models = type(self)._get_models()
     name  = str(name_build)
     build = models.get(name, None)
     if build is None:
       raise tools.UnavailableException(models, name, what="model name")
   # Build model
   with torch.no_grad():
     model = build(*args, **kwargs)
     if not isinstance(model, torch.nn.Module):
       raise tools.UserException("Expected built model %r to be an instance of 'torch.nn.Module', found %r instead" % (name, getattr(type(model), "__name__", "<unknown>")))
     model = model.to(**config)
     device = config["device"]
     if device.type == "cuda" and device.index is None: # Model is on GPU and not explicitly restricted to one particular card => enable data parallelism
       model = torch.nn.DataParallel(model)
   params = tools.flatten(model.parameters()) # NOTE: Ordering across runs/nodes seems to be ensured (i.e. only dependent on the model constructor)
   # Finalization
   self._model    = model
   self._name     = name
   self._config   = config
   self._params   = params
   self._gradient = None
   self._defaults = {
     "trainset":  None,
     "testset":   None,
     "loss":      None,
     "criterion": None,
     "optimizer": None }
def read_gold_alignments(alignment_pair):
    doc = BeautifulStoneSoup(open(alignment_pair.plagiarism_xml_fn()).read())

    plag_spans = []

    for feature in doc.findAll('feature'):
        if feature['name'] == 'plagiarism':
            src_span = (int(feature['source_offset']), int(feature['source_length']))
            susp_span = (int(feature['this_offset']), int(feature['this_length']))

            plag_spans.append((susp_span, src_span))

    plag_segs = []

    for (susp_offset, susp_len), (src_offset, src_len) in plag_spans:
        susp_seg = match_seg(alignment_pair.susp_doc, susp_offset, susp_len)
        src_seg = match_seg(alignment_pair.src_doc, src_offset, src_len)

        plag_segs.append((susp_seg, src_seg))

    return flatten(plag_segs)
Пример #36
0
def reward(threshold, defender_rates, attacker_rates, defender_costs,
           attacker_costs):
    n = len(defender_rates)
    if n != (len(attacker_rates) and len(defender_costs)
             and len(attacker_costs)):
        raise Exception("Unequal set of lists")
    else:
        resources = list(range(0, n))
        subsets = []
        for t in range(threshold, n + 1):
            subsets.append(list(itertools.combinations(resources, t)))

        subsets = tools.flatten(subsets)

        if () in subsets and len(defender_costs) != 1:
            subsets.remove(())

        gain = 0
        for l in subsets:
            # creating the product
            p = 1
            for i in range(0, n):
                p *= 1 / (defender_rates[i] + attacker_rates[i])

                if i in l:
                    p *= attacker_rates[i]
                else:
                    p *= defender_rates[i]
            gain += p

        defender_move_cost = 0
        attacker_move_cost = 0
        for i in range(0, n):
            defender_move_cost += defender_rates[i] * defender_costs[i]
            attacker_move_cost += attacker_rates[i] * attacker_costs[i]

        defender_reward = 1 - gain - defender_move_cost
        attacker_reward = gain - attacker_move_cost

        return defender_reward, attacker_reward
Пример #37
0
	def images_to_video(self, image_files, image_durations=[5.0], transition_times=[1.0], video_file=None):
		# determine video file
		video_file = self._check_output_video_file(video_file)
		
		# check parameter list lengths
		if len(image_durations) > 1 and len(image_durations) < len(image_files):
			log.warning("Not enough image durations specified.")
		image_durations = (image_durations*len(image_files))[:len(image_files)]
		
		if len(transition_times) > 1 and len(transition_times) < (len(image_files) - 1):
			log.warning("Not enough image transition times specified.")
		transition_times = (transition_times*(len(image_files)-1))[:(len(image_files)-1)]
		
		# create image videos
		image_videos = [
				Multivision.image_to_video(self, image_file, duration, output_file_extension=self.video_format)
				for image_file, duration
				in progressiterator.ProgressIterator(zip(image_files, image_durations), description="Process images")
		]
		
		# create transition videos
		transition_videos = [
				Multivision.image_transition_to_video(self, image_file1, image_file2, transition_time, output_file_extension=self.video_format)
				for image_file1, image_file2, transition_time
				in progressiterator.ProgressIterator(zip(image_files[:-1], image_files[1:], transition_times), description="Process image transitions")
		]
		
		# concatenate videos
		video_files = tools.flatten(zip(image_videos[:-1], transition_videos))+[image_videos[-1]]
		video_files = [v for v in video_files if v != "None"]
		temporary_files = video_files
		self.concatenate_videos(video_files, concatenated_video_file=video_file)
		
		# remove temporary files
		#for temporary_file in temporary_files:
		#	os.remove(temporary_file)
		
		# return video file
		return video_file
Пример #38
0
    def to_sql(self):
        stack = []
        params = []
        for i, e in reverse_enumerate(self.__exp):
            if self._is_leaf(e, internal=True):
                table = self.__tables.get(i, self.__main_table)
                q, p = self.__leaf_to_sql(e, table)
                params.insert(0, p)
                stack.append(q)
            else:
                if e == '!':
                    stack.append('(NOT (%s))' % (stack.pop(),))
                else:
                    ops = {'&': ' AND ', '|': ' OR '}
                    q1 = stack.pop()
                    q2 = stack.pop()
                    stack.append('(%s %s %s)' % (q1, ops[e], q2,))

        query = ' AND '.join(reversed(stack))
        joins = ' AND '.join(map(lambda j: j[0], self.__joins))
        if joins:
            query = '(%s) AND (%s)' % (joins, query)
        return (query, flatten(params))
Пример #39
0
Файл: ttree.py Проект: xshi/dhad
        elif stopatradeta and (self.pdgid == pdgid_eta and
                               pdgid_gamma in map(lambda x: x.pdgid,
                                                      self.daughters)):
            return [self]
        elif stopatradomega and (self.pdgid == pdgid_omega and
                                 pdgid_gamma in map(lambda x: x.pdgid,
                                                    self.daughters)):
            #print 'here'
            return [self]

        else:
            return tools.flatten([dau.interestingDescendants(terminii=terminii,
                                                             stopatradrho=stopatradrho,
                                                             stopatraddecay=stopatraddecay,
                                                             stopatallphodecay=stopatallphodecay,
                                                             stopatradeta=stopatradeta,
                                                             stopatradomega=stopatradomega)
                                  for dau in self.daughters])

    def mcDmode(self):
        if abs(self.pdgid) not in (pdgid_Dp, pdgid_Dz, pdgid_Dsp):
            return None
        else:
            list = self.interestingDescendants()
            retval = 0
            for node in list:
                if node.pdgid == pdgid_Km:
                    retval += 1
                elif node.pdgid == pdgid_Kp:
                    retval += 10
Пример #40
0
    def __second_predict(self, chunked_sentences, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([len(inds) for inds in inds_list]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []

        # Create object that is a wrapper for the features
        if globals_cliner.verbosity > 0:
            print '\textracting  features (pass two)'

        print '\textracting  features (pass two)'

        # Extract features
        text_features = [
            feat_obj.concept_features(s, inds)
            for s, inds in zip(chunked_sentences, inds_list)
        ]
        flattened_text_features = flatten(text_features)

        print '\tvectorizing features (pass two)'

        if globals_cliner.verbosity > 0:
            print '\tvectorizing features (pass two)'

        # Vectorize features
        vectorized_features = self._second_vec.transform(
            flattened_text_features)

        if globals_cliner.verbosity > 0:
            print '\tpredicting    labels (pass two)'

        # Predict concept labels
        out = sci.predict(self._second_clf, vectorized_features)

        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno, inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len(chunked_sentences[lineno][i].split())

                # Length of chunk
                length = len(chunked_sentences[lineno][ind].split())

                # Classification token
                classifications.append(
                    (concept, lineno + 1, start, start + length - 1))

        # Return classifications
        return classifications
Пример #41
0
 def add_rooms(*rooms):
     """ Add multiple room objects to the building.
     """
     room_list = tools.flatten(rooms)
     for room in room_list:
         Amity.add_room(room)
Пример #42
0
    def setup(self):
        # Extract a single experiment from the table that is not
        # already running.  set self.experiment and self.state
        super(DBRSyncChannel, self).setup()

        self.state.jobman.sql.host_name = socket.gethostname()

        def state_del(state, keys):
            # Delete from the state the following key if present
            for key in keys:
                if hasattr(state, key):
                    del state[key]

        # put jobs scheduler info into the state
        condor_slot = os.getenv("_CONDOR_SLOT")
        sge_task_id = os.getenv('SGE_TASK_ID')
        pbs_task_id = os.getenv('PBS_JOBID')
        if condor_slot:
            self.state.jobman.sql.condor_slot = condor_slot
            job_ad_file = os.getenv("_CONDOR_JOB_AD", None)
            if job_ad_file:
                f = open(job_ad_file)
                try:
                    for line in f.readlines():
                        if line.startswith('GlobalJobId = '):
                            self.state.jobman.sql.condor_global_job_id = line.split(
                                '=')[1].strip()[1:-1]
                        elif line.startswith('Out = '):
                            self.state.jobman.sql.condor_stdout = line.split(
                                '=')[1].strip()[1:-1]
                        elif line.startswith('Err = '):
                            self.state.jobman.sql.condor_stderr = line.split(
                                '=')[1].strip()[1:-1]
                        elif line.startswith('OrigIwd = '):
                            self.state.jobman.sql.condor_origiwd = line.split(
                                '=')[1].strip()[1:-1]
                finally:
                    f.close()
        elif sge_task_id:
            self.state.jobman.sql.sge_task_id = sge_task_id
            self.state.jobman.sql.job_id = os.getenv('JOB_ID')
            self.state.jobman.sql.sge_stdout = os.getenv('SGE_STDOUT_PATH')
            self.state.jobman.sql.sge_stderr = os.getenv('SGE_STDERR_PATH')
        elif pbs_task_id:
            self.state.jobman.sql.pbs_task_id = pbs_task_id
            self.state.jobman.sql.pbs_queue = os.getenv('PBS_QUEUE')
            self.state.jobman.sql.pbs_arrayid = os.getenv('PBS_ARRAYID')
            self.state.jobman.sql.pbs_num_ppn = os.getenv('PBS_NUM_PPN')

        # delete old jobs scheduler info into the state
        # this is needed in case we move a job to a different system.
        # to know where it is running now.
        key_to_del = []
        if not condor_slot:
            key_to_del.extend(['jobman.sql.condor_global_job_id',
                               'jobman.sql.condor_stdout',
                               'jobman.sql.condor_stderr',
                               'jobman.sql.condor_origiwd',
                               'jobman.sql.condor_slot'])
        if not sge_task_id:
            key_to_del.extend(['jobman.sql.sge_task_id',
                               'jobman.sql.job_id',
                               'jobman.sql.sge_stdout',
                               'self.state.jobman.sql.sge_stderr'])
        if not pbs_task_id:
            key_to_del.extend(['jobman.sql.pbs_task_id',
                               'jobman.sql.pbs_queue',
                               'jobman.sql.pbs_arrayid',
                               'jobman.sql.pbs_num_ppn'])

        flattened_state = flatten(self.state)
        deleted = False
        for k in key_to_del:
            if k in flattened_state:
                del flattened_state[k]
                deleted = True
        if deleted:
            self.state = expand(flattened_state)

        self.state.jobman.sql.start_time = time.time()
        self.state.jobman.sql.host_workdir = self.path
        self.dbstate.update(flatten(self.state))
Пример #43
0
    def interestingDescendants(self,
                               terminii=interesting_particles,
                               stopatradrho=True,
                               stopatraddecay=True,
                               stopatallphodecay=True,
                               stopatradeta=True,
                               stopatradomega=True):
        if self.pdgid == pdgid_gammaFSR:
            return []
        if (abs(self.pdgid) == pdgid_rhop and len(self.daughters) < 2):
            print[particle_data.findId(x.pdgid).name for x in self.daughters]
            return self.daughters
        if ((abs(self.pdgid) == pdgid_rhop and len(self.daughters) > 1
             and abs(self.daughters[0].pdgid) == pdgid_pip
             and self.daughters[1].pdgid == pdgid_gamma)):
            print 'rho->pi gamma: hahahaha'
            print self
            for node in self.daughters:
                print node
        # we don't care about decays in flight or pi0 daughters
        # Dumb hack : we don't care right now for non-FSR photons, so
        # terminate if X -> Y gamma, else carry on
        # But hey, this kills radiative K* decays ... make an option
        if self.pdgid in terminii:
            return [self]
        elif (stopatallphodecay and len(self.daughters) == len(
                filter(lambda x: x.pdgid == pdgid_gamma, self.daughters))):
            return [self]
        elif stopatraddecay and (len(self.daughters) == 2
                                 and pdgid_gamma in (self.daughters[0].pdgid,
                                                     self.daughters[1].pdgid)):
            ##             if self.pdgid == pdgid_etaprime:
            ##                 print 'HO'
            ##                 print 'xxxxxxxxxxx'
            ##                 print self
            ##                 print self.daughters[0]
            ##                 print self.daughters[1]
            ##                 print 'xxxxxxxxxxx'
            ##            if self.pdgid == 323:
            ##                print 'here', terminii, stopatraddecay
            return [self]
        elif stopatradrho and ((abs(self.pdgid) == pdgid_rhop
                                and abs(self.daughters[0].pdgid) == pdgid_pip
                                and self.daughters[1].pdgid == pdgid_gamma)):
            print 'rho check'
            return [self]

        elif stopatradeta and (self.pdgid == pdgid_eta and pdgid_gamma in map(
                lambda x: x.pdgid, self.daughters)):
            return [self]
        elif stopatradomega and (self.pdgid == pdgid_omega and pdgid_gamma
                                 in map(lambda x: x.pdgid, self.daughters)):
            #print 'here'
            return [self]

        else:
            return tools.flatten([
                dau.interestingDescendants(terminii=terminii,
                                           stopatradrho=stopatradrho,
                                           stopatraddecay=stopatraddecay,
                                           stopatallphodecay=stopatallphodecay,
                                           stopatradeta=stopatradeta,
                                           stopatradomega=stopatradomega)
                for dau in self.daughters
            ])
Пример #44
0
def generic_train(p_or_n, tokenized_sents, iob_nested_labels,
                  val_sents=None, val_labels=None, dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.             A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents.    A list of sentences, where each sentence is tokenized
                                 into words
    @param iob_nested_labels.  Parallel to `tokenized_sents`, 7-way labels for 
                                 concept spans
    @param val_sents.          Validation data. Same format as tokenized_sents
    @param val_labels.         Validation data. Same format as iob_nested_labels
    @param dev_split.          A real number from 0 to 1
    '''
    # Must have data to train on
    if len(tokenized_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000):
    if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>10):

        p = int(dev_split*100)
        print '\tCreating %d/%d train/dev split' % (100-p,p)

        perm = range(len(tokenized_sents))
        random.shuffle(perm)

        tokenized_sents   = [   tokenized_sents[i] for i in perm ]
        iob_nested_labels = [ iob_nested_labels[i] for i in perm ]

        ind = int(dev_split*len(tokenized_sents))

        val_sents   = tokenized_sents[:ind ]
        train_sents = tokenized_sents[ ind:]

        val_labels   = iob_nested_labels[:ind ]
        train_labels = iob_nested_labels[ ind:]

        tokenized_sents   = train_sents
        iob_nested_labels = train_labels


    print '\tvectorizing words', p_or_n

    #tokenized_sents   = train_sents[ :2]
    #iob_nested_labels = train_labels[:2]

    # count word frequencies to determine OOV
    freq = defaultdict(int)
    for sent in tokenized_sents:
        for w in sent:
            freq[w] += 1

    # determine OOV based on % of vocab or minimum word freq threshold
    oov = set()
    '''
    if len(freq) < 100:
        lo = len(freq)/20
        oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo])
    else:
        #lo = 2
        #oov = set([ w for w,f in freq.items() if (f <= lo) ])
        oov = set()
    '''

    '''
    val = None
    for w,f in sorted(freq.items(), key=lambda t:t[1]):
        if val != f:
            val = f
            print
        print '%8d  %s' % (f,w)
    exit()
    '''

    ########
    # CRF 
    ########

    # vectorize tokenized sentences
    '''
    def make_feature(ind):
        return {(ind,i):1 for i in range(10)}
    text_features = []
    for sent in tokenized_sents:
        fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent]
        text_features.append(fseq)
    '''
    text_features = extract_features(tokenized_sents)

    # Collect list of feature types
    enabled_features = set()
    for sf in text_features:
        for wf in sf:
            for (feature_type,instance),value in wf.items():
                if feature_type.startswith('prev'):
                    feature_type = 'PREV*'
                if feature_type.startswith('next'):
                    feature_type = 'NEXT*'
                enabled_features.add(feature_type)
    enabled_features = sorted(enabled_features)

    # Vectorize features
    vocab = DictVectorizer()
    flat_X_feats = vocab.fit_transform( flatten(text_features) )
    X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    # vectorize IOB labels
    Y_labels = [ [tag2id[y] for y in y_seq] for y_seq in iob_nested_labels ]

    assert len(X_feats) == len(Y_labels)
    for i in range(len(X_feats)):
        assert X_feats[i].shape[0] == len(Y_labels[i])


    # if there is specified validation data, then vectorize it
    if val_sents:
        # vectorize validation X
        val_text_features = extract_features(val_sents)
        flat_val_X_feats = vocab.transform( flatten(val_text_features) )
        val_X = reconstruct_list(flat_val_X_feats, 
                                 save_list_structure(val_text_features))
        # vectorize validation Y
        val_Y = [ [tag2id[y] for y in y_seq] for y_seq in val_labels ]


    print '\ttraining classifiers', p_or_n

    #val_sents  = val_sents[ :5]
    #val_labels = val_labels[:5]

    # train using crf
    clf, dev_score  = crf_ml.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y)

    return vocab, clf, dev_score, enabled_features

# meta verbs with synonyms
meta_verbs = (("q", "quit"), )

# direction verbs with synonyms
direction_verbs = (
    ("n", "north"),
    ("e", "east"),
    ("s", "south"),
    ("w", "west"),
)

# all the verbs!
verbs = tools.flatten((
    *meta_verbs,
    *direction_verbs,
))

########################################
#   REPL
########################################

game = Game("Adventure", rooms, player)
game.start("outside")

while True:

    # get user input
    user_input = game.prompt_user()

    # check validity of user input
Пример #46
0
def get_files(directory, extensions):
	files = tools.flatten([glob.glob(os.path.join(directory, "*.%s" % ext)) for ext in extensions])
	return list(set(files))
Пример #47
0
def extract_table_from_img(input_img_name,
                           output_img_path=None,
                           show_tables=False,
                           save_small_tables=False,
                           get_test_tables=False):
    """
    table extracted from img will be saved in table_info
    if want draw rectangles to directly show tables of the img set show_tables=True

    """
    print('I am working on extracting table from image')
    os.path.isfile(input_img_name)
    img = cv2.imread(input_img_name)
    max_area = img.shape[0] * img.shape[1]
    max_area_condition = max_area * 3 / 4
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edge_img = cv2.Canny(gray_img, 50, 150)
    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (7, 3))
    dilate_image = cv2.dilate(edge_img, dilate_kernel, iterations=1)
    res, binary_img = cv2.threshold(dilate_image, 45, 255, cv2.THRESH_BINARY)

    horizontal_dilation = get_table_lines(binary_img, kernel_size=(50, 1))
    vertical_dilation = get_table_lines(binary_img, kernel_size=(1, 50))
    table_dilation = horizontal_dilation + vertical_dilation
    # table_dilation = cv2.dilate(table_dilation, dilate_kernel, iterations=1)
    table_dilation, contours, hierarchy = cv2.findContours(
        table_dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    # print(len(contours))
    rec_coo = []
    for i in range(len(contours)):
        contour_coordinates = contours[i]
        x_coordinates = flatten(contour_coordinates[:, :, 0].tolist())
        y_coordinates = flatten(contour_coordinates[:, :, 1].tolist())
        x_no_repeat_list = list(set(x_coordinates))
        y_no_repeat_list = list(set(y_coordinates))
        same_x_num = len(x_coordinates) - len(x_no_repeat_list)
        same_y_num = len(y_coordinates) - len(y_no_repeat_list)
        if same_x_num >= 2 and same_y_num >= 2:
            rec_x_min, rec_y_min, rec_x_max, rec_y_max = find_left_right_conner(
                x_coordinates, y_coordinates)
            find_area = (rec_x_max - rec_x_min) * (rec_y_max - rec_y_min)
            if find_area is not 0 and find_area < max_area_condition:
                # print('find left right conner')
                rec_coo.append([rec_x_min, rec_y_min, rec_x_max, rec_y_max])
                if show_tables or get_test_tables:
                    cv2.rectangle(img, (rec_x_min, rec_y_min),
                                  (rec_x_max, rec_y_max), (0, 255, 0), 3)
                    f_name = get_file_name(input_img_name)
                    cv2.imwrite(os.path.join(f_name + '_Draw.png'), img)
    table_num = 0
    # extract table from img_name
    rec_list = []
    for x_y_coo in rec_coo:
        rec = img[x_y_coo[1]:x_y_coo[3] + 1, x_y_coo[0]:x_y_coo[2] + 1]
        rec_list.append(rec)
        if save_small_tables:
            table_label = os.path.join(
                output_img_path, 'g1_0_table_cut' + str(table_num) + '.png')
            cv2.imwrite(table_label, rec)
            table_num += 1
    return rec_list, rec_coo
Пример #48
0
def get_cases_ids_from_references(references):
    return list({
        int(x) for x in flatten([CASE_ID_RE.findall(ref) for ref in references])
    })
Пример #49
0
def generic_train(p_or_n,
                  tokenized_sents,
                  iob_nested_labels,
                  use_lstm,
                  val_sents=None,
                  val_labels=None,
                  dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.             A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents.    A list of sentences, where each sentence is tokenized
                                 into words
    @param iob_nested_labels.  Parallel to `tokenized_sents`, 7-way labels for 
                                 concept spans
    @param use_lstm            Bool indicating whether to train CRF or LSTM.
    @param val_sents.          Validation data. Same format as tokenized_sents
    @param val_labels.         Validation data. Same format as iob_nested_labels
    @param dev_split.          A real number from 0 to 1
    '''

    # Must have data to train on:
    if len(tokenized_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000):
    if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents) > 10):

        p = int(dev_split * 100)
        print '\tCreating %d/%d train/dev split' % (100 - p, p)

        perm = range(len(tokenized_sents))
        random.shuffle(perm)

        tokenized_sents = [tokenized_sents[i] for i in perm]
        iob_nested_labels = [iob_nested_labels[i] for i in perm]

        ind = int(dev_split * len(tokenized_sents))

        val_sents = tokenized_sents[:ind]
        train_sents = tokenized_sents[ind:]

        val_labels = iob_nested_labels[:ind]
        train_labels = iob_nested_labels[ind:]

        tokenized_sents = train_sents
        iob_nested_labels = train_labels

    print '\tvectorizing words', p_or_n

    #tokenized_sents   = train_sents[ :2]
    #iob_nested_labels = train_labels[:2]

    # count word frequencies to determine OOV
    freq = defaultdict(int)
    for sent in tokenized_sents:
        for w in sent:
            freq[w] += 1

    # determine OOV based on % of vocab or minimum word freq threshold
    oov = set()
    '''
    if len(freq) < 100:
        lo = len(freq)/20
        oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo])
    else:
        #lo = 2
        #oov = set([ w for w,f in freq.items() if (f <= lo) ])
        oov = set()
    '''
    '''
    val = None
    for w,f in sorted(freq.items(), key=lambda t:t[1]):
        if val != f:
            val = f
            print
        print '%8d  %s' % (f,w)
    exit()
    '''

    if use_lstm:
        ########
        # LSTM
        ########

        # build vocabulary of words
        vocab = {}
        for sent in tokenized_sents:
            for w in sent:
                if (w not in vocab) and (w not in oov):
                    vocab[w] = len(vocab) + 1
        vocab['oov'] = len(vocab) + 1

        # vectorize tokenized sentences
        X_seq_ids = []
        for sent in tokenized_sents:
            id_seq = [(vocab[w] if w in vocab else vocab['oov']) for w in sent]
            X_seq_ids.append(id_seq)

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels]

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_X = []
            for sent in val_sents:
                id_seq = [(vocab[w] if w in vocab else vocab['oov'])
                          for w in sent]
                val_X.append(id_seq)
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

    else:
        ########
        # CRF
        ########

        # vectorize tokenized sentences
        '''
        def make_feature(ind):
            return {(ind,i):1 for i in range(10)}
        text_features = []
            fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent]
            text_features.append(fseq)
        '''
        text_features = extract_features(tokenized_sents)
        # type(text_features): <type 'list'>

        # Collect list of feature types
        enabled_features = set()
        for sf in text_features:
            for wf in sf:
                for (feature_type, instance), value in wf.items():
                    if feature_type.startswith('prev'):
                        feature_type = 'PREV*'
                    if feature_type.startswith('next'):
                        feature_type = 'NEXT*'
                    enabled_features.add(feature_type)
        enabled_features = sorted(enabled_features)

        # Vectorize features
        vocab = DictVectorizer()
        flat_X_feats = vocab.fit_transform(flatten(text_features))
        X_feats = reconstruct_list(flat_X_feats,
                                   save_list_structure(text_features))

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels]

        assert len(X_feats) == len(Y_labels)
        for i in range(len(X_feats)):
            assert X_feats[i].shape[0] == len(Y_labels[i])

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_text_features = extract_features(val_sents)
            flat_val_X_feats = vocab.transform(flatten(val_text_features))
            val_X = reconstruct_list(flat_val_X_feats,
                                     save_list_structure(val_text_features))
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

    print '\ttraining classifiers', p_or_n

    #val_sents  = val_sents[ :5]
    #val_labels = val_labels[:5]

    if use_lstm:
        # train using lstm
        clf, dev_score = keras_ml.train(X_seq_ids,
                                        Y_labels,
                                        tag2id,
                                        len(vocab),
                                        val_X_ids=val_X,
                                        val_Y_ids=val_Y)
    else:
        # train using crf
        clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y)

    return vocab, clf, dev_score, enabled_features
def token_match(susp_sents, src_sents):
    susp_tokens = set(flatten([sent.words() for sent in susp_sents]))
    src_tokens = set(flatten([sent.words() for sent in src_sents]))

    return len(susp_tokens.intersection(src_tokens)) / (len(src_tokens) * 1.0)
Пример #51
0
def plotload(plotname,work,specfile=None,choice_override=None,use_group=False):

	"""
	Load postprocessing data for making a plot.
	Note that we currently do not use the specs items.
	"""

	#---read plot specification
	if 0:
		if not specfile: specfile = work.paths['specs_file']
		#---load the yaml specifications file
		if type(specfile)==str: specfile = [specfile]
		raw_specs = ''
		for sfn in specfile: 
			with open(sfn,'r') as fp: raw_specs += '\n'+fp.read()
		specs = yaml.load(raw_specs)
	specs = work.load_specs()

	#---merge automatic plots here
	if 0:
		if 'autoplots' in specs:
			for key,val in specs['autoplots'].items():
				if key in specs['plots']: 
					raise Exception('\n[ERROR] redundant names in plots and autoplots: %s'%key+
						", which is populated with django so check calculator.Calculation")
				else: specs['plots'][key] = deepcopy(val)
	plotspecs = specs['plots'][plotname]

	#---load the calculation from the workspace
	calcnames = plotspecs['calculation']

	if type(calcnames)==str: calcnames = [calcnames]
	datasets = {name:[] for name in calcnames}
	calcsets = {name:[] for name in calcnames}
	
	#---loop over calcnames requested in the plot specs
	for calcname in calcnames:
		
		calcs = work.interpret_specs(work.calc[calcname])
		if len(calcs)==0: raise Exception('[ERROR] failed to retrieve calculations')
	
		#---get the group from either plotspecs or the calculation or exception
		if 'group' in plotspecs: group = plotspecs['group']
		elif 'group' in work.calc[calcname]: group = work.calc[calcname]['group']
		else: group = None
		#---get the collection from either plotspecs or the upstream calculation
		if 'collections' in plotspecs: 
			collections = plotspecs['collections']
			if type(collections)==str: collections = [collections]
		else: collections = calc['collections']
		sns = flatten([work.vars['collections'][c] for c in collections])

		#---compile all upstream data
		data = [{} for c in calcs]
	
		#---iterate over the loop over upstream calculations
		for calcnum,calcwhittle in enumerate(calcs):

			status('upstream data type: %s'%str(calcwhittle),tag='load')
			calc = deepcopy(work.calc[calcname])
			#---loop over simulations 
			for snum,sn in enumerate(sns):
				status(sn.ljust(26),tag='load',i=snum,looplen=len(sns))
				#---slices in plotspecs or lookup from variables with plus-syntax
				#---! need to allow blank slices here so that the machine looks to calcs to get them
				if 'slices' in plotspecs and not re.match('^\+',plotspecs['slices']): 
					work.cursor = (work.c,work.trajectory_format)
					sl = work.slice(sn)[plotspecs['slices']]['all' if not group else group]
				elif 'slices' in plotspecs: 
					sl = deepcopy(delve(work.vars,*plotspecs['slices'].strip('+').split('/')))
					#---the slice might not have a filekey if its a combo
					if 'filekey' not in sl:
						#---! pbc and groups will usually be absent here
						start,end,skip,pbc = [sl[i] for i in 'start,end,skip,pbc'.split(',')]
						sl['filekey'] = '%s.%d-%d-%d'%(work.prefixer(sn),start,end,skip)
				else: raise Exception('[ERROR] cannot infer slices')
				#---compute base filename
				if not group: 
					#---! deprecated
					fn_base = re.findall('^v[0-9]+\.[0-9]+-[0-9]+-[0-9]+',sl['filekey'])[0]+'.%s'%calcname
				elif use_group:
					#---special settings here for loading certain kinds of data eg protein_abstractor
					fn_base = '%s.%s.pbc%s.%s'%(sl['filekey'],group,sl['pbc'],calcname)
				else: fn_base = '%s.%s'%(sl['filekey'],calcname)
				#---fill in upstream details in our replicate of the calculation specs
				for route,val in [(i,j) for i,j in catalog(calcwhittle)]:
					#---! the plot has to mimic the specs structure exactly otherwise error below
					try: endpoint = delve(work.calc[calcname],*route)
					except:
						import pdb;pdb.set_trace()
					if type(endpoint)==dict and 'loop' in endpoint: 
						try: 
							penultimate = delve(calc,*route[:-1])
							penultimate[route[-1]] = val
						except: pass
				#---get the dat file and package it
				fn = work.select_postdata(fn_base,calc,debug=True)
				if fn == None: 
					print '[ERROR] cannot locate a file necessary for plotting via work.select_postdata\n'+\
						'[ERROR] you probably need to fix your meta.yaml file and run "make compute"\n'+\
						'[ERROR] check the post directory at "%s" and the variables fn_base,calc\n'
					import pdb;pdb.set_trace()
				dat_fn = os.path.basename(fn)[:-4]+'dat'
				data[calcnum][sn] = {'data':load(dat_fn,work.postdir),
					'slice':sl,'group':group,'fn_base':fn_base}
		#---if only one calculation of this type then we elevate package
		if len(calcs)==1: calcs,data = calcs[0],data[0]
		datasets[calcname],calcsets[calcname] = data,calcs
	#---if only one upstream calculation we return that directly
	if len(datasets)==1: return datasets.values()[0],calcsets.values()[0]
	else: return datasets,calcsets
Пример #52
0
    def __generic_first_train(self,
                              p_or_n,
                              text_features,
                              iob_labels,
                              do_grid=False):
        '''
        Model::__generic_first_train()

        Purpose: Train that works for both prose and nonprose

        @param p_or_n.        <string> either "prose" or "nonprose"
        @param text_features. <list-of-lists> of feature dictionaries
        @param iob_labels.    <list> of "I", "O", and "B" labels
        @param do_grid.       <boolean> indicating whether to perform grid search
        '''

        # Must have data to train on
        if len(text_features) == 0:
            raise Exception('Training must have %s training examples' % p_or_n)

        # Vectorize IOB labels
        Y_labels = [IOB_labels[y] for y in iob_labels]

        # Save list structure to reconstruct after vectorization
        offsets = save_list_structure(text_features)

        if globals_cliner.verbosity > 0:
            print '\tvectorizing features (pass one) ' + p_or_n

        #X = reconstruct_list(flatten(text_features), offsets)
        #Y = reconstruct_list(        Y_labels      , offsets)
        #for a,b in zip(X,Y):
        #    for x,y in zip(a,b):
        #        print y
        #        #print filter(lambda t:t[0]=='word', x.keys())
        #        print x.keys()
        #        print
        #    print '\n\n\n'

        # Vectorize features
        dvect = DictVectorizer()
        X_feats = dvect.fit_transform(flatten(text_features))

        # CRF needs reconstructed lists
        if self._crf_enabled:
            X_feats = reconstruct_list(list(X_feats), offsets)
            Y_labels = reconstruct_list(Y_labels, offsets)
            lib = crf
        else:
            lib = sci

        if globals_cliner.verbosity > 0:
            print '\ttraining classifiers (pass one) ' + p_or_n

        #for i,X in enumerate(X_feats):
        #    for j,x in enumerate(X):
        #        print x, '\t', Y_labels[i][j]
        #    print
        #exit()

        # Train classifier
        clf = lib.train(X_feats, Y_labels, do_grid)

        return dvect, clf
Пример #53
0
_spex = _home + '/proj/mdwarfs/templates/spex/'
spex_fns = [line.strip() for line in os.popen('ls %s*.fits' % _spex)]
sptstr = [os.path.split(el)[1].split('_')[0] for el in spex_fns]
spts = np.array([10*int(el[0]=='M')+int(el[1])-10 for el in sptstr])
spex_dats = []
for fn in spex_fns:
    temp = pyfits.getdata(fn)
    nir = (temp[0] > 0.9) * (temp[0] < 2.4)
    dw = np.diff(temp[0,nir]).mean()
    xkern = np.arange(-40, 40)
    kern = an.gaussian([1., 1./(res*dw), 0, 0], xkern)
    temp[1] = np.convolve(temp[1], kern, 'same')
    spex_dats.append(temp)

# Sort filenames:
final_fns = np.array(tools.flatten([[line.strip() for line in os.popen('ls %s*final.fits' % (proc))] for proc in procs]))
ai = np.argsort([os.path.split(fn)[1] for fn in final_fns])
final_fns = final_fns[ai]

w_ignore = [0, .91], [1.33, 1.45], [1.75, 2.04], [2.4, 999]
nobs = len(final_fns)
nmod = len(spex_fns)
chisq = np.zeros((nobs, nmod), dtype=float) + 9e99

for ii in range(nobs):
    obs0 = pyfits.getdata(final_fns[ii])
    obs0[2] = np.sqrt(obs0[2]**2 + (obs0[1]/maxsnr)**2)
    wbins = 0.5*(obs0[0,1:] + obs0[0,0:-1])
    wbins = np.concatenate((wbins[0]-np.diff(wbins[0:2]), wbins, wbins[-1]+np.diff(wbins[-2:])))
    bestfit, bestmod = 0, 0
    for jj in range(nmod):
def densityOrderedSDRPlot(activeColumns):
    #Make sure inputs are numpy arrays
    activeColumns = numpy.asarray(activeColumns)
    
    
    ##Transform the input values into x and y coordinates
    
    #for each active bit, the x value is the time when it was active
    xValues = []
    for t in xrange(len(activeColumns)):
        xValues.append([t]*len(activeColumns[t]))
    xValues = numpy.fromiter(flatten(xValues),int) #not sure if int is the right thing to use here

    #Calculate y values 
    #for each active bit the y value is the number of times that bit is active over the whole time range
    from collections import Counter
    from operator import itemgetter

    inputIndexes = list(flatten(activeColumns))

    #get a list of tuples - [(columnIndex,count),(columnIndex, count),...]
    c=Counter(inputIndexes).items()
    #sort this list by column index first
    s=sorted(c,key=itemgetter(0))
    #then sort the sorted list by count (so columns that have the same count will be sorted by index)
    sortedCounts = sorted(s,key=itemgetter(1),reverse=True)
    #reassign the count number in each tuple to be the position in the list
    d=[]
    for i in xrange(len(sortedCounts)):
        item = list(sortedCounts[i])
        item[1] = i
        d.append(item)
    columnRank = dict(d)

    yValues = numpy.asarray([columnRank[index] for index in inputIndexes])
    
    
    
    #Calculate horizontal grid lines
    #Each number next to a horizontal gridline shows the number of activations in the band above
    #split up plot into bands of the same activation count ie group the 'columns' by activation count
    yTickPositions = []
    lastValue = None
    for i in xrange(len(sortedCounts)):
        currentValue = sortedCounts[i][1]
        if currentValue != lastValue:
            yTickPositions.append(i)
        lastValue = currentValue
    yTickPositions = numpy.asarray(yTickPositions) - 0.5 #the y ticks should divide the columns into groups, so they go between line, hence the 0.5
    
    
    
    ##Plot the x and y coordinates
    
    #set up plot
    (figure,axes) = plt.subplots()
    #plot the values
    axes.plot(xValues,yValues,'.')

    #display the horizontal grid lines that go through each line of markers
    axes.minorticks_on()
    axes.grid(True,axis='both',which='minor',linestyle='solid',color=(0.7,0.7,0.7))
    axes.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1))

    #display the horizontal lines that seperate the columns into activation amounts
    axes.grid(True,axis='y',which='major',linestyle='solid',color=(0.3,0.3,0.3))
    axes.set_yticks(yTickPositions)
    axes.set_yticklabels(numpy.unique([item[1] for item in sortedCounts])[::-1])
    
    #draw the drid lines beneath the points
    axes.set_axisbelow(True)

    #Add a gap to the bottom and left edges so all points can be seen
    axes.set_xlim(left=-1)
    axes.set_ylim(bottom=-1)

    #set plot aspect ratio so the axes are even. Make the plot bigger (the plot expands to the maximum size allowed by aspect ratio)
    axes.set_aspect(aspect='equal')
    figure.set_size_inches(20,20)

    return (figure, figure.axes)
Пример #55
0
 def add_persons(*persons):
     """Add persons to the people collection
     """
     persons_list = tools.flatten(list(persons))
     for person in persons_list:
         Amity.add_person(person)
Пример #56
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm,
                    hyperparams):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained keras model.
    @param use_lstm.        Bool indicating whether clf is a CRF or LSTM.
    '''
    # use_lstm=self._use_lstm
    if use_lstm:

        #parameters=hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        #model_folder="./models/NN_models"
        predictions = []
        sys.stdout.write('\n use_lstm \n')
        dataset = Exp.Dataset()

        fictional_labels = copy.deepcopy(tokenized_sents)
        for idx, x in enumerate(fictional_labels):
            for val_id, value in enumerate(x):
                fictional_labels[idx][val_id] = 'O'

        Datasets_tokens = {}
        Datasets_labels = {}

        Datasets_tokens['deploy'] = tokenized_sents
        Datasets_labels['deploy'] = fictional_labels

        token_to_vector = dataset.load_dataset(
            Datasets_tokens,
            Datasets_labels,
            "",
            parameters,
            token_to_vector=tokens_to_vec,
            pretrained_dataset=pretrained_dataset)

        print(dataset.token_indices.keys())

        parameters['Feature_vector_length'] = dataset.feature_vector_size
        parameters['use_features_before_final_lstm'] = False

        dataset.update_dataset("", ['deploy'], Datasets_tokens,
                               Datasets_labels)

        del Datasets_tokens
        del Datasets_labels

        #model=current_model
        model = entity_model.EntityLSTM(dataset, parameters)

        os.mkdir(parameters['conll_like_result_folder'])

        test_temp = os.path.join(parameters['conll_like_result_folder'],
                                 'test/')
        train_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'train/')
        valid_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'valid/')

        os.mkdir(test_temp)
        os.mkdir(train_temp)
        os.mkdir(valid_temp)

        sess = tf.Session()
        with sess.as_default():

            #model=entity_model.EntityLSTM(dataset,parameters)
            transition_params_trained = model.restore_from_pretrained_model(
                parameters,
                dataset,
                sess,
                token_to_vector=token_to_vector,
                pretrained_dataset=pretrained_dataset)
            del token_to_vector
            predictions = training_predict_LSTM.prediction_step(
                sess, dataset, "deploy", model, 0,
                parameters['conll_like_result_folder'],
                transition_params_trained)
            sess.close()

        tf.reset_default_graph()

        shutil.rmtree(parameters['conll_like_result_folder'])
        return predictions, model

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        sys.stdout.write('\tnothing to predict %s\n' % p_or_n)
        return []

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        print('todo: incorporate lstm')
        # vectorize tokenized sentences
        #X = []
        #for sent in tokenized_sents:
        #   id_seq = []
        #   for w in sent:
        #      if w in vocab:
        #           id_seq.append(vocab[w])
        #       else:
        #        id_seq.append(vocab['oov'])
        #  X.append(id_seq)
    else:
        from feature_extraction.features import extract_features

        # vectorize validation X
        text_features = extract_features(tokenized_sents)
        flat_X_feats = vocab.transform(flatten(text_features))
        X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    sys.stdout.write('\tpredicting  labels %s\n' % p_or_n)

    # Predict labels
    if use_lstm:
        print("TEST_PREDICT")
        exit()

    else:
        from machine_learning import crf
        predictions = crf.predict(clf, X)

    # Format labels from output
    return predictions
Пример #57
0
def generic_train(p_or_n,
                  train_sents,
                  train_labels,
                  use_lstm,
                  val_sents=None,
                  val_labels=None,
                  test_sents=None,
                  test_labels=None,
                  dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.         A string that indicates "prose", "nonprose", or "all"
    @param train_sents.    A list of sentences; each sentence is tokenized into words
    @param train_labels.   Parallel to `train_sents`, 7-way labels for concept spans
    @param use_lstm        Bool indicating whether to train CRF or LSTM.
    @param val_sents.      Validation data. Same format as train_sents
    @param val_labels.     Validation data. Same format as train_labels
    @param dev_split.      A real number from 0 to 1
    '''

    # Must have data to train on:
    if len(train_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10):

        p = int(dev_split * 100)
        sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p))

        perm = list(range(len(train_sents)))
        random.shuffle(perm)

        train_sents = [train_sents[i] for i in perm]
        train_labels = [train_labels[i] for i in perm]

        ind = int(dev_split * len(train_sents))

        val_sents = train_sents[:ind]
        train_sents = train_sents[ind:]

        val_labels = train_labels[:ind]
        train_labels = train_labels[ind:]
    else:
        sys.stdout.write('\tUsing existing validation data\n')

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        print("TESTING NEW DATSET OBJECT")
        dataset = Exp.Dataset()

        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = False

        Datasets_tokens = {}
        Datasets_labels = {}

        Datasets_tokens['train'] = train_sents
        Datasets_labels['train'] = train_labels

        if val_sents != None:
            Datasets_tokens['valid'] = val_sents
            Datasets_labels['valid'] = val_labels

        if test_sents != None:
            Datasets_tokens['test'] = test_sents
            Datasets_labels['test'] = test_labels

        dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters)
        pickle.dump(
            dataset,
            open(os.path.join(parameters['model_folder'], 'dataset.pickle'),
                 'wb'))

        print(Datasets_tokens['valid'][0])
        print(Datasets_tokens['test'][0])

        parameters['Feature_vector_length'] = dataset.feature_vector_size
        parameters['use_features_before_final_lstm'] = False
        parameters['learning_rate'] = 0.005

        sess = tf.Session()
        number_of_sent = list(range(len(dataset.token_indices['train'])))

        with sess.as_default():
            model = entity_model.EntityLSTM(dataset, parameters)
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)
            epoch_number = -1
            transition_params_trained = np.random.rand(5 + 2, 5 + 2)
            values = {}
            values["best"] = 0

            f1_dictionary = {}
            f1_dictionary['best'] = 0

            model_saver = tf.train.Saver(max_to_keep=100)

        print("START TRAINING")

        eval_dir = os.path.join(
            tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep)
        parameters['conll_like_result_folder'] = eval_dir

        test_temp = os.path.join(parameters['conll_like_result_folder'],
                                 'test/')
        train_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'train/')
        valid_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'valid/')

        os.mkdir(parameters['conll_like_result_folder'])
        os.mkdir(test_temp)
        os.mkdir(train_temp)
        os.mkdir(valid_temp)

        while epoch_number < 90:
            average_loss_per_phrase = 0
            accuracy_per_phase = 0
            step = 0

            epoch_number += 1
            if epoch_number != 0:
                sequence_numbers = list(
                    range(len(dataset.token_indices['train'])))
                random.shuffle(sequence_numbers)
                for sequence_number in sequence_numbers:
                    loss, accuracy, transition_params_trained = training_predict_LSTM.train_step(
                        sess, dataset, sequence_number, model)
                    average_loss_per_phrase += loss
                    accuracy_per_phase += accuracy
                    step += 1
                    if step % 10 == 0:
                        print('Training {0:.2f}% done\n'.format(
                            step / len(sequence_numbers) * 100))

                model_saver.save(
                    sess,
                    os.path.join(parameters['model_folder'],
                                 'model_{0:05d}.ckpt'.format(epoch_number)))

                total_loss = average_loss_per_phrase
                total_accuracy = accuracy_per_phase

                average_loss_per_phrase = average_loss_per_phrase / len(
                    number_of_sent)
                accuracy_per_phase = accuracy_per_phase / len(number_of_sent)

            if epoch_number > 0:
                ""
                f1, predictions = training_predict_LSTM.prediction_step(
                    sess, dataset, "test", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)
                f1_train, _ = training_predict_LSTM.prediction_step(
                    sess, dataset, "train", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)
                f1_valid, _ = training_predict_LSTM.prediction_step(
                    sess, dataset, "valid", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)

                correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy(
                    parameters['conll_like_result_folder'] + "valid" + os.sep +
                    "epoche_" + str(epoch_number) + ".txt")

                if f1_dictionary['best'] < float(f1_valid):
                    f1_dictionary['epoche'] = epoch_number
                    f1_dictionary['best'] = float(f1_valid)

                if values["best"] < correctly_predicted_tokens:
                    values["epoche"] = epoch_number
                    values["best"] = correctly_predicted_tokens

                #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens))

                print("NEW EPOCHE" + " " + str(epoch_number))

                print("Current F1 on train" + " " + str(f1_train))
                print("Current F1 on valid" + " " + str(f1_valid))
                print("Current F1 on test" + " " + str(f1))

                print("Current F1 best (validation): ")
                print(f1_dictionary)

        shutil.rmtree(parameters['conll_like_result_folder'])
        return parameters, dataset, f1_dictionary['best']

    else:
        ########
        # CRF
        ########

        # vectorize tokenized sentences
        text_features = extract_features(train_sents)
        # type(text_features): <type 'list'>

        # Collect list of feature types
        enabled_features = set()
        for sf in text_features:
            for wf in sf:
                for (feature_type, instance), value in wf.items():
                    if feature_type.startswith('prev'):
                        feature_type = 'PREV*'
                    if feature_type.startswith('next'):
                        feature_type = 'NEXT*'
                    enabled_features.add(feature_type)
        enabled_features = sorted(enabled_features)

        # Vectorize features
        vocab = DictVectorizer()
        flat_X_feats = vocab.fit_transform(flatten(text_features))
        X_feats = reconstruct_list(flat_X_feats,
                                   save_list_structure(text_features))

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels]

        assert len(X_feats) == len(Y_labels)
        for i in range(len(X_feats)):
            assert X_feats[i].shape[0] == len(Y_labels[i])

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_text_features = extract_features(val_sents)
            flat_val_X_feats = vocab.transform(flatten(val_text_features))
            val_X = reconstruct_list(flat_val_X_feats,
                                     save_list_structure(val_text_features))
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

        # if there is specified test data, then vectorize it
        if test_sents:
            # vectorize test X
            test_text_features = extract_features(test_sents)
            flat_test_X_feats = vocab.transform(flatten(test_text_features))
            test_X = reconstruct_list(flat_test_X_feats,
                                      save_list_structure(test_text_features))
            # vectorize test Y
            test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels]
        else:
            test_X = None
            test_Y = None

    sys.stdout.write('\ttraining classifiers %s\n' % p_or_n)

    if use_lstm:
        # train using lstm
        clf, dev_score = keras_ml.train(X_seq_ids,
                                        Y_labels,
                                        tag2id,
                                        len(vocab),
                                        val_X_ids=val_X,
                                        val_Y_ids=val_Y,
                                        test_X_ids=test_X,
                                        test_Y_ids=test_Y)
    else:
        # train using crf
        clf, dev_score = crf.train(X_feats,
                                   Y_labels,
                                   val_X=val_X,
                                   val_Y=val_Y,
                                   test_X=test_X,
                                   test_Y=test_Y)

    return vocab, clf, dev_score, enabled_features
Пример #58
0
def computer(function,**kwargs):

	"""
	Compute function figures out how to run a calculation over a simulation.
	"""

	work = kwargs['workspace']
	calc = kwargs['calc']
	
	#---perform a calculation over all collections
	if 'collections' in calc: 
		cols = tuple([calc['collections']]) if type(calc['collections'])==str else calc['collections']
		sns = unique(flatten([work.vars['collections'][i] for i in cols]))
	else: sns = work.sns()
	
	#---get slices (required)
	slice_name = calc['slice_name']
	group = calc['group'] if 'group' in calc else None
	
	#---pass data to the function according to upstream data type
	incoming_type = calc['uptype']
	jobs,data = [],dict([(sn,{}) for sn in sns])
	combined_slices = []
	for sn in sns:
		new_job = {'sn':sn,'slice_name':slice_name,'group':group}
		if incoming_type == 'simulation':
			#---prepare combinations in a dictionary
			if slice_name not in work.slice(sn):
				raise Exception(
					'\n[ERROR] the slices yaml file is missing a slice named "%s" for simulation "%s"'%
					(slice_name,sn))
			try: mfp = work.slice(sn)[slice_name][group]['missing_frame_percent']
			except: 
				print "[WARNING] no missing frame percentage here"
				mfp = 0.0
			if mfp>work.missing_frame_tolerance:
				status('upstream slice failure: %s,%s,%s missing_frame_percent=%.1f'%(
					sn,slice_name,group,mfp),tag='warning')
				continue
			#---defaulting to 'all' group if group is None
			new_job['grofile'] = work.postdir+\
				work.slice(sn)[slice_name][group if group else 'all']['gro']
			#---! xtc must become a flag. recommend 'xtc' becomes work.cursor[1]
			#---defaulting to 'all' group if group is None
			new_job['trajfile'] = work.postdir+work.slice(sn)[slice_name][group if group else 'all']['xtc']
		if 'specs' not in calc: calc['specs'] = ''
		if 'upstream' in calc['specs']:
			#---if no loop on upstream you can use a list
			if type(calc['specs']['upstream'])==list: 
				upstream_ask = dict([(key,None) for key in calc['specs']['upstream']])
			elif type(calc['specs']['upstream'])==str: 
				upstream_ask = {calc['specs']['upstream']:None}
			else: upstream_ask = calc['specs']['upstream']
			for key,val in upstream_ask.items():
				upspecs = deepcopy(work.calc[key])
				#---identify the list of particular options along with the stubs
				options,stubs = work.interpret_specs(upspecs,return_stubs=True)
				#---identify paths and values over which we "whittle" the total list of specs
				whittles = [(i,j) for i,j in catalog(val)]
				#---if no loop on upstream pickles we interpret none and send blank specs
				if val in ['None','none',None]: 
					specs = [options[ss] for r,v in whittles for ss,s in enumerate(stubs)]
				else:
					#---select the correct option by matching all catalogued routes from the incoming
					#---...key to the original calculation
					specs = [options[ss] for r,v in whittles for ss,s in enumerate(stubs) 
						if delve(s['specs'],*r)==v]
				if len(specs)!=1 and 'loop' not in upspecs['slice_name']: 
					import pdb;pdb.set_trace()
					raise Exception('[ERROR] redundant upstream selection %s'%str(select))
				#---if there are multiple slices
				#---! note that we expect that if slice_names is a list it will be ordered here too
				for slicenum,spec in enumerate(specs):
					#---if the upstream calculation has a group then use it in the filename
					if not group:
						if 'group' in work.calc[key]: upgroup = work.calc[key]['group']
						else: upgroup = None
					else: upgroup = group
					if not upgroup: 
						sl = work.slice(sn)[spec['slice_name']]
						fn_base = re.findall('^v[0-9]+\.[0-9]+-[0-9]+-[0-9]+',
							work.slice(sn)[upspecs['slice_name']]['all']['filekey']
							)[0]+'.%s'%key
					else: 
						sl = work.slice(sn)[spec['slice_name']][upgroup]
						fn_base = '%s.%s'%(sl['filekey'],key)
					#---! moved the following block left recently
					fn = work.select_postdata(fn_base,spec)
					if not fn: 
						print '[ERROR] missing %s'%fn
						import pdb;pdb.set_trace()
					outkey = key if len(specs)==1 else '%s%d'%(key,slicenum)
					#---before each calculation the master loop loads the filename stored here
					data[sn][outkey] = os.path.basename(fn)[:-4]+'dat'
			new_job['upstream'] = data[sn].keys()
		jobs.append(new_job)
	
	#---master loop
	for outgoing in jobs:
		sn,slice_name,group = outgoing['sn'],outgoing['slice_name'],outgoing['group']
		
		#---if we combine slices for this calculation we use the whole time span in the base filename
		if type(slice_name)==list:
			#---! simple method for making the combination file key
			start = min([work.slice(sn)[s]['all' if not group else group]['start'] for s in slice_name])
			end = max([work.slice(sn)[s]['all' if not group else group]['end'] for s in slice_name])
			skip = work.slice(sn)[s]['all' if not group else group]['skip']
			#---! this filekey construction means the user will have to anticipate the names of combos
			fn_base = '%s.%d-%d-%d.%s'%(work.prefixer(sn),start,end,skip,function.__name__)
		else:
			#---we index all calculations automatically in case we loop over specs later
			index,fn_key = -1,''
			if not group:
				fn_base = re.findall('^v[0-9]+\.[0-9]+-[0-9]+-[0-9]+',
					work.slice(sn)[slice_name][
					'all' if not group else group]['filekey'])[0]+'.%s'%function.__name__
			else:
				try: fn_base = work.slice(sn)[slice_name][
					'all' if not group else group]['filekey']+'.%s'%function.__name__
				except:
					print "no group and cannot get base filename"
					import pdb;pdb.set_trace()
		prev = glob.glob(work.postdir+fn_base+'*.dat')
		if prev == []: index = 0
		else: index = max(map(lambda x:int(re.findall('^.+\/%s\.n([0-9]+)\.dat'%fn_base,x)[0]),prev))+1
		fn_key = '.n%d'%index
		fn = fn_base+fn_key+'.dat'
		#---safety check for file errors to prevent overwriting however this should be handled by indices
		if os.path.isfile(work.postdir+fn): raise Exception('[ERROR] %s exists'%(work.postdir+fn))
		
		#---check for specs file with the exact same specifications
		exists = True if index != -1 and work.select_postdata(fn_base,calc) != None else False
		if not exists:
			import ipdb;ipdb.set_trace()
			status("%s %s"%(function.__name__,str(outgoing)),tag='compute')
			outgoing['workspace'] = work
			outgoing['calc'] = calc
			if 'upstream' in outgoing:
				sn = outgoing['sn']
				outgoing['upstream'] = dict([(k,
					load(data[sn][k],work.postdir)) for k in outgoing['upstream']])
			result,attrs = function(**outgoing)
			"""
			spec files are carefully constructed
			they prevent redundant calculations
			they allow us to loop over many parameters while saving files with a single index
			the calculation dictionary in the specs file contains meta-parameters for looping
			we are careful not to save meta parameters to the spec file
			we only save parameters which are relevant to the calculation itself
			the calculation dictionary in the spec file must therefore separate these parameters
			in a sub-dictionary called 'specs'
			we prefer attrs to be small and specific
			since attrs is also used to uniquely specify the data
			all big data should be stored as a result via numpy
			"""
			#---if any calculation specifications are not in attributes we warn the user here
			if 'specs' in calc: unaccounted = [i for i in calc['specs'] if i not in attrs]
			else: unaccounted = []
			if 'upstream' in unaccounted and 'upstream' not in attrs: 
				status('automatically appending upstream data',tag='status')
				unaccounted.remove('upstream')
				attrs['upstream'] = calc['specs']['upstream']
			if any(unaccounted):
				print computer_error_attrs_passthrough+'\n\n'
				status('some calculation specs were not saved: %s'%
					str(unaccounted),tag='STATUS')
				import pdb;pdb.set_trace()
			store(result,fn,work.postdir,attrs=attrs)
			with open(work.postdir+fn_base+fn_key+'.spec','w') as fp: fp.write(json.dumps(attrs)+'\n')
	#---no modifications to work so no save
	return
Пример #59
0
def link_objs(objs, output):
    # Link objects into the output program
    if tools.depends(output, objs):
        arr = tools.flatten([LD, '-r', objs, '-o', output])
        tools.pprint('LD', output)
        tools.call(arr)