示例#1
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():    

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['person1'], row['person2']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
                    
        # Because it's a symmetric relation, load both directions...
        context_stable_ids = "~~".join([row['person2'], row['person1']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
示例#2
0
def load_external_labels(session,
                         candidate_class,
                         split,
                         annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")

    # Get split candidates
    candidates = session.query(candidate_class).filter(
        candidate_class.split == split).all()

    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['virus'], row['host']])
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)

        # If label doesn't exist, add label to the session
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=split,
                            filter_label_split=False)
示例#3
0
def reload_external_labels(session: SnorkelSession,
                           input_file: Union[str, Path],
                           annotator_name: str = "gold"):
    Education = get_candidate_class()
    with open(str(input_file), "r") as f:
        lbls = ujson.load(f)

    for lbl in lbls:
        # we check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join((lbl['person'], lbl['organization']))
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=lbl['value']))

    # commit session
    session.commit()

    # reload annotator labels
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
示例#4
0
def load_external_labels(session, candidate_class, annotator_name='gold',file_path=None,isPrint=True):
    # inherited from tutorial/intro/util.py
    gold_labels = pd.read_csv(file_path, sep="\t")
    for index, row in gold_labels.iterrows(): 

        # if row['label'].strip()==annotator_name: 
            # We check if the label already exists, in case this cell was already executed
        # print(row['segment'],row['label'])
        context_stable_ids = row['segment']
        if isPrint:
            print(context_stable_ids)
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
示例#5
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, delimiter='\t', encoding='utf-8')
    for index, row in gold_labels.iterrows():
        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['cell']])

        # print(index, context_stable_ids)
        # print(StableLabel.context_stable_ids)
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)

        if query.count() == 0:

            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))

    print(index)
    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
示例#6
0
def load_external_trend_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():    

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = row['tr']
        #print(context_stable_ids)
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        #print(query.count())
#         for x in query.:
#            print(x)
#         print(query.all())
        if query.count() == 0:
            print('********************************')
            print('adding gold labels for this row')
            print(row)
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
        else:
            print('----------------------------')
            print('stable label is found for this one!: ')
            print(row)
            
            
                    
    # Commit session
    session.commit()
    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
示例#7
0
文件: util.py 项目: oudalab/snorkel
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():
        session.add(StableLabel(tweet = row['content'], annotator_name = annotator_name, value = row['label']))
    
    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
示例#8
0
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False, filter_label_split = False, debug=True):
    # FPATH = 'data/gold_labels.tsv'
    """
    Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py
    
    reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2)
    """
    gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG}
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['Chemical'], row['Gene']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # If it's a symmetric relation, load both directions...
    if symmetric:
        for index, row in gold_labels.iterrows():    
            context_stable_ids = "~~".join([row['Gene'], row['Chemical']])
            query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
            query = query.filter(StableLabel.annotator_name == annotator_name)
            if query.count() == 0:
                session.add(StableLabel(
                    context_stable_ids=context_stable_ids,
                    annotator_name=annotator_name,
                    value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    if reload:
        reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug)
示例#9
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    counter = 0
    #print(session.query(StableLabel).filter(StableLabel.context_stable_ids.label.im_self)[:20])
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = row['Features']
        #print(row['Features'])
        #print(row['label'])
        '''
        session.add(StableLabel(
                idx=index,
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
        
        '''
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))
        '''
        # Because it's a symmetric relation, load both directions... (it is for persons only)
        context_stable_ids = row['Features']
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
        '''

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
示例#10
0
    def handle_label_event(self, _, content, buffers):
        """
        Handles label event by persisting new label
        """
        if content.get('event', '') == 'set_label':
            cid = content.get('cid', None)
            value = content.get('value', None)
            if value is True:
                value = 1
            elif value is False:
                value = -1
            else:
                raise ValueError('Unexpected label returned from widget: ' + str(value) +
                                 '. Expected values are True and False.')

            # If label already exists, just update value (in both AnnotatorLabel and StableLabel)
            if self.annotations[cid] is not None:
                if self.annotations[cid].value != value:
                    self.annotations[cid].value        = value
                    self.annotations_stable[cid].value = value
                    self.session.commit()

            # Otherwise, create a AnnotatorLabel *and a StableLabel*
            else:
                candidate = self.candidates[cid]

                # Create AnnotatorLabel
                self.annotations[cid] = GoldLabel(key=self.annotator, candidate=candidate, value=value)
                self.session.add(self.annotations[cid])

                # Create StableLabel
                context_stable_ids           = '~~'.join([c.stable_id for c in candidate.get_contexts()])
                self.annotations_stable[cid] = StableLabel(context_stable_ids=context_stable_ids,\
                                                           annotator_name=self.annotator.name,\
                                                           value=value,\
                                                           split=candidate.split)
                self.session.add(self.annotations_stable[cid])
                self.session.commit()

        elif content.get('event', '') == 'delete_label':
            cid = content.get('cid', None)
            self.session.delete(self.annotations[cid])
            self.annotations[cid] = None
            self.session.delete(self.annotations_stable[cid])
            self.annotations_stable[cid] = None
            self.session.commit()
示例#11
0
def load_external_labels(session,
                         candidate_class,
                         split,
                         annotator='gold',
                         label_fname='data/cdr_relations_gold.pkl',
                         id_fname='data/doc_ids.pkl'):
    # Load document-level relation annotations
    with open(label_fname, 'rb') as f:
        relations = load(f)
    # Get split candidates
    candidates = session.query(candidate_class).filter(
        candidate_class.split == split).all()
    for c in candidates:
        # Get the label by mapping document annotations to mentions
        doc_relations = relations.get(c.get_parent().get_parent().name, set())
        label = 2 * int(c.get_cids() in doc_relations) - 1
        # Get stable ids and check to see if label already exits
        context_stable_ids = '~~'.join(x.get_stable_id() for x in c)
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator)
        # If does not already exist, add label
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator,
                            value=label))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator,
                            split=split,
                            filter_label_split=False)
def load_external_labels(session,
                         candidate_class,
                         column1_title,
                         column2_title,
                         filepath,
                         candidates,
                         annotator_name='gold'):
    gold_labels = pd.read_csv(filepath, sep="\t")
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join(
            [row[column1_title], row[column2_title]])
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        # print context_stable_ids
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))

        # Because it's a symmetric relation, load both directions...
        context_stable_ids = "~~".join(
            [row[column1_title], row[column2_title]])
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))
    for c in candidates:
        print c.biomarker.get_stable_id()
        print c
        candidate_label = c[0].get_stable_id() + "~~" + c[1].get_stable_id()
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == candidate_label)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=candidate_label,
                            annotator_name=annotator_name,
                            value=-1))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
示例#13
0
    def __init__(self, candidates, session, gold=[], n_per_page=3, height=225, annotator_name=None):
        """
        Initializes a Viewer.

        The Viewer uses the keyword argument annotator_name to define a AnnotatorLabelKey with that name.

        :param candidates: A Python container of Candidates (e.g., not a CandidateSet, but candidate_set.candidates)
        :param session: The SnorkelSession for the database backend
        :param gold: Optional, Python container of Candidates that are know to have positive labels
        :param n_per_page: Optional, number of Contexts to display per page
        :param height: Optional, the height in pixels of the Viewer
        :param annotator_name: Name of the human using the Viewer, for saving their work. Defaults to system username.
        """
        super(Viewer, self).__init__()
        self.session = session

        # By default, use the username as annotator name
        name = annotator_name if annotator_name is not None else getpass.getuser()

        # Sets up the AnnotationKey to use
        self.annotator = self.session.query(GoldLabelKey).filter(GoldLabelKey.name == name).first()
        if self.annotator is None:
            self.annotator = GoldLabelKey(name=name)
            session.add(self.annotator)
            session.commit()

        # Viewer display configs
        self.n_per_page = n_per_page
        self.height     = height

        # Note that the candidates are not necessarily commited to the DB, so they *may not have* non-null ids
        # Hence, we index by their position in this list
        # We get the sorted candidates and all contexts required, either from unary or binary candidates
        self.gold       = list(gold)
        self.candidates = sorted(list(candidates), key=lambda c : c[0].char_start)
        self.contexts   = list(set(c[0].get_parent() for c in self.candidates + self.gold))

        # If committed, sort contexts by id
        try:
            self.contexts = sorted(self.contexts, key=lambda c : c.id)
        except:
            pass

        # Loads existing annotations
        self.annotations        = [None] * len(self.candidates)
        self.annotations_stable = [None] * len(self.candidates)
        init_labels_serialized  = []
        for i, candidate in enumerate(self.candidates):

            # First look for the annotation in the primary annotations table
            existing_annotation = self.session.query(GoldLabel) \
                .filter(GoldLabel.key == self.annotator) \
                .filter(GoldLabel.candidate == candidate) \
                .first()
            if existing_annotation is not None:
                self.annotations[i] = existing_annotation
                if existing_annotation.value == 1:
                    value_string = 'true'
                elif existing_annotation.value == -1:
                    value_string = 'false'
                else:
                    raise ValueError(str(existing_annotation) +
                                     ' has value not in {1, -1}, which Viewer does not support.')
                init_labels_serialized.append(str(i) + '~~' + value_string)

                # If the annotator label is in the main table, also get its stable version
                context_stable_ids = '~~'.join([c.stable_id for c in candidate.get_contexts()])
                existing_annotation_stable = self.session.query(StableLabel) \
                                                 .filter(StableLabel.context_stable_ids == context_stable_ids)\
                                                 .filter(StableLabel.annotator_name == name).one_or_none()

                # If stable version is not available, create it here
                # NOTE: This is for versioning issues, should be removed?
                if existing_annotation_stable is None:
                    context_stable_ids         = '~~'.join([c.stable_id for c in candidate.get_contexts()])
                    existing_annotation_stable = StableLabel(context_stable_ids=context_stable_ids,\
                                                             annotator_name=self.annotator.name,\
                                                             split=candidate.split,\
                                                             value=existing_annotation.value)
                    self.session.add(existing_annotation_stable)
                    self.session.commit()

                self.annotations_stable[i] = existing_annotation_stable

        self._labels_serialized = ','.join(init_labels_serialized)

        # Configures message handler
        self.on_msg(self.handle_label_event)

        # display js, construct html and pass on to widget model
        self.render()
示例#14
0
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False):
    # FPATH = 'data/gold_labels.tsv'
    """
    Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py
    
    reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2)
    """
    gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG}
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['Chemical'], row['Gene']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # If it's a symmetric relation, load both directions...
    if symmetric:
        for index, row in gold_labels.iterrows():    
            context_stable_ids = "~~".join([row['Gene'], row['Chemical']])
            query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
            query = query.filter(StableLabel.annotator_name == annotator_name)
            if query.count() == 0:
                session.add(StableLabel(
                    context_stable_ids=context_stable_ids,
                    annotator_name=annotator_name,
                    value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    if reload:
        reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug)



#######################################################
### Load from pickle dictionary (on document level) ###
#######################################################
###  From snorkel/tutorials/cdr/load_external_annotations.py (v0.6.2)
# from six.moves.cPickle import load

# from snorkel.db_helpers import reload_annotator_labels
# from snorkel.models import StableLabel
# import bz2

# def load_external_labels(session, candidate_class, split, annotator='gold',
#     label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'):
#     # Load document-level relation annotations
#     if label_fname.endswith('.bz2'):
#         with bz2.BZ2File(label_fname, 'rb') as f:
#             relations = load(f)
#     else:    
#         with open(label_fname, 'rb') as f:
#             relations = load(f)
#     # Get split candidates
#     candidates = session.query(candidate_class).filter(
#         candidate_class.split == split
#     ).all()
#     for c in candidates:
#         # Get the label by mapping document annotations to mentions
#         doc_relations = relations.get(c.get_parent().get_parent().name, set())
#         label = 2 * int(c.get_cids() in doc_relations) - 1        
#         # Get stable ids and check to see if label already exits
#         context_stable_ids = '~~'.join(x.get_stable_id() for x in c)
#         query = session.query(StableLabel).filter(
#             StableLabel.context_stable_ids == context_stable_ids
#         )
#         query = query.filter(StableLabel.annotator_name == annotator)
#         # If does not already exist, add label
#         if query.count() == 0:
#             session.add(StableLabel(
#                 context_stable_ids=context_stable_ids,
#                 annotator_name=annotator,
#                 value=label
#             ))

#     # Commit session
#     session.commit()

#     # Reload annotator labels
#     reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)