Exemplo n.º 1
0
    def setUpClass(cls):
        # This is a hack to create a session to a different DB after Snorkel has
        # already been imported. It does not work in general because e.g., the UDF
        # constructor uses Snorkel's new_sessionmaker on different processes.
        # In general, the connection should still be set via the SNORKELDB
        # environment variable
        dir_path = os.path.dirname(os.path.realpath(__file__))
        snorkel_engine = create_engine(os.path.join('sqlite:///' + dir_path, 'spouses.db'))
        SnorkelSession = sessionmaker(bind=snorkel_engine)
        cls.session = SnorkelSession()

        Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

        cls.train_marginals = load_marginals(cls.session, split=0)

        cls.train_cands = cls.session.query(Spouse).filter(Spouse.split == 0).order_by(Spouse.id).all()
        cls.dev_cands   = cls.session.query(Spouse).filter(Spouse.split == 1).order_by(Spouse.id).all()
        cls.test_cands  = cls.session.query(Spouse).filter(Spouse.split == 2).order_by(Spouse.id).all()

        # Each candidate is featurized as 10 floats. The first five are between
        # -.25 and 1 if the class label is True and between -1 and .25 if False.
        # The remaining five are between -1 and 1.
        cls.F_train = load_feature_matrix(cls.session, split=0, coerce_int=False)
        cls.F_dev = load_feature_matrix(cls.session, split=1, coerce_int=False)
        cls.F_test = load_feature_matrix(cls.session, split=2, coerce_int=False)

        cls.L_gold_dev  = load_gold_labels(cls.session, annotator_name='gold', split=1)
        cls.L_gold_test = load_gold_labels(cls.session, annotator_name='gold', split=2)
Exemplo n.º 2
0
    def __init__(self, name, data_path, lfs=None, verbose=True):
        super(CdrDataset, self).__init__(name, verbose)
        assert os.path.exists(data_path)

        os.environ['SNORKELDB'] = "sqlite:///{}".format(data_path)
        logger.info("SQL connection {}".format(os.environ['SNORKELDB']))

        # Hack to prevent snorkel.db creation
        from snorkel import SnorkelSession
        from snorkel.models import candidate_subclass
        from snorkel.annotations import load_gold_labels, load_marginals

        self.session = SnorkelSession()
        self.class_type = candidate_subclass('ChemicalDisease',
                                             ['chemical', 'disease'])

        self.X_train = self.session.query(
            self.class_type).filter(self.class_type.split == 0).all()
        self.X_dev = self.session.query(
            self.class_type).filter(self.class_type.split == 1).all()
        self.X_test = self.session.query(
            self.class_type).filter(self.class_type.split == 2).all()

        if self.verbose:
            splits = {
                "Train": self.X_train,
                "Dev": self.X_dev,
                "Test": self.X_test
            }
            self.print_summary(splits)
        if len(self.X_train) == 0:
            logger.error("Fatal error - no candidates found in database")
            sys.exit()

        self.y_train = load_marginals(self.session, split=0)

        self.y_gold_train = load_gold_labels(self.session,
                                             annotator_name='gold',
                                             split=0)
        self.y_gold_dev = load_gold_labels(self.session,
                                           annotator_name='gold',
                                           split=1)
        self.y_gold_test = load_gold_labels(self.session,
                                            annotator_name='gold',
                                            split=2)

        if not lfs:
            # convert to 0/1 marginals
            self.y_train = (np.ravel(self.y_gold_train.toarray()) + 1.) / 2
            self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2
            self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2

        else:
            self.y_train = self.y_train
            self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2
            self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2
Exemplo n.º 3
0
def Loading_sets(session, T1):
    # Generating and modeling noisy training labels
    # Using a labeled _development set_
    # Loading the labels from both dev set and test set
    missed = load_external_labels(session, T1, annotator_name='gold')

    L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
    L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
    L_gold_train = load_gold_labels(session, annotator_name='gold', split=0)

    return L_gold_dev, L_gold_test, L_gold_train
Exemplo n.º 4
0
    def __init__(self, name, data_path, lfs=None):
        super(SpouseDataset, self).__init__(name)

        os.environ['SNORKELDB'] = "sqlite:///{}".format(data_path)
        logger.info("SQL connection {}".format(os.environ['SNORKELDB']))

        # Hack to prevent snorkel.db creation
        from snorkel import SnorkelSession
        from snorkel.models import candidate_subclass
        from snorkel.annotations import load_gold_labels, load_marginals

        self.session = SnorkelSession()
        self.class_type = candidate_subclass('Spouse', ['person1', 'person2'])

        self.X_train = self.session.query(
            self.class_type).filter(self.class_type.split == 0).all()
        self.X_dev = self.session.query(
            self.class_type).filter(self.class_type.split == 1).all()
        self.X_test = self.session.query(
            self.class_type).filter(self.class_type.split == 2).all()

        if self.verbose:
            splits = {
                "Train": self.X_train,
                "Dev": self.X_dev,
                "Test": self.X_test
            }
            self.print_summary(splits)

        self.y_train = load_marginals(self.session, split=0)

        self.y_gold_train = load_gold_labels(self.session,
                                             annotator_name='gold',
                                             split=0)
        self.y_gold_dev = load_gold_labels(self.session,
                                           annotator_name='gold',
                                           split=1)
        self.y_gold_test = load_gold_labels(self.session,
                                            annotator_name='gold',
                                            split=2)

        if not lfs:
            # convert to 0/1 marginals
            self.y_train = (np.ravel(self.y_gold_train.toarray()) + 1.) / 2
            self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2
            self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2

        else:
            self.y_train = self.y_train
            self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2
            self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2
Exemplo n.º 5
0
def test_LF(session, lf, split, annotator_name):
    """
    Gets the accuracy of a single LF on a split of the candidates, w.r.t. annotator labels,
    and also returns the error buckets of the candidates.
    """
    test_candidates = session.query(Candidate).filter(Candidate.split == split).all()
    test_labels     = load_gold_labels(session, annotator_name=annotator_name, split=split)
    scorer          = MentionScorer(test_candidates, test_labels)
    test_marginals  = np.array([0.5 * (lf(c) + 1) for c in test_candidates])
    return scorer.score(test_marginals, set_unlabeled_as_neg=False, set_at_thresh_as_neg=False)
Exemplo n.º 6
0
    def setUpClass(cls):
        # This is a hack to create a session to a different DB after Snorkel has
        # already been imported. It does not work in general because e.g., the UDF
        # constructor uses Snorkel's new_sessionmaker on different processes.
        # In general, the connection should still be set via the SNORKELDB
        # environment variable
        dir_path = os.path.dirname(os.path.realpath(__file__))
        snorkel_engine = create_engine(
            os.path.join('sqlite:///' + dir_path, 'spouses.db'))
        SnorkelSession = sessionmaker(bind=snorkel_engine)
        cls.session = SnorkelSession()

        Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

        cls.train_marginals = load_marginals(cls.session, split=0)

        cls.train_cands = cls.session.query(Spouse).filter(
            Spouse.split == 0).order_by(Spouse.id).all()
        cls.dev_cands = cls.session.query(Spouse).filter(
            Spouse.split == 1).order_by(Spouse.id).all()
        cls.test_cands = cls.session.query(Spouse).filter(
            Spouse.split == 2).order_by(Spouse.id).all()

        # Each candidate is featurized as 10 floats. The first five are between
        # -.25 and 1 if the class label is True and between -1 and .25 if False.
        # The remaining five are between -1 and 1.
        cls.F_train = load_feature_matrix(cls.session,
                                          split=0,
                                          coerce_int=False)
        cls.F_dev = load_feature_matrix(cls.session, split=1, coerce_int=False)
        cls.F_test = load_feature_matrix(cls.session,
                                         split=2,
                                         coerce_int=False)

        cls.L_gold_dev = load_gold_labels(cls.session,
                                          annotator_name='gold',
                                          split=1)
        cls.L_gold_test = load_gold_labels(cls.session,
                                           annotator_name='gold',
                                           split=2)
Exemplo n.º 7
0
def get_gold_test_matrix(predicate_resume, session):
    candidate_subclass = predicate_resume["candidate_subclass"]
    brat = BratAnnotator(session, candidate_subclass, encoding='utf-8')
    test_cids_query = get_test_cids_with_span(predicate_resume, session)
    test_cands = get_test_cands_with_span(predicate_resume, session).all()
    brat.import_gold_labels(session,
                            get_collection_name(predicate_resume, 2),
                            test_cands,
                            annotator_name="brat")
    L_gold_test = load_gold_labels(session,
                                   annotator_name="brat",
                                   cids_query=test_cids_query,
                                   split=2)
    return L_gold_test
def test_LF(session, lf, split, annotator_name):
    """
    Gets the accuracy of a single LF on a split of the candidates, w.r.t. annotator labels,
    and also returns the error buckets of the candidates.
    """
    test_candidates = session.query(Candidate).filter(
        Candidate.split == split).all()
    test_labels = load_gold_labels(session,
                                   annotator_name=annotator_name,
                                   split=split)
    scorer = MentionScorer(test_candidates, test_labels)
    test_marginals = np.array([0.5 * (lf(c) + 1) for c in test_candidates])
    return scorer.score(test_marginals,
                        set_unlabeled_as_neg=False,
                        set_at_thresh_as_neg=False)
def test_LF(session, lf_arr, label_val, split=1, annotator_name='gold'):

    test_candidates = session.query(Candidate).filter(
        Candidate.split == split).all()
    test_labels = load_gold_labels(session, annotator_name='gold', split=split)
    test_marginals = np.array([(run_lf_arr_as_one(lf_arr, c))
                               for c in test_candidates])

    test_label_array = []
    tp = set()
    fp = set()
    tn = set()
    fn = set()
    b = 0.5

    for i, candidate in enumerate(test_candidates):
        try:
            test_label_index = test_labels.get_row_index(candidate)
            test_label = test_labels[test_label_index, 0]
        except AttributeError:
            test_label = test_labels[i]

        # Bucket the candidates for error analysis
        test_label_array.append(test_label)
        if test_label != 0:
            if test_marginals[i] > b:
                if test_label == label_val:
                    tp.add(candidate)
                else:
                    fp.add(candidate)
            elif test_marginals[i] < b:
                if test_label != label_val:
                    tn.add(candidate)
                else:
                    fn.add(candidate)

    print_scores(len(tp),
                 len(fp),
                 len(tn),
                 len(fn),
                 title="Scores (Un-adjusted)")
    return tp, fp, tn, fn
Exemplo n.º 10
0
    dev_cands = session.query(VirusHost).filter(VirusHost.split == 1).order_by(
        VirusHost.id).all()
    test_cands = session.query(VirusHost).filter(
        VirusHost.split == 2).order_by(VirusHost.id).all()

    # Apply labeler to all sets
    L_train = labeler.apply(split=0)
    L_dev = labeler.apply(split=1)
    L_test = labeler.apply(split=2)

    # Load gold labels
    missed = load_external_labels(session,
                                  VirusHost,
                                  annotator_name='gold',
                                  split=1)
    L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
    missed = load_external_labels(session,
                                  VirusHost,
                                  annotator_name='gold',
                                  split=2)
    L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

    # Generative model
    ds = DependencySelector()
    deps = ds.select(L_train, threshold=0.1)

    gen_model = GenerativeModel()
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_train.shape[0],
Exemplo n.º 11
0
dev_sentences_df = pd.read_excel("data/sentence-labels-dev.xlsx")
dev_sentences_df = dev_sentences_df[dev_sentences_df.curated_dsh.notnull()]
dev_sentences_df = dev_sentences_df.sort_values("candidate_id")
dev_sentences_df.head(2)

# In[ ]:

sql = '''
SELECT candidate_id FROM gold_label
INNER JOIN Candidate ON Candidate.id=gold_label.candidate_id
WHERE Candidate.split=0;
'''
cids = session.query(Candidate.id).filter(
    Candidate.id.in_([x[0] for x in session.execute(sql)]))
L_train_labeled_gold = load_gold_labels(session,
                                        annotator_name='danich1',
                                        cids_query=cids)

# In[ ]:

train_candidate_ids = train_sentences_df.candidate_id.astype(int).tolist()
dev_candidate_ids = (dev_sentences_df[
    dev_sentences_df.curated_dsh.notnull()].candidate_id.astype(int).tolist())

# In[ ]:

train_cands = (session.query(Candidate).filter(
    Candidate.id.in_(train_candidate_ids)).all())

train_label_cands = (session.query(Candidate).filter(
    Candidate.id.in_(cids)).all())
Exemplo n.º 12
0
    def __init__(
        self,
        conn_str,
        candidate_def,
        split=0,
        use_lfs=False,
        word_dict=None,
        pretrained_word_dict=None,
        max_seq_len=125,
    ):
        """
        Assumes a Snorkel database that is fully instantiated with:
        - Candidates generated and assigned to train/dev/test splits
        - Labeling functions are applied and probabilistic labels are generated for train split(s)
        - Gold labels are stored in the database under 'annotator_name = gold'

        :param conn_str:
        :param candidate_def:
        :param split:
        :param use_lfs:
        :param word_dict:
        :param pretrained_word_dict:
        :param max_seq_len:

        """
        if os.path.exists(conn_str):
            os.environ["SNORKELDB"] = "sqlite:///{}".format(conn_str)
        else:
            os.environ["SNORKELDB"] = "postgresql:///{}".format(conn_str)
        print("Connected to {}".format(os.environ["SNORKELDB"]))

        # defer imports until SNORKELDB is defined to prevent initalizing an empty sqlite instance
        from snorkel import SnorkelSession
        from snorkel.models import candidate_subclass, Candidate
        from snorkel.annotations import load_gold_labels, load_marginals

        # sqlite3 doesn't support multiple connections, so use a singleton-style connection object
        if not SnorkelDataset.session:
            SnorkelDataset.session = SnorkelSession()
        self.session = SnorkelDataset.session

        self.class_type = candidate_subclass(*candidate_def)
        self.cardinality = len(candidate_def[-1])
        self.split = split
        self.max_seq_len = max_seq_len

        # create markup sequences and labels
        markers = [
            m.format(i) for i in range(self.cardinality)
            for m in ["~~[[{}", "{}]]~~"]
        ]
        self.X = (self.session.query(Candidate).filter(
            Candidate.split == split).order_by(Candidate.id).all())
        self.X = [self._mark_entities(x, markers) for x in self.X]

        # initalize vocabulary
        self.word_dict = (self._build_vocab(self.X, markers)
                          if not word_dict else word_dict)
        if pretrained_word_dict:
            # include pretrained embedding terms
            self._include_pretrained_vocab(pretrained_word_dict,
                                           self.session.query(Candidate).all())

        # initalize labels (from either LFs or gold labels)
        if use_lfs:
            self.Y = torch.tensor(
                load_marginals(self.session, split=split).todense())
        else:
            self.Y = load_gold_labels(self.session,
                                      annotator_name="gold",
                                      split=split)
            self.Y = [int(y) for y in np.nditer(self.Y.todense())]
            # remap class labels to not include 0 (reserved by MeTaL)
            labels = {
                y: i + 1
                for i, y in enumerate(sorted(np.unique(self.Y), reverse=1))
            }
            self.Y = torch.tensor([labels[y] for y in self.Y])