示例#1
0
def load_hardware_labels(
    session, candidate_class, filename, attrib, annotator_name="gold"
):
    ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first()
    if ak is None:
        ak = GoldLabelKey(name=annotator_name)
        session.add(ak)
        session.commit()

    candidates = session.query(candidate_class).all()
    gold_dict = get_gold_dict(filename, attribute=attrib)
    cand_total = len(candidates)
    logger.info("Loading {} candidate labels".format(cand_total))
    labels = []
    for i, c in enumerate(tqdm(candidates)):
        doc = (c[0].span.sentence.document.name).upper()
        part = (c[0].span.get_span()).upper()
        val = ("".join(c[1].span.get_span().split())).upper()
        label = (
            session.query(GoldLabel)
            .filter(GoldLabel.key == ak)
            .filter(GoldLabel.candidate == c)
            .first()
        )
        if label is None:
            if (doc, part, val) in gold_dict:
                label = GoldLabel(candidate=c, key=ak, value=1)
            else:
                label = GoldLabel(candidate=c, key=ak, value=-1)
            session.add(label)
            labels.append(label)
    session.commit()

    session.commit()
    logger.info("AnnotatorLabels created: %s" % (len(labels),))
示例#2
0
def load_hardware_labels(
    session, candidate_classes, filename, attrib, annotator_name="gold"
):
    """Bulk insert hardware GoldLabels.

    :param session: The database session to use.
    :param candidate_classes: Which candidate_classes to load labels for.
    :param filename: Path to the CSV file containing gold labels.
    :param attrib: Which attributes to load labels for (e.g. "stg_temp_max").
    """
    # Check that candidate_classes is iterable
    candidate_classes = (
        candidate_classes
        if isinstance(candidate_classes, (list, tuple))
        else [candidate_classes]
    )

    ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first()
    # Add the gold key
    if ak is None:
        ak = GoldLabelKey(
            name=annotator_name,
            candidate_classes=[_.__tablename__ for _ in candidate_classes],
        )
        session.add(ak)
        session.commit()

    # Bulk insert candidate labels
    candidates = []
    for candidate_class in candidate_classes:
        candidates.extend(session.query(candidate_class).all())

    gold_dict = get_gold_dict(filename, attribute=attrib)
    cand_total = len(candidates)
    logger.info(f"Loading {cand_total} candidate labels")
    labels = 0

    cands = []
    values = []
    for i, c in enumerate(tqdm(candidates)):
        doc = (c[0].context.sentence.document.name).upper()
        part = (c[0].context.get_span()).upper()
        val = ("".join(c[1].context.get_span().split())).upper()

        label = session.query(GoldLabel).filter(GoldLabel.candidate == c).first()
        if label is None:
            if (doc, part, val) in gold_dict:
                values.append(TRUE)
            else:
                values.append(FALSE)

            cands.append(c)
            labels += 1

    # Only insert the labels which were not already present
    session.bulk_insert_mappings(
        GoldLabel,
        [
            {"candidate_id": cand.id, "keys": [annotator_name], "values": [val]}
            for (cand, val) in zip(cands, values)
        ],
    )
    session.commit()

    logger.info(f"GoldLabels created: {labels}")
示例#3
0
def load_president_gold_labels(session,
                               candidate_classes,
                               filename,
                               annotator_name="gold"):
    """Bulk insert hardware GoldLabels.

    :param session: The database session to use.
    :param candidate_classes: Which candidate_classes to load labels for.
    :param filename: Path to the CSV file containing gold labels.
    """
    # Check that candidate_classes is iterable
    candidate_classes = (candidate_classes if isinstance(
        candidate_classes, (list, tuple)) else [candidate_classes])

    print("Clearing ALL Gold labels")
    session.query(GoldLabel).delete()
    session.query(GoldLabelKey).delete()

    ak = session.query(GoldLabelKey).filter(
        GoldLabelKey.name == annotator_name).first()
    # Add the gold key
    if ak is None:
        ak = GoldLabelKey(
            name=annotator_name,
            candidate_classes=[_.__tablename__ for _ in candidate_classes],
        )
        session.add(ak)
        session.commit()

    # Bulk insert candidate labels
    candidates = []
    for candidate_class in candidate_classes:
        candidates.extend(session.query(candidate_class).all())

    gold_dict = get_gold_dict(filename)
    cand_total = len(candidates)
    print(f"Loading {cand_total} candidate labels")
    labels = 0

    docs_in_gold_dict = set([x[0] for x in gold_dict])
    print(f"{len(docs_in_gold_dict)} different docs in gold dict")
    candidates_by_doc = dict()
    for name, place in candidates:
        doc = name.context.sentence.document.name
        if doc not in candidates_by_doc:
            candidates_by_doc[doc] = {name: [place]}
        else:
            if name not in candidates_by_doc[doc]:
                candidates_by_doc[doc][name] = [place]
            else:
                candidates_by_doc[doc][name].append(place)

    cands = []
    values = []
    for i, c in enumerate(tqdm(candidates)):
        doc = (c[0].context.sentence.document.name).upper()
        president_name = (c[0].context.get_span()).upper()
        birthplace = (c[1].context.get_span()).upper()

        cand_tuple = (doc, president_name, birthplace)
        # gold_matches = [x for x in gold_dict if x[0] == doc]
        if cand_tuple in gold_dict:
            values.append(TRUE)
        else:
            values.append(FALSE)

        cands.append(c)
        labels += 1

    # Only insert the labels which were not already present
    session.bulk_insert_mappings(
        GoldLabel,
        [{
            "candidate_id": cand.id,
            "keys": [annotator_name],
            "values": [val]
        } for (cand, val) in zip(cands, values)],
    )
    session.commit()

    print(f"GoldLabels created: {labels}")
示例#4
0
def load_section_heading_gold_labels(
    featurizer_output,annotator_name="gold"
):
    """
    :param session: The database session to use.
    :param candidate_classes: Which candidate_classes to load labels for.
    :param filename: Path to the CSV file containing gold labels.    
    """
    session = featurizer_output['session']
    candidate_classes = featurizer_output['candidate_variable']
    filename = config.gold_file_path
    
    # Check that candidate_classes is iterable
    candidate_classes = (
        candidate_classes
        if isinstance(candidate_classes, (list, tuple))
        else [candidate_classes]
    )

    ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first()
    # Add the gold key
    if ak is None:
        ak = GoldLabelKey(
            name=annotator_name,
            candidate_classes=[_.__tablename__ for _ in candidate_classes],
        )
        session.add(ak)
        session.commit()

    # Bulk insert candidate labels
    candidates = []
    for candidate_class in candidate_classes:
        candidates.extend(session.query(candidate_class).all())

    gold_dict = get_gold_dict(filename)
    cand_total = len(candidates)
    print(f"Loading {cand_total} candidate labels")
    labels = 0

    cands = []
    values = []
    for i, c in enumerate(tqdm(candidates)):
        doc = (c[0].context.sentence.document.name)       
        val = (c[0].context.get_span())

        label = session.query(GoldLabel).filter(GoldLabel.candidate == c).first()
        if label is None:
            if (doc, val) in gold_dict:
                values.append(TRUE)
            else:
                values.append(FALSE)

            cands.append(c)
            labels += 1

    # Only insert the labels which were not already present
    session.bulk_insert_mappings(
        GoldLabel,
        [
            {"candidate_id": cand.id, "keys": [annotator_name], "values": [val]}
            for (cand, val) in zip(cands, values)
        ],
    )
    session.commit()

    print(f"GoldLabels created: {labels}")