示例#1
0
def process_gloss_pos_line(inst, word_tag_dict, outfile):
    """
    Process the gloss pos line.

    :param inst:
    :param word_tag_dict:
    """
    # Grab the gloss POS tier...
    gpos_tier = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE)

    # If this tier exists, then let's process it.
    if gpos_tier is not None:

        # Iterate over each gloss POS tag...
        for gpos in gpos_tier:

            # Skip this tag if for some reason it doesn't align with
            # a gloss word.
            if ALIGNMENT not in gpos.attributes or not gpos.alignment:
                EXTRACT_LOG.debug("No alignment found for {} in tier {} igt {}".format(gpos.id, gpos.tier.id, gpos.igt.id))
                continue

            word = gpos.igt.find(id=gpos.alignment).value()
            tag  = gpos.value()

            # Write out the features...
            t = GoldTagPOSToken(word, goldlabel=tag)
            write_gram(t, feat_prev_gram=False, feat_next_gram=False, lowercase=True, output=outfile)
示例#2
0
def process_dicts(class_path):
    c = pickle.load(open(c_path, "rb"))
    d = pickle.load(open(d_path, "rb"))
    m = pickle.load(open(m_path, "rb"))

    print(len(c), len(d), len(m))

    # Threshold:
    thresh = 30

    # Now, we want to write out every word that we've seen at least 3 times.
    out_path = os.path.join(proj_root, "odin_feats.txt")
    out_f = open(out_path, "w", encoding="utf-8")
    for i, w in enumerate(d.keys()):

        if d[w].total() < thresh:
            LOG.debug("Skipping {}".format(w))
        else:
            LOG.debug("Testing {}".format(w))
            for tag in d[w].keys():

                LOG.debug("Writing out tag for {}-{}".format(w, tag))
                t = GoldTagPOSToken(w, goldlabel=tag)
                write_gram(t, output=out_f, feat_next_gram=False, feat_prev_gram=False, lowercase=True)
            out_f.flush()

    out_f.close()

    train_txt(out_path, class_path)
示例#3
0
def chunk_to_features(chunk, tag_method=None, posdict=None, context_feats=False):
    """
    Method to extract the gloss-line classifier features from a subset of instances. (Useful for parallelizing)

    :param inst:
    :type inst: RGIgt
    :param tag_method:
    :param posdict:
    :param feat_path:
    :param context_feats:
    """
    out_string = StringIO()

    num_instances = 0
    # Look for the GLOSS_POS tier
    for inst in chunk:
        gpos_tier = inst.get_pos_tags(GLOSS_WORD_ID, tag_method=tag_method)
        if gpos_tier:
            num_instances += 1

            # For each token in the tier...
            for i, gp in enumerate(gpos_tier):

                if ALIGNMENT not in gp.attributes:
                    continue

                word = gp.igt.find(id=gp.attributes[ALIGNMENT]).value()
                tag = gp.value()

                prev_word = None
                next_word = None

                if context_feats:
                    if i > 0:
                        prev_word = gp.igt.find(id=gpos_tier[i - 1].attributes[ALIGNMENT]).value()

                    if i < len(gpos_tier) - 1:
                        next_word = gp.igt.find(id=gpos_tier[i + 1].attributes[ALIGNMENT]).value()

                # Write out features...
                t = GoldTagPOSToken(word, goldlabel=tag)
                write_gram(
                    t,
                    feat_prev_gram=context_feats,
                    feat_next_gram=context_feats,
                    prev_gram=prev_word,
                    next_gram=next_word,
                    lowercase=True,
                    output=out_string,
                    posdict=posdict,
                )

    return out_string.getvalue(), num_instances
示例#4
0
def write_out_gram_dict(subword_dict, feat_path, feat_list, threshold = 1):
    """
    Given the gram+tag dict, write out grams for those that have been seen enough to
    meet our threshold.

    :param subword_dict:
    :type subword_dict: TwoLevelCountDict
    :param feat_path:
    :param class_path:
    """

    EXTRACT_LOG.log(NORM_LEVEL, 'Writing out svm-lite style features to "{}"...'.format(feat_path))
    feat_file = open(feat_path, 'w', encoding='utf-8')

    # Load the posdict if needed...
    pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False

    for subword in subword_dict.keys():
        for tag in subword_dict[subword].keys():
            # Write out the gram with this tag as many times as it appears...
            for prev_word, next_word in subword_dict[subword][tag]['contexts']:
                gt = GoldTagPOSToken(subword, goldlabel=tag)

                # -------------------------------------------
                # Now, vary the features depending on whats in the list
                # -------------------------------------------

                write_gram(gt, lowercase=True,
                           feat_next_gram=CLASS_FEATS_NEXSW in feat_list,
                           feat_prev_gram=CLASS_FEATS_PRESW in feat_list,
                           feat_suffix=CLASS_FEATS_SUF in feat_list,
                           feat_prefix=CLASS_FEATS_PRE in feat_list,
                           feat_has_number=CLASS_FEATS_NUM in feat_list,
                           feat_morph_num=CLASS_FEATS_NUMSW in feat_list,
                           feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list,
                           feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list,
                           feat_basic=CLASS_FEATS_SW in feat_list,
                           feat_dict=CLASS_FEATS_DICT in feat_list,
                           posdict=pd,
                           next_gram=next_word,
                           prev_gram=prev_word,
                           output=feat_file)

    feat_file.close()
    EXTRACT_LOG.log(NORM_LEVEL, 'Written')