예제 #1
0
    def run(self):
        self.gold = collections.deque(
            tsvlib.iter_tsv_sentences(self.args.gold_file))
        self.pred = collections.deque(
            tsvlib.iter_tsv_sentences(self.args.prediction_file))
        mc_args = dict(debug=self.args.debug,
                       tractable=not self.args.combinatorial)
        c_mwebased = MatchCounter("MWE-based",
                                  **mc_args)  # used below for exact match
        c_tokbased = MatchCounter("Token-based",
                                  **mc_args)  # used below for fuzzy match

        while self.gold or self.pred:
            self.check_eof()
            g = self.gold.popleft()
            p = self.pred.popleft()
            self.filter_categories(g)
            self.filter_categories(p)
            self.compare_sentences(g, p)
            g_mwes, p_mwes = self.to_mwes(g), self.to_mwes(p)
            if self.args.debug:
                self.print_debug_pairing(g, p)
                self.print_debug_mwes("gold", g_mwes)
                self.print_debug_mwes("pred", p_mwes)
            c_mwebased.increment_mwebased(g_mwes, p_mwes)
            c_tokbased.increment_tokbased(g_mwes, p_mwes)

            if self.args.debug:
                print("DEBUG:", file=sys.stderr)

        self.print_stats(c_mwebased)
        self.print_stats(c_tokbased)
예제 #2
0
def convert(filename, targetfilename, rtl, lang_set_file):
    doc = folia.Document(id=os.path.basename(filename.replace('.tsv','')))
    if rtl:
        doc.metadata['direction'] = 'rtl'
    doc.metadata['status'] = 'untouched'
    doc.declare(folia.Entity, lang_set_file) #ENTITY-SET definition    
    doc.declare(folia.AnnotationType.POS, set=POS_SET_URL) #POS-SET definition 
    text = doc.append(folia.Text)

    with open(filename,'r',encoding='utf-8') as f:
        for tsv_sentence in tsvlib.iter_tsv_sentences(f):
            folia_sentence = folia.Sentence(doc, generate_id_in=text)
            text.append(folia_sentence)

            for tsv_word in tsv_sentence:
                folia_word = folia.Word(doc, text=tsv_word.surface, space=(not tsv_word.nsp), generate_id_in=folia_sentence)
                folia_sentence.append(folia_word)
                if tsv_word.pos:
                    folia_word.append(folia.PosAnnotation(doc, cls=tsv_word.pos, annotator="auto", annotatortype=folia.AnnotatorType.AUTO))

            mwe_infos = tsv_sentence.mwe_infos()
            if mwe_infos:
                folia_mwe_list = folia.EntitiesLayer(doc)
                folia_sentence.append(folia_mwe_list)
                for mweid, mweinfo in mwe_infos.items():
                    assert mweinfo.category, "Conversion to FoLiA requires all MWEs to have a category"  # checkme
                    folia_words = [folia_sentence[i] for i in mweinfo.word_indexes]
                    folia_mwe_list.append(folia.Entity, *folia_words, cls=mweinfo.category, annotatortype=folia.AnnotatorType.MANUAL)

    doc.save(targetfilename)
예제 #3
0
 def __init__(self, train_file):
     self.train_file = train_file
     if self.train_file is not None:
         self.mwe_fieldindex_sets, self.mwe_field_sets, self.mwe_spans = {}, {}, {}
         sents = list(tsvlib.iter_tsv_sentences(self.train_file))
         for field_name in TRAIN_FIELD_NAMES:
             self.mwe_fieldindex_sets[
                 field_name] = self._calc_mwe_fieldindex_sets(
                     field_name, sents)
             self.mwe_field_sets[field_name] = self._calc_mwe_field_sets(
                 field_name)
             self.mwe_spans[field_name] = frozenset(
                 span for sent in sents
                 for span in sent.iter_mwe_fields_including_span(
                     field_name))
예제 #4
0
    def run(self):
        sys.excepthook = tsvlib.excepthook
        for sentence in tsvlib.iter_tsv_sentences(self.args.tsv_file):
            for i, token in enumerate(sentence.words, 1):
                for mwe_code in token.mwe_codes:
                    try:
                        tsvlib.mwe_code_to_id_categ(mwe_code)
                    except ValueError:
                        self.warn('MWE codes must look like "3:LVC" or "3"\n' \
                                'The MWE code {bad!r} is not well-formed',
                                bad=mwe_code, warntype="ERROR")

                if str(i) != token.token_id:
                    self.warn('Token has rank "{rank}", expected rank "{exp}"',
                              rank=token.token_id,
                              exp=i,
                              warntype="ERROR")

        if not self.warned:
            print('INFO: The file format looks fine!', file=sys.stderr)
예제 #5
0
 def tsv_(self):
     with open(self.corpus, "r+") as f:
         tsv_ = list(tsvlib.iter_tsv_sentences(f))
     return tsv_
예제 #6
0
    def run(self):
        if self.args.debug:
            print("DEBUG:  LEGEND:  {} {} {} {}".format(
                GOLDPRED_FMT[(False, False)].format('normal-text'),
                GOLDPRED_FMT[(True, False)].format('gold-only'),
                GOLDPRED_FMT[(False, True)].format('pred-only'),
                GOLDPRED_FMT[(True, True)].format('gold-pred-matched')))
            print("DEBUG:")

        mc_args = dict(debug=self.args.debug,
                       tractable=not self.args.combinatorial)
        self.gold = collections.deque(
            tsvlib.iter_tsv_sentences(self.args.gold_file))
        self.pred = collections.deque(
            tsvlib.iter_tsv_sentences(self.args.prediction_file))
        seen = SeenInfo(self.args.train_file)

        base_stats = Statistics(mc_args)
        categ2stats = collections.defaultdict(lambda: Statistics(mc_args))
        continuity2stats = collections.defaultdict(lambda: Statistics(mc_args))
        multitokenness2stats = collections.defaultdict(
            lambda: Statistics(mc_args))
        field_whetherseen2stats = collections.defaultdict(
            lambda: Statistics(mc_args))  # dict[(field, bool)] -> stats
        field_variantness2stats = collections.defaultdict(
            lambda: Statistics(mc_args))  # dict[(field, bool)] -> stats

        while self.gold or self.pred:
            self.check_eof()
            sent_gold = self.gold.popleft()
            sent_pred = self.pred.popleft()
            sent_gold.absorb_mwes_from_contraction_ranges()
            sent_pred.absorb_mwes_from_contraction_ranges()
            if self.args.debug:
                self.print_debug_pairing(sent_gold, sent_pred)
            self.compare_sentences(sent_gold, sent_pred)
            categories = self.mwe_categs(sent_gold) | self.mwe_categs(
                sent_pred)
            mweinfos_gold = sent_gold.mwe_infos().values()
            mweinfos_pred = sent_pred.mwe_infos().values()

            self.add_to_stats(sent_gold,
                              base_stats,
                              mweinfos_gold,
                              mweinfos_pred,
                              debug_header="Global:")

            for category in list(sorted(categories, key=str)):
                g = self.mweinfos_per_categ(mweinfos_gold, category)
                p = self.mweinfos_per_categ(mweinfos_pred, category)
                self.add_to_stats(sent_gold,
                                  categ2stats[category],
                                  g,
                                  p,
                                  debug_header="Category {}:".format(
                                      category or UNLABELED))

            for continuity in [True, False]:
                g = self.mweinfo_per_continuity(mweinfos_gold, continuity)
                p = self.mweinfo_per_continuity(mweinfos_pred, continuity)
                self.add_to_stats(sent_gold,
                                  continuity2stats[continuity],
                                  g,
                                  p,
                                  debug_header="Continuous:"
                                  if continuity else "Discontinuous:")

            for multitokenness in [True, False]:
                g = self.mweinfo_per_multitokenness(mweinfos_gold,
                                                    multitokenness)
                p = self.mweinfo_per_multitokenness(mweinfos_pred,
                                                    multitokenness)
                self.add_to_stats(sent_gold,
                                  multitokenness2stats[multitokenness],
                                  g,
                                  p,
                                  debug_header="{}-token:".format(
                                      "Multi" if multitokenness else "Single"))

            if self.args.train_file:
                for whetherseen in [True, False]:
                    g = seen.mweinfo_per_whetherseen(mweinfos_gold, "LEMMA",
                                                     whetherseen)
                    p = seen.mweinfo_per_whetherseen(mweinfos_pred, "LEMMA",
                                                     whetherseen)
                    self.add_to_stats(sent_gold,
                                      field_whetherseen2stats[("LEMMA",
                                                               whetherseen)],
                                      g,
                                      p,
                                      debug_header="{}-in-train:".format(
                                          "Seen" if whetherseen else "Unseen"))

                for variantness in [True, False]:
                    # We interpret variantness==False as "MWEs that were seen and are identical"
                    g = seen.mweinfo_per_variantness(mweinfos_gold, "LEMMA",
                                                     "FORM", variantness)
                    p = seen.mweinfo_per_variantness(mweinfos_pred, "LEMMA",
                                                     "FORM", variantness)
                    self.add_to_stats(
                        sent_gold,
                        field_variantness2stats[("LEMMA", "FORM",
                                                 variantness)],
                        g,
                        p,
                        debug_header="{}-train:".format(
                            "Variant-of" if variantness else "Identical-to"))

            if self.args.debug:
                print("DEBUG:")

        #------------------------------------------
        print("## Global evaluation")
        base_stats.print_stats(prefix='')
        print()

        print("## Per-category evaluation (partition of Global)")
        for category in sorted(categ2stats, key=str):
            prefix = '{}: '.format(category or UNLABELED)
            categ2stats[category].print_mwebased_proportion(
                prefix, baseline=base_stats)
            categ2stats[category].print_stats(prefix)
        print()

        print("## MWE continuity (partition of Global)")
        for continuity in [True, False]:
            prefix = "Continuous: " if continuity else "Discontinuous: "
            continuity2stats[continuity].print_mwebased_proportion(
                prefix, baseline=base_stats)
            continuity2stats[continuity].c_mwebased.print_p_r_f(prefix)
        print()

        print("## Number of tokens (partition of Global)")
        for multitokenness in [True, False]:
            prefix = "{}-token: ".format(
                "Multi" if multitokenness else "Single")
            multitokenness2stats[multitokenness].print_mwebased_proportion(
                prefix, baseline=base_stats)
            multitokenness2stats[multitokenness].c_mwebased.print_p_r_f(prefix)
        print()

        if self.args.train_file:
            if not seen.mwe_fieldindex_sets["LEMMA"]:
                tsvlib.warn(
                    "found no MWEs in training file (in field={field_name})",
                    field_name="LEMMA",
                    position='')

            else:
                print("## Whether seen in train (partition of Global)")
                for whetherseen in [True, False]:
                    prefix = "{}-in-train: ".format(
                        "Seen" if whetherseen else "Unseen")
                    field_whetherseen2stats[("LEMMA", whetherseen)] \
                        .print_mwebased_proportion(prefix, baseline=base_stats)
                    field_whetherseen2stats[(
                        "LEMMA", whetherseen)].c_mwebased.print_p_r_f(prefix)
                print()

                print(
                    "## Whether identical to train (partition of Seen-in-train)"
                )
                for variantness in [True, False]:
                    prefix = "{}-train: ".format(
                        "Variant-of" if variantness else "Identical-to")
                    field_variantness2stats[("LEMMA", "FORM", variantness)] \
                        .print_mwebased_proportion(prefix, baseline=field_whetherseen2stats[("LEMMA", True)])
                    field_variantness2stats[(
                        "LEMMA", "FORM",
                        variantness)].c_mwebased.print_p_r_f(prefix)
                print()
import argparse
import os
import sys

sys.path.append('.')
import tsvlib

parser = argparse.ArgumentParser(
    description='Simple usage example of the tsvlib library')
parser.add_argument("--input",
                    type=argparse.FileType('r'),
                    required=True,
                    help="""Path to input file (in .cupt format)""")
args = parser.parse_args()
with args.input as f:
    sentences = list(tsvlib.iter_tsv_sentences(f))

    for sentence in sentences:
        print("\n-------------------------------")
        print("NEW SENTENCE")
        forms = " ".join(token['FORM'] for token in sentence.words)
        print("Text:", forms)

        first = sentence.words[0]
        first_LEMMA = first['LEMMA']
        first_UPOS = first.get(
            'UPOS', '??')  # UPOS not necessarily defined for every token...
        first_FEATS = first.get(
            'FEATS', '??')  # FEATS not necessarily defined for every token...
        first_HEAD = int(first.get('HEAD', 0))
        parent = sentence.words[first_HEAD -