def run(self): self.gold = collections.deque( tsvlib.iter_tsv_sentences(self.args.gold_file)) self.pred = collections.deque( tsvlib.iter_tsv_sentences(self.args.prediction_file)) mc_args = dict(debug=self.args.debug, tractable=not self.args.combinatorial) c_mwebased = MatchCounter("MWE-based", **mc_args) # used below for exact match c_tokbased = MatchCounter("Token-based", **mc_args) # used below for fuzzy match while self.gold or self.pred: self.check_eof() g = self.gold.popleft() p = self.pred.popleft() self.filter_categories(g) self.filter_categories(p) self.compare_sentences(g, p) g_mwes, p_mwes = self.to_mwes(g), self.to_mwes(p) if self.args.debug: self.print_debug_pairing(g, p) self.print_debug_mwes("gold", g_mwes) self.print_debug_mwes("pred", p_mwes) c_mwebased.increment_mwebased(g_mwes, p_mwes) c_tokbased.increment_tokbased(g_mwes, p_mwes) if self.args.debug: print("DEBUG:", file=sys.stderr) self.print_stats(c_mwebased) self.print_stats(c_tokbased)
def convert(filename, targetfilename, rtl, lang_set_file): doc = folia.Document(id=os.path.basename(filename.replace('.tsv',''))) if rtl: doc.metadata['direction'] = 'rtl' doc.metadata['status'] = 'untouched' doc.declare(folia.Entity, lang_set_file) #ENTITY-SET definition doc.declare(folia.AnnotationType.POS, set=POS_SET_URL) #POS-SET definition text = doc.append(folia.Text) with open(filename,'r',encoding='utf-8') as f: for tsv_sentence in tsvlib.iter_tsv_sentences(f): folia_sentence = folia.Sentence(doc, generate_id_in=text) text.append(folia_sentence) for tsv_word in tsv_sentence: folia_word = folia.Word(doc, text=tsv_word.surface, space=(not tsv_word.nsp), generate_id_in=folia_sentence) folia_sentence.append(folia_word) if tsv_word.pos: folia_word.append(folia.PosAnnotation(doc, cls=tsv_word.pos, annotator="auto", annotatortype=folia.AnnotatorType.AUTO)) mwe_infos = tsv_sentence.mwe_infos() if mwe_infos: folia_mwe_list = folia.EntitiesLayer(doc) folia_sentence.append(folia_mwe_list) for mweid, mweinfo in mwe_infos.items(): assert mweinfo.category, "Conversion to FoLiA requires all MWEs to have a category" # checkme folia_words = [folia_sentence[i] for i in mweinfo.word_indexes] folia_mwe_list.append(folia.Entity, *folia_words, cls=mweinfo.category, annotatortype=folia.AnnotatorType.MANUAL) doc.save(targetfilename)
def __init__(self, train_file): self.train_file = train_file if self.train_file is not None: self.mwe_fieldindex_sets, self.mwe_field_sets, self.mwe_spans = {}, {}, {} sents = list(tsvlib.iter_tsv_sentences(self.train_file)) for field_name in TRAIN_FIELD_NAMES: self.mwe_fieldindex_sets[ field_name] = self._calc_mwe_fieldindex_sets( field_name, sents) self.mwe_field_sets[field_name] = self._calc_mwe_field_sets( field_name) self.mwe_spans[field_name] = frozenset( span for sent in sents for span in sent.iter_mwe_fields_including_span( field_name))
def run(self): sys.excepthook = tsvlib.excepthook for sentence in tsvlib.iter_tsv_sentences(self.args.tsv_file): for i, token in enumerate(sentence.words, 1): for mwe_code in token.mwe_codes: try: tsvlib.mwe_code_to_id_categ(mwe_code) except ValueError: self.warn('MWE codes must look like "3:LVC" or "3"\n' \ 'The MWE code {bad!r} is not well-formed', bad=mwe_code, warntype="ERROR") if str(i) != token.token_id: self.warn('Token has rank "{rank}", expected rank "{exp}"', rank=token.token_id, exp=i, warntype="ERROR") if not self.warned: print('INFO: The file format looks fine!', file=sys.stderr)
def tsv_(self): with open(self.corpus, "r+") as f: tsv_ = list(tsvlib.iter_tsv_sentences(f)) return tsv_
def run(self): if self.args.debug: print("DEBUG: LEGEND: {} {} {} {}".format( GOLDPRED_FMT[(False, False)].format('normal-text'), GOLDPRED_FMT[(True, False)].format('gold-only'), GOLDPRED_FMT[(False, True)].format('pred-only'), GOLDPRED_FMT[(True, True)].format('gold-pred-matched'))) print("DEBUG:") mc_args = dict(debug=self.args.debug, tractable=not self.args.combinatorial) self.gold = collections.deque( tsvlib.iter_tsv_sentences(self.args.gold_file)) self.pred = collections.deque( tsvlib.iter_tsv_sentences(self.args.prediction_file)) seen = SeenInfo(self.args.train_file) base_stats = Statistics(mc_args) categ2stats = collections.defaultdict(lambda: Statistics(mc_args)) continuity2stats = collections.defaultdict(lambda: Statistics(mc_args)) multitokenness2stats = collections.defaultdict( lambda: Statistics(mc_args)) field_whetherseen2stats = collections.defaultdict( lambda: Statistics(mc_args)) # dict[(field, bool)] -> stats field_variantness2stats = collections.defaultdict( lambda: Statistics(mc_args)) # dict[(field, bool)] -> stats while self.gold or self.pred: self.check_eof() sent_gold = self.gold.popleft() sent_pred = self.pred.popleft() sent_gold.absorb_mwes_from_contraction_ranges() sent_pred.absorb_mwes_from_contraction_ranges() if self.args.debug: self.print_debug_pairing(sent_gold, sent_pred) self.compare_sentences(sent_gold, sent_pred) categories = self.mwe_categs(sent_gold) | self.mwe_categs( sent_pred) mweinfos_gold = sent_gold.mwe_infos().values() mweinfos_pred = sent_pred.mwe_infos().values() self.add_to_stats(sent_gold, base_stats, mweinfos_gold, mweinfos_pred, debug_header="Global:") for category in list(sorted(categories, key=str)): g = self.mweinfos_per_categ(mweinfos_gold, category) p = self.mweinfos_per_categ(mweinfos_pred, category) self.add_to_stats(sent_gold, categ2stats[category], g, p, debug_header="Category {}:".format( category or UNLABELED)) for continuity in [True, False]: g = self.mweinfo_per_continuity(mweinfos_gold, continuity) p = self.mweinfo_per_continuity(mweinfos_pred, continuity) self.add_to_stats(sent_gold, continuity2stats[continuity], g, p, debug_header="Continuous:" if continuity else "Discontinuous:") for multitokenness in [True, False]: g = self.mweinfo_per_multitokenness(mweinfos_gold, multitokenness) p = self.mweinfo_per_multitokenness(mweinfos_pred, multitokenness) self.add_to_stats(sent_gold, multitokenness2stats[multitokenness], g, p, debug_header="{}-token:".format( "Multi" if multitokenness else "Single")) if self.args.train_file: for whetherseen in [True, False]: g = seen.mweinfo_per_whetherseen(mweinfos_gold, "LEMMA", whetherseen) p = seen.mweinfo_per_whetherseen(mweinfos_pred, "LEMMA", whetherseen) self.add_to_stats(sent_gold, field_whetherseen2stats[("LEMMA", whetherseen)], g, p, debug_header="{}-in-train:".format( "Seen" if whetherseen else "Unseen")) for variantness in [True, False]: # We interpret variantness==False as "MWEs that were seen and are identical" g = seen.mweinfo_per_variantness(mweinfos_gold, "LEMMA", "FORM", variantness) p = seen.mweinfo_per_variantness(mweinfos_pred, "LEMMA", "FORM", variantness) self.add_to_stats( sent_gold, field_variantness2stats[("LEMMA", "FORM", variantness)], g, p, debug_header="{}-train:".format( "Variant-of" if variantness else "Identical-to")) if self.args.debug: print("DEBUG:") #------------------------------------------ print("## Global evaluation") base_stats.print_stats(prefix='') print() print("## Per-category evaluation (partition of Global)") for category in sorted(categ2stats, key=str): prefix = '{}: '.format(category or UNLABELED) categ2stats[category].print_mwebased_proportion( prefix, baseline=base_stats) categ2stats[category].print_stats(prefix) print() print("## MWE continuity (partition of Global)") for continuity in [True, False]: prefix = "Continuous: " if continuity else "Discontinuous: " continuity2stats[continuity].print_mwebased_proportion( prefix, baseline=base_stats) continuity2stats[continuity].c_mwebased.print_p_r_f(prefix) print() print("## Number of tokens (partition of Global)") for multitokenness in [True, False]: prefix = "{}-token: ".format( "Multi" if multitokenness else "Single") multitokenness2stats[multitokenness].print_mwebased_proportion( prefix, baseline=base_stats) multitokenness2stats[multitokenness].c_mwebased.print_p_r_f(prefix) print() if self.args.train_file: if not seen.mwe_fieldindex_sets["LEMMA"]: tsvlib.warn( "found no MWEs in training file (in field={field_name})", field_name="LEMMA", position='') else: print("## Whether seen in train (partition of Global)") for whetherseen in [True, False]: prefix = "{}-in-train: ".format( "Seen" if whetherseen else "Unseen") field_whetherseen2stats[("LEMMA", whetherseen)] \ .print_mwebased_proportion(prefix, baseline=base_stats) field_whetherseen2stats[( "LEMMA", whetherseen)].c_mwebased.print_p_r_f(prefix) print() print( "## Whether identical to train (partition of Seen-in-train)" ) for variantness in [True, False]: prefix = "{}-train: ".format( "Variant-of" if variantness else "Identical-to") field_variantness2stats[("LEMMA", "FORM", variantness)] \ .print_mwebased_proportion(prefix, baseline=field_whetherseen2stats[("LEMMA", True)]) field_variantness2stats[( "LEMMA", "FORM", variantness)].c_mwebased.print_p_r_f(prefix) print()
import argparse import os import sys sys.path.append('.') import tsvlib parser = argparse.ArgumentParser( description='Simple usage example of the tsvlib library') parser.add_argument("--input", type=argparse.FileType('r'), required=True, help="""Path to input file (in .cupt format)""") args = parser.parse_args() with args.input as f: sentences = list(tsvlib.iter_tsv_sentences(f)) for sentence in sentences: print("\n-------------------------------") print("NEW SENTENCE") forms = " ".join(token['FORM'] for token in sentence.words) print("Text:", forms) first = sentence.words[0] first_LEMMA = first['LEMMA'] first_UPOS = first.get( 'UPOS', '??') # UPOS not necessarily defined for every token... first_FEATS = first.get( 'FEATS', '??') # FEATS not necessarily defined for every token... first_HEAD = int(first.get('HEAD', 0)) parent = sentence.words[first_HEAD -