def test_splits(lexan, uohd_refs): # Check if s is in splits def _in_splits(s, splits): return s in [list(map(str, ss)) for ss in splits] f = uohd_refs[0] s = uohd_refs[1] i = SanskritObject(f, encoding=SLP1) try: # for sss in s: # if not lexan.forms.valid(sss): # return "Skip" graph = lexan.getSandhiSplits(i) if graph is None: logger.error("FAIL: Empty split for {}".format( i.canonical().encode('utf-8'))) return False # Reducing max_paths to 100 splits = graph.findAllPaths(max_paths=100, sort=False) r = _in_splits(s, splits) if splits is None or (not r): logger.error("FAIL: {} not in {}".format(s, splits)) return r except: # noqa logger.warning("Split Exception: {}".format( i.canonical().encode('utf-8'))) return "Error"
def main(): args = getArgs() print("Input Dhatu:", args.dhatu) if args.debug: logging.basicConfig(filename='DhatuWrapper.log', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename='DhatuWrapper.log', filemode='w', level=logging.INFO) logger = logging.getLogger(__name__) if args.input_encoding is None: ie = None else: ie = SCHEMES[args.input_encoding] i = SanskritObject(args.dhatu, encoding=ie) it = i.canonical() print("Input String in SLP1:", it) logger.info("Input String in SLP1: {}".format(it)) w = DhatuWrapper(logger=logger) if args.tags == "all": res = w._get_dhatus(it) else: res = map(lambda x: x[args.tags], w._get_dhatus(it)) print(res) print("Is {} sakarmaka?: {}".format(it, w.is_sakarmaka(it))) logger.info("Reported {}".format(res))
def main(count=None, start=0): # Collect inputs with open(input_file) as fp: inputs = fp.readlines() num_inputs = len(inputs) stop = start + count if count else None inputs = itertools.islice(inputs, start, stop) max_value = count or num_inputs - start # Write the reference outputs with open(ref_file) as fp, open(ref_output_file, "w") as out: refs = fp.readlines() refs = itertools.islice(refs, stop) out.write("".join(refs)) # Create the objects for scoring inria = LexicalSplitMetrics("inria", 10) combined = LexicalSplitMetrics("combined", 10) bar = progressbar.ProgressBar(max_value=max_value) with outputctx(strict_io=False): for line in bar(inputs): s = SanskritObject(line.strip(), encoding=SLP1, replace_ending_visarga=None) logger.debug("Input in SLP1 = %s", s.canonical()) # Compute splits inria.update(s) combined.update(s) print("{:20s} | {:30s} | {:5s}".format("Name", "BLEU", "CHRF")) print("-" * 70) inria.print_metrics() combined.print_metrics()
def get(self, v): """ Parse a presegmented sentence """ strict_p = True if request.args.get("strict") == "false": strict_p = False vobj = SanskritObject(v, strict_io=strict_p, replace_ending_visarga=None) parser = Parser(input_encoding="SLP1", output_encoding="Devanagari", replace_ending_visarga='s') mres = [] print(v) for split in parser.split(vobj.canonical(), limit=10, pre_segmented=True): parses = list(split.parse(limit=10)) sdot = split.to_dot() mres = [x.serializable() for x in parses] pdots = [x.to_dot() for x in parses] r = { "input": v, "devanagari": vobj.devanagari(), "analysis": mres, "split_dot": sdot, "parse_dots": pdots } return r
def get(self, v): """ Presegmented Split """ vobj = SanskritObject(v, strict_io=True, replace_ending_visarga=None) parser = Parser(input_encoding="SLP1", output_encoding="Devanagari", replace_ending_visarga='s') splits = parser.split(vobj.canonical(), limit=10, pre_segmented=True) r = { "input": v, "devanagari": vobj.devanagari(), "splits": [x.serializable()['split'] for x in splits] } return r
def process(sentences, tag_mapper, ws): inria_metrics = WordLevelMetrics("inria") sdata_metrics = WordLevelMetrics("sanskrit_data") comb_metrics = WordLevelMetrics("combined") stats = {'inria': (0, 0, 0), 'sdata': (0, 0, 0), 'combo': (0, 0, 0)} missing = [] for sent in sentences: if sent is None: continue text_obj = SanskritObject(sent.text, encoding=IAST, strict_io=False) words = text_obj.canonical().strip().split(" ") if len(words) != len(sent.dcsAnalysisDecomposition): continue for w, analysis in zip(words, sent.dcsAnalysisDecomposition): if len(analysis) != 1: continue word_analysis = analysis[0] if word_analysis.dcsGrammarHint == []: continue word_slp = SanskritObject(w, encoding=IAST, strict_io=False).canonical() tags = tag_mapper(word_analysis.dcsGrammarHint) root = SanskritObject(word_analysis.root, encoding=IAST, strict_io=False).canonical() i_valid = inria_metrics.update(word_slp, root, tags) s_valid = sdata_metrics.update(word_slp, root, tags) comb_metrics.update(word_slp, root, tags) if not i_valid or not s_valid: missing.append([ word_slp, word_analysis.root, word_analysis.dcsGrammarHint, i_valid, s_valid, not i_valid and not s_valid, i_valid and not s_valid, not i_valid and s_valid ]) stats['inria'] = inria_metrics.metrics() stats['sdata'] = sdata_metrics.metrics() stats['combo'] = comb_metrics.metrics() stats['missing'] = missing return stats