def main():
        global need_lakara
        args = getArgs()
        print("Input String:", args.data)
        need_lakara = args.need_lakara

        if args.debug:
            logging.basicConfig(filename='SanskritMorphologicalAnalyzer.log', filemode='w', level=logging.DEBUG)
        s = SanskritMorphologicalAnalyzer(args.lexical_lookup)
        if args.input_encoding is None:
            ie = None
        else:
            ie = SanskritBase.SCHEMES[args.input_encoding]
        i = SanskritBase.SanskritObject(args.data, encoding=ie,
                                        strict_io=args.strict_io,
                                        replace_ending_visarga=None)
        print("Input String in SLP1:", i.canonical())
        import time
        print("Start Split")
        start_split = time.time()
        graph = s.getSandhiSplits(i, tag=True)
        end_split = time.time()
        print("End DAG generation")
        with SanskritBase.outputctx(args.strict_io):
            if graph:
                start_path = time.time()
                splits = graph.findAllPaths(max_paths=args.max_paths)
                end_path = time.time()
                print("End pathfinding")
                print("Splits:")
                for sp in splits:
                    print("Lexical Split:", sp)
                    p = s.constrainPath(sp)
                    if p:
                        print("Valid Morphologies")
                        for pp in p:
                            print([(spp, pp[str(spp)]) for spp in sp])
                    else:
                        print("No valid morphologies for this split")
                print("End Morphological Analysis")
                print("-----------")
                print("Performance")
                print("Time taken for split: {0:0.6f}s".format(end_split-start_split))
                print("Time taken for path: {0:0.6f}s".format(end_path-start_path))
            else:
                print("No Valid Splits Found")
                return
    def _possible_splits(self, s):
        ''' private method to dynamically compute all sandhi splits

            Used by getSandhiSplits
            Adds the individual splits to the graph self.splits and returns
            the roots of the subgraph corresponding to the split of s
           Params:
              s(string): Input SLP1 encoded string
            Returns:
              roots : set of roots of subgraph corresponding to possible splits of s
        '''
        logger.debug("Splitting " + s)

        @lru_cache(256)
        def _is_valid_word(ss):
            r = self.forms.valid(ss)
            return r

        def _sandhi_splits_all(s, start=None, stop=None):
            obj = SanskritBase.SanskritImmutableString(
                s, encoding=SanskritBase.SLP1)
            splits = self.sandhi.split_all(obj, start, stop)
            return splits

        roots = set()

        # Memoization for dynamic programming - remember substrings that've
        # been seen before
        if s in self.dynamic_scoreboard:
            logger.debug("Found {} in scoreboard".format(s))
            return self.dynamic_scoreboard[s]

        # If a space is found in a string, stop at that space
        spos = s.find(" ")
        stop = None if spos == -1 else spos

        s_c_list = _sandhi_splits_all(s, start=0, stop=stop)
        logger.debug("s_c_list: " + str(s_c_list))
        if s_c_list is None:
            s_c_list = []

        node_cache = {}

        for (s_c_left, s_c_right) in s_c_list:
            # Is the left side a valid word?
            if _is_valid_word(s_c_left):
                logger.debug("Valid left word: " + s_c_left)
                # For each split with a valid left part, check it there are
                # valid splits of the right part
                if s_c_right and s_c_right != '':
                    logger.debug("Trying to split:" + s_c_right)
                    r_roots = self._possible_splits(s_c_right.strip())
                    # if there are valid splits of the right side
                    if r_roots:
                        # Make sure we got a set of roots back
                        assert isinstance(r_roots, set)
                        # if there are valid splits of the right side
                        if s_c_left not in node_cache:
                            # Extend splits list with s_c_left appended with
                            # possible splits of s_c_right
                            t = SanskritBase.SanskritObject(
                                s_c_left, encoding=SanskritBase.SLP1)
                            node_cache[s_c_left] = t
                        else:
                            t = node_cache[s_c_left]
                        roots.add(t)
                        if not self.splits.has_node(t):
                            self.splits.add_node(t)
                        self.splits.append_to_node(t, r_roots)
                else:  # Null right part
                    # Why cache s_c_left here? To handle the case
                    # where the same s_c_left appears with a null and non-null
                    # right side.
                    if s_c_left not in node_cache:
                        t = SanskritBase.SanskritObject(
                            s_c_left, encoding=SanskritBase.SLP1)
                        node_cache[s_c_left] = t
                    else:
                        t = node_cache[s_c_left]
                    # Extend splits list with s_c_left appended with
                    # possible splits of s_c_right
                    roots.add(t)
                    if not self.splits.has_node(t):
                        self.splits.add_node(t)
                    self.splits.add_end_edge(t)
            else:
                logger.debug("Invalid left word: " + s_c_left)
        # Update scoreboard for this substring, so we don't have to split
        # again
        self.dynamic_scoreboard[s] = roots
        if len(roots) == 0:
            logger.debug("No splits found, returning empty set")
        else:
            logger.debug("Roots: %s", roots)
        return roots
Пример #3
0
    def main():
        args = getArgs()
        if args.strict_io:
            print("Interpreting input strictly")
        else:
            print("Interpreting input loosely (strict_io set to false)")
        print("Input String:", args.data)

        if args.debug:
            logging.basicConfig(filename='SanskritLexicalAnalyzer.log',
                                filemode='w',
                                level=logging.DEBUG)
        else:
            logging.basicConfig(filename='SanskritLexicalAnalyzer.log',
                                filemode='w',
                                level=logging.INFO)

        s = SanskritLexicalAnalyzer(args.lexical_lookup)
        if args.input_encoding is None:
            ie = None
        else:
            ie = SanskritBase.SCHEMES[args.input_encoding]
        with SanskritBase.outputctx(args.strict_io):
            if not args.split:
                i = SanskritBase.SanskritObject(args.data,
                                                encoding=ie,
                                                strict_io=args.strict_io,
                                                replace_ending_visarga='s')
                print("Input String in SLP1:", i.canonical())
                ts = s.getLexicalTags(i)
                print(ts)
                # Possible rakaranta
                # Try by replacing end visarga with 'r' instead
                if not args.strict_io:
                    i = SanskritBase.SanskritObject(args.data,
                                                    encoding=ie,
                                                    strict_io=args.strict_io,
                                                    replace_ending_visarga='r')
                    ts = s.getLexicalTags(i)
                    if ts is not None:
                        print("Input String in SLP1:", i.canonical())
                        print(ts)
                if args.tag_set or args.base:
                    if args.tag_set:
                        g = set(args.tag_set)
                    print(
                        s.hasTag(i, SanskritBase.SanskritObject(args.base), g))
            else:
                import time
                i = SanskritBase.SanskritObject(args.data,
                                                encoding=ie,
                                                strict_io=args.strict_io,
                                                replace_ending_visarga=None)
                print("Input String in SLP1:", i.canonical())
                print("Start Split")
                start_split = time.time()
                graph = s.getSandhiSplits(i)
                end_graph = time.time()
                print("End DAG generation")
                if graph:
                    logger.debug("Graph has %d nodes and %d edges" %
                                 (len(graph.G.nodes()), len(graph.G.edges())))
                    splits = graph.findAllPaths(max_paths=args.max_paths,
                                                score=args.score)
                    print("End pathfinding", time.time())
                    print("Splits:")
                    if splits:
                        for split in splits:
                            print(split)
                    else:
                        print("None")
                else:
                    print("No Valid Splits Found")
                end_split = time.time()
                print("-----------")
                print("Performance")
                print("Time for graph generation = {0:0.6f}s".format(
                    end_graph - start_split))
                print(
                    "Total time for graph generation + find paths = {0:0.6f}s".
                    format(end_split - start_split))
Пример #4
0
 def _sandhi_splits_all(s, start=None, stop=None):
     obj = SanskritBase.SanskritObject(s, encoding=SanskritBase.SLP1)
     splits = self.sandhi.split_all(obj, start, stop)
     return splits