Пример #1
0
    def add_rules_from_file(self, path):
        """
        Add sandhi rules from file.
        Each line of the input file should contain one rule. E.g. अ + अ = आ
        Lines starting with a # are treated as comments and skipped.
        Empty lines are ignored as well.

        :param path: file to read rules from

        See also add_rules_from_dir
        """
        filename = os.path.basename(path)
        with codecs.open(path, "rb", 'utf-8') as f:
            for linenum, line in enumerate(f):
                line = line.strip()
                if line.startswith('#') or line == '':
                    continue
                self.logger.debug("Processing rule %s", line)
                rule = SanskritObject(line).canonical()
                for r in self.expand_rule(rule):
                    self.add_rule(*r,
                                  annotation="%s:%d" % (filename, linenum + 1))
def test_medium_split(lexan):
    i = SanskritObject("budDaMSaraRaNgacCAmi", encoding=SLP1)
    graph = lexan.getSandhiSplits(i)
    splits = graph.findAllPaths()
    assert [u'budDam', u'SaraRam', u'gacCAmi'] in \
           [list(map(str, ss)) for ss in splits]
def test_simple_split(lexan):
    # gaNeshannamAmi
    i = SanskritObject("gaReSannamAmi", encoding=SLP1)
    graph = lexan.getSandhiSplits(i)
    splits = graph.findAllPaths()
    assert [u'gaReSam', u'namAmi'] in [list(map(str, ss)) for ss in splits]
def process_line(lnum, l):
    '''Process a single line'''
    logging.info("Processing Line {}: {}".format(lnum, l))
    r = None
    subsplitp = False
    line = l.strip()
    if line and line[0] == '#':
        logging.info("Skipping Comment")
        return None
    if line.find('=>') == -1:
        logging.info("Cannot find =>")
        return None
    full, split = line.split('=>')
    full = full.strip()
    full = full.replace(u'|', '')
    # Zero width joiner/nonjoiner
    full = full.replace(u"\u200c", "")
    full = full.replace(u"\u200d", "")
    ofull = full  # Save
    full = _dumpchars(SanskritObject(full).transcoded(SLP1))
    split = split.strip()
    split = split.replace(u'|', '')
    # Zero width joiner/nonjoiner
    split = split.replace(u"\u200c", "")
    split = split.replace(u"\u200d", "")
    osplit = split  # Save
    splits = list(
        map(lambda x: _dumpchars(SanskritObject(x).transcoded(SLP1).strip()),
            split.split('+')))
    if splits[-1] == '':
        splits.pop()
    # Empty full string
    if len(full) == 0:
        logger.info("Skipping")

    # UOHD errors, final visarga is sometimes missing
    if len(splits[-1]) > 1 and splits[-1][-2:] == "AH" and \
       full[-1] == "A":
        full = full + "H"
    if len(splits[-1]) > 1 and splits[-1][-2:] == "aH" and \
       full[-1] == "a":
        full = full + "H"
    if splits[-1][-1] == "A" and len(full) > 1 and full[-2:] == "AH":
        splits[-1] = splits[-1] + "H"
    if splits[-1][-1] == "a" and len(full) > 1 and full[-2:] == "aH":
        splits[-1] = splits[-1] + "H"

    # FIXME - this creates problems, eg on 'aho', 'prabho'
    # UOHD stores sandhied final words!
    # This is not a full fix
    full = re.sub("o$", "aH", full)
    # Modified splits
    s = []

    for ss in splits:
        # Check if this word is in our db
        # Rakarantas
        # FIXME - these four replacements aren't working
        # Check intent and actual operation
        sss = ss.replace('punaH', 'punar')
        sss = ss.replace('antaH', 'antar')
        sss = ss.replace('bahiH', 'bahir')
        sss = ss.replace('prAtaH', 'prAtar')
        # FIXME - the above four replacements aren't working
        # Sakarantas
        sss = re.sub('H$', 's', sss)
        if sss.find('punas') != -1:
            logger.error("ERROR: found {}".format(sss))
            # Is in our database
        if (subsplitp == "Skip") or lexan.forms.valid(sss):
            s.append(sss)
        else:
            # If not, treat it as a word to be split
            try:
                graph = lexan.getSandhiSplits(SanskritObject(ss,
                                                             encoding=SLP1))
                if graph is None:
                    # Catch stray unicode symbols with the encode
                    logger.warning("Skipping: {} is not in db".format(
                        ss.encode('utf-8')))
                    subsplitp = "Skip"
                    s.append(sss)
                    continue
                else:
                    subsplitp = True
            except:  # noqa
                logger.warning("Split Error: {}".format(ss.encode('utf-8')))
                s.append(sss)
                continue
            # First split
            ssp = list(map(str, graph.findAllPaths(max_paths=1)[0]))
            # Add it to split list
            s.extend(map(str, ssp))
    logger.info(u"{} => {}".format(full, " ".join(s)))
    r = [full, s, ofull, osplit, subsplitp]
    return r
Пример #5
0
    def expand_rule(self, rule):
        """
        Expands a given sandhi rule from the rules file to generate all possible combinations

        :param rule: Rule to expand
        :return: A generator of all possible expanded rules
        """
        self.logger.debug("Expanding rule %s", rule)

        ms = MaheshvaraSutras()

        b, afters = map(six.text_type.strip, rule.split("="))
        before = list(map(six.text_type.strip, b.split("+", 1)))
        left_classes = re.split(r'\[(.*?)\]', before[0])
        self.logger.debug("Left classes = %s", left_classes)

        # Split after forms into individual forms
        afters = map(six.text_type.strip, afters.split("/"))

        before_left = []
        for c in left_classes:
            if c != '':
                if c.startswith("*"):
                    # This is a mAheswara sUtra pratyAhAra
                    splits = list(map(six.text_type.strip, c.split('-')))
                    varnas = set(
                        ms.getPratyahara(SanskritObject(splits[0][1:],
                                                        encoding=SLP1),
                                         longp=False,
                                         remove_a=True,
                                         dirghas=True).canonical())
                    if len(splits) == 2:
                        varnas -= set(splits[1])
                    self.logger.debug("Found pratyAhAra %s = %s", c, varnas)
                    before_left.append(varnas)
                else:
                    before_left.append(map(six.text_type.strip, c.split(",")))
        self.logger.debug("before_left iterator = %s", before_left)

        right_classes = re.split(r'\[(.*?)\]', before[1])
        # Could have used list comprehension, but this is easier to read
        self.logger.debug("right_classes = %s", right_classes)
        if right_classes:
            before_right = []
            for c in right_classes:
                if c != '':
                    if c.startswith("*"):
                        # This is a mAheswara sUtra pratyAhAra
                        splits = list(
                            map(six.text_type.strip, re.split('([+-])', c)))
                        varnas = set(
                            ms.getPratyahara(SanskritObject(splits[0][1:],
                                                            encoding=SLP1),
                                             longp=False,
                                             remove_a=True,
                                             dirghas=True).canonical())
                        if len(splits) == 3:
                            if splits[1] == '-':
                                varnas -= set(splits[2])
                            elif splits[1] == '+':
                                varnas |= set(splits[2])
                        self.logger.debug("Found pratyAhAra %s (%s) = %s", c,
                                          splits[0][1:], varnas)
                        before_right.append(varnas)
                    else:
                        before_right.append(
                            map(six.text_type.strip, c.split(",")))
        else:
            before_right = [before[1].strip()]
        self.logger.debug("before_right iterator = %s", before_right)

        for after, before_l, before_r in itertools.product(
                afters, itertools.product(*before_left),
                itertools.product(*before_right)):
            left = ''.join(before_l)
            right = ''.join(before_r)
            list_before_r = list(before_r)
            left_right = (left, right)
            a = after.format(*(list(before_l) + list_before_r))
            # The below is just too much logging - should be silenced in production:
            # self.logger.debug("Final rule = %s -> %s", left_right, a)
            yield (left_right, a)
Пример #6
0
def jtag(tag):
    """ Helper to translate tag to serializable format"""
    return (SanskritObject(tag[0], encoding=SLP1).devanagari(strict_io=False),
            [t.devanagari(strict_io=False) for t in list(tag[1])])
Пример #7
0
def jedge(pred, node, label):
    return (node.pada.devanagari(strict_io=False),
            jtag(node.getMorphologicalTags()),
            SanskritObject(label, encoding=SLP1).devanagari(strict_io=False),
            pred.pada.devanagari(strict_io=False))