示例#1
0
def numericTagger(instr):
    """
    numericTagger is a regex based tagger that tags Numbers with the tag "num"
    :param instr: Can be a string, list of tokens or a list of tuples. 
    It can be the string to be tagged, tokenized string or even a pre-tagged string
    :type inst: string, list of strings, list of tuples
    
    :return: Returns a List of tuples of the form [(token1, genderTag), (token2, genderTag)...]
    :rtype: List of Tuples.
    """
    #lst = type([1, 2, 3])
    #tup = type(("Hello", "Hi"))
    #string = type("Hello")
    num_match = re.compile(
        r'([०१२३४५६७८९]+[\.\,]*)+[०१२३४५६७८९]+|([-+]*\d+[\.\,]*)+\d+|([०१२३४५६७८९]+|\d+)'
    )
    if type(instr) == list:
        for index, item in enumerate(instr):
            if type(item) == tuple:
                if num_match.search(str(item[0])):
                    instr[index] = (instr[index][1], 'num')
            else:
                if num_match.search(str(item)):
                    instr[index] = (instr[index], 'num')
    else:
        if type(instr) == str:
            instr = tok.tokenize(instr)
            numericTagger(instr)
        else:
            print("not supported")

    return instr
示例#2
0
def lookupTagger(instr):
    """
    lookupTagger looks up the Dictionary formatches and tags the token if a match is found
    :param instr: Can be a string, list of tokens or a list of tuples
    It can be the string to be tagged, tokenized string or a pre-tragged list of tokens.
    :type instr: string, list of strings, list of tuples
    
    :return: Returns a List of tuples of the form [(token1, genderTag), (token2, genderTag)...]
    :rtype: List of Tuples.
    """
    lst = type([1, 2, 3])
    tup = type(("Hello", "Hi"))
    string = type("Hello")
    gndrlst = gndrlist.drawlist()
    words = []
    genders = []
    for item in gndrlst:
        words.append(item.split("\t")[0])
        if (len(item.split("\t")) > 2):
            genders.append("any")
        else:
            genders.append(item.split("\t")[1])

    tokens = set(words)

    if (type(instr) == lst):
        for index, item in enumerate(instr):
            if (type(item) == tup):
                if item[0] in tokens:
                    tag = genders[words.index(item[0])]
                    instr[index] = (instr[index][0], tag)
            else:
                if (type(item) != tup):
                    if item in tokens:
                        tag = genders[words.index(item)]
                        instr[index] = (instr[index], tag)

    else:
        if (type(instr) == string):
            instr = tok.tokenize(instr)
            lookupTagger(instr)

        else:
            print("not supported")

    return (instr)
示例#3
0
def defaultTagger(instr):
    """
    defaultTagger tags untagged tokens with "any"
    :param instr: Can be a string, list of tokens  
    It can be the string to be tagged, tokenized string 
    :type instr: string, list of strings, list of tuples
    
    :return: Returns a List of tuples of the form [(token1, genderTag), (token2, genderTag)...]
    :rtype: List of Tuples.
    """
    lst = type([1, 2, 3])
    tup = type(("Hello", "Hi"))
    string = type("Hello")
    if type(instr) == lst:
        for index, item in enumerate(instr):
            if type(item) != tup:
                instr[index] = (instr[index], 'any')
    else:
        if type(instr) == string:
            instr = tok.tokenize(instr)
            defaultTagger(instr)
        else:
            print("not supported")
    return instr