def numericTagger(instr): """ numericTagger is a regex based tagger that tags Numbers with the tag "num" :param instr: Can be a string, list of tokens or a list of tuples. It can be the string to be tagged, tokenized string or even a pre-tagged string :type inst: string, list of strings, list of tuples :return: Returns a List of tuples of the form [(token1, genderTag), (token2, genderTag)...] :rtype: List of Tuples. """ #lst = type([1, 2, 3]) #tup = type(("Hello", "Hi")) #string = type("Hello") num_match = re.compile( r'([०१२३४५६७८९]+[\.\,]*)+[०१२३४५६७८९]+|([-+]*\d+[\.\,]*)+\d+|([०१२३४५६७८९]+|\d+)' ) if type(instr) == list: for index, item in enumerate(instr): if type(item) == tuple: if num_match.search(str(item[0])): instr[index] = (instr[index][1], 'num') else: if num_match.search(str(item)): instr[index] = (instr[index], 'num') else: if type(instr) == str: instr = tok.tokenize(instr) numericTagger(instr) else: print("not supported") return instr
def lookupTagger(instr): """ lookupTagger looks up the Dictionary formatches and tags the token if a match is found :param instr: Can be a string, list of tokens or a list of tuples It can be the string to be tagged, tokenized string or a pre-tragged list of tokens. :type instr: string, list of strings, list of tuples :return: Returns a List of tuples of the form [(token1, genderTag), (token2, genderTag)...] :rtype: List of Tuples. """ lst = type([1, 2, 3]) tup = type(("Hello", "Hi")) string = type("Hello") gndrlst = gndrlist.drawlist() words = [] genders = [] for item in gndrlst: words.append(item.split("\t")[0]) if (len(item.split("\t")) > 2): genders.append("any") else: genders.append(item.split("\t")[1]) tokens = set(words) if (type(instr) == lst): for index, item in enumerate(instr): if (type(item) == tup): if item[0] in tokens: tag = genders[words.index(item[0])] instr[index] = (instr[index][0], tag) else: if (type(item) != tup): if item in tokens: tag = genders[words.index(item)] instr[index] = (instr[index], tag) else: if (type(instr) == string): instr = tok.tokenize(instr) lookupTagger(instr) else: print("not supported") return (instr)
def defaultTagger(instr): """ defaultTagger tags untagged tokens with "any" :param instr: Can be a string, list of tokens It can be the string to be tagged, tokenized string :type instr: string, list of strings, list of tuples :return: Returns a List of tuples of the form [(token1, genderTag), (token2, genderTag)...] :rtype: List of Tuples. """ lst = type([1, 2, 3]) tup = type(("Hello", "Hi")) string = type("Hello") if type(instr) == lst: for index, item in enumerate(instr): if type(item) != tup: instr[index] = (instr[index], 'any') else: if type(instr) == string: instr = tok.tokenize(instr) defaultTagger(instr) else: print("not supported") return instr