def avgWordLengthBucketed(text):
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    for token in tokens:
        sum += len(token)
        count +=1
    numericValue = int(sum/count)
    bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short"
    return {"AVG word Length" : bucketLabel}
def avgWordLengthBucketed(text):
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    for token in tokens:
        sum += len(token)
        count += 1
    numericValue = int(sum / count)
    bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short"
    return {"AVG word Length": bucketLabel}
def avgWordLength(text):
    text = " ".join(text)
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    tokens = list(set(tokens))
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count += 1
    return {"AVG word Length": int(sum / count)}
def avgWordLength(text):
    text = " ".join(text)
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    tokens = list(set(tokens))
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count +=1
    return {"AVG word Length" : int(sum/count)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     words=[]
     if stem:
         words = Tokenize.byWordStem(text)
     else:
         words = Tokenize.byWordAlphaOnly(text)
     fd = Ngrams.getNgramFreqDist(words,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["word#"+str(i)+" "+str(n)+"gramW"] = topM[i][0]
     return vector
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     words = []
     if stem:
         words = Tokenize.byWordStem(text)
     else:
         words = Tokenize.byWordAlphaOnly(text)
     fd = Ngrams.getNgramFreqDist(words, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["word#" + str(i) + " " + str(n) + "gramW"] = topM[i][0]
     return vector
def wordLengthDist(text):
    text = " ".join(text)
    words = Tokenize.byWordAlphaOnly(text)
    vector = {}
    total = 0
    for i in range(1,11):
        vector["%ofwords"+str(i)+"long"] = 0
    count = 0
    words = list(set(words))
    for word in words:
        if len(word) < 10:
            vector["%ofwords"+str(len(word))+"long"] += 1 
        else:
            vector["%ofwords"+str(10)+"long"] += 1
        total +=1
    for i in range(1,11):
        vector["%ofwords"+str(i)+"long"] = int(100*vector["%ofwords"+str(i)+"long"]/total)
    return vector
def wordLengthDist(text):
    text = " ".join(text)
    words = Tokenize.byWordAlphaOnly(text)
    vector = {}
    total = 0
    for i in range(1, 11):
        vector["%ofwords" + str(i) + "long"] = 0
    count = 0
    words = list(set(words))
    for word in words:
        if len(word) < 10:
            vector["%ofwords" + str(len(word)) + "long"] += 1
        else:
            vector["%ofwords" + str(10) + "long"] += 1
        total += 1
    for i in range(1, 11):
        vector["%ofwords" + str(i) + "long"] = int(
            100 * vector["%ofwords" + str(i) + "long"] / total)
    return vector