def test_tokenzie(self): text = \ """("hi, I am Wenjing" #t #f result cond 123 """ results = list(Tokenize.tokenize(text, 0)) self.assertEqual(len(results), 7) self.assertTokenEqual(results[0], Token(Tokens.LPAREN, 0)) self.assertTokenEqual(results[1], Token(Tokens.STRING, 1, "hi, I am Wenjing")) self.assertTokenEqual(results[2], Token(Tokens.TRUE, 21)) self.assertTokenEqual(results[3], Token(Tokens.FALSE, 25)) self.assertTokenEqual(results[4], Token(Tokens.VARIABLE, 28, "result")) self.assertTokenEqual(results[5], Token(Tokens.COND, 35, )) self.assertTokenEqual(results[6], Token(Tokens.NUMBER, 40, 123)) text = "(('))" results = list(Tokenize.tokenize(text, 0)) self.assertEqual(len(results), 5) self.assertTokenEqual(results[0], Token(Tokens.LPAREN, 0)) self.assertTokenEqual(results[1], Token(Tokens.LPAREN, 1)) self.assertTokenEqual(results[2], Token(Tokens.QUOTE, 2)) self.assertTokenEqual(results[3], Token(Tokens.RPAREN, 3)) self.assertTokenEqual(results[4], Token(Tokens.RPAREN, 4)) text = """(cond "hello" 12)""" results = list(Tokenize.tokenize(text, 0)) self.assertEqual(len(results), 5) self.assertTokenEqual(results[0], Token(Tokens.LPAREN, 0)) self.assertTokenEqual(results[1], Token(Tokens.COND, 1)) self.assertTokenEqual(results[2], Token(Tokens.STRING, 6, "hello")) self.assertTokenEqual(results[3], Token(Tokens.NUMBER, 14, 12)) self.assertTokenEqual(results[4], Token(Tokens.RPAREN, 16))
def test_skip(self): text = "\n ;This line is leave blank\n ;code will begin\n (cons 1 2)" pos = Tokenize._skip(text, 0) self.assertEqual(text[pos:], "(cons 1 2)") #another test case text = "Nil" pos = Tokenize._skip(text, 0) self.assertEqual(text[pos:], "Nil")
def isCreditCard(text): credit = Tokenize.nGram(text, "credit card") ticket = Tokenize.nGram(text, "bank ticket") if credit > ticket: return True elif ticket > credit: return False else: return None
def test_skipToNewLine(self): textWith3Lines = "line1\nline2\nline3" pos = Tokenize._skipToNextLine(textWith3Lines, 0) self.assertEqual(textWith3Lines[pos:], "line2\nline3") pos = Tokenize._skipToNextLine(textWith3Lines, pos) self.assertEqual(textWith3Lines[pos:], "line3") pos = Tokenize._skipToNextLine(textWith3Lines, pos) self.assertEqual(textWith3Lines[pos:], "")
def test_skipWhiteSpaces(self): # text with leading spaces text = " \n\r\t \n\r\t text" pos = Tokenize._skipWhitespaces(text, 0) self.assertEqual("text", text[pos:]) # text with no leading space test = "text" pos = Tokenize._skipWhitespaces(text, 0) self.assertEqual("text", text[pos:])
def lemmatize_sentence(sentence): stop_words = stopwords.words('english') tokens = Tokenize.tokenize_word(sentence) without_stopwords = [word.lower() for word in tokens if word.lower() not in stop_words] lemmatizer = WordNetLemmatizer() lemmas = [] for token, tag in nltk.pos_tag(without_stopwords): mapped_tag = Tokenize.tag_map(tag[0]) lemma = lemmatizer.lemmatize(token, mapped_tag) lemmas.append(lemma) return lemmas
def searchGame(text, corpus): text = Tokenize.tratarTexto(text) gameList = {} searchList = corpusTrat for key in searchList: searchList[key]["coe"] = Tokenize.coeficienteSimilaridade( text, searchList[key]["text"], searchList) if searchList[key]["coe"] > 0: gameList[key] = searchList[key]["coe"] return gameList
def test_literal_expression(self): tokens = Tokenize.tokenize("'(cons a b) 'c", 0) result = list(parse(tokens)) partialTokens = Tokenize.tokenize("(cons a b)", 0) partialResult = list(parse(partialTokens)) self.assertEqual(result[0][1].valueType, Values.LITERAL) self.assertEqual(result[0][1].val, partialResult[0][1]) self.assertEqual(result[1][1].valueType, Values.LITERAL) self.assertEqual(result[1][1].val, makeSymbol("c"))
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) words=[] if stem: words = Tokenize.byWordStem(text) else: words = Tokenize.byWordAlphaOnly(text) fd = Ngrams.getNgramFreqDist(words,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["word#"+str(i)+" "+str(n)+"gramW"] = topM[i][0] return vector
def answer(self, text): if not Tokenize.valid_exp_date(text): print("Enter a valid expiration date. Format mm/yy") return CardExpirationTimeState() order.creditCard.holder = text print("What is the name of the holder of the Credit Card?") return CardholderState()
def answer(self, text): if not Tokenize.valid_card_number(text): print("Enter a valid credit card number.") return CreditCardNumberState() order.creditCard.expirationDate = text print("What is the expiration date of the Credit Card? Format mm/yy") return CardExpirationTimeState()
def answer(self, text): if not Tokenize.valid_cpf(text): print("Invalid CPF. Please enter again with only number.") return CPFState() order.cpf = text print("How do you want to pay? Credit card or bank ticket?") return PaymentMethodState()
def answer(self, text): if not Tokenize.valid_email(text): print("Invalid email. Please enter a valid email.") return UserEmailState() order.email = text print("What's your CPF? Enter only numbers.") return CPFState()
def execute(self, sourceCode): tokens = Tokenize.tokenize(sourceCode, 0) for status, exp in parse(tokens): if status != ParseError.OK: print "Error occurs:", status return return Eval.eval(exp, self.glob)[1]
def test_TokenizeKeyWords(self): wrong = ["some","of","these","words","are","not","keywords"] keywords = ["make","if","else","return","class","method"] input = wrong + keywords random.shuffle(input) output = [ word for word in input if Tokenize.TokenizeKeywords(word)] self.assertItemsEqual(output,keywords,"find the keywords")
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSize" :"HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSizeb" :len(types)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/tokenb" :"HIGH" if len(types)/len(tokens) > .5 else "MEDIUM" if len(types)/len(tokens) > .2 else "LOW"}
def test_TokenizeOperators(self): wrong = ["these","are","not","operators","#","$","_"] opperators = ["+","-","*","=","+=","-=",">","<","!=",">=","<="] input = wrong + opperators random.shuffle(input) output = [word for word in input if Tokenize.TokenizeOperators(word)] self.assertItemsEqual(output,opperators,"find the operators")
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) words = [] if stem: words = Tokenize.byWordStem(text) else: words = Tokenize.byWordAlphaOnly(text) fd = Ngrams.getNgramFreqDist(words, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["word#" + str(i) + " " + str(n) + "gramW"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSizeb": len(types)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token": int(100 * len(types) / len(tokens))}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token" : int(100*len(types)/len(tokens))}
def test_Digits(self): wrong = ["these","are","not","digits","0.0.0"] digits = ["0","-1","3","9.0",".9","100000000"] input = wrong + digits random.shuffle(input) output = [ word for word in input if Tokenize.TokenizeDigits(word) ] self.assertItemsEqual(output,digits,"find digits")
def avgWordLength(text): tokens = Tokenize.byWord(text) sum = 0 count = 0 for token in tokens: if token.isalpha(): sum += len(token) count +=1 return {"AVG word Length" : int(sum/count)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["char#"+str(i)+" "+str(n)+"gramC"] = topM[i][0] return vector
def test_TokenizeStrings(self): strings = ["\"string with spaces\"","\"stringWithNoSpaces\""] opperators = ["+","-","*","=","+=","-=",">","<","!=",">=","<="] keywords = ["make","if","else","return","class","method"] invalids = ["2er4",",sdf","@sd"] input = strings + opperators + keywords + invalids random.shuffle(input) output = [word for word in input if Tokenize.TokenizeStrings(word)] self.assertItemsEqual(output,strings,"find strings")
def vocabSizeBucketed(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "vocabSize": "HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW" }
def test_getToken(self): text = "(cons 1 2)" result = Tokenize._extractToken(text, 0) self.assertIsNotNone(result) self.assertEqual(result[1], 1) self.assertTokenEqual(result[0], Token(Tokens.LPAREN, 0)) """
def avgWordLength(text): tokens = Tokenize.byWord(text) sum = 0 count = 0 for token in tokens: if token.isalpha(): sum += len(token) count += 1 return {"AVG word Length": int(sum / count)}
def test_TokenizeIdentifiers(self): opperators = ["+","-","*","=","+=","-=",">","<","!=",">=","<="] keywords = ["make","if","else","return","class","method"] invalids = ["2er4",",sdf","@sd"] identifiers = ["x","y","count","total","r3","R2","totalMoney","i"] input = opperators + keywords + invalids + identifiers random.shuffle(input) output = [ word for word in input if Tokenize.TokenizeIdentifiers(word)] self.assertItemsEqual(output,identifiers,"find the indentifiers")
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] fd = Ngrams.getNgramFreqDist(POStags,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["pos#"+str(i)+" "+str(n)+"gram"] = topM[i][0] return vector
def avgWordLengthBucketed(text): tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 for token in tokens: sum += len(token) count += 1 numericValue = int(sum / count) bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short" return {"AVG word Length": bucketLabel}
def typeTokenRatioBucketed(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "type/token": "HIGH" if len(types) / len(tokens) > .5 else "MEDIUM" if len(types) / len(tokens) > .2 else "LOW" }
def avgWordLengthBucketed(text): tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 for token in tokens: sum += len(token) count +=1 numericValue = int(sum/count) bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short" return {"AVG word Length" : bucketLabel}
def avgWordLength(text): text = " ".join(text) tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 tokens = list(set(tokens)) for token in tokens: if token.isalpha(): sum += len(token) count +=1 return {"AVG word Length" : int(sum/count)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["char#" + str(i) + " " + str(n) + "gramC"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "vocabSize": "HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW" }
def percentOfUpperLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) uppers = 0 total = 0 for c in text: if c.isupper(): uppers +=1 total += 1 percent = int(100*uppers/total) return {"percentUpperCase" : percent}
def test_single_expression(self): tokens = Tokenize.tokenize("(define size 4)", 0) expList = [makeKeyword(Tokens.DEFINE), makeSymbol("size"), makeNumber(4)] expected = makeList(expList) result = list(parse(tokens)) self.assertTrue(all(code == ParseError.OK for code, _ in result)) actual = [item for _, item in result] self.assertEqual(1, len(actual)) self.assertEqual(expected, actual[0])
def avgWordLength(text): text = " ".join(text) tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 tokens = list(set(tokens)) for token in tokens: if token.isalpha(): sum += len(token) count += 1 return {"AVG word Length": int(sum / count)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "type/tokenb": "HIGH" if len(types) / len(tokens) > .5 else "MEDIUM" if len(types) / len(tokens) > .2 else "LOW" }
def __init__(self, inputFile, alpha, beta, KTopics, vocab): self.corpus = self.loadCorpus(inputFile) self.vocab = vocab self.alpha = alpha self.beta = beta self.K = KTopics for i, doc in enumerate(self.corpus): docTerms = Tokenize.tokenizeText(doc) self.vocab = self.vocab + docTerms self.corpus[i] = docTerms self.vocab = list(set(self.vocab)) self.D = len(self.corpus) self.theta = np.zeros( (self.D, self.K )) # count of words assigned to topic k in doc D/ n_m_z / n_dk self.N = len(self.vocab) self.phi = np.zeros( (self.K, self.N)) # word count of each word in topic K / n_zt / n_kw self.z_dn = [] # topics of every word in doc D self.wordCount_k = np.zeros(self.K) # word count of each topic K ''' Initialize a random topic to every word in every document and appropriately change the phi and theta matrices. ''' for i, doc in enumerate(self.corpus): z_d = [] for word in doc: z = np.random.randint( 0, self.K) # a random topic drawn from the list of K topics z_d.append(z) ''' A word has been assigned to topic z in the current doc i. Update the Doc-topic distribution ''' self.theta[i][z] += 1 ''' The word 'word' has been assigned to topic z. So update the topic-word dist ''' self.phi[z][(self.vocab).index(word)] += 1 ''' The word count of topic z has to be incremented. ''' self.wordCount_k[z] += 1 z_d = np.array(z_d) self.z_dn.append(z_d) self.Phi = np.zeros(self.phi.shape) self.Theta = np.zeros(self.theta.shape)
def percentOfUpperLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) uppers = 0 total = 0 for c in text: if c.isupper(): uppers += 1 total += 1 percent = int(100 * uppers / total) return {"percentUpperCase": percent}
def test_keywords_expression(self): tokens = Tokenize.tokenize("define cond if else set! lambda", 0) expected = [makeKeyword(Tokens.DEFINE), makeKeyword(Tokens.COND), makeKeyword(Tokens.IF), makeKeyword(Tokens.ELSE), makeKeyword(Tokens.ASSIGNMENT), makeKeyword(Tokens.LAMBDA)] result = list(parse(tokens)) self.assertTrue(all(code == ParseError.OK for code, _ in result)) actual = [item for _, item in result] self.assertEqual(expected, actual)
def test_primitive_expression(self): tokens = Tokenize.tokenize('"I am a string" 1234 #t #f symbol null', 0) expected = [makeString('I am a string'), makeNumber(1234), makeBoolean(True), makeBoolean(False), makeSymbol("symbol"), makeNULL()] result = list(parse(tokens)) self.assertTrue(all(code == ParseError.OK for code, _ in result)) actual = [item for _, item in result] self.assertEqual(expected, actual)
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] fd = Ngrams.getNgramFreqDist(POStags, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["pos#" + str(i) + " " + str(n) + "gram"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1],reverse=True)[:m] #print(topM) total = 0 for p in topM: total += p[1] PDF = [] for p in topM: PDF.append((p[0],p[1]/total)) return dict(PDF[:m])
def answer(self, text): if not Tokenize.valid_conf_code(text): print("Enter a valid verification code") return VerificationCodeState() print("We are verifying your Credit Card information. Wait a moment.") time.sleep(3) print("We are all set. Your Credit Card is valid.") print("Let's see your order: ") for game in order.games: print(game['name'] + " price: " + str(game['price'])) print("Total: " + str(order.total)) print("Do you want to proceed with the order?") return ConfirmCreditCardState()
def test_multiple_expression(self): tokens = Tokenize.tokenize("(define size 4)(+ size 5)", 0) expListOne = [makeKeyword(Tokens.DEFINE), makeSymbol("size"), makeNumber(4)] expListTwo = [makeSymbol("+"), makeSymbol("size"), makeNumber(5)] expectedOne = makeList(expListOne) expectedTwo = makeList(expListTwo) result = list(parse(tokens)) self.assertTrue(all(code == ParseError.OK for code, _ in result)) actual = [item for _, item in result] self.assertEqual(2, len(actual)) self.assertEqual(expectedOne, actual[0]) self.assertEqual(expectedTwo, actual[1])
def percentOfLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) vector = {} total = 0 for i in range(26): vector["pL"+chr(i + ord('a'))] = 0 for c in text.lower(): if "pL"+c in vector.keys(): vector["pL"+c] +=1 total += 1 for i in range(26): vector["pL"+chr(i + ord('a'))] = int(100*(vector["pL"+chr(i + ord('a'))]/total)) return vector
def train_models(languages, pretrainded_head=5000): pretrainded_models = {} for language in languages: corpus = '' for i in range(1, 4): with open('Texts/{}/{}{}.txt'.format(language, language, i), encoding='utf-8') as f: corpus += f.read().rstrip() ngrams = Tokenize.get_ngrams(corpus, ngram_size=3) pretrained = get_ngram_frequency(ngrams).head(pretrainded_head) pretrained.to_csv('{}_{}.csv'.format(language, pretrainded_head), header=False) pretrainded_models[language] = pretrained return pretrainded_models
def featureNumericScore(sample): words = Tokenize.byWord(sample) HSWords = loadHSWords() sentimentWordCount = 0 score = 0 for w in words: for s in HSWords: if w == s["word"]: score += s["score"] sentimentWordCount +=1 #print("Raw score",score) score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1)) #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1 #print("Ours:", rating, "Score", score) return {"HS raw score" : score}
def posDist(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] possibleTags = PerceptronTagger().model.classes vector = {} total = 0 for tag in possibleTags: vector[tag] = 0 for tag in POStags: vector[tag] += 1 total += 1 for tag in possibleTags: vector[tag] = int(100 * vector[tag] / total) return vector
def percentOfLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) vector = {} total = 0 for i in range(26): vector["pL" + chr(i + ord('a'))] = 0 for c in text.lower(): if "pL" + c in vector.keys(): vector["pL" + c] += 1 total += 1 for i in range(26): vector["pL" + chr(i + ord('a'))] = int( 100 * (vector["pL" + chr(i + ord('a'))] / total)) return vector
def posDist(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] possibleTags = PerceptronTagger().model.classes vector = {} total = 0 for tag in possibleTags: vector[tag] = 0 for tag in POStags: vector[tag] += 1 total +=1 for tag in possibleTags: vector[tag] = int(100*vector[tag]/total) return vector
def featureBinaryScore(sample): words = Tokenize.byWord(sample) HSWords = loadHSWords() sentimentWordCount = 0 score = 0 for w in words: for s in HSWords: if w == s["word"]: score += s["score"] sentimentWordCount +=1 #print("Raw score",score) score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1)) rating = "+" if score > 0 else "-" #print("Ours:", rating, "Score", score) return {"HS rating" : rating}
def featureHitCountBucketed(sample): words = Tokenize.byWord(sample) HSWords = loadHSWords() sentimentWordCount = 0 score = 0 for w in words: for s in HSWords: if w == s["word"]: score += s["score"] sentimentWordCount +=1 #print("Raw score",score) score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1)) #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1 #print("Ours:", rating, "Score", score) return {"HS hit count" : "HIGH" if sentimentWordCount > 8 else "MEDIUM" if sentimentWordCount > 4 else "LOW"}
def wordLengthDist(text): text = " ".join(text) words = Tokenize.byWordAlphaOnly(text) vector = {} total = 0 for i in range(1,11): vector["%ofwords"+str(i)+"long"] = 0 count = 0 words = list(set(words)) for word in words: if len(word) < 10: vector["%ofwords"+str(len(word))+"long"] += 1 else: vector["%ofwords"+str(10)+"long"] += 1 total +=1 for i in range(1,11): vector["%ofwords"+str(i)+"long"] = int(100*vector["%ofwords"+str(i)+"long"]/total) return vector
def textLength(text): return {"text Length" : len(Tokenize.byWord(text))}