def crossValidationLinearInterpolation(tweetList, k, maxNgram): for i in xrange(k): trainSet, testSet = divideDataset(tweetList, k, i) trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(trainSet, maxNgram) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, trainDist) print linearCoefficients count = 0 tot = 0 for tweet in testSet: predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, trainDist) utils.printResultTXT(predictedLanguage, tweet) if(predictedLanguage == tweet.language): count = count + 1; tot = tot +1 # print str(count)+'/'+str(tot) print 'correct tweets fold '+str(i)+' = '+str(count)+'/'+str(tot)
# Generate linear coefficients: input (n-grams and language) # Smooth data # cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram) linearCoefficientsAll = list() trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram) for gram in xrange(1, maxNgram+1): linearCoefficientsAll.append(linear.getlinearcoefficientsForLanguageArray(arrayLanguages, gram, trainDist)) print linearCoefficientsAll # linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients) count = 4 # Desde que gram empezar for i in xrange(count, maxNgram): count = count + 1 t0 = time.time() for tweet in tweetListPreProcessed_test: # t0 = time.time() predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficientsAll[i], tweet.text, count, trainDist) utils.printResultTXT(predictedLanguage, tweet, count) # print "time = "+str(time.time()-t0) # cv.nestedCrossValidation(tweetListPreProcessed,5,5,[0,0,0],arrayLanguagesFull) # cv.crossValidation(tweetListPreProcessed, 3, maxNgram+1) # 3.3-. Out-of-place Measure
c = '"En Cada Lucha Aquel Que Va A Muerte Es El Que Gana" Goazen @PasaiaRegional!! #aupaekipo #aupapasaia pic.twitter.com/BQ1ikdE2Qt' text = preprocess.preprocessText(tweetEU) # linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, corpusNgrams) linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients) linearCoefficients = linearCoefficientsALL[maxNgram-1] import time t1 = time.time() for tweet in tweetListPreProcessedtest: t0 = time.time() predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, corpusNgrams) utils.printResultTXT(predictedLanguage, tweet, 5) print "time for tweet= "+str(time.time()-t0) print "time total= "+str(time.time()-t1) # sys.stdout.write("\n Tweet: "+str(text.encode("utf-8"))) # sys.stdout.write("\n Tweet language: "+str(predictedLanguage)+"\n Probability of: "+str(probability)+"\n") # 3.3-. Algorithms: Ranking Methods # cv.nestedCrossValidation(tweetListPreProcessed,5,5,[0,0,0],arrayLanguagesFull) # cv.crossValidation(tweetListPreProcessed, 3, [0,0,0], arrayLanguagesFull, maxNgram+1) # 3.4-. Out-of-place Measure