def crossValidationLinearInterpolation(tweetList, k, maxNgram):
    for i in xrange(k):
        trainSet, testSet = divideDataset(tweetList, k, i)
        trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(trainSet, maxNgram)
        linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, trainDist)
        print linearCoefficients
        count = 0
        tot = 0

        for tweet in testSet:
            predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, trainDist)
            utils.printResultTXT(predictedLanguage, tweet)

            if(predictedLanguage == tweet.language):
                count = count + 1;
            tot = tot +1
            # print str(count)+'/'+str(tot)
        print 'correct tweets fold '+str(i)+' = '+str(count)+'/'+str(tot)
Пример #2
0
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data

# cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram)
linearCoefficientsAll = list()

trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram)
for gram in xrange(1, maxNgram+1):
    linearCoefficientsAll.append(linear.getlinearcoefficientsForLanguageArray(arrayLanguages, gram, trainDist))

print linearCoefficientsAll

# linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients)


count = 4 # Desde que gram empezar

for i in xrange(count, maxNgram):
    count = count + 1
    t0 = time.time()

    for tweet in tweetListPreProcessed_test:
        # t0 = time.time()
        predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficientsAll[i], tweet.text, count,
                                                                             trainDist)
        utils.printResultTXT(predictedLanguage, tweet, count)
    # print "time = "+str(time.time()-t0)  # cv.nestedCrossValidation(tweetListPreProcessed,5,5,[0,0,0],arrayLanguagesFull)
# cv.crossValidation(tweetListPreProcessed, 3, maxNgram+1)

# 3.3-. Out-of-place Measure
Пример #3
0
c = '"En Cada Lucha Aquel Que Va A Muerte Es El Que Gana" Goazen @PasaiaRegional!! #aupaekipo #aupapasaia pic.twitter.com/BQ1ikdE2Qt'


text = preprocess.preprocessText(tweetEU)

# linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, corpusNgrams)
linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients)

linearCoefficients = linearCoefficientsALL[maxNgram-1]
import time
t1 = time.time()
for tweet in tweetListPreProcessedtest:
    t0 = time.time()

    predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, corpusNgrams)
    utils.printResultTXT(predictedLanguage, tweet, 5)

    print "time for tweet= "+str(time.time()-t0)
print "time total= "+str(time.time()-t1)

# sys.stdout.write("\n    Tweet:  "+str(text.encode("utf-8")))
# sys.stdout.write("\n    Tweet language:   "+str(predictedLanguage)+"\n    Probability of:  "+str(probability)+"\n")


# 3.3-. Algorithms: Ranking Methods

# cv.nestedCrossValidation(tweetListPreProcessed,5,5,[0,0,0],arrayLanguagesFull)
# cv.crossValidation(tweetListPreProcessed, 3, [0,0,0], arrayLanguagesFull, maxNgram+1)

# 3.4-. Out-of-place Measure