Python UtilsTweetSafa.obtainNgrams примеры использования

Язык программирования: Python

Класс/Тип: UtilsTweetSafa

Метод/Функция: obtainNgrams

Примеров на hotexamples.com: 5

Python UtilsTweetSafa.obtainNgrams - 5 примеров найдено. Это лучшие примеры Python кода для UtilsTweetSafa.obtainNgrams, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

obtainNgrams(4)

printResultTXT(2)

chooseLanguagesLin(1)

getBigramFreqForSingleLang(1)

returnNgramFreqSet(1)

printResults(1)

orderVector(1)

learnNgramConfidencefromData(1)

getAllLanguagesSet(1)

cleanDataset(1)

formatDataset(1)

evaluateNgramRakingSet(1)

crossValidationRanking(1)

crossValidationLidstone(1)

createDataSet(1)

cleanTweets(1)

returnNgramFreqSetRanking(1)

Пример #1

Показать файл

Файл: CrossValidation.py Проект: CarlosAndres12/SEPLN-TweetLID14

def crossValidation(tweetList, k,maxNgram):
    m=80
    n=50
    for i in xrange(k):

        trainSet,testSet = divideDataset(tweetList,k,i)
        trainDist = utils.obtainNgrams(trainSet,maxNgram)
        confidenceDict=utils.learnNgramConfidencefromData(trainDist,trainSet)
        predicted, true=utils.evaluateNgramRakingSet(testSet,trainDist, confidenceDict,m,n)
        # utils.printJeroni(true,predicted,i)
        utils.printResults(testSet, predicted, i)

Пример #2

Показать файл

Файл: CrossValidation.py Проект: CarlosAndres12/SEPLN-TweetLID14

def crossValidationLinearInterpolation(tweetList, k, maxNgram):
    for i in xrange(k):
        trainSet, testSet = divideDataset(tweetList, k, i)
        trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(trainSet, maxNgram)
        linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, trainDist)
        print linearCoefficients
        count = 0
        tot = 0

        for tweet in testSet:
            predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, trainDist)
            utils.printResultTXT(predictedLanguage, tweet)

            if(predictedLanguage == tweet.language):
                count = count + 1;
            tot = tot +1
            # print str(count)+'/'+str(tot)
        print 'correct tweets fold '+str(i)+' = '+str(count)+'/'+str(tot)

Пример #3

Показать файл

Файл: mainLID.py Проект: CarlosAndres12/SEPLN-TweetLID14

tweetListPreProcessed_train = preprocess.main(tweetList_train)
tweetListPreProcessed_test = preprocess.main(tweetList_test)
# shuffle(tweetListPreProcessed)

# 3-. Algorithms

# 3.1-. Algorithms: Bayesian Networks
#   3.2.1-. Linear interpolation
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data

# cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram)
linearCoefficientsAll = list()

trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram)
for gram in xrange(1, maxNgram+1):
    linearCoefficientsAll.append(linear.getlinearcoefficientsForLanguageArray(arrayLanguages, gram, trainDist))

print linearCoefficientsAll

# linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients)


count = 4 # Desde que gram empezar

for i in xrange(count, maxNgram):
    count = count + 1
    t0 = time.time()

    for tweet in tweetListPreProcessed_test:

Пример #4

Показать файл

Файл: Demo.py Проект: buhrmann/SEPLN-TweetLID14

tweetListtest = read.read_tweets_dataset(test)
# 2-. Pre-process state

tweetListPreProcessed = preprocess.main(tweetList)
tweetListPreProcessedtest= preprocess.main(tweetListtest)
shuffle(tweetListPreProcessed)
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed

#utils.printTweets(tweetListPreProcessed)

# 3-. Algorithms
#
# 3.1-. OBTAIN N-GRAMS

corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, maxNgram+1)
arrayLanguagesFull = utils.orderVector(arrayLanguagesFull)

# Example:  print(corpusNgrams.get(str(3)).get('pt'))


# 3.2-. Algorithms: Bayesian Networks
#   3.2.1-. Linear interpolation
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data


tweetEN = "Tomorrow is going to be a good day to go to the beach."
tweetPT = "Amanhã será um dia muito bom, como ir para a praia."
tweetCA = "Demà farà un dia molt bo, com per anar a la platja."
tweetEU = "Bihar egun oso ona egingo du, hondartzara joateko modukoa."

Пример #5

Показать файл

Файл: CalculateLICoefficients.py Проект: CarlosAndres12/SEPLN-TweetLID14

# _____________________________________________________________________________


# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = sys.argv[1]
maxNgram = int(sys.argv[2])

filename = os.path.basename(dataset).split('.')

tweetList = read.read_tweets_dataset(dataset)

# 2-. Pre-process state
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed
tweetListPreProcessed = preprocess.main(tweetList)

# 3-. OBTAIN N-GRAMS and Linear Coefficients

for i in xrange(5, maxNgram+1):
    corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1)
    linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams)
    # print linearCoefficients
    file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+')
    for li in linearCoefficients:
        file.write(str(i)+"\t"+str(li[0]))
        for co in xrange(1, i+1):
            file.write("\t"+str(li[co]))
        file.write("\n")
file.close()