Python tokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: naiveBayes

메소드/함수: tokenize

hotexamples.com에서의 예제들: 7

Python tokenize - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 naiveBayes.tokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: wordProfiles.py 프로젝트: wzy19840102/machine-learning

def countWords(tweetColumn, classColumn):
    classes = {}
    wordFreq = {}
    csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"')
    for row in csvreader:
        if len(row) < tweetColumn or len(row) < classColumn:
            sys.exit(COMMAND + ": unexpected line in file " + fileName + ": " +
                     (",".join(row)) + "\n")
        tokenized = naiveBayes.tokenize([row[tweetColumn - 1]])
        thisClass = row[classColumn - 1]
        if thisClass in classes: classes[thisClass] += 1
        else: classes[thisClass] = 1
        words = {}
        for word in tokenized[0]:
            words[word] = True
        for word in words:
            if not word in wordFreq: wordFreq[word] = {}
            if thisClass in wordFreq[word]: wordFreq[word][thisClass] += 1
            else: wordFreq[word][thisClass] = 1
    return ({"wordFreq": wordFreq, "classes": classes})

예제 #2

파일 보기

파일: tokenizeTweets.py 프로젝트: wzy19840102/machine-learning

def readData(tweetColumn, fileHasHeading):
    text = []
    csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"')
    lineNbr = 0
    for row in csvreader:
        lineNbr += 1
        # ignore first line if it is a heading
        if lineNbr == 1 and fileHasHeading: continue
        # tokenize text
        tokenized = naiveBayes.tokenize([row[tweetColumn]], False)
        # print tokenized text
        for i in range(0, len(tokenized)):
            outLine = ""
            for j in range(0, len(tokenized[i])):
                outLine += tokenized[i][j]
                if j < len(tokenized[i]) - 1: outLine += " "
            print(unicode(outLine).encode('utf8'))
            # flush stdout
            sys.stdout.flush()
    return (text)

예제 #3

파일 보기

def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))

예제 #4

파일 보기

def wc_mapper(document):
    """ for each word in the document emit (word,1) """
    for word in tokenize(document):
        yield(word, 1)

예제 #5

파일 보기

def word_count_old(documents):
    """word count not using MapReduce"""
    return Counter(word
                   for document in documents
                   for word in tokenize(document))

예제 #6

파일 보기

    sys.exit(USAGE)
for option in options:
    if option[0] == "-c": keepUpperCase = True

csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"')
lineNbr = 0
for row in csvreader:
    lineNbr += 1
    # first line is a heading
    if lineNbr == 1:
        for i in range(0, len(row)):
            if row[i] == "tweet": tweetColumn = i
            elif row[i] == "class" or row[i] == naiveBayes.CLASSCOLUMNNAME:
                classColumn = i
    else:
        # sanity check
        if tweetColumn == NONE:
            sys.exit(COMMAND +
                     ": tweet column definition missing in heading: " +
                     str(row))
        # tokenize tweet text
        tokenized, = naiveBayes.tokenize([row[tweetColumn]], keepUpperCase)
        # print tokenized text
        if classColumn != NONE:
            sys.stdout.write(LABELPREFIX + row[classColumn] + " ")
        for i in range(0, len(tokenized)):
            if i > 0: sys.stdout.write(" ")
            sys.stdout.write(tokenized[i].encode("utf8"))
        print
sys.exit()

예제 #7

파일 보기

파일: word2vec.py 프로젝트: wzy19840102/machine-learning


# main function starts here
checkOptions()

# get target classes from training data file
targetClasses = naiveBayes.getTargetClasses(trainFile)
if len(targetClasses) == 0:
    sys.exit(COMMAND + ": cannot find target classes\n")

# if required: train the word vector model and save it to file
if modelFile != "":
    # read the model data
    readDataResults = naiveBayes.readData(modelFile, targetClasses[0])
    # tokenize the model data
    tokenizeResults = naiveBayes.tokenize(readDataResults["text"])
    # build the word vectors (test sg=1,window=10)
    wordvecModel = gensim.models.Word2Vec(tokenizeResults,
                                          min_count=MINCOUNT,
                                          size=maxVector)
    # save the word vectors
    wordvecModel.save(wordvectorFile)

# load the word vector model from file
patternNameVec = re.compile("\.vec$")
if not patternNameVec.search(wordvectorFile):
    print >> sys.stderr, "loading gensim vector model from file: %s" % (
        wordvectorFile)
    # read standard file format from gensim
    wordvecModel = gensim.models.Word2Vec.load(wordvectorFile)
else: