예제 #1
0
def countWords(tweetColumn, classColumn):
    classes = {}
    wordFreq = {}
    csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"')
    for row in csvreader:
        if len(row) < tweetColumn or len(row) < classColumn:
            sys.exit(COMMAND + ": unexpected line in file " + fileName + ": " +
                     (",".join(row)) + "\n")
        tokenized = naiveBayes.tokenize([row[tweetColumn - 1]])
        thisClass = row[classColumn - 1]
        if thisClass in classes: classes[thisClass] += 1
        else: classes[thisClass] = 1
        words = {}
        for word in tokenized[0]:
            words[word] = True
        for word in words:
            if not word in wordFreq: wordFreq[word] = {}
            if thisClass in wordFreq[word]: wordFreq[word][thisClass] += 1
            else: wordFreq[word][thisClass] = 1
    return ({"wordFreq": wordFreq, "classes": classes})
def readData(tweetColumn, fileHasHeading):
    text = []
    csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"')
    lineNbr = 0
    for row in csvreader:
        lineNbr += 1
        # ignore first line if it is a heading
        if lineNbr == 1 and fileHasHeading: continue
        # tokenize text
        tokenized = naiveBayes.tokenize([row[tweetColumn]], False)
        # print tokenized text
        for i in range(0, len(tokenized)):
            outLine = ""
            for j in range(0, len(tokenized[i])):
                outLine += tokenized[i][j]
                if j < len(tokenized[i]) - 1: outLine += " "
            print(unicode(outLine).encode('utf8'))
            # flush stdout
            sys.stdout.flush()
    return (text)
예제 #3
0
def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))
예제 #4
0
def wc_mapper(document):
    """ for each word in the document emit (word,1) """
    for word in tokenize(document):
        yield(word, 1)
예제 #5
0
def word_count_old(documents):
    """word count not using MapReduce"""
    return Counter(word
                   for document in documents
                   for word in tokenize(document))
예제 #6
0
    sys.exit(USAGE)
for option in options:
    if option[0] == "-c": keepUpperCase = True

csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"')
lineNbr = 0
for row in csvreader:
    lineNbr += 1
    # first line is a heading
    if lineNbr == 1:
        for i in range(0, len(row)):
            if row[i] == "tweet": tweetColumn = i
            elif row[i] == "class" or row[i] == naiveBayes.CLASSCOLUMNNAME:
                classColumn = i
    else:
        # sanity check
        if tweetColumn == NONE:
            sys.exit(COMMAND +
                     ": tweet column definition missing in heading: " +
                     str(row))
        # tokenize tweet text
        tokenized, = naiveBayes.tokenize([row[tweetColumn]], keepUpperCase)
        # print tokenized text
        if classColumn != NONE:
            sys.stdout.write(LABELPREFIX + row[classColumn] + " ")
        for i in range(0, len(tokenized)):
            if i > 0: sys.stdout.write(" ")
            sys.stdout.write(tokenized[i].encode("utf8"))
        print
sys.exit()
예제 #7
0

# main function starts here
checkOptions()

# get target classes from training data file
targetClasses = naiveBayes.getTargetClasses(trainFile)
if len(targetClasses) == 0:
    sys.exit(COMMAND + ": cannot find target classes\n")

# if required: train the word vector model and save it to file
if modelFile != "":
    # read the model data
    readDataResults = naiveBayes.readData(modelFile, targetClasses[0])
    # tokenize the model data
    tokenizeResults = naiveBayes.tokenize(readDataResults["text"])
    # build the word vectors (test sg=1,window=10)
    wordvecModel = gensim.models.Word2Vec(tokenizeResults,
                                          min_count=MINCOUNT,
                                          size=maxVector)
    # save the word vectors
    wordvecModel.save(wordvectorFile)

# load the word vector model from file
patternNameVec = re.compile("\.vec$")
if not patternNameVec.search(wordvectorFile):
    print >> sys.stderr, "loading gensim vector model from file: %s" % (
        wordvectorFile)
    # read standard file format from gensim
    wordvecModel = gensim.models.Word2Vec.load(wordvectorFile)
else: