def countWords(tweetColumn, classColumn): classes = {} wordFreq = {} csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"') for row in csvreader: if len(row) < tweetColumn or len(row) < classColumn: sys.exit(COMMAND + ": unexpected line in file " + fileName + ": " + (",".join(row)) + "\n") tokenized = naiveBayes.tokenize([row[tweetColumn - 1]]) thisClass = row[classColumn - 1] if thisClass in classes: classes[thisClass] += 1 else: classes[thisClass] = 1 words = {} for word in tokenized[0]: words[word] = True for word in words: if not word in wordFreq: wordFreq[word] = {} if thisClass in wordFreq[word]: wordFreq[word][thisClass] += 1 else: wordFreq[word][thisClass] = 1 return ({"wordFreq": wordFreq, "classes": classes})
def readData(tweetColumn, fileHasHeading): text = [] csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"') lineNbr = 0 for row in csvreader: lineNbr += 1 # ignore first line if it is a heading if lineNbr == 1 and fileHasHeading: continue # tokenize text tokenized = naiveBayes.tokenize([row[tweetColumn]], False) # print tokenized text for i in range(0, len(tokenized)): outLine = "" for j in range(0, len(tokenized[i])): outLine += tokenized[i][j] if j < len(tokenized[i]) - 1: outLine += " " print(unicode(outLine).encode('utf8')) # flush stdout sys.stdout.flush() return (text)
def words_per_user_mapper(status_update): user = status_update["username"] for word in tokenize(status_update["text"]): yield (user, (word, 1))
def wc_mapper(document): """ for each word in the document emit (word,1) """ for word in tokenize(document): yield(word, 1)
def word_count_old(documents): """word count not using MapReduce""" return Counter(word for document in documents for word in tokenize(document))
sys.exit(USAGE) for option in options: if option[0] == "-c": keepUpperCase = True csvreader = csv.reader(sys.stdin, delimiter=',', quotechar='"') lineNbr = 0 for row in csvreader: lineNbr += 1 # first line is a heading if lineNbr == 1: for i in range(0, len(row)): if row[i] == "tweet": tweetColumn = i elif row[i] == "class" or row[i] == naiveBayes.CLASSCOLUMNNAME: classColumn = i else: # sanity check if tweetColumn == NONE: sys.exit(COMMAND + ": tweet column definition missing in heading: " + str(row)) # tokenize tweet text tokenized, = naiveBayes.tokenize([row[tweetColumn]], keepUpperCase) # print tokenized text if classColumn != NONE: sys.stdout.write(LABELPREFIX + row[classColumn] + " ") for i in range(0, len(tokenized)): if i > 0: sys.stdout.write(" ") sys.stdout.write(tokenized[i].encode("utf8")) print sys.exit()
# main function starts here checkOptions() # get target classes from training data file targetClasses = naiveBayes.getTargetClasses(trainFile) if len(targetClasses) == 0: sys.exit(COMMAND + ": cannot find target classes\n") # if required: train the word vector model and save it to file if modelFile != "": # read the model data readDataResults = naiveBayes.readData(modelFile, targetClasses[0]) # tokenize the model data tokenizeResults = naiveBayes.tokenize(readDataResults["text"]) # build the word vectors (test sg=1,window=10) wordvecModel = gensim.models.Word2Vec(tokenizeResults, min_count=MINCOUNT, size=maxVector) # save the word vectors wordvecModel.save(wordvectorFile) # load the word vector model from file patternNameVec = re.compile("\.vec$") if not patternNameVec.search(wordvectorFile): print >> sys.stderr, "loading gensim vector model from file: %s" % ( wordvectorFile) # read standard file format from gensim wordvecModel = gensim.models.Word2Vec.load(wordvectorFile) else: