def buildWD(file_name, writeCSV=False):
    print "Building word dictionary"
    wordRowDict, numTweets, rownames = buildWords(file_name)
    print "Word dictionary finished"
    mat = np.zeros((len(wordRowDict), numTweets))
    colnames = []
    subjects = []
    print "Building word document matrix"
    f = open(file_name)
    matCol = 0
    for line in f:
        words = line.strip('\"').split(',')
        tweet = buildTweet(words[5:])
        tweetColumn = np.zeros(len(wordRowDict))
        for word in tweetprocess.tokenize(tweet):
            if word in wordRowDict:
                tweetColumn[wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1
        if np.sum(tweetColumn) > 0.5*(len(words)-2):
            colnames.append(words[1])
            subjects.append(words[0])
            mat[:,matCol] = tweetColumn
            matCol += 1
    f.close()
    mat = mat[:,0:matCol]
    print "Word document matrix finished"
    if writeCSV:
        print "Writing to CSV"
        writeToCSV(mat, colnames, wordRowDict, "trainWords.csv")
        print "Finished writing to CSV"

    return (mat, colnames, rownames, subjects)
示例#2
0
def buildWords(file_name):
    wordCountDict = defaultdict(int)
    wordRowDict = {}
    rownames = []
    numTweets = 0
    f = open(file_name)
    if "_tiny" in file_name or "_micro" in file_name:
        thresh = 10
    else:
        thresh = 50
    row = 0
    for line in f:
        numTweets += 1
        words = line.split()
        tweet = buildTweet(words[2:])
        for word in tweetprocess.tokenize(tweet):
            wordCountDict[word] = wordCountDict[word] + 1
            if wordCountDict[word] > thresh and word not in wordRowDict:
                rownames.append(word)
                wordRowDict[word] = row
                row += 1
    f.close()
    i = 0
    for word in wordRowDict:
        if i == 15:
            break
        i += 1
    print len(wordRowDict)
    return wordRowDict, numTweets, rownames
示例#3
0
def buildWords(file_name):
    wordCountDict = defaultdict(int)
    wordRowDict = {}
    rownames = []
    numTweets = 0
    f = open(file_name)
    if "_tiny" in file_name or "_micro" in file_name:
        thresh = 10
    else:
        thresh = 50
    row = 0
    for line in f:
        numTweets += 1
        words = line.split()
        tweet = buildTweet(words[2:])
        for word in tweetprocess.tokenize(tweet):
            wordCountDict[word] = wordCountDict[word] + 1
            if wordCountDict[word] > thresh and word not in wordRowDict:
                rownames.append(word)
                wordRowDict[word] = row
                row += 1
    f.close()
    i = 0
    for word in wordRowDict:
        if i == 15:
            break
        i += 1
    print len(wordRowDict)
    return wordRowDict, numTweets, rownames
def buildWords(file_name):
    wordCountDict = defaultdict(int)
    wordRowDict = {}
    rownames = []
    numTweets = 0
    f = open(file_name)
    thresh = 5
    row = 0
    for line in f:
        numTweets += 1
        words = line.strip('\"').split(',')
        tweet = buildTweet(words[5:])
        for word in tweetprocess.tokenize(tweet):
            wordCountDict[word] = wordCountDict[word] + 1
            if wordCountDict[word] > thresh and word not in wordRowDict:
                rownames.append(word)
                wordRowDict[word] = row
                row += 1
    f.close()
    i = 0
    for word in wordRowDict:
        if i == 15:
            break
        i += 1
    print "Word dict length: ", len(wordRowDict)
    return wordRowDict, numTweets, rownames
示例#5
0
def buildWD(file_name, writeCSV=False):
    print "Building word dictionary"
    wordRowDict, numTweets, rownames = buildWords(file_name)
    print "Word dictionary finished"
    mat = np.zeros((len(wordRowDict), numTweets))
    colnames = []
    subjects = []
    print "Building word document matrix"
    f = open(file_name)
    matCol = 0
    for line in f:
        words = line.strip('\"').split(',')
        tweet = buildTweet(words[5:])
        tweetColumn = np.zeros(len(wordRowDict))
        for word in tweetprocess.tokenize(tweet):
            if word in wordRowDict:
                tweetColumn[
                    wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1
        if np.sum(tweetColumn) > 0.5 * (len(words) - 2):
            colnames.append(words[1])
            subjects.append(words[0])
            mat[:, matCol] = tweetColumn
            matCol += 1
    f.close()
    mat = mat[:, 0:matCol]
    print "Word document matrix finished"
    if writeCSV:
        print "Writing to CSV"
        writeToCSV(mat, colnames, wordRowDict, "trainWords.csv")
        print "Finished writing to CSV"

    return (mat, colnames, rownames, subjects)
示例#6
0
def buildWords(file_name):
    wordCountDict = defaultdict(int)
    wordRowDict = {}
    rownames = []
    numTweets = 0
    f = open(file_name)
    thresh = 5
    row = 0
    for line in f:
        numTweets += 1
        words = line.strip('\"').split(',')
        tweet = buildTweet(words[5:])
        for word in tweetprocess.tokenize(tweet):
            wordCountDict[word] = wordCountDict[word] + 1
            if wordCountDict[word] > thresh and word not in wordRowDict:
                rownames.append(word)
                wordRowDict[word] = row
                row += 1
    f.close()
    i = 0
    for word in wordRowDict:
        if i == 15:
            break
        i += 1
    print "Word dict length: ", len(wordRowDict)
    return wordRowDict, numTweets, rownames
def tfidf_logreg(train_file):
    wd = sentiment_buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.strip('\"').split(',')
        if words[1] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            tweet = sentiment_buildwd.buildTweet(words[5:])
            for word in tweetprocess.tokenize(tweet):
                pword = sentiment_buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow*1.0) / numWords
            trainMat[matCol,:] = trainRow
            matCol += 1
    f.close()

    trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index*0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg.score(train[cutoff:], labels[cutoff:])
示例#8
0
def tfidf_logreg(train_file):
    wd = sentiment_buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.strip('\"').split(',')
        if words[1] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            tweet = sentiment_buildwd.buildTweet(words[5:])
            for word in tweetprocess.tokenize(tweet):
                pword = sentiment_buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg.score(train[cutoff:], labels[cutoff:])
示例#9
0
def buildWD(file_name, writeCSV=False, randomize=False, sentiment=False):
    print "Building word dictionary"
    wordRowDict, numTweets, rownames = buildWords(file_name)
    print "Word dictionary finished"
    extra_feats = 0
    if sentiment: extra_feats = 1
    mat = np.zeros((len(wordRowDict) + extra_feats, numTweets))
    colnames = []
    subjects = []
    print "Building word document matrix"
    f = open(file_name)
    matCol = 0
    if (sentiment):
        sentiment_model, sentiment_words = sent.tfidf_logreg(
            SENTIMENT_FILENAME)
        sentimentTweetColumn = np.zeros(len(sentiment_words))
    for line in f:
        words = line.split()
        tweet = buildTweet(words[2:])
        tweetColumn = np.zeros(len(wordRowDict) + extra_feats)
        num_words = 0
        for word in tweetprocess.tokenize(tweet):
            if word in wordRowDict:
                tweetColumn[
                    wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1
            if sentiment and word in sentiment_words:
                sentimentTweetColumn[sentiment_words.index(word)] += 1
                num_words += 1
        if (sentiment and num_words > 0):
            sentimentTweetColumn *= 1.0 / num_words
            tweetColumn[-1] = sentiment_model.predict(sentimentTweetColumn)
        elif (sentiment):
            tweetColumn[-1] = 2
        if np.sum(tweetColumn) > 0.5 * (len(words) - 2):
            colnames.append(words[0])
            subjects.append(words[1])
            mat[:, matCol] = tweetColumn
            matCol += 1
    f.close()
    mat = mat[:, 0:matCol]
    print "Word document matrix finished"
    if writeCSV:
        print "Writing to CSV"
        writeToCSV(mat, colnames, wordRowDict, "trainWords.csv")
        print "Finished writing to CSV"

    if randomize:
        random.seed(17)
        # RANDOMIZE
        shuffle = range(len(subjects))
        random.shuffle(shuffle)
        m = np.zeros(mat.shape)
        c = []
        s = []
        index = 0
        for i in shuffle:
            m[:, index] = mat[:, i]
            c.append(colnames[i])
            s.append(subjects[i])
            index += 1

        return (m, c, rownames, s)
    #return (mat, colnames, rownames, subjects)

    return (mat, colnames, rownames, subjects)
示例#10
0
def buildWD(file_name, writeCSV=False, randomize=False, sentiment=False):
    print "Building word dictionary"
    wordRowDict, numTweets, rownames = buildWords(file_name)
    print "Word dictionary finished"
    extra_feats = 0
    if sentiment: extra_feats = 1
    mat = np.zeros((len(wordRowDict) + extra_feats, numTweets))
    colnames = []
    subjects = []
    print "Building word document matrix"
    f = open(file_name)
    matCol = 0
    if (sentiment):
        sentiment_model, sentiment_words = sent.tfidf_logreg(SENTIMENT_FILENAME)
        sentimentTweetColumn = np.zeros(len(sentiment_words))
    for line in f:
        words = line.split()
        tweet = buildTweet(words[2:])
        tweetColumn = np.zeros(len(wordRowDict) + extra_feats)
        num_words = 0
        for word in tweetprocess.tokenize(tweet):
            if word in wordRowDict:
                tweetColumn[wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1
            if sentiment and word in sentiment_words:
                sentimentTweetColumn[sentiment_words.index(word)] += 1
                num_words += 1
        if (sentiment and num_words > 0):
            sentimentTweetColumn *= 1.0 / num_words
            tweetColumn[-1] = sentiment_model.predict(sentimentTweetColumn)
        elif (sentiment):
            tweetColumn[-1] = 2
        if np.sum(tweetColumn) > 0.5*(len(words)-2):
            colnames.append(words[0])
            subjects.append(words[1])
            mat[:,matCol] = tweetColumn
            matCol += 1
    f.close()
    mat = mat[:,0:matCol]
    print "Word document matrix finished"
    if writeCSV:
        print "Writing to CSV"
        writeToCSV(mat, colnames, wordRowDict, "trainWords.csv")
        print "Finished writing to CSV"

    if randomize:
        random.seed(17)
        # RANDOMIZE
        shuffle = range(len(subjects))
        random.shuffle(shuffle)
        m = np.zeros(mat.shape)
        c = []
        s = []
        index = 0
        for i in shuffle:
            m[:, index] = mat[:, i]
            c.append(colnames[i])
            s.append(subjects[i])
            index += 1

        return (m, c, rownames, s)
    #return (mat, colnames, rownames, subjects)

    return (mat, colnames, rownames, subjects)