示例#1
0
def tfidf_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(trainMat[0:(trainMat.shape[0] * 0.7), :],
            trainVals[0:(trainMat.shape[0] * 0.7)])
    return knn.score(trainMat[(trainMat.shape[0] * 0.7):, :],
                     trainVals[(trainMat.shape[0] * 0.7):])
示例#2
0
def tfidf_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7)])
    return knn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :])
示例#3
0
def get_glove_logreg(train_file, trainMat=None):
    if trainMat == None:
        trainMat = buildGloveTrainMat(train_file)

    wd = buildwd.buildWD(train_file, randomize=True)
    labels = wd[3]
    trainVals = buildwd.trainValsFromSubjects(labels)

    logreg = linear_model.LogisticRegression()
    logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return logreg, trainMat, trainVals
示例#4
0
def glove_knn(train_file, trainMat=None):
    if trainMat == None:
        trainMat = buildGloveTrainMat(train_file)

    wd = buildwd.buildWD(train_file, randomize=True)
    labels = wd[3]
    trainVals = buildwd.trainValsFromSubjects(labels)

    knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return knn.score(trainMat[(trainMat.shape[0]*0.7):,:], trainVals[(trainMat.shape[0]*0.7):])
示例#5
0
def buildGloveTrainMat(train_file):
    wd = buildwd.buildWD(train_file, randomize=True)
    mat = wd[0]
    tweetIDs = wd[1]
    words = wd[2]
    labels = wd[3]
    buildGloveCache(words)
    mat = np.transpose(mat)
    print 'Building GLOVE train matrix...'
    trainMat = np.array([glove_features(mat[i,:], words) for i in range(len(tweetIDs))])
    return trainMat
示例#6
0
def get_bag_logreg(train_file):
    wd = buildwd.buildWD(train_file, randomize=True, sentiment=True)
    colnames = wd[1]
    subjects = wd[3]

    trainMat = np.transpose(wd[0])
    row_sums = trainMat.sum(axis=1)
    trainMat = trainMat / row_sums[:, np.newaxis]

    trainVals = buildwd.trainValsFromSubjects(subjects)

    print 'Training bag_logreg...'
    logreg = linear_model.LogisticRegression()
    logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return logreg, trainMat, trainVals
示例#7
0
def get_bag_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]

    trainMat = np.transpose(wd[0])
    row_sums = trainMat.sum(axis=1)
    trainMat = trainMat / row_sums[:, np.newaxis]

    trainVals = buildwd.trainValsFromSubjects(subjects)

    print 'Training bag_knn...'
    knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return knn, trainMat, trainVals
示例#8
0
def get_tfidf_logreg(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            if numWords > 0:
                trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, train, labels, cutoff
示例#9
0
def get_tfidf_logreg(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            if (numWords > 0): trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, train, labels, cutoff
示例#10
0
def tfidf_shallownn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = np.zeros((len(subjects), 2))
    for s in enumerate(subjects):
        if s[1] == 'Sports':
            trainVals[s[0], 0] = 1
        elif s[1] == 'Politics':
            trainVals[s[0], 1] = 1

    snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1],
                                         hidden_dim=5,
                                         output_dim=2)
    snn.train(trainMat[0:(trainMat.shape[0] * 0.7), :],
              trainVals[0:(trainMat.shape[0] * 0.7), :],
              display_progress=True,
              maxiter=10)
    return snn.score(trainMat[(trainMat.shape[0] * 0.7):, :],
                     trainVals[(trainMat.shape[0] * 0.7):, :])
示例#11
0
def tfidf_shallownn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = np.zeros((len(subjects), 2))
    for s in enumerate(subjects):
        if s[1] == "Sports":
            trainVals[s[0], 0] = 1
        elif s[1] == "Politics":
            trainVals[s[0], 1] = 1

    snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2)
    snn.train(
        trainMat[0 : (trainMat.shape[0] * 0.7), :],
        trainVals[0 : (trainMat.shape[0] * 0.7), :],
        display_progress=True,
        maxiter=10,
    )
    return snn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :, :])
示例#12
0

"""
Format is word then glove vector values with spaces used as delimiters
"""


def writeToFile(mat, rownames):
    f = open(WRITE_FILE, "w")
    for i in range(len(rownames)):
        toWrite = rownames[i] + " "
        for j in range(mat.shape[1]):
            toWrite += str(mat[i, j])
            if j != mat.shape[1] - 1:
                toWrite += " "
        toWrite += "\n"
        f.write(toWrite)
    f.close()


def buildGloveFile(mat, rownames):
    glv = glove(mat=mat, rownames=rownames)
    writeToFile(glv[0], glv[1])


if __name__ == "__main__":
    wd = buildwd.buildWD(TRAIN_FILE)
    mat = wd[0]
    rownames = wd[2]
    buildGloveFile(mat, rownames)
示例#13
0
    # Return the sum of the word and context matrices, per the advice 
    # in section 4.2:
    return (W + C, rownames)

"""
Format is word then glove vector values with spaces used as delimiters
"""
def writeToFile(mat, rownames):
    f = open(WRITE_FILE, 'w')
    for i in range(len(rownames)):
        toWrite = rownames[i] + " "
        for j in range(mat.shape[1]):
            toWrite += str(mat[i,j])
            if j != mat.shape[1]-1:
                toWrite += " "
        toWrite += "\n"
        f.write(toWrite)
    f.close()

def buildGloveFile(mat, rownames):
    glv = glove(mat=mat, rownames=rownames)
    writeToFile(glv[0], glv[1])

if __name__ == '__main__':
    wd = buildwd.buildWD(TRAIN_FILE)
    mat = wd[0]
    rownames = wd[2]
    buildGloveFile(mat, rownames)