示例#1
0
def tfidf_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(trainMat[0:(trainMat.shape[0] * 0.7), :],
            trainVals[0:(trainMat.shape[0] * 0.7)])
    return knn.score(trainMat[(trainMat.shape[0] * 0.7):, :],
                     trainVals[(trainMat.shape[0] * 0.7):])
示例#2
0
def tfidf_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7)])
    return knn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :])
示例#3
0
def get_tfidf_logreg(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            if numWords > 0:
                trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, train, labels, cutoff
示例#4
0
def get_tfidf_logreg(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            if (numWords > 0): trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, train, labels, cutoff
示例#5
0
def tfidf_shallownn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = np.zeros((len(subjects), 2))
    for s in enumerate(subjects):
        if s[1] == 'Sports':
            trainVals[s[0], 0] = 1
        elif s[1] == 'Politics':
            trainVals[s[0], 1] = 1

    snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1],
                                         hidden_dim=5,
                                         output_dim=2)
    snn.train(trainMat[0:(trainMat.shape[0] * 0.7), :],
              trainVals[0:(trainMat.shape[0] * 0.7), :],
              display_progress=True,
              maxiter=10)
    return snn.score(trainMat[(trainMat.shape[0] * 0.7):, :],
                     trainVals[(trainMat.shape[0] * 0.7):, :])
示例#6
0
def tfidf_shallownn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = np.zeros((len(subjects), 2))
    for s in enumerate(subjects):
        if s[1] == "Sports":
            trainVals[s[0], 0] = 1
        elif s[1] == "Politics":
            trainVals[s[0], 1] = 1

    snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2)
    snn.train(
        trainMat[0 : (trainMat.shape[0] * 0.7), :],
        trainVals[0 : (trainMat.shape[0] * 0.7), :],
        display_progress=True,
        maxiter=10,
    )
    return snn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :, :])