예제 #1
0
파일: vec_cls.py 프로젝트: Jeky/thesis
def testClassifiers(dataset, out):
    names = ["Nearest Neighbors",
              "Decision Tree",
              "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
              "Quadratic Discriminant Analysis"]
    # names = ["Linear SVM", "RBF SVM"]
    classifiers = [
        KNeighborsClassifier(10),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        #SVC(kernel = 'linear', cache_size = 1500),
        #SVC(kernel = 'rbf', cache_size = 1500)]
    ]
    for clf, name in zip(classifiers, names):
        globe.getLogger().info('Testing Classifier: %s', name)
        out.write(name + '\n')
        scores = cross_validation.cross_val_predict(clf, dataset.instances, dataset.labels, cv = 10, verbose = 0)

        cm = confusion_matrix(dataset.labels, scores)
        out.write('%d\t%d\t%d\t%d\t%.10f\t%.10f\t%.10f\t%.10f\n' % (
                    cm[0][0], cm[0][1], cm[1][0], cm[1][1],
                    precision_score(dataset.labels, scores),
                    recall_score(dataset.labels, scores),
                    accuracy_score(dataset.labels, scores),
                    f1_score(dataset.labels, scores)
                    ))
예제 #2
0
def testClassifiers(dataset, out):
    names = [
        "Nearest Neighbors", "Decision Tree", "Random Forest", "AdaBoost",
        "Naive Bayes", "Linear Discriminant Analysis",
        "Quadratic Discriminant Analysis"
    ]
    # names = ["Linear SVM", "RBF SVM"]
    classifiers = [
        KNeighborsClassifier(10),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        #SVC(kernel = 'linear', cache_size = 1500),
        #SVC(kernel = 'rbf', cache_size = 1500)]
    ]
    for clf, name in zip(classifiers, names):
        globe.getLogger().info('Testing Classifier: %s', name)
        out.write(name + '\n')
        scores = cross_validation.cross_val_predict(clf,
                                                    dataset.instances,
                                                    dataset.labels,
                                                    cv=10,
                                                    verbose=0)

        cm = confusion_matrix(dataset.labels, scores)
        out.write('%d\t%d\t%d\t%d\t%.10f\t%.10f\t%.10f\t%.10f\n' %
                  (cm[0][0], cm[0][1], cm[1][0], cm[1][1],
                   precision_score(dataset.labels, scores),
                   recall_score(dataset.labels, scores),
                   accuracy_score(dataset.labels,
                                  scores), f1_score(dataset.labels, scores)))
예제 #3
0
def unigramDF():
    globe.getLogger().info('Counting Unigram')

    def splitter(instance):
        return set(instance.split())

    countDF(splitter, UNIGRAM_DF_OUTPUT)
예제 #4
0
파일: user2vec.py 프로젝트: lvjunmei/thesis
def user2vec(words, d, normalized=True, discardMissing=True):
    globe.getLogger().info(
        'Converting Users to Vectors (NORMALIZED = %r, DISCARD_MISSING_WORD = %r)'
        % (normalized, discardMissing))
    missingWords = {}
    dim = words[words.keys()[0]].shape[0]

    vectorInstances = []
    for i, instance in enumerate(d.instances):
        if i % 1000 == 0 and i != 0:
            globe.getLogger().info('processed %d instance' % i)

        vector = np.zeros(dim)
        wCount = 0
        for w in instance.split():
            if w in words:
                wCount += 1
                vector += words[w]
            else:
                if not discardMissing:
                    if w not in missingWords:
                        missingWords[w] = np.random.rand(dim)

                    vector += missingWords[w]

        if normalized:
            if wCount != 0:
                vector /= wCount

        vectorInstances.append(vector)

    # remove zero vectors
    vecDataset = Dataset()
    globe.getLogger().info('Removing zero vectors')
    zeroCount = 0
    for i, vector in enumerate(vectorInstances):
        if np.count_nonzero(vector) == 0:
            globe.getLogger().info('User %d is empty' % d.users[i])
            zeroCount += 1
        else:
            vecDataset.users.append(d.users[i])
            vecDataset.labels.append(d.labels[i])
            vecDataset.instances.append(vector)

    globe.getLogger().info('Total Found: %d' % zeroCount)
    if normalized:
        if discardMissing:
            fout = USER_VECTOR_NORM_DISMW_DATASET
        else:
            fout = USER_VECTOR_NORM_DATASET
    else:
        if discardMissing:
            fout = USER_VECTOR_DISMW_DATASET
        else:
            fout = USER_VECTOR_DATASET

    vecDataset.save(fout)
예제 #5
0
def loadDFFeatures(featurePath):
    globe.getLogger().info('Loading Features from %s' % featurePath)
    with open(featurePath) as fin:
        features = []#{}
        for i, l in enumerate(fin.xreadlines()):
            if i % 100000 == 0 and i != 0:
                globe.getLogger().info('loaded %d features' % i)
            gram, df1, df2, score = l.strip().split()
            #features[gram] = [int(df1), int(df2), float(score)]
            features.append(gram)

    return features
예제 #6
0
def bigramDF():
    globe.getLogger().info('Counting Bigram')

    def splitter(instance):
        bigrams = set()
        for l in instance.split('\n'):
            words = l.split()
            for i in range(len(words) - 1):
                bigrams.add(words[i] + '_' + words[i + 1])

        return bigrams

    countDF(splitter, BIGRAM_DF_OUTPUT)
예제 #7
0
파일: text_cls.py 프로젝트: Jeky/thesis
def testClassifiers(dataset, out):
    names = ['MultinomialNB', 'BernoulliNB']

    classifiers = [
        MultinomialNB,
        BernoulliNB
    ]

    for clf, name in zip(classifiers, names):
        globe.getLogger().info('Testing Classifier: %s', name)
        out.write('Testing Classifier: %s\n' % name)
        cv.crossValidate(dataset, clf(), out)
        scores = cross_validation.cross_val_predict(clf(), dataset.instances, dataset.labels, cv = 10, verbose = 0)
        out.write('Total:\n'+ str(confusion_matrix(dataset.labels, scores)))
예제 #8
0
파일: text_cls.py 프로젝트: lvjunmei/thesis
def testClassifiers(dataset, out):
    names = ['MultinomialNB', 'BernoulliNB']

    classifiers = [MultinomialNB, BernoulliNB]

    for clf, name in zip(classifiers, names):
        globe.getLogger().info('Testing Classifier: %s', name)
        out.write('Testing Classifier: %s\n' % name)
        cv.crossValidate(dataset, clf(), out)
        scores = cross_validation.cross_val_predict(clf(),
                                                    dataset.instances,
                                                    dataset.labels,
                                                    cv=10,
                                                    verbose=0)
        out.write('Total:\n' + str(confusion_matrix(dataset.labels, scores)))
예제 #9
0
파일: para2vec.py 프로젝트: Jeky/thesis
def train(dataset, dim):
    globe.getLogger().info('Training Paragraph Vectors (Dimension = %d)' % dim)
    model = Doc2Vec(size = dim, window = 8, workers = 8, alpha=0.025, min_alpha=0.025, min_count = 2)

    model.build_vocab(dataset.instances)

    for epoch in range(10):
        globe.getLogger().info('Training %d time' % epoch)
        model.train(dataset.instances)
        model.alpha -= 0.002 # decrease the learning rate
        model.min_alpha = model.alpha # fix the learning rate, no deca
        model.train(dataset.instances)

    model.save(PARA_MODEL)
    return model
예제 #10
0
파일: para2vec.py 프로젝트: Jeky/thesis
def loadParaDataset():
    globe.getLogger().info('Loading Dataset')
    dataset = loadDataset(DOC_DATAEST)
    paraDataset = Dataset()
    paraDataset.users = dataset.users
    paraDataset.labels = dataset.labels

    for i, instance in enumerate(dataset.instances):
        if i % 100 == 0:
            globe.getLogger().info('Processed %d instances' % i)
        paraDataset.instances.append(LabeledSentence(words = instance.split(), tags = [u'T_%d' % i]))

    dataset.save(PRE_PARA_DOC)

    return paraDataset
예제 #11
0
def loadParaDataset():
    globe.getLogger().info('Loading Dataset')
    dataset = loadDataset(DOC_DATAEST)
    paraDataset = Dataset()
    paraDataset.users = dataset.users
    paraDataset.labels = dataset.labels

    for i, instance in enumerate(dataset.instances):
        if i % 100 == 0:
            globe.getLogger().info('Processed %d instances' % i)
        paraDataset.instances.append(
            LabeledSentence(words=instance.split(), tags=[u'T_%d' % i]))

    dataset.save(PRE_PARA_DOC)

    return paraDataset
예제 #12
0
파일: user2vec.py 프로젝트: lvjunmei/thesis
def loadWord2vec(dim):
    globe.getLogger().info('Start Loading word vector file')

    with open(PATH + '/tweets-%d.bin.txt' % dim) as fin:
        words = {}
        count, dim = fin.readline().strip().split()
        dim = int(dim)

        for i, l in enumerate(fin.xreadlines()):
            if i % 100000 == 0 and i != 0:
                globe.getLogger().info('read %d lines' % i)

            vector = l.strip().split()
            words[vector[0]] = np.array([float(v) for v in vector[1:]])

    return words
예제 #13
0
파일: para2vec.py 프로젝트: Jeky/thesis
def test(model, dim, dataset, out):
    instances = [model.docvecs[u'T_%d' % i] for i in range(len(dataset.instances))]

    d = Dataset()
    zeroCount = 0
    for i, ins in enumerate(instances):
        if np.isfinite(ins).all():
            d.labels.append(dataset.labels[i])
            d.instances.append(ins)
            d.users.append(dataset.users[i])
        else:
            zeroCount += 1

    d.save(PATH + 'paragraph-vector-s%d.obj' % dim)

    globe.getLogger().info('Zero Count: %d' % zeroCount)
    '''
예제 #14
0
파일: data.py 프로젝트: lvjunmei/thesis
    def read(self, fname):
        '''
        Read dataset from original dataset file.

        Format:
        !ID \t IS_SUSPENDED
        TWEET1
        TWEET2
        ...
        !ID \t IS_SUSPENDED
        ...

        '''

        users = []

        u = {}
        with open(fname) as fin:
            globe.getLogger().info('Start reading file: %s', fname)
            for i, l in enumerate(fin.xreadlines()):
                l = l.strip()

                if i != 0 and i % 10000 == 0:
                    globe.getLogger().info('Read %d lines', i)

                if l != '':
                    if l[0] == '!':
                        if 'id' in u:
                            users.append(u)
                            u = {}

                        uid, isSuspended = l.split()
                        u['id'] = int(uid[1:])
                        u['suspended'] = int(isSuspended)
                        u['tweets'] = []
                    else:
                        u['tweets'].append(l)

        users.append(u)

        for u in users:
            self.users.append(u['id'])
            self.labels.append(u['suspended'])
            self.instances.append('\n'.join(u['tweets']))
예제 #15
0
파일: data.py 프로젝트: Jeky/thesis
    def read(self, fname):
        '''
        Read dataset from original dataset file.

        Format:
        !ID \t IS_SUSPENDED
        TWEET1
        TWEET2
        ...
        !ID \t IS_SUSPENDED
        ...

        '''

        users = []

        u = {}
        with open(fname) as fin:
            globe.getLogger().info('Start reading file: %s', fname)
            for i, l in enumerate(fin.xreadlines()):
                l = l.strip()

                if i != 0 and i % 10000 == 0:
                    globe.getLogger().info('Read %d lines', i)

                if l != '':
                    if l[0] == '!':
                        if 'id' in u:
                            users.append(u)
                            u = {}

                        uid, isSuspended = l.split()
                        u['id'] = int(uid[1:])
                        u['suspended'] = int(isSuspended)
                        u['tweets'] = []
                    else:
                        u['tweets'].append(l)

        users.append(u)

        for u in users:
            self.users.append(u['id'])
            self.labels.append(u['suspended'])
            self.instances.append('\n'.join(u['tweets']))
예제 #16
0
def test(model, dim, dataset, out):
    instances = [
        model.docvecs[u'T_%d' % i] for i in range(len(dataset.instances))
    ]

    d = Dataset()
    zeroCount = 0
    for i, ins in enumerate(instances):
        if np.isfinite(ins).all():
            d.labels.append(dataset.labels[i])
            d.instances.append(ins)
            d.users.append(dataset.users[i])
        else:
            zeroCount += 1

    d.save(PATH + 'paragraph-vector-s%d.obj' % dim)

    globe.getLogger().info('Zero Count: %d' % zeroCount)
    '''
예제 #17
0
def train(dataset, dim):
    globe.getLogger().info('Training Paragraph Vectors (Dimension = %d)' % dim)
    model = Doc2Vec(size=dim,
                    window=8,
                    workers=8,
                    alpha=0.025,
                    min_alpha=0.025,
                    min_count=2)

    model.build_vocab(dataset.instances)

    for epoch in range(10):
        globe.getLogger().info('Training %d time' % epoch)
        model.train(dataset.instances)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no deca
        model.train(dataset.instances)

    model.save(PARA_MODEL)
    return model
예제 #18
0
def evaluateDF(dataset, features, top):
    globe.getLogger().info('Evaluating with feature count = %d' % top)
    # filter dataset
    filteredDS = Dataset()
    def addFeature(ins, fins, f):
        if f in ins:
            fins[f] = ins[f]
        else:
            fins[f] = 0

    globe.getLogger().info('Filtering Dataset')
    count = 0
    for uid, label, instance in zip(dataset.users, dataset.labels, dataset.instances):
        if count % 100 == 0 and count != 0:
            globe.getLogger().info('processed %d instances' % count)

        filteredInstance = {}
        for i in range(top / 2):
            addFeature(instance, filteredInstance, features[i])
            addFeature(instance, filteredInstance, features[-(i+1)])

        filteredDS.users.append(uid)
        filteredDS.labels.append(label)
        filteredDS.instances.append(filteredInstance)

        count += 1

    filteredDS.instances = DictVectorizer().fit_transform(filteredDS.instances)

    # evaluate
    text_cls.testClassifiers(filteredDS)
예제 #19
0
def countDF(splitter, outputFilename):
    dataset = loadDataset(DOC_DATAEST)
    sGram = Counter()
    nsGram = Counter()

    i = 0
    for label, instance in zip(dataset.labels, dataset.instances):
        if i % 100 == 0 and i != 0:
            globe.getLogger().info('processed %d instances' % i)

        grams = splitter(instance)
        for g in grams:
            if label == SUSPENDED_LABEL:
                sGram[g] += 1
            else:
                nsGram[g] += 1

        i += 1

    globe.getLogger().info('Len(sGram) = %d, Len(nsGram) = %d' % (len(sGram), len(nsGram)))
    features = {}

    for g, c in sGram.items():
        if g not in features:
            features[g] = [c, 0, 0.0]
        else:
            features[g][0] = c

    for g, c in nsGram.items():
        if g not in features:
            features[g] = [0, c, 0.0]
        else:
            features[g][1] = c

    globe.getLogger().info('Sorting Grams by DF')
    features = [(k, v[0], v[1], float(v[0] + 1) / (v[1] + 1)) for k, v in features.items()]
    features.sort(key = lambda item : -item[-1])

    globe.getLogger().info('Saving Result')
    with open(outputFilename, 'w') as fout:
        for f in features:
            fout.write('%s\t%d\t%d\t%.10f\n' % f)
예제 #20
0
def testMI(d):
    globe.getLogger().info('length of vocabulary = %d', len(d.vocabulary))

    # print d.instances.shape
    c = biclass_mutual_info(d.instances, d.labels)
    scores = [(d.vocabulary[i], sf) for i, sf in enumerate(c)]
    scores.sort(key = lambda i: -i[1])
    with open(MI_OUTPUT, 'w') as fout:
        for s in scores:
            fout.write('%s\t%.10f\n' % s)

    size = 1
    while 10 ** size < len(d.vocabulary):
        clf = MultinomialNB()
        X = SelectKBest(biclass_mutual_info, k= 10 ** size).fit_transform(d.instances, d.labels)
        scores = cross_validation.cross_val_predict(clf, X, d.labels, cv = 10, verbose = 0)
        globe.getLogger().info('10^%d\t%.6f\t%.6f', size, accuracy_score(d.labels, scores), f1_score(d.labels, scores))
        globe.getLogger().info(confusion_matrix(d.labels, scores))
        size += 1

    clf = MultinomialNB()
    scores = cross_validation.cross_val_predict(clf, d.instances, d.labels, cv = 10, verbose = 0)
    globe.getLogger().info('%.1e\t%.6f\t%.6f', len(d.vocabulary), accuracy_score(d.labels, scores), f1_score(d.labels, scores))
예제 #21
0
파일: cv.py 프로젝트: Jeky/thesis
def crossValidate(dataset, cls, out):
    kfold = KFold(len(dataset.users), 10, shuffle = True, random_state = 42)

    count = 1
    for trainIndex, testIndex in kfold:
        out.write('Cross Validation %d Time\n' % count)
        globe.getLogger().info('Cross Validation %d Time' % count)

        trainX = dataset.instances[trainIndex]
        trainY = dataset.labels[trainIndex]

        testX = dataset.instances[testIndex]
        testY = dataset.labels[testIndex]

        globe.getLogger().info('Training...')
        cls.fit(trainX, trainY)

        globe.getLogger().info('Testing...')
        predicted = cls.predict(testX)

        out.write(np.array_str(confusion_matrix(testY, predicted)) + '\n')

        count += 1
예제 #22
0
파일: cv.py 프로젝트: lvjunmei/thesis
def crossValidate(dataset, cls, out):
    kfold = KFold(len(dataset.users), 10, shuffle=True, random_state=42)

    count = 1
    for trainIndex, testIndex in kfold:
        out.write('Cross Validation %d Time\n' % count)
        globe.getLogger().info('Cross Validation %d Time' % count)

        trainX = dataset.instances[trainIndex]
        trainY = dataset.labels[trainIndex]

        testX = dataset.instances[testIndex]
        testY = dataset.labels[testIndex]

        globe.getLogger().info('Training...')
        cls.fit(trainX, trainY)

        globe.getLogger().info('Testing...')
        predicted = cls.predict(testX)

        out.write(np.array_str(confusion_matrix(testY, predicted)) + '\n')

        count += 1
예제 #23
0
파일: data.py 프로젝트: Jeky/thesis
    d.read(ORIGINAL_DATASET)
    d.save(DOC_DATAEST)

    d1 = Dataset()
    d1.users = d.users
    d1.labels = np.array(d.labels)

    count_vect = CountVectorizer(token_pattern=r'\S+')
    X_train_counts = count_vect.fit_transform(d.instances)
    d1.instances = X_train_counts
    d1.save(TOKEN_COUNT_DATASET)
    d1.vocabulary = {v:k for k, v in count_vect.vocabulary_.items()}

    # tfidf_transformer = TfidfTransformer()
    # d1.instances = tfidf_transformer.fit_transform(X_train_counts)
    # d1.save(TOKEN_NORM_COUNT_DATASET)

    count_vect = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\S+')
    X_train_counts = count_vect.fit_transform(d.instances)
    d1.instances = X_train_counts
    d1.vocabulary = {v:k for k, v in count_vect.vocabulary_.items()}
    d1.save(BIGRAM_TOKEN_COUNT_DATASET)

    # tfidf_transformer = TfidfTransformer()
    # d1.instances = tfidf_transformer.fit_transform(X_train_counts)
    # d1.save(BIGRAM_TOKEN_NORM_COUNT_DATASET)

if __name__ == '__main__':
    globe.getLogger().info('Initialize Dataset')
    initNGramDatasets()
예제 #24
0
파일: data.py 프로젝트: Jeky/thesis
 def save(self, fname):
     globe.getLogger().info('Save Dataset to %s', fname)
     with open(fname, 'wb') as fout:
         pickle.dump(self, fout)
예제 #25
0
파일: data.py 프로젝트: Jeky/thesis
def loadDataset(fname):
    globe.getLogger().info('Load Dataset from %s', fname)
    with open(fname) as fin:
        return pickle.load(fin)
예제 #26
0
파일: data.py 프로젝트: lvjunmei/thesis
    d.save(DOC_DATAEST)

    d1 = Dataset()
    d1.users = d.users
    d1.labels = np.array(d.labels)

    count_vect = CountVectorizer(token_pattern=r'\S+')
    X_train_counts = count_vect.fit_transform(d.instances)
    d1.instances = X_train_counts
    d1.save(TOKEN_COUNT_DATASET)
    d1.vocabulary = {v: k for k, v in count_vect.vocabulary_.items()}

    # tfidf_transformer = TfidfTransformer()
    # d1.instances = tfidf_transformer.fit_transform(X_train_counts)
    # d1.save(TOKEN_NORM_COUNT_DATASET)

    count_vect = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\S+')
    X_train_counts = count_vect.fit_transform(d.instances)
    d1.instances = X_train_counts
    d1.vocabulary = {v: k for k, v in count_vect.vocabulary_.items()}
    d1.save(BIGRAM_TOKEN_COUNT_DATASET)

    # tfidf_transformer = TfidfTransformer()
    # d1.instances = tfidf_transformer.fit_transform(X_train_counts)
    # d1.save(BIGRAM_TOKEN_NORM_COUNT_DATASET)


if __name__ == '__main__':
    globe.getLogger().info('Initialize Dataset')
    initNGramDatasets()
예제 #27
0
파일: data.py 프로젝트: lvjunmei/thesis
 def save(self, fname):
     globe.getLogger().info('Save Dataset to %s', fname)
     with open(fname, 'wb') as fout:
         pickle.dump(self, fout)
예제 #28
0
파일: data.py 프로젝트: lvjunmei/thesis
def loadDataset(fname):
    globe.getLogger().info('Load Dataset from %s', fname)
    with open(fname) as fin:
        return pickle.load(fin)