Exemplo n.º 1
0
def m007(label, cut_bound=10, drop_bound=1, max_level=10):
    """  """
    
    training = []
    test = []
    features = Counter()
        
    stderr.write('constructing training set... ')
    training_size = 0
    for id, lab, history in iterdata(label):
        training.append((id, lab, []))
        for url in history.iterkeys():
            if url[-1] == '/':
                url = url[:-1]
            training[training_size][2].append((url, 1))
            features[url] += 1
        training_size += 1
    stderr.write('done\n')

    stderr.write('modifying training set... \n')
    for level in range(max_level, -1, -1):
        stderr.write('\titeration with prefix length ' + str(level) + '... ')
        for i in range(training_size):
            for j in range(len(training[i][2])):
                url = training[i][2][j][0]
                if features[url] < cut_bound:
                    url = url_prefix(url, level)
                    training[i][2][j] = (url, 1)
                    features[url] += 1
        stderr.write('done\n')
    stderr.write('... done\n')

    stderr.write('selecting most visited URLs as features... ')
    features = set([f for f, c in features.iteritems() if c >= drop_bound])
    features_map = dict([(f, c) for c, f in enumerate(features)])
    stderr.write('done\n')

    stderr.write('constructing test set... ')
    test_size = 0
    for id, lab, history in iterdata(label, 'test'):
        test.append((id, lab, []))
        for url in history.iterkeys():
            if url[-1] == '/':
                url = url[:-1]
            for url in url_prefixes(url):
                if url in features:
                    break
            if url in features:
                test[test_size][2].append((url, 1))
        test_size += 1
    stderr.write('done\n')
    
    stderr.write('converting datasets to sparse matrices... ')
    X, y = to_matrix(training, features_map)
    X_test, y_test = to_matrix(test, features_map)
    stderr.write('done\n')
    
    return X, y, X_test, y_test, features_map
Exemplo n.º 2
0
def m007(label, cut_bound=10, drop_bound=1, max_level=10):
    """  """

    training = []
    test = []
    features = Counter()

    stderr.write('constructing training set... ')
    training_size = 0
    for id, lab, history in iterdata(label):
        training.append((id, lab, []))
        for url in history.iterkeys():
            if url[-1] == '/':
                url = url[:-1]
            training[training_size][2].append((url, 1))
            features[url] += 1
        training_size += 1
    stderr.write('done\n')

    stderr.write('modifying training set... \n')
    for level in range(max_level, -1, -1):
        stderr.write('\titeration with prefix length ' + str(level) + '... ')
        for i in range(training_size):
            for j in range(len(training[i][2])):
                url = training[i][2][j][0]
                if features[url] < cut_bound:
                    url = url_prefix(url, level)
                    training[i][2][j] = (url, 1)
                    features[url] += 1
        stderr.write('done\n')
    stderr.write('... done\n')

    stderr.write('selecting most visited URLs as features... ')
    features = set([f for f, c in features.iteritems() if c >= drop_bound])
    features_map = dict([(f, c) for c, f in enumerate(features)])
    stderr.write('done\n')

    stderr.write('constructing test set... ')
    test_size = 0
    for id, lab, history in iterdata(label, 'test'):
        test.append((id, lab, []))
        for url in history.iterkeys():
            if url[-1] == '/':
                url = url[:-1]
            for url in url_prefixes(url):
                if url in features:
                    break
            if url in features:
                test[test_size][2].append((url, 1))
        test_size += 1
    stderr.write('done\n')

    stderr.write('converting datasets to sparse matrices... ')
    X, y = to_matrix(training, features_map)
    X_test, y_test = to_matrix(test, features_map)
    stderr.write('done\n')

    return X, y, X_test, y_test, features_map
Exemplo n.º 3
0
def m008(label, lower_pop_bound=5):
    """  """
    
    training = []
    test = []
    features = Counter()
    
    # Constructing training set.
    # If http(s)://<X>/<folder> is visited, we say that
    # http(s)://<X> is also visited, and apply this recursively.
    stderr.write('constructing training set... ')
    training_size = 0
    for id, lab, history in iterdata(label):
        training.append((id, lab, set()))
        for url in history.iterkeys():
            for x in url_prefixes(url):
                training[training_size][2].add((x, 1))
                features[x] += 1
        training_size += 1
    stderr.write('done\n')

    stderr.write('selecting most visited URLs as features... ')
    features = set([f for f, c in features.iteritems()
                                            if c >= lower_pop_bound])
    features_map = dict([(f, c) for c, f in enumerate(features)])
    stderr.write('done\n')

    stderr.write('constructing test set... ')
    test_size = 0
    for id, lab, history in iterdata(label, 'test'):
        test.append((id, lab, set()))
        for url in history.iterkeys():
            for x in url_prefixes(url):
                if x in features:
                    test[test_size][2].add((x, 1))
        test_size += 1
    stderr.write('done\n')
    
    stderr.write('converting datasets to sparse matrices... ')
    X, y = to_matrix(training, features_map)
    X_test, y_test = to_matrix(test, features_map)
    stderr.write('done\n')
    
    return X, y, X_test, y_test, features_map