Exemplo n.º 1
0
def generate_candidates(num_to_generate, out_path, pos, neg, exclude):

    # Open the file to which we will write candidates
    out_f = open(out_path, 'w')

    # Read in the extracted features, which we'll also need for a couple things
    features = extract_features.FeatureAccumulator(
        load=BEST_WORDNET_ONLY_FEATURES_PATH)

    # Make the best performing classifier.  This is what we'll use to score the
    # "relationalness" of new words.
    clf = classifier.make_classifier(kind='svm',
                                     features=features,
                                     positives=pos,
                                     negatives=neg,
                                     **BEST_CLASSIFIER_CONFIG)

    # Now generate the candidates.  We only keep track of the number of
    # positives generated, because there are always more negatives
    num_generated = 0
    for token in features.dictionary.get_token_list():
        if token in exclude:
            print '\t\tx\t%s' % token
            continue
        score = clf.score(token)[0]
        if score > clf.threshold:
            print '%s\t+' % token
            out_f.write('%s\t+\t%f\n' % (token, score))
            num_generated += 1
            if num_generated == num_to_generate:
                break
        else:
            print '\t-\t%s' % token
            out_f.write('%s\t-\t%f\n' % (token, score))
Exemplo n.º 2
0
def prune_to_top_k_features(k):
    """
    Prunes the features to only the k features having highest mutual 
    information.  Only the features listed in
    extract_features.COUNT_BASED_FEATURES were subjected to this filtering.
    """

    in_path = os.path.join( 
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000')
        
    out_path = os.path.join(
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-top_%d' % k)

    # Get the top k features to be kept
    print 'getting the top k features'
    keep_features = get_top_k_features(k)

    # Load the base set of features that we'll be pruning
    features = ef.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=in_path)

    # Do the pruning
    print 'pruning...'
    features.prune_features_keep_only(keep_features)

    # Save the pruned features
    print 'writing pruned features to disc...'
    features.write(out_path)
def optimize_syntactic_feature_sets2():

    # We'll write results for this hyperparameter optimization here:
    out_path = os.path.join(HYPERPARAMETER_TUNING_DIR,
                            'optimize_syntactic_feature_sets2.tsv')

    # Read in the training set splits and the features
    train, test = utils.get_train_test_split()
    features_path = os.path.join(DATA_DIR,
                                 'relational-noun-features-lexical-wordnet',
                                 '0ba')
    features = extract_features.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=features_path)

    # Define the ranges over which parameters should be varied
    parameter_ranges = {
        'syntax_feature_types': [
            #[],
            #['baseline'],
            #['dependency'],
            #['hand_picked'],
            ['pos_unigram'],
            ['pos_unigram', 'pos_bigram'],
            ['lemma_unigram'],
            ['lemma_unigram', 'lemma_bigram'],
            ['surface_unigram', 'surface_bigram'],
            #['dependency', 'hand_picked'],
            #['baseline', 'hand_picked'],
            #['baseline', 'dependency'],
            #['baseline', 'dependency', 'hand_picked'],
        ]
    }

    # Define the values of parameters to be held constant
    constants = {
        'kind': 'svm',
        'on_unk': False,
        'C': 0.01,
        'semantic_similarity': 'res',
        'include_suffix': True,
        'syntactic_multiplier': 10.0,
        'semantic_multiplier': 2.0,
        'suffix_multiplier': 0.2
    }

    # Generate all combinations of variable parameters, while including
    # constant paramteres.
    classifier_definitions = test_classifier.generate_classifier_definitions(
        parameter_ranges, constants)

    # Evaluate the classifier when running for all classifier definitions
    test_classifier.optimize_classifier(classifier_definitions,
                                        features,
                                        train['pos'],
                                        train['neg'],
                                        test['pos'],
                                        test['neg'],
                                        out_path,
                                        num_procs=1)
Exemplo n.º 4
0
def prune_features_more():
    in_path = os.path.join(
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat5000')
    out_path = os.path.join(
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000')
    features = ef.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=in_path)
    features.prune_features(1000)
    features.write(out_path)
Exemplo n.º 5
0
    def test_get_dep_tree_features(self):
        # Make a mock (empty) dictionary (does not affect test, but needed to 
        # create the feature accumulator).
        dictionary = set()

        # Make a mock dependency tree
        F = {
            'parents':[],
            'children':[],
            'pos':'pos_F'
        }
        E = {
            'parents':[('rel_F', F)],
            'children':[],
            'pos':'pos_E'
        }
        D = {
            'parents':[],
            'children':[],
            'pos':'pos_D'
        }
        C = {
            'parents':[('rel_E', E)],
            'children':[('rel_D', D)],
            'pos':'pos_C'
        }
        B = {
            'parents':[],
            'children':[],
            'pos':'pos_B'
        }
        BB = {
            'parents':[],
            'children':[],
            'pos':'pos_BB'
        }
        A = {
            'parents':[('rel_C', C)],
            'children':[('rel_B', B), ('rel_BB', BB)],
            'pos':'pos_A'
        }

        accumulator = extract_features.FeatureAccumulator(dictionary)
        features = accumulator.get_dep_tree_features_recurse(A, depth=2)

        # Note that because we called it with depth=2, no feature is made for 
        # token F
        expected_features = [
            'parent:rel_C:pos_C', 'parent:rel_C:pos_C-parent:rel_E:pos_E',
            'parent:rel_C:pos_C-child:rel_D:pos_D', 'child:rel_B:pos_B',
            'child:rel_BB:pos_BB'
        ]

        self.assertItemsEqual(features, expected_features)
def optimize_pruning2():

    # We'll write results for this hyperparameter optimization here:
    out_path = os.path.join(HYPERPARAMETER_TUNING_DIR, 'optimize_pruning2.tsv')

    # Read in the training set splits and the features
    train, test = utils.get_train_test_split()
    features = extract_features.FeatureAccumulator(load=os.path.join(
        DATA_DIR, 'relational-noun-features-wordnet-only', 'accumulated'))

    # Define the ranges over which parameters should be varied
    parameter_ranges = {
        'min_feature_frequency': [
            200,
            500,
            1000,
            2000,
            5000,
            10000,
            #20000, 50000, 100000, 200000, 500000, 1000000,
        ]
    }

    # Define the values of parameters to be held constant
    constants = {
        'kind': 'svm',
        'on_unk': False,
        'C': 0.01,
        'syntax_feature_types': ['baseline', 'dependency', 'hand_picked'],
        'semantic_similarity': 'res',
        'include_suffix': True,
        'syntactic_multiplier': 0.33,
        'semantic_multiplier': 0.33,
        'suffix_multiplier': 0.33,
    }

    # Generate all combinations of variable parameters, while including
    # constant paramteres.
    classifier_definitions = test_classifier.generate_classifier_definitions(
        parameter_ranges, constants)

    # Evaluate the classifier when running for all classifier definitions
    test_classifier.optimize_classifier(classifier_definitions,
                                        features,
                                        train['pos'],
                                        train['neg'],
                                        test['pos'],
                                        test['neg'],
                                        out_path,
                                        num_procs=12)
Exemplo n.º 7
0
def generate_candidates_ordinal(num_to_generate,
                                out_path,
                                pos,
                                neg,
                                neut,
                                exclude,
                                kernel=None,
                                features=None):

    # Open the file to which we will write candidates
    out_f = open(out_path, 'w')

    # Read in the extracted features, which we'll also need for a couple things
    if features is None:
        features = extract_features.FeatureAccumulator(
            load=BEST_WORDNET_ONLY_FEATURES_PATH)

    # Make the best performing classifier.  This is what we'll use to score the
    # "relationalness" of new words.
    clf = classifier.make_classifier(kind='osvm',
                                     kernel=kernel,
                                     features=features,
                                     positives=pos,
                                     negatives=neg,
                                     neutrals=neut,
                                     **BEST_CLASSIFIER_CONFIG)

    # Now generate the candidates.  We only keep track of the number of
    # positives generated, because there are always more negatives
    num_generated = 0
    filtered_tokens = [
        t for t in features.dictionary.get_token_list() if t not in exclude
    ]

    for token, score in clf.score_parallel(filtered_tokens):
        if score >= 1:
            print '%s\t+' % token
            out_f.write('%s\t+\t%f\n' % (token, score))
            num_generated += 1
            if num_generated == num_to_generate:
                break
        elif score > -1:
            print '\t0\t%s' % token
            out_f.write('%s\t0\t%f\n' % (token, score))
        else:
            print '\t-\t%s' % token
            out_f.write('%s\t-\t%f\n' % (token, score))
def calculate_mutual_information(feature_sets, out_fname):

    # Tolerate providing a single feature set.  Make into a proper set.
    if isinstance(feature_sets, basestring):
        feature_sets = set([feature_sets])
    else:
        feature_sets = set(feature_sets)

    # Separate count based features and non-count features
    count_based_features = list(feature_sets & set(ef.COUNT_BASED_FEATURES))
    non_count_features = list(feature_sets & set(ef.NON_COUNT_FEATURES))

    # Validation: ensure no unexpected features were provided
    unexpected_features = (feature_sets - set(ef.COUNT_BASED_FEATURES) -
                           set(ef.NON_COUNT_FEATURES))
    # Make sure no misspelled features were included
    if len(unexpected_features):
        raise ValueError('Unexpected feature(s): %s' %
                         ', '.join(unexpected_features))

    # Define the path at which to write.  If no fname was given, then name
    # the file after the first element of names_of_runs
    out_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR, out_fname)

    # Load the features if not provided
    wni = utils.read_wordnet_index()
    features_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR,
                                 'accumulated450-min_token_5-min_feat1000')
    start = time.time()
    features = ef.FeatureAccumulator(wni, load=features_path)
    print 'time to read features elapsed: %s' % (time.time() - start)

    # Load relational noun annotations
    annots = annotations.Annotations(features.dictionary)

    features.calculate_mutual_information(annots, out_path,
                                          count_based_features,
                                          non_count_features)