Пример #1
0
def extract_and_serialize(txt_file,
                          xml_file,
                          out_file,
                          atom_type='paragraph',
                          cluster_method='kmeans',
                          k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility()

    feature_names = [
        'average_word_length', 'average_sentence_length',
        'stopword_percentage', 'punctuation_percentage',
        'syntactic_complexity', 'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file
Пример #2
0
def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph',
                          cluster_method='kmeans', k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility() 

    feature_names = [
        'average_word_length',
        'average_sentence_length',
        'stopword_percentage',
        'punctuation_percentage',
        'syntactic_complexity',
        'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]
   

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file
def _default_stepwise_params():
    features = FeatureExtractor.get_all_feature_function_names()

    cluster_type = 'outlier'
    k = 2
    atom_type = 'nchars'
    n = 500
    first_doc_num = 0
    
    results = stepwise_feature_selection(features, cluster_type, k, atom_type, n, first_doc_num=first_doc_num)
    print results
    return results
def get_feature_sets():
    '''
    Returns a list containing every set of features we want to test. Since we want to test
    each feature individually, for example, we will return something like:
    [['feat1'], ['feat2'], ..., ['feat1', 'feat2']] 
    '''
    all_features = FeatureExtractor.get_all_feature_function_names()
    individual_features = [[feat] for feat in all_features]

    # Test all features as a feature set, as well
    all_sets = individual_features + [all_features]

    return all_sets
def compare_params():
    '''
    [('l1', 'auto', 0.59759576698869676, 'plagcomps/shared/../figures/roc1390881314.99.pdf'),
     ('l1', None, 0.60174204862821445, 'plagcomps/shared/../figures/roc1390881397.91.pdf'),
     ('l2', 'auto', 0.60095727893574291, 'plagcomps/shared/../figures/roc1390881480.62.pdf'),
     ('l2', None, 0.5977554082484301, 'plagcomps/shared/../figures/roc1390881563.36.pdf')
    ]

    '''
    features = FeatureExtractor.get_all_feature_function_names()
    features = [f for f in features if 'unigram' not in f and 'trigram' not in f]
    cluster_type = 'outlier'
    atom_type = 'paragraph' 
    start_doc = 0
    ntrain = 100
    ntest = 200

    # Process the test set once
    test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, ntrain, ntest)

    # Options for Log regression
    regularization_options = ['l1', 'l2']
    class_weight_options = ['auto', None]

    results = []
    for regularization in regularization_options:
        for class_weight in class_weight_options:
            model = train(features, cluster_type, atom_type, ntrain, start_doc=start_doc, regularization=regularization, class_weight=class_weight)
            confidences = [x[1] for x in model.predict_proba(test_matrix)]
            path, auc = BaseUtility.draw_roc(actuals, confidences, combination='Using Combination')

            results.append((regularization, class_weight, auc, path))

            print results

    print results
    return results
Пример #6
0
sys.path.append("../PyGene/")
from pygene.prog import ProgOrganism
from pygene.population import Population

import sqlalchemy
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

url = "postgresql://%s:%s@%s" % (username, password, dbname)
engine = sqlalchemy.create_engine(url)
Base = declarative_base()
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

features = FeatureExtractor.get_all_feature_function_names()
num_training = 50
num_testing = 500
starting_doc = 0
training_files = IntrinsicUtility().get_n_training_files(
    n=num_training, first_doc_num=starting_doc)
test_files = IntrinsicUtility().get_n_training_files(
    n=num_testing, first_doc_num=starting_doc + num_training)
cached_reduced_docs = {}
cached_confidences = {}

# set base values for globals
atom_type, cluster_type = "paragraph", "kmeans"


# a tiny batch of functions
Пример #7
0
sys.path.append("../PyGene/")
from pygene.prog import ProgOrganism
from pygene.population import Population

import sqlalchemy
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

url = "postgresql://%s:%s@%s" % (username, password, dbname)
engine = sqlalchemy.create_engine(url)
Base = declarative_base()
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

features = FeatureExtractor.get_all_feature_function_names()
num_training = 50 
num_testing = 500
starting_doc = 0
training_files = IntrinsicUtility().get_n_training_files(n=num_training, first_doc_num=starting_doc)
test_files = IntrinsicUtility().get_n_training_files(n=num_testing, first_doc_num=starting_doc + num_training)
cached_reduced_docs = {}
cached_confidences = {}

# set base values for globals
atom_type, cluster_type = "paragraph", "kmeans"

# a tiny batch of functions
def add(x,y):
    #print "add: x=%s y=%s" % (repr(x), repr(y))
    try:
def get_pairwise_results(atom_type, cluster_type, n, min_len, feature_set=None, cheating=False, write_output=False):
    '''
    Generates a table for the results of all feature pairs.
    '''
    all_features = FeatureExtractor.get_all_feature_function_names()
    if not feature_set:
        feature_set = list(itertools.combinations(all_features, 2))
        feature_set += [(x,x) for x in all_features]
    session = Session()

    values = []
    results = {}
    for feature_pair in feature_set:
        if feature_pair[0] == feature_pair[1]:
            feature_pair = [feature_pair[0]]
        trial = _get_latest_trial(atom_type, cluster_type, n, min_len, list(feature_pair), cheating, session)
        if trial:
            results[tuple(feature_pair)] = round(trial.auc, 4)
            values.append(trial.auc)
        else:
            results[tuple(feature_pair)] = "n/a"

    mean = numpy.array(values).mean()
    stdev = numpy.array(values).std()

    columns = all_features
    rows = all_features

    cells = []
    for feature_a in rows:
        row = []
        for feature_b in columns:
            if feature_a == feature_b:
                row.append(results[tuple([feature_a])])
            else:
                if (feature_a, feature_b) in results:
                    row.append(results[(feature_a, feature_b)])
                elif (feature_b, feature_a) in results:
                    row.append(results[(feature_b, feature_a)])
                else:
                    row.append('???')
        cells.append(row)

    # Is html table the best way to view it?
    html = '<html><head></head><body>'
    html += '<h1>Pairwise Feature Results</h1>'
    html += '<p>DASHBOARD_VERSION = ' + str(DASHBOARD_VERSION) + '</p>'
    html += '<p>cheating = ' + str(cheating) + '</p>'
    html += '<p>atom_type = ' + str(atom_type) + '</p>'
    html += '<p>cluster_type = ' + str(cluster_type) + '</p>'
    html += '<p>n >= ' + str(n) + '</p>'
    html += '<p>min_len = ' + str(min_len) + '</p>'
    html += '<p>auc mean = ' + str(round(mean, 4)) + ', stdev = ' + str(round(stdev, 4)) + '</p>'
    html += '<table border="1">'
    html += '<tr>'
    html += '<td></td>'
    for feature in columns:
        html += '<td style="font-size: 0.7em">' + feature + '</td>'
    html += '</tr>'
    for i, feature_a in enumerate(rows, 0):
        html += '<tr>'
        html += '<td>' + feature_a + '</td>'
        for j, feature_b in enumerate(columns, 0):
            # set bg color of table cell to help visualize good features
            if type(cells[i][j]) == float:
                val = cells[i][j]
                z_score = (val - mean) / stdev
                if z_score > 3:
                    bgcolor = '#00FF00'
                elif z_score > 2:
                    bgcolor = '#AAFFAA'
                elif z_score > 1:
                    bgcolor = '#DDFFDD'
                elif z_score > -1:
                    bgcolor = '#FFFFFF'
                elif z_score > -2:
                    bgcolor = '#FFDDDD'
                elif z_score > -3:
                    bgcolor = '#FFAAAA'
                else:
                    bgcolor = '#FF0000'
            else:
                bgcolor = '#888888'

            html += '<td style="background-color: ' + bgcolor + '">' + str(cells[i][j]) + '</td>'
        html += '</tr>'

    html += '</table></body></html>'
    
    if write_output:
        html_path = os.path.join(os.path.dirname(__file__), "../figures/dashboard_pairwise_table_"+str(DASHBOARD_VERSION)+"_"+str(time.time())+".html")
        with open(html_path, 'w') as f:
            f.write(html)
        print 'Saved pairwise feature table to ' + html_path

    return html
def all_k_sets_of_features(k=2):
    all_features = FeatureExtractor.get_all_feature_function_names()
    k_sets = [list(combo) for combo in itertools.combinations(all_features, k)]

    return k_sets