def write_sample(sample_dict, outfile): ''' Reads data from a sample, or all of the test set, extracts features, and writes the features out in .csv format at path.PROCESSED. Args: sample_dict - None, or a dict like {filename: label} for every file in the sample. If None, runs the test set. Use a dict with the full training set to get training features. outfile - just the base, with no path or extension Writes: features in .csv format ''' start = datetime.now() outpath = os.path.join(paths.PROCESSED, outfile + '.csv') if sample_dict is not None: sample = zip_io.generate_sample(sample_dict) else: sample = zip_io.generate_test() fieldnames = ['file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct', 'head_script', 'body_script', 'head_style', 'body_style', 'head_link', 'body_link'] fieldnames.extend(BARE_TAGS) tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL] fieldnames.extend(tag_attr_val_names) tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR] fieldnames.extend(tag_attr_names) fieldnames.extend(TEXT_NAMES) fieldnames.extend(SCRIPT_FEATURES) script_url_names = ['script_url_' + url for url in URL_FEATURES] fieldnames.extend(script_url_names) script_path_names = ['script_path_' + p for p in PATH_FEATURES] fieldnames.extend(script_path_names) with open(outpath, 'w') as f_out: writer = DictWriter(f_out, fieldnames=fieldnames) writer.writeheader() for page_tuple in sample: row = {} row['file'] = page_tuple[0] row['sponsored'] = page_tuple[1] page = page_tuple[2] row['tag_ct'] = len(page.select('*')) row['head_tag_ct'] = len(page.select('head *')) row['body_tag_ct'] = len(page.select('body *')) row['head_script'] = len(page.select('head script')) row['body_script'] = len(page.select('body script')) row['head_style'] = len(page.select('head style')) row['body_style'] = len(page.select('body style')) row['head_link'] = len(page.select('head link')) row['body_link'] = len(page.select('body link')) add_bare_tags(row, page) add_tag_attr_vals(row, page) add_tag_attr(row, page) text_features(row, page) script_features(row, page) writer.writerow(row) finish = datetime.now() print 'Elapsed time: %d sec.' % (finish - start).seconds
def test_features(outfile): ''' Reads the test set, extracts features from it, and writes the features out in .csv format suitable for loading as a Pandas data frame. Args: outfile - features are written at paths.PROCESSED/<outfile>_test.csv Writes: features in .csv format ''' test = zip_io.generate_test() write_features(test, outfile + '_test')
def test_features(outfile): ''' Reads the test set, extracts features from it, and writes the features out in LibSVM format. Args: outfile - features are written at paths.PROCESSED/<outfile>.libsvm Writes: features in LibSVM format ''' test = zip_io.generate_test() # The + '.5' allows the test set to have the same base name as the # training data, with base.5 as test and base.0-4 for train. write_features(test, outfile + '.5')
def write_sample(sample_dict, outfile): ''' Reads data from a sample, or all of the test set, extracts features, and writes the features out in .csv format at path.PROCESSED. Args: sample_dict - None, or a dict like {filename: label} for every file in the sample. If None, runs the test set. Use a dict with the full training set to get training features. outfile - just the base, with no path or extension Writes: features in .csv format ''' start = datetime.now() outpath = os.path.join(paths.PROCESSED, outfile + '.csv') if sample_dict is not None: sample = zip_io.generate_sample(sample_dict) else: sample = zip_io.generate_test() fieldnames = [ 'file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct', 'head_script', 'body_script', 'head_style', 'body_style', 'head_link', 'body_link' ] fieldnames.extend(BARE_TAGS) tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL] fieldnames.extend(tag_attr_val_names) tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR] fieldnames.extend(tag_attr_names) fieldnames.extend(TEXT_NAMES) fieldnames.extend(SCRIPT_FEATURES) script_url_names = ['script_url_' + url for url in URL_FEATURES] fieldnames.extend(script_url_names) script_path_names = ['script_path_' + p for p in PATH_FEATURES] fieldnames.extend(script_path_names) with open(outpath, 'w') as f_out: writer = DictWriter(f_out, fieldnames=fieldnames) writer.writeheader() for page_tuple in sample: row = {} row['file'] = page_tuple[0] row['sponsored'] = page_tuple[1] page = page_tuple[2] row['tag_ct'] = len(page.select('*')) row['head_tag_ct'] = len(page.select('head *')) row['body_tag_ct'] = len(page.select('body *')) row['head_script'] = len(page.select('head script')) row['body_script'] = len(page.select('body script')) row['head_style'] = len(page.select('head style')) row['body_style'] = len(page.select('body style')) row['head_link'] = len(page.select('head link')) row['body_link'] = len(page.select('body link')) add_bare_tags(row, page) add_tag_attr_vals(row, page) add_tag_attr(row, page) text_features(row, page) script_features(row, page) writer.writerow(row) finish = datetime.now() print 'Elapsed time: %d sec.' % (finish - start).seconds