def predict_svm(self, example): ''' :param example: str (example comment) :return: str (constructiveness prediction for the example) Description: Given a comment example, example, this class method returns whether the comment is constructive or not based on the trained model for constructiveness. ''' # Build a feature vector for the example example_df = pd.DataFrame.from_dict({ 'pp_comment_text': [example], 'constructive': ['?'] }) print(example_df) fe = FeatureExtractor(example_df) fe.extract_features() feats_df = fe.get_features_df() # Get the prediction score and find the winner prediction = self.svm_pipeline.predict(feats_df)[0] prediction_winner = 'Non-constructive' if prediction == 0 else 'Constructive' return prediction_winner.upper()
def main(): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if EXTRACT_FEATURES: extractor = FeatureExtractor(DATA_ROOT_DIR, SEQUENCE_SIZE) extractor.extract_features() # training dataset training_dataset = TypeNetDataset(DATA_ROOT_DIR, ENROLLMENT_SEQUENCES, PARTS_TRAINING) # # validation dataset validation_dataset = TypeNetDataset(DATA_ROOT_DIR, ENROLLMENT_SEQUENCES, PARTS_VAL) val_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=True) # test dataset # test_dataset = TypeNetDatasetTest(DATA_ROOT_DIR, PARTS_TEST) # test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) # initializing model model = SiameseNet(input_size, hidden_size, num_layers).to(device) # adam optimizer optimizer = optim.Adam(model.parameters(), lr=lr) # training for epoch in range(num_epochs): if save_states: torch.save(model.state_dict(), 'states/siamese_typenet_{:03}.pt'.format(epoch)) if epoch % 5 == 0: validation(model, device, val_loader, epoch) indices = sample(range(0, len(training_dataset)), batch_size * BATCHS_PER_ITER) training_subset = torch.utils.data.Subset(training_dataset, indices) train_loader = torch.utils.data.DataLoader(training_subset, batch_size=batch_size, shuffle=True) train(model, device, train_loader, epoch, optimizer) if save_states: torch.save(model.state_dict(), 'states/siamese_typenet_final.pt')
def extract_features(json_file): class_id = json_file.split('_')[-1] class_id = int(class_id[0]) feature_extractor = FeatureExtractor(json_file) feature_list = feature_extractor.extract_features() return feature_list, class_id
class TestFeatureExtractor(unittest.TestCase): ''' Unit tests for the FeatureExtractor class. Does simple tests to insure that the feature vector we get back is of the right length and has frequency data that makes sense. More tests should be added. ''' def setUp(self): '''Sets up the test by constructing feature vectors to get tested''' self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", IUPAC.protein), id="YP_025292.1", name="HokC", description="toxic membrane protein, small") self.seq1 = self.record1.seq self.feature_extractor = FeatureExtractor() self.feature_vector1 = self.feature_extractor.extract_features(self.seq1) def test_feature_vector_length(self): '''Tests that the feature vector is 400 elements long''' self.assertEqual(len(self.feature_vector1), 400, msg="Feature vector not 400 long") def test_dipeptide_frequency_sum(self): '''Tests that the dipeptide frequencies sum to 1''' checksum = 0.0 for i in range(0,400): checksum += self.feature_vector1[i] self.assertAlmostEqual(checksum, 1.0, places=5, msg="Frequencies don't sum to 1")
def preprocessing(params): # filter the logs filter_logs = Filter(params) filter_settings = filter_logs.get_filter_settings() filter_logs.filter_logs() # if template file not available, generate template # filter settings needed to get svn branch path for particular version # currently accesses svn server directly, TO DO: have local back up copy in case svn server is down? templatizer = Templatizer(filter_settings=filter_settings) component_template = templatizer.gen_template() # extract features from log, using template extractor = FeatureExtractor(component_template=component_template, techdump_filename=params['filename'], filter_settings=filter_settings) return component_template, extractor.extract_features()
def extract_results(self): overall_extracted_log_features = [] overall_test_case_labels = [] component_template = {} features_template = {} base_path = os.path.basename(self.start_path).split("-") if len(base_path) == 2: start_index = int(base_path[1]) end_index = int(os.path.basename(self.end_path).split("-")[1]) base_path = os.path.join(os.path.dirname(self.start_path), base_path[0]) + "-" for index in range(start_index, end_index + 1): test_result = base_path + str(index) acp_version, test_case_labels = self.parse_results(test_result) overall_test_case_labels.append(test_case_labels) extracted_log_features = {} for test in test_case_labels: test_path = os.path.join(test_result, test) if os.path.isdir(test_path): test_result_dir = os.path.join( self.mode, os.path.basename(test_result)) techdump_rel_path = os.path.join(test_result_dir, test) params = { "channel_number": self.channel_number, "src_txt_result_dir_path": test_path, "techdump_name": test, "test_result_dir": test_result_dir } log_filter = Filter(params, filter_type="txt_result") filter_settings = log_filter.get_filter_settings() log_filter.filter_logs() templatizer = Templatizer(acp_version=acp_version) component_template, features_template = templatizer.gen_template( ) # extract features from log, using template extractor = FeatureExtractor( component_template=component_template, techdump_filename=techdump_rel_path, filter_settings=filter_settings) extracted_log_features.update( {test: extractor.extract_features()}) overall_extracted_log_features.append(extracted_log_features) else: logger.error( "Invalid start path provided. Needs to contain '-' in last directory %s" % self.start_path) return component_template, features_template, overall_extracted_log_features, overall_test_case_labels
def train_dataset_parser(train_path): fe = FeatureExtractor() bag_of_words = set() images_labels = list() # Extract labels and create bag of words for image_name in sorted(os.listdir(train_path)): token_list = [int(l) for l in re.findall(r"[\d']+", image_name)] images_labels.append((image_name, token_list[1:])) bag_of_words = bag_of_words.union(set(token_list[1:])) # # # bag_of_words = list(bag_of_words) binary_labels = list() data = list() with open('../dataset/features_train.csv', 'w') as f: # Write bag of words the file f.write('{0}\n'.format(','.join([str(x) for x in bag_of_words]))) for image_name, labels in images_labels: binary_vector = [0] * len(bag_of_words) for label in labels: binary_vector[bag_of_words.index(label)] = 1 binary_labels.append(binary_vector) # Extract features spatial_features = fe.extract_features(os.path.join(train_path, image_name)) data.append(spatial_features) # Write the extracted features to the file f.write('{0},{1}\n'.format( ','.join([str(x) for x in spatial_features]), ','.join([str(x) for x in binary_vector]) )) return data, binary_labels, bag_of_words
def test_dataset_parser(test_path): fe = FeatureExtractor() data = list() image_ids = list() with open('../dataset/features_test.csv', 'w') as f: for image_name in os.listdir(test_path): # Extract image ID image_id = image_name.split('.')[0] image_ids.append(image_id) # Extract features spatial_features = fe.extract_features(os.path.join(test_path, image_name)) data.append(spatial_features) # Write the extracted features to the file f.write('{0},{1}\n'.format( image_id, ','.join([str(x) for x in spatial_features]), )) return data, image_ids
class OdometryEstimator: DISTANCE_SQ_THRESHOLD = 1 SCAN_VICINITY = 2.5 def __init__(self): self.extractor = FeatureExtractor() self.inited = False self.last_less_sharp_points = None self.last_less_flat_points = None self.last_position = np.eye(4) def append_pcd(self, pcd): sharp_points, less_sharp_points, flat_points, less_flat_points = self.extractor.extract_features( pcd[0], pcd[1], pcd[2]) T = None if not self.inited: self.inited = True T = np.zeros(6) else: edge_corresp = self.find_edge_correspondences(sharp_points) surface_corresp = self.find_surface_correspondences( flat_points, pcd) optimizer = LOAMOptimizer(edge_corresp, surface_corresp) T = optimizer.optimize() import utils surf = np.vstack( (surface_corresp[1], surface_corresp[2], surface_corresp[3])) keypoints = utils.get_pcd_from_numpy(surf) keypoints.paint_uniform_color([0, 1, 0]) pcd = utils.get_pcd_from_numpy( mrob.geometry.SE3(T).transform_array(pcd[0])) pcd.paint_uniform_color([0, 0, 1]) orig = utils.get_pcd_from_numpy(surface_corresp[0]) orig.paint_uniform_color([1, 0, 0]) # o3d.visualization.draw_geometries([pcd, keypoints, orig]) self.last_less_sharp_points = np.vstack(less_sharp_points) x = get_pcd_from_numpy(np.vstack(less_flat_points)) y = np.vstack(less_flat_points)[:, 3].reshape((-1, 1)) / 64 x.colors = o3d.utility.Vector3dVector(np.hstack((y, y, y))) x = x.voxel_down_sample(0.1) self.last_less_flat_points = np.hstack( (np.asarray(x.points), 64 * np.asarray(x.colors)[:, 0].reshape( (-1, 1)))) scan_ids = self.last_less_flat_points[:, 3] sorted_ind = np.argsort(scan_ids, kind='stable') self.last_less_flat_points = self.last_less_flat_points[sorted_ind] self.last_position = mrob.geometry.SE3(T).T() @ self.last_position return mrob.geometry.SE3( T).T(), self.last_less_flat_points, self.last_less_flat_points def find_edge_correspondences(self, sharp_points): corners_cnt = len(sharp_points) edge_points = [] edge_1 = [] edge_2 = [] less_sharp_points_tree = o3d.geometry.KDTreeFlann( get_pcd_from_numpy(self.last_less_sharp_points)) for i in range(corners_cnt): point_sel = sharp_points[i] _, idx, dist = less_sharp_points_tree.search_knn_vector_3d( point_sel[:3], 1) min_point_ind_2 = -1 if dist[0] < self.DISTANCE_SQ_THRESHOLD: closest_point_ind = idx[0] min_point_sq_dist_2 = self.DISTANCE_SQ_THRESHOLD closest_point_scan_id = self.last_less_sharp_points[ closest_point_ind][3] dist_to_sel_point = matrix_dot_product( (self.last_less_sharp_points[:, :3] - point_sel[:3]), (self.last_less_sharp_points[:, :3] - point_sel[:3])) for j in range(closest_point_ind + 1, len(self.last_less_sharp_points)): if self.last_less_sharp_points[j][ 3] <= closest_point_scan_id: continue if self.last_less_sharp_points[j][ 3] > closest_point_scan_id + self.SCAN_VICINITY: break point_sq_dist = dist_to_sel_point[j] if point_sq_dist < min_point_sq_dist_2: min_point_sq_dist_2 = point_sq_dist min_point_ind_2 = j for j in range(closest_point_ind - 1, -1, -1): if self.last_less_sharp_points[j][ 3] >= closest_point_scan_id: continue if self.last_less_sharp_points[j][ 3] < closest_point_scan_id - self.SCAN_VICINITY: break point_sq_dist = dist_to_sel_point[j] if point_sq_dist < min_point_sq_dist_2: min_point_sq_dist_2 = point_sq_dist min_point_ind_2 = j if min_point_ind_2 >= 0: edge_points.append(point_sel) edge_1.append( self.last_less_sharp_points[closest_point_ind]) edge_2.append(self.last_less_sharp_points[min_point_ind_2]) edge_points = np.vstack(edge_points)[:, :3] edge_1 = np.vstack(edge_1)[:, :3] edge_2 = np.vstack(edge_2)[:, :3] return edge_points, edge_1, edge_2 def find_surface_correspondences(self, flat_points, pcd): surface_cnt = len(flat_points) print('Surface count: ', surface_cnt) surface_points = [] surface_1 = [] surface_2 = [] surface_3 = [] less_flat_points_tree = o3d.geometry.KDTreeFlann( get_pcd_from_numpy(self.last_less_flat_points)) for i in range(surface_cnt): point_sel = flat_points[i] _, idx, dist = less_flat_points_tree.search_knn_vector_3d( point_sel[:3], 1) min_point_ind_2 = -1 min_point_ind_3 = -1 dist_to_sel_point = matrix_dot_product( (self.last_less_flat_points[:, :3] - point_sel[:3]), (self.last_less_flat_points[:, :3] - point_sel[:3])) closest_point_ind = idx[0] v = self.last_less_flat_points[ closest_point_ind][:3] - point_sel[:3] dist = np.dot(v, v) if dist < self.DISTANCE_SQ_THRESHOLD: closest_point_scan_id = self.last_less_flat_points[ closest_point_ind][3] min_point_sq_dist_2 = self.DISTANCE_SQ_THRESHOLD min_point_sq_dist_3 = self.DISTANCE_SQ_THRESHOLD for j in range(closest_point_ind + 1, len(self.last_less_flat_points)): if self.last_less_flat_points[j][ 3] > closest_point_scan_id + self.SCAN_VICINITY: break point_sq_dist = dist_to_sel_point[j] if self.last_less_flat_points[j][3] <= closest_point_scan_id \ and point_sq_dist < min_point_sq_dist_2: min_point_sq_dist_2 = point_sq_dist min_point_ind_2 = j elif self.last_less_flat_points[j][3] > closest_point_scan_id \ and point_sq_dist < min_point_sq_dist_3: min_point_sq_dist_3 = point_sq_dist min_point_ind_3 = j for j in range(closest_point_ind - 1, -1, -1): if self.last_less_flat_points[j][ 3] < closest_point_scan_id - self.SCAN_VICINITY: break point_sq_dist = dist_to_sel_point[j] if self.last_less_flat_points[j][3] >= closest_point_scan_id \ and point_sq_dist < min_point_sq_dist_2: min_point_sq_dist_2 = point_sq_dist min_point_ind_2 = j elif self.last_less_flat_points[j][3] < closest_point_scan_id \ and point_sq_dist < min_point_sq_dist_3: min_point_sq_dist_3 = point_sq_dist min_point_ind_3 = j if min_point_ind_2 >= 0 and min_point_ind_3 >= 0: surface_points.append(point_sel) surface_1.append( self.last_less_flat_points[closest_point_ind]) surface_2.append( self.last_less_flat_points[min_point_ind_2]) surface_3.append( self.last_less_flat_points[min_point_ind_3]) surface_points = np.vstack(surface_points) surface_1 = np.vstack(surface_1) surface_2 = np.vstack(surface_2) surface_3 = np.vstack(surface_3) print('output: ', surface_points.shape[0]) import utils # import open3d as 0o3d ind = surface_1[:, 3] > 0 surf = np.vstack((surface_1[ind], surface_2[ind], surface_3[ind])) keypoints = utils.get_pcd_from_numpy(surf) keypoints.paint_uniform_color([0, 1, 0]) pcd = utils.get_pcd_from_numpy(pcd[0]) pcd.paint_uniform_color([0, 0, 1]) orig = utils.get_pcd_from_numpy(surface_points[ind]) orig.paint_uniform_color([1, 0, 0]) o3d.visualization.draw_geometries([pcd, keypoints, orig]) return surface_points[ind][:, :3], surface_1[ind][:, :3], surface_2[ ind][:, :3], surface_3[ind][:, :3]
print( 'Loaded ' + str(len(test_images_filenames)) + ' testing images filenames with classes ', set(test_labels)) # Load precomputed labels if avaliable precomp_label_filename = classifier + '_' + feature_method + '.npy' if os.path.isfile(precomp_label_filename) and not force_reload: print 'Loading previous predictions' predicted_classes = np.load(precomp_label_filename) else: start = time.time() print 'Extracting features' fe = FeatureExtractor(feature_method) (X, y) = fe.extract_features(train_images_filenames, train_labels, nimmax=30) print 'Training a classifier' c = Classifier(classifier) c.fit(X, y) print 'Predicting test set labels with the classifier' numtestimages = 0 predicted_classes = [] for i in range(len(test_images_filenames)): imfilename = test_images_filenames[i] des = fe.extract_single_image_features(imfilename) predictedclass = c.predict(des) predicted_classes.append(predictedclass) print('image ' + imfilename + ' was from class ' + test_labels[i] +
class MultiReader(DataLoader): def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False): self.input_width = 400 self.output_width = output_width self.training_frac = training_frac self.validation_frac = validation_frac self.debug = debug # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/" # Where the files live. self.names = [ # Names of all of the files. "baseplate_3370", "collar_1385", "htj_2258_nofg", "major_tail_1512", "mcp_3589", "minor_capsid_1500_nofg", "minor_tail_2033", "portal_2141", "tail_fiber_3007", "tail_sheath_2350", ] self.feature_extractor = FeatureExtractor() def load_data(self, source): """Load the data from a directory with a collection of source files, one file for each kind of protein. Returns an array of pairs in the form: [(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)] :type source: String :param source: The directory where the source files are located. """ dir = source raw_data = list() unsupporteds = list() for i in range(0, len(self.names)): num_in_file = 0 if self.debug: print (dir + self.names[i] + ".faa") handle = open(dir + self.names[i] + ".faa", "rU") # Open a file. for record in SeqIO.parse(handle, "fasta"): num_in_file += 1 try: # print " " + record.id feature_vector = self.feature_extractor.extract_features(record) # Now we have to augment the feature vector with the output # vector. So we: # 1) Make a new array a bit longer than the feature vector, # 2) Copy the feature vector into the first cells of the new array, # 3) Find the appropriate cell in the tail of the new array # and set that one equal to 1. prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width) for col in range(0, len(feature_vector)): # This surely could be done more efficiently. prepared_data_record[col] = feature_vector[col] # Doesn't matter for now. prepared_data_record[ len(feature_vector) + i ] = 1 # The class of the protein is taken from the order of the files in the list "names" raw_data.append(prepared_data_record) except KeyError: if self.debug: print " Unsupported sequence: " + record.id + " " + str(record.annotations) unsupporteds.append(record) pass handle.close() if self.debug: print "Total in file " + self.names[i] + " = " + str(num_in_file) # Now we are done reading all of the data in. In debug mode, print some # overall summary information. if self.debug: print "Supported Sequences = " + str(len(raw_data)) print "Unsupported Sequences = " + str(len(unsupporteds)) num_examples = len(raw_data) # But the labeled data we have is not randomly ordered. It is sorted # by class. We need to shuffle it up or we will only train on the first # classes. if self.debug: print "Shuffling data to randomize for training" shuffle = self.rand_perm(num_examples) data = numpy.ndarray((num_examples, self.input_width + self.output_width), float) for n in range(0, num_examples): for w in range(0, self.input_width + self.output_width): s = raw_data[shuffle[n]][w] data[n, w] = float(s) if self.debug: print "Finished shuffling data" print "Processing data to cull outliers" data = self.preprocess(self.cull(data)) num_examples = len(data) print "Data shape = ", data.shape, " num_examples=", num_examples inputs = numpy.array(data)[:, 0 : self.input_width] outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width] if self.debug: print "Finished culling outliers" print inputs.shape print outputs_full.shape outputs = numpy.ndarray((num_examples,), int) for n in range(0, num_examples): found_class = False for w in range(0, self.output_width): if outputs_full[n, w] > 0.5: outputs[n] = w found_class = True break num_training_cases = self.num_training(num_examples) num_validation_cases = self.num_validation(num_examples) num_test_cases = self.num_test(num_examples) print num_training_cases, " ", num_validation_cases, " ", num_test_cases training_set = (inputs[0:num_training_cases, :], outputs[0:num_training_cases]) validation_set = ( inputs[num_training_cases : num_training_cases + num_validation_cases, :], outputs[num_training_cases : num_training_cases + num_validation_cases], ) test_set = ( inputs[num_training_cases + num_validation_cases :, :], outputs[num_training_cases + num_validation_cases :], ) training_set_x, training_set_y = theanoutil.shared_dataset(training_set) validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set) test_set_x, test_set_y = theanoutil.shared_dataset(test_set) if self.debug: print "TYPE of test_set_x =", type(test_set_x) print "TYPE of test_set=", type(test_set), " SIZE of test_set=", len(test_set) print "TYPE of test_set[0]=", type(test_set[0]), " SHAPE of test_set[0]=", test_set[0].shape print "TYPE of test_set[1]=", type(test_set[1]), " SHAPE of test_set[1]=", test_set[1].shape print "VALUE of training_set[0,0,0]=", training_set[0][0, 0] print "VALUE of training_set[1,0]=", training_set[1][0], " test_set[1,0]=", test_set[1][0] rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)] return rval # Everything from here down should be turned into a base class. def num_training(self, num_examples): return num_examples * (self.training_frac / 100.0) def num_validation(self, num_examples): return num_examples * (self.validation_frac / 100.0) def num_test(self, num_examples): return num_examples - (self.num_training(num_examples) + self.num_validation(num_examples)) def rand_perm(self, length): # In debug mode, we want to have a repeatable random number seed so # that we can have a repeatable shuffling. if self.debug: seed(1) shuffle = numpy.ndarray((length,), int) for n in range(0, length): shuffle[n] = n for n in range(0, length): swap_cell = randint(0, length - 1) temp = shuffle[swap_cell] shuffle[swap_cell] = shuffle[n] shuffle[n] = temp return shuffle def cull(self, data): # Make a list of all row numbers that need to get culled from the data. cull_list = [] for n in range(0, len(data)): if self.prune(data[n]): cull_list.append(n) cull_list.append(len(data)) # A sentinel at the end of the cull list. # Make a new array that doesn't have the culled items in it. # The 1+ is for the sentinel. new_data = numpy.ndarray((1 + len(data) - len(cull_list), self.input_width + self.output_width), float) next_cull_index = 0 next_data_index = 0 for n in range(0, len(data)): if n == cull_list[next_cull_index]: next_cull_index += 1 else: new_data[next_data_index] = data[n] next_data_index += 1 print "Number culled = ", len(cull_list) - 1 return new_data def prune(self, example): sum = 0.0 for n in range(0, self.input_width): if example[n] < 0.0: return True if example[n] > 1.0: return True sum += example[n] if sum > 1.01: return True if sum < 0.99: return True return False def preprocess(self, data): n = self.input_width for r in range(0, len(data)): sum_x = 0.0 sum_x2 = 0.0 for c in range(0, n): sum_x += data[r, c] sum_x2 += data[r, c] * data[r, c] mu = sum_x / n std = math.sqrt((sum_x2 - (sum_x * sum_x) / n) / n) # Population std for c in range(0, n): z = (data[r, c] - mu) / std # squashed_z = sigma(z) data[r, c] = z if r % 1000 == 0: print "Preprocessed row ", r return data
def main(args): graph_dataset = MAGDataset(name="", path=args.data, raw=False) node_features = graph_dataset.g_full.ndata['x'] node_features = F.normalize(node_features, p=2, dim=1) vocab = graph_dataset.vocab full_graph = graph_dataset.g_full.to_networkx() kv = KeyedVectors(vector_size=node_features.shape[1]) kv.add([str(i) for i in range(len(vocab))], node_features.numpy()) if args.mode == "train": node_list = graph_dataset.train_node_ids graph = full_graph.subgraph(graph_dataset.train_node_ids).copy() elif args.mode == "validation": node_list = graph_dataset.validation_node_ids graph = full_graph.subgraph(graph_dataset.train_node_ids + graph_dataset.validation_node_ids).copy() else: node_list = graph_dataset.test_node_ids graph = full_graph.subgraph(graph_dataset.train_node_ids + graph_dataset.test_node_ids).copy() roots = [node for node in graph.nodes() if graph.in_degree(node) == 0] interested_node_set = set(node_list) - set(roots) node_list = list(interested_node_set) nq = NegativeQueue(node_list.copy() * 5) node2parents = {} # list of correct parent positions node2masks = { } # list of positions that should not be chosen as negative positions for node in tqdm(graph.nodes(), desc="generating intermediate data ..."): parents = [edge[0] for edge in graph.in_edges(node)] node2parents[node] = parents if node in interested_node_set: descendants = nx.descendants(graph, node) masks = set(list(descendants) + parents + [node] + roots) node2masks[node] = masks edge_to_remove = [] if args.mode == "validation": for node in graph_dataset.validation_node_ids: edge_to_remove.extend(list(graph.in_edges(node))) print( f"Remove {len(edge_to_remove)} edges between validation nodes and training nodes" ) graph.remove_edges_from(edge_to_remove) print("=== Finish data loading ===\n") feature_extractor = FeatureExtractor(graph, kv) NEGATIVE_RATIO = args.neg featMat = [] labels = [] for query_node in tqdm(node_list): cnt = 0 for positive_parent in node2parents[query_node]: featMat.append( feature_extractor.extract_features(query_node, positive_parent)) labels.append(1) cnt += 1 num_negatives = NEGATIVE_RATIO * cnt avoid_set = node2masks[query_node] negatives = nq.sample_avoid_positive_set(avoid_set, num_negatives) for negative_parent in negatives: featMat.append( feature_extractor.extract_features(query_node, negative_parent)) labels.append(0) data = xgb.DMatrix(np.array(featMat), label=np.array(labels), missing=-999.0) data.save_binary(args.output)
name + ".entity") link_path = os.path.join(DATA_ROOT, "train", "link", name + ".link") atts_path = os.path.join(DATA_ROOT, "train", "atts", name + ".atts") with open(ing_path) as ing, \ open(ins_path) as ins, \ open(link_path) as link, \ open(atts_path) as atts: recipe = PreprocessedRecipe(ing, ins, link, atts) samples_in_recipe = generate_samples(recipe) for label, pair in samples_in_recipe: sample_labels.append(label) sample_pairs.append(pair) sample_recipes.append(recipe) # Extract Features features = extractor.extract_features(sample_pairs, sample_recipes) # Training cls = LinearSVMClassifier() cls.train(sample_labels, features) # Dev Testing dev_labels = [] dev_pairs = [] dev_recipes = [] for name in dev_names: ing_path = os.path.join(DATA_ROOT, "dev", "ing_entity", name + ".ient") ins_path = os.path.join(DATA_ROOT, "dev", "instruct_entity", name + ".entity") link_path = os.path.join(DATA_ROOT, "dev", "link", name + ".link") atts_path = os.path.join(DATA_ROOT, "dev", "atts", name + ".atts")
def set_data(self, datafile, ratingsfile): fe = FeatureExtractor(datafile) self.features = fe.extract_features() self.info = fe.extract_info() self.make_lookup_table() self.targets = create_targets(self.info, ratingsfile)
help='path to the positive corpus file') arg_parser.add_argument('corpus_file_neg', help='path to the negative corpus file') arg_parser.add_argument('output_file', help='path to the output file') args = arg_parser.parse_args() print('\n - Autosarkasmus Baseline Feature Extraction (Simplified) -\n') # feature setup print('setting up features...') features, feature_order = setup_features() print('setting up feature extractor...') feature_extractor = FeatureExtractor(features, feature_order) # ARFF document setup arff_doc = ARFFDocument('Sarkasmuserkennung', features, feature_order) # the magic tweets_ext = feature_extractor.extract_features(args.corpus_file_pos, args.corpus_file_neg, verbose=True) # generate final ARFF document print('generating ARFF document...') for tweet_ext in tweets_ext: arff_doc.add_data(tweet_ext) arff_doc.generate_document(args.output_file) print('\n - extracted features from ' + str(len(arff_doc.data)) + ' tweets -\n')
def start_processing(cls, path_to_base_folder): start = time() Utilities.prepare_properties_dictionary() if not os.path.exists( os.path.join( path_to_base_folder, Utilities.get_prop_value(Utilities.BOOK_DESCRIPTOR_KEY))): print("Please provide the book descriptor file") return if not os.path.exists( os.path.join(path_to_base_folder, Utilities.get_prop_value( Utilities.BOOK_REPO_KEY))): print("Please provide the book folder") return if not os.path.exists( os.path.join( os.getcwd(), Utilities.get_prop_value(Utilities.DATA_POINT_KEY))): data_start = time() DataPointSelector.select_datapoints(path_to_base_folder) data_end = time() print("Data Selection took : {} minutes".format( (data_end - data_start) / 60)) else: print( "Data Point CSV found in directory, continuing to Feature Extraction" ) if not os.path.exists( os.path.join( os.getcwd(), Utilities.get_prop_value(Utilities.PYTHON_FEATURE_CSV))): py_start = time() extractor = FeatureExtractor( base_folder_address=path_to_base_folder) extractor.extract_features() py_end = time() print("Python Extractor took : {} minutes".format( (py_end - py_start) / 60)) else: print( "Python Feature Vector CSV found in directory, continuing to run Java project" ) if not os.path.exists( os.path.join( os.getcwd(), Utilities.get_prop_value(Utilities.JAVA_FEATURE_CSV))): bat_file_name = r'command.bat' folder_path = os.path.join( path_to_base_folder, Utilities.get_prop_value(Utilities.BOOK_REPO_KEY)) output_file_name = ".\\" + Utilities.get_prop_value( Utilities.JAVA_FEATURE_CSV) book_descriptor_file_name = ".\\" + Utilities.get_prop_value( Utilities.BOOK_DESCRIPTOR_KEY) data_points_file_name = ".\\" + Utilities.get_prop_value( Utilities.DATA_POINT_KEY) java_start = time() x = subprocess.call([ bat_file_name, folder_path, output_file_name, book_descriptor_file_name, data_points_file_name ]) java_end = time() print("Java Project took : {} minutes".format( (java_end - java_start) / 60)) else: print( "Java output Feature Vector CSV found in directory, continuing to Model Runner" ) runner = ModelRunner(path_to_base_folder) runner.drive_model_runner() end = time() total = end - start print("Total Time for the whole process : {} minutes".format( (end - start) / 60))