def main(): args = get_parser().parse_args() config = Config.from_file(args.config) logger = get_logger(config.output_path) logger.info(args) logger.info("=> Starting evaluation ...") logger.info("Load data") corpus = io.load_json(config.input_path, append_title=config.use_title) logger.info("Perform preprocessing") preprocessed_corpus = Preprocessing( corpus["keywords"], config=config.preprocessing, datatype="keywords", logger=logger, ).apply_preprocessing() preprocessed_corpus["token"] = preprocessed_corpus["token"].apply(flatten) preprocessed_corpus.drop("abstract", axis=1, inplace=True) logger.info("Start clustering") clustering = Clustering( preprocessed_corpus, clustering_config=config.clustering, dim_reduction_config=config.dim_reduction, logger=logger, ) model = clustering.perform_clustering() logger.info(f"Save results to {config.output_path}") corpus["label"] = model.labels_ io.write_json(config.input_path.split(".")[0] + "_labeled.json", corpus)
def __init__(self, root, dataset_partitions, past_frames, future_frames, input_dim, output_dim, blur_radius=3, buffer_memory=1e2, buffer_size=1e3, batch_size=16, caching=True, downsampling=False, dynamicity=1e-3): ''' Data Generator Inputs: - Path containing folders of frames - List of the names of these folders - Partitions: [(ids_x(x, 10), ids_y(x, 4))] ''' self.input_dim = input_dim self.output_dim = output_dim self.dataset_partitions = dataset_partitions self.batch_size = np.min([len(x[1]) for x in self.dataset_partitions]) # minimo numero di sequenze per area self.past_frames = past_frames self.future_frames = future_frames self.caching = caching self.batch_size = batch_size self.blurry_filter_size = (blur_radius, blur_radius) self.downsampling_factor = 4 self.downsampling = downsampling self.root = root self.buffer = [] self.buffer_size = buffer_size self.buffer_memory = buffer_memory self.buffer_hit_ratio = 0 self.preprocessing = Preprocessing() self.dynamicity = dynamicity
def main(): dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) analyzer = Preprocessing(dataset.X_train) \ .featurizer.build_analyzer() docs = [analyzer(doc) for doc in dataset.X_train] create_word_to_vec_embeddings(docs)
def preprocessing(app_config): logger.info("开始预处理文件") preprocess_param_file = open( app_config["app"]["preprocessing"]["preprocess_config"], "r", encoding="utf-8") preprocess_config = yaml.load(preprocess_param_file.read()) output_dir = app_config["app"]["preprocessing"]["output_dir"] input_dir = app_config["app"]["preprocessing"]["input_dir"] file_pattern = app_config["app"]["preprocessing"]["input_file_pattern"] output_file_prefix = app_config["app"]["preprocessing"][ "output_file_prefix"] if not os.path.isdir(output_dir): os.makedirs(output_dir) for root, dirs, files in os.walk(input_dir): for f in files: if file_pattern in f: logger.info("预处理文件%s" % f) prep = Preprocessing(data_frame=pd.read_csv( os.path.join(root, f)), config=preprocess_config) df, seeds = prep.preprocessing() if df.shape[0] > 0: df.to_csv(os.path.join(output_dir, output_file_prefix + f), index=False, header=app_config["app"]["preprocessing"] ["keep_header"]) if app_config["app"]["preprocessing"]["generate_seed"]: logger.info("导出种子文件%s" % f[:-4] + ".yml") output_seed_file_prefix = app_config["app"][ "preprocessing"]["output_seed_file_prefix"] with open(os.path.join( output_dir, output_seed_file_prefix + f[:-4] + ".yml"), "w", encoding="utf-8") as sf: yaml.dump(seeds, sf) logger.info("预处理完成")
def get_vectorizer(current_params): f = lambda x: Preprocessing.preprocessing(x, current_params) Vectorizer = current_params.get("Vectorizer", CountVectorizer) vectorizer = Vectorizer( preprocessor=f, lowercase=False, token_pattern=Preprocessing.token_pattern, binary=current_params.get("binary", False), max_df=current_params.get("max_df", 1.), min_df=current_params.get("min_df", 1), ngram_range=current_params.get("ngram_range", (1, 1)), max_features=current_params.get("max_features", None)) return vectorizer
class DataGenerator(): def __init__(self, root, dataset_partitions, past_frames, future_frames, input_dim, output_dim, dynamicity, filtering=True, buffer_memory=1e2, buffer_size=1e3, batch_size=16, caching=True, downsampling=False): ''' Data Generator Inputs: - Path containing folders of frames - List of the names of these folders - Partitions: [(ids_x(x, 10), ids_y(x, 4))] ''' self.input_dim = input_dim self.output_dim = output_dim self.dataset_partitions = dataset_partitions self.batch_size = np.min([len(x[1]) for x in self.dataset_partitions ]) # minimo numero di sequenze per area self.past_frames = past_frames self.future_frames = future_frames self.caching = caching self.batch_size = batch_size self.blurry_filter_size = (3, 3) self.downsampling_factor = 4 self.downsampling = downsampling self.root = root self.buffer = [] self.buffer_size = buffer_size self.buffer_memory = buffer_memory self.buffer_hit_ratio = 0 self.preprocessing = Preprocessing() self.dynamicity = dynamicity self.filtering = filtering def get_data(self): 'Generates batches of datapoints' X, Y = self.__data_generation() # seq, t, h, w, c return X, Y def __data_generation(self): 'Generates the raw sequence of datapoints (filtered)' # stats accesses = 0 hits = 0 # Initialization X = None Y = None print("[x] {} areas found".format(len(self.dataset_partitions))) # For each area for area_index, area in enumerate(self.dataset_partitions): # For each sequence loaded = 0 print("Area {} - sequences: {}\n".format(area_index, len(area[1])), end="", flush=True) for i, sequence in enumerate(area[1]): # --- BTM btm_filenames = [ x for x in os.listdir(self.root + self.dataset_partitions[area_index][0]) if x.endswith(".BTM") ] if len(btm_filenames) == 0: raise Exception("No BTM map found for the area {}".format( self.dataset_partitions[area_index][0])) btm = iter_loadtxt( self.root + self.dataset_partitions[area_index][0] + "/" + btm_filenames[0], delimiter=" ") # --- Outliers btm[np.isnan(btm)] = 0 btm[btm > 10e5] = 0 # --- Preprocessing if self.downsampling: btm = cv.GaussianBlur(btm, self.blurry_filter_size, 0) btm = cv.pyrDown(btm) btm = cv.pyrDown(btm) # riduzione valori il sottraendo minimo min_btm = np.min(btm) btm = btm - min_btm btm.resize(btm.shape[0], btm.shape[1], 1) btm_x = np.tile(btm, (self.past_frames, 1, 1, 1)) deps = None vvx_s = None vvy_s = None framestart = int(sequence.replace("id-", "")) # Starts from the right frame for k in range( framestart, framestart + self.past_frames + self.future_frames): # id area -> id frame gid = "{}-{}-{}".format(area_index, sequence, k) # Parameters extensions = ["DEP", "VVX", "VVY"] matrices = [] # Gets datapoint filename dep_filenames = [ x for x in os.listdir( self.root + self.dataset_partitions[area_index][0]) if x.endswith(".DEP") ] if len(dep_filenames) == 0: raise Exception( "No DEP maps found for the area {}".format( self.dataset_partitions[area_index][0])) # asserting that all maps are named with the same prefix dep_filename = dep_filenames[0].split(".")[0][:-4] # 1 frame -> 3 matrices (3 extensions) for i, ext in enumerate(extensions): accesses += 1 global_id = "{}-{}".format( i, gid) # indice linearizzato globale # ----- Cache if self.caching: cache_frame = self.buffer_lookup(global_id) if cache_frame is False: frame = iter_loadtxt( self.root + self.dataset_partitions[area_index][0] + "/{}{:04d}.{}".format( dep_filename, k, ext), delimiter=" ") self.buffer_push(global_id, frame) else: frame = cache_frame hits += 1 # ----- No cache else: frame = iter_loadtxt( self.root + self.dataset_partitions[area_index][0] + "/{}{:04d}.{}".format(dep_filename, k, ext), delimiter=" ") # --- Outliers frame[np.isnan(frame)] = 0 frame[frame > 10e5] = 0 # --- On-spot Gaussian Blurring if self.downsampling: frame = cv.GaussianBlur(frame, self.blurry_filter_size, 0) frame = cv.pyrDown(frame) frame = cv.pyrDown(frame) matrices.append(frame) frame, vvx, vvy = matrices # --- if deps is None: deps = np.array([frame]) else: deps = np.concatenate((deps, np.array([frame]))) if vvx_s is None: vvx_s = np.array([vvx]) else: vvx_s = np.concatenate((vvx_s, np.array([vvx]))) if vvy_s is None: vvy_s = np.array([vvy]) else: vvy_s = np.concatenate((vvy_s, np.array([vvy]))) # --------- deps[deps > 10e5] = 0 vvx_s[vvx_s > 10e5] = 0 vvy_s[vvy_s > 10e5] = 0 btm_x[btm_x > 10e5] = 0 # --- X x_dep = deps[:self.past_frames] x_dep.resize( (x_dep.shape[0], x_dep.shape[1], x_dep.shape[2], 1)) x_vx = vvx_s[:self.past_frames] x_vx.resize((x_vx.shape[0], x_vx.shape[1], x_vx.shape[2], 1)) x_vy = vvy_s[:self.past_frames] x_vy.resize((x_vy.shape[0], x_vy.shape[1], x_vy.shape[2], 1)) x = np.concatenate((x_dep, x_vx, x_vy, btm_x), axis=3) # --- Y y_dep = deps[self.past_frames:] y_dep.resize( (y_dep.shape[0], y_dep.shape[1], y_dep.shape[2], 1)) y_vx = vvx_s[self.past_frames:] y_vx.resize((y_vx.shape[0], y_vx.shape[1], y_vx.shape[2], 1)) y_vy = vvy_s[self.past_frames:] y_vy.resize((y_vy.shape[0], y_vy.shape[1], y_vy.shape[2], 1)) y = np.concatenate((y_dep, y_vx, y_vy), axis=3) # filtering if self.filtering: valid = self.preprocessing.eval_datapoint( x[:, :, :, :3], y, self.dynamicity) else: valid = True if valid: loaded += 1 if X is None: X = np.expand_dims(x, 0) else: X = np.concatenate((X, np.expand_dims(x, 0))) if Y is None: Y = np.expand_dims(y, 0) else: Y = np.concatenate((Y, np.expand_dims(y, 0))) print("x ", end="", flush=True) else: print("- ", end="", flush=True) print("\n[{}%] {} valid sequences loaded".format( round((area_index + 1) / len(self.dataset_partitions) * 100), loaded)) # Buffer ratio calculation if accesses != 0: self.buffer_hit_ratio = self.buffer_hit_ratio * 0.5 + 0.5 * ( hits / accesses) return X, Y # ------------------------------------ def buffer_lookup(self, k): ''' Get sequence (datapoint) from cache given the start frame global id ''' if self.caching: for i, x in enumerate(self.buffer): # Returns found record if x["global_id"] == k: self.buffer[i]["fresh"] += 1 return x["value"] # Set any read record to 0 (second chance) elif self.buffer[i]["fresh"] != 0: self.buffer[i]["fresh"] -= 1 return False def buffer_push(self, k, x): ''' Add sequence (datapoint) to cache with start frame global id ''' if self.caching: # Makes space if len(self.buffer) >= self.buffer_size: for i, j in enumerate(self.buffer): if j["fresh"] == 0: del self.buffer[i] # Push self.buffer.append({ 'fresh': self.buffer_memory, 'global_id': k, 'value': x })
def run(): parser = get_arg_parser() cmd_args = parser.parse_args() if cmd_args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu) gpunum = os.getenv('CUDA_VISIBLE_DEVICES') logging.info("GPU has been set to {}".format(gpunum)) logging.info("Model used for the regression network: {}" .format(cmd_args.model_name)) # 1. Dataset retrieval # -------------------- tab_printer(constants.Dataset) dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) logging.info("Going to create vocabulary and fit a preprocessing pipeline" "using {} samples. Settings will be listed below" .format(len(dataset.X_train))) # 2. Preprocessing # ----------------- tab_printer(constants.NLP) preprocessor = Preprocessing(dataset.X_train) # Preprocess documents X_train = preprocessor.transform_documents(dataset.X_train) X_test = preprocessor.transform_documents(dataset.X_test) # 3. Word embeddings with word2vec # -------------------------------- # Train word2vec embeddings if train_word2vec option is selected if cmd_args.train_word2vec: utils.embeddings.main() weights = get_embedding_tensor(preprocessor) # 4. Node embeddings with AttentionWalk # ------------------------------------- args = _generate_deepwalk_parameters(dataset.y_train_graph) if cmd_args.train_attentionwalk: train_attention_walk(args) graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values # Get document representations using node embeddings y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings) y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings) # 5. Regressor Training # --------------------- device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \ if torch.cuda.is_available() else 'cpu' regressor_nn = NeuralNet( get_network_class(cmd_args.model_name), max_epochs=constants.NeuralNetworkTraining.epochs, lr=constants.NeuralNetworkTraining.learning_rate, batch_size=constants.NeuralNetworkTraining.batch_size, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, module__output_dim=args.dimensions, module__embedding=weights, module__embedding_dim=constants.NLP.embedding_size, device=device, train_split=None, ) # Train the regressor neural network regressor_nn.fit(X_train, y_embedded.astype(np.float32)) # 6. Train Multi-label KNN algorithm # ---------------------------------- tab_printer(constants.MLKNN) # Train multi-label KNN to turn label embeddings into label predictions classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s) classifier.fit(y_embedded, dataset.y_train) # 7. Evaluation # ------------- # Label prediction with documents y_test_pred = regressor_nn.predict(X_test) preds = classifier.predict(y_test_pred) preds_raw = classifier.predict_proba(y_test_pred) # Label prediction with label embeddings preds_w_labels = classifier.predict(y_test_embedded) preds_w_labels_raw = classifier.predict_proba(y_test_embedded) # Log evaluation result with label embeddings eval_metrics_w_labels = evaluation \ .all_metrics(preds_w_labels.toarray(), dataset.y_test, yhat_raw=preds_w_labels_raw.toarray()) logging.info(str(eval_metrics_w_labels)) # Log evaluation result with documents report_evaluation(preds.toarray(), dataset.y_test, yhat_raw=preds_raw.toarray())
class DataGenerator(): def __init__(self, root, dataset_partitions, past_frames, future_frames, input_dim, output_dim, blur_radius=3, buffer_memory=1e2, buffer_size=1e3, batch_size=16, caching=True, downsampling=False, dynamicity=1e-3): ''' Data Generator Inputs: - Path containing folders of frames - List of the names of these folders - Partitions: [(ids_x(x, 10), ids_y(x, 4))] ''' self.input_dim = input_dim self.output_dim = output_dim self.dataset_partitions = dataset_partitions self.batch_size = np.min([len(x[1]) for x in self.dataset_partitions]) # minimo numero di sequenze per area self.past_frames = past_frames self.future_frames = future_frames self.caching = caching self.batch_size = batch_size self.blurry_filter_size = (blur_radius, blur_radius) self.downsampling_factor = 4 self.downsampling = downsampling self.root = root self.buffer = [] self.buffer_size = buffer_size self.buffer_memory = buffer_memory self.buffer_hit_ratio = 0 self.preprocessing = Preprocessing() self.dynamicity = dynamicity def get_datapoint(self, area_index, sequence_index, check=True): ''' Generates a single datapoint on the fly (cached) Inputs: - index of the area - index of the sequence - flag to check sequence validity Outputs: - case 1: valid sequence -> (X, Y) - case 2: non-valid sequence -> None ''' # Initialization X = None Y = None area = self.dataset_partitions[area_index] sequence = self.dataset_partitions[area_index][1][sequence_index] # --- BTM btm_filenames = [x for x in os.listdir(self.root + self.dataset_partitions[area_index][0]) if x.endswith(".BTM")] if len(btm_filenames) == 0: raise Exception("No BTM map found for the area {}".format(self.dataset_partitions[area_index][0])) btm = pd.read_csv(self.root + self.dataset_partitions[area_index][0] + "/" + btm_filenames[0],' ',header=None).values # --- Outliers btm[np.isnan(btm)] = 0 btm[btm > 10e5] = 0 # --- Preprocessing if self.downsampling: btm = cv.GaussianBlur(btm, self.blurry_filter_size, 0) btm = cv.pyrDown(btm) btm = cv.pyrDown(btm) # riduzione valori il sottraendo minimo min_btm = np.min(btm) btm = btm - min_btm btm.resize(btm.shape[0], btm.shape[1], 1) btm_x = np.tile(btm, (self.past_frames, 1, 1, 1)) deps = None vvx_s = None vvy_s = None framestart = int(sequence.replace("id-", "")) # Starts from the right frame for k in range(framestart, framestart + self.past_frames + self.future_frames): # id area -> id frame gid = "{}-{}-{}".format(area_index, sequence, k) # Parameters extensions = ["DEP", "VVX", "VVY"] matrices = [] # Gets datapoint filename dep_filenames = [x for x in os.listdir(self.root + self.dataset_partitions[area_index][0]) if x.endswith(".DEP")] if len(dep_filenames) == 0: raise Exception("No DEP maps found for the area {}".format(self.dataset_partitions[area_index][0])) # asserting that all maps are named with the same prefix dep_filename = dep_filenames[0].split(".")[0][:-4] # 1 frame -> 3 matrices (3 extensions) for i, ext in enumerate(extensions): global_id = "{}-{}".format(i, gid) # indice linearizzato globale # ----- Cache if self.caching: cache_frame = self.buffer_lookup( global_id ) if cache_frame is False: frame = pd.read_csv(self.root + self.dataset_partitions[area_index][0] + "/{}{:04d}.{}".format(dep_filename,k, ext), ' ', header=None).values self.buffer_push(global_id, frame) else: frame = cache_frame # ----- No cache else: frame = pd.read_csv(self.root + self.dataset_partitions[area_index][0] + "/{}{:04d}.{}".format(dep_filename,k, ext), ' ', header=None).values # --- Filtering frame[np.isnan(frame)] = 0 frame[frame > 10e5] = 0 # --- On-spot Gaussian Blurring if self.downsampling: frame = cv.GaussianBlur(frame, self.blurry_filter_size, 0) frame = cv.pyrDown(frame) frame = cv.pyrDown(frame) matrices.append(frame) frame, vvx, vvy = matrices # --- if deps is None: deps = np.array([frame]) else: deps = np.concatenate((deps, np.array([frame]))) if vvx_s is None: vvx_s = np.array([vvx]) else: vvx_s = np.concatenate((vvx_s, np.array([vvx]))) if vvy_s is None: vvy_s = np.array([vvy]) else: vvy_s = np.concatenate((vvy_s, np.array([vvy]))) # --------- deps[deps > 10e5] = 0 vvx_s[vvx_s > 10e5] = 0 vvy_s[vvy_s > 10e5] = 0 btm_x[btm_x > 10e5] = 0 # --- X x_dep = deps[:self.past_frames] x_dep.resize((x_dep.shape[0], x_dep.shape[1], x_dep.shape[2], 1)) x_vx = vvx_s[:self.past_frames] x_vx.resize((x_vx.shape[0], x_vx.shape[1], x_vx.shape[2], 1)) x_vy = vvy_s[:self.past_frames] x_vy.resize((x_vy.shape[0], x_vy.shape[1], x_vy.shape[2], 1)) x = np.concatenate((x_dep, x_vx, x_vy, btm_x), axis=3) # --- Y y_dep = deps[self.past_frames:] y_dep.resize((y_dep.shape[0], y_dep.shape[1], y_dep.shape[2], 1)) y_vx = vvx_s[self.past_frames:] y_vx.resize((y_vx.shape[0], y_vx.shape[1], y_vx.shape[2], 1)) y_vy = vvy_s[self.past_frames:] y_vy.resize((y_vy.shape[0], y_vy.shape[1], y_vy.shape[2], 1)) y = np.concatenate((y_dep, y_vx, y_vy), axis=3) # filtering if check: valid = self.preprocessing.eval_datapoint(x[:,:,:,:3], y, self.dynamicity) if valid: if X is None: X = np.expand_dims(x,0) else: X = np.concatenate((X, np.expand_dims(x,0))) if Y is None: Y = np.expand_dims(y,0) else: Y = np.concatenate((Y, np.expand_dims(y,0))) return X, Y else: return (None, None) else: if X is None: X = np.expand_dims(x,0) else: X = np.concatenate((X, np.expand_dims(x,0))) if Y is None: Y = np.expand_dims(y,0) else: Y = np.concatenate((Y, np.expand_dims(y,0))) return X, Y # ------------------------------------ def buffer_lookup(self, k): ''' Get sequence (datapoint) from cache given the start frame global id ''' if self.caching: for i, x in enumerate(self.buffer): # Returns found record if x["global_id"] == k: self.buffer[i]["fresh"] += 1 return x["value"] # Set any read record to 0 (second chance) elif self.buffer[i]["fresh"] != 0: self.buffer[i]["fresh"] -= 1 return False def buffer_push(self, k, x): ''' Add sequence (datapoint) to cache with start frame global id ''' if self.caching: # Makes space if len(self.buffer) >= self.buffer_size: for i, j in enumerate(self.buffer): if j["fresh"] == 0: del self.buffer[i] # Push self.buffer.append({'fresh': self.buffer_memory, 'global_id': k, 'value': x})
print(list(test.keys())[maxi]) # parametres de la meilleur solution stop = list(stopwords.words('french')) + ['cet', 'cette', 'là'] params = { "lowercase": True, "punct": True, "marker": True, "number": True, "stemming": Preprocessing.lem, #stemmer.stem, "ligne": None, "strip_accents": True, "stopwords": set(stop) } f = lambda x: Preprocessing.preprocessing(x, params) t = time() data_x = list(map(f, alltxts)) print("temps 1 :", time() - t) vectorizer = CountVectorizer(preprocessor=None, lowercase=False, token_pattern=Preprocessing.token_pattern) t = time() X = vectorizer.fit_transform(data_x) print("temps 2 :", time() - t) # train test split sans équilibrage t = time() clf = svm.LinearSVC()
def gridSearch(datax, datay, params, stock=False): ''' Parameters ---------- datax Liste des données. datay Liste des labels des données. clf_class Classifieur à utiliser. params Dictionnaire des parametres. Returns ------- res_train Dictionnaire des F1-score en train en fonction des différents parametres. res_test : TYPE Dictionnaire des F1-score en train en fonction des différents parametres.. ''' el = params.keys() res_test = dict() res_train = dict() size = len(list(itertools.product(*params.values()))) for i, v in enumerate(list(itertools.product(*params.values()))): print(i + 1, "on", size) tag = tuple(x if isinstance(x, collections.Hashable) else "YES" for x in v) print(tag) current_params = dict(zip(el, v)) # choix du classifieur clf_class = current_params.get("clf", svm.LinearSVC) if clf_class == nb.MultinomialNB: class_prior = current_params.get("class_weight", None) if class_prior == "balanced": class_prior = len(datax) / ( 2 * np.bincount(np.where(np.array(datay) == 1, 1, 0))) print(class_prior) clf = clf_class(class_prior=class_prior) else: clf = clf_class( class_weight=current_params.get("class_weight", None)) # choix du vectorizer Vectorizer = current_params.get("Vectorizer", CountVectorizer) # application des parametres au preprocessing f = lambda x: Preprocessing.preprocessing(x, current_params) # Vectorization print(current_params.get("max_df", 1), current_params.get("min_df", 1)) vectorizer = Vectorizer( preprocessor=f, lowercase=False, token_pattern=Preprocessing.token_pattern, binary=current_params.get("binary", False), max_df=current_params.get("max_df", 1), min_df=current_params.get("min_df", 1), ngram_range=current_params.get("ngram_range", (1, 1)), max_features=current_params.get("max_features", None)) X = vectorizer.fit_transform(datax) X_train, X_test, y_train, y_test = train_test_split(X, datay, test_size=0.4, random_state=0) clf.fit(X_train, y_train) # Application yhat_test = clf.predict(X_test) yhat_train = clf.predict(X_train) res_test[tag] = f1_score(y_test, yhat_test) res_train[tag] = f1_score(y_train, yhat_train) print(res_test[tag]) if stock: pickle.dump(res_train, open("train", "wb")) pickle.dump(res_test, open("test", "wb")) return res_train, res_test
def run(): parser = get_arg_parser(embedding_classifier=False) cmd_args = parser.parse_args() if cmd_args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu) gpunum = os.getenv('CUDA_VISIBLE_DEVICES') logging.info("GPU has been set to {}".format(gpunum)) logging.info("Model used for the classification network: {}".format( cmd_args.model_name)) # 1. Dataset retrieval # -------------------- tab_printer(constants.Dataset) dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) logging.info("Going to create vocabulary and fit a preprocessing pipeline" "using {} samples. Settings will be listed below".format( len(dataset.X_train))) # 2. Preprocessing # ----------------- tab_printer(constants.NLP) preprocessor = Preprocessing(dataset.X_train) # Preprocess documents X_train = preprocessor.transform_documents(dataset.X_train) X_test = preprocessor.transform_documents(dataset.X_test) # 3. Word embeddings with word2vec # -------------------------------- # Train word2vec embeddings if train_word2vec option # is selected if cmd_args.train_word2vec: utils.embeddings.main() weights = get_embedding_tensor(preprocessor) logging.info("Word embeddings are loaded.") # 4. Label Network Optim # ----------------------- device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \ if torch.cuda.is_available() else 'cpu' logging.info("Going to run on device: {}".format(device)) args = _generate_deepwalk_parameters(dataset.y_train_graph) label_embeddings = np.array( pd.read_csv(args.embedding_path).iloc[:, 1:].values) label_embeddings_weights = torch.FloatTensor(label_embeddings) label_network = NeuralNet( CAML, max_epochs=50, lr=constants.NeuralNetworkTraining.learning_rate, batch_size=constants.NeuralNetworkTraining.batch_size, optimizer=torch.optim.Adam, criterion=torch.nn.BCEWithLogitsLoss, module__output_dim=dataset.y_train.shape[1], module__embedding=label_embeddings_weights, module__embedding_dim=args.dimensions, module__kernel_size=1, device=device, train_split=skorch.dataset.CVSplit(stratified=False), ) label_network.fit(dataset.y_train, dataset.y_train.astype(np.float32)) # 5. Evaluation # ------------- yhat_test_raw_logits = label_network.predict_proba(dataset.y_test) yhat_test_raw = torch.sigmoid(torch.Tensor(yhat_test_raw_logits)).numpy() yhat_test = np.array(yhat_test_raw >= constants.NeuralNetworkTraining.threshold) \ .astype(np.int64) report_evaluation(yhat_test, dataset.y_test, yhat_raw=yhat_test_raw)
def __init__(self): self.text = Preprocessing()
pchg = df.pop("PCHG") df.drop(["DATE", "CODE"], axis=1, inplace=True) pca = PCA(n_components=20) pca_data = pca.fit_transform(df.values) PCA_COLUMNS = [] for i in range(20): PCA_COLUMNS.append("PCA" + str(i + 1)) pca_df = pd.DataFrame(pca_data, columns=PCA_COLUMNS) pca_df = pd.concat([pchg, pca_df], axis=1) # print(pca_df.head()) # print(pca_data) # print(pca.explained_variance_ratio_) config_file_path = "../config/pca_preprocessing_config.yaml" config = yaml.safe_load(open(config_file_path, "r")) prep = Preprocessing(pca_df, config=config) p_df, p_config = prep.preprocessing() # print("*" * 70) # print(p_df.head()) # print(p_config) p_df.to_csv(os.path.join(out_dir, "pca_" + shortname + ".csv"), index=False, header=False) with open(os.path.join(out_dir, "seed_" + shortname + ".yml"), "w", encoding="utf-8") as sf: yaml.dump(p_config, sf)
parser = argparse.ArgumentParser() parser.add_argument('--folder_to_save', help='Folder to save summaries') args = parser.parse_args() folder_to_save = args.folder_to_save path_to_save = root_directory + "Data/DUC_2007/" + folder_to_save + "/" if not os.path.exists(path_to_save): os.makedirs(path_to_save) for folder in doc_folders: path = os.path.join(root_directory + "Data/DUC_2007/Documents/", '') + folder print (path) sentences, last_indexs = Preprocessing().openDirectory(path) text_sents = [] for item in sentences: text_sents.append(item.getStemmedWords()) clean_sents = [] org_sents = [] for item in sentences: org_sents.append(item.getOGwords()) tmp = "" for word in item.getStemmedWords(): tmp += word + " " if tmp[-1] not in clean_sents: clean_sents.append(tmp[:-1])