def merge_geog(): """ Choose best precision between initial coordinates or geocoded coordinates if geog is not set from cadastre information """ # Input dataset basol_geocoded = Dataset("etl", "basol_normalized") # Output dataset basol_geog_merged = Dataset("etl", "basol_geog_merged") basol_geog_merged.write_dtype([ *basol_geocoded.read_dtype(), Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ]) BasolGeocoded = basol_geocoded.reflect() session = basol_geocoded.get_session() point_lambert2 = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.coordxlambertii, BasolGeocoded.coordylambertii), LAMBERT2), WGS84) point_geocoded = func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.geocoded_longitude, BasolGeocoded.geocoded_latitude), WGS84) q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all() with basol_geog_merged.get_writer() as writer: for (row, point_lambert2, point_geocoded) in q: output_row = { **row2dict(row), "geog": None, "geog_precision": None, "geog_source": None } if row.l2e_precision == precisions.HOUSENUMBER: output_row["geog"] = point_lambert2 output_row["geog_precision"] = row.l2e_precision output_row["geog_source"] = "lambert2" elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \ (row.geocoded_result_score >= 0.6): output_row["geog"] = point_geocoded output_row["geog_precision"] = row.geocoded_result_type output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def generate_folds_for_dataset(): dataset_names = Dataset.get_dataset_names() + Dataset.interesting_2d_datasets() for dataset_name in dataset_names: dataset = Dataset(dataset_name) print("making folds for dataset ", dataset_name) os.makedirs(os.path.join(FOLD_PATH, dataset_name), exist_ok=True) for run_nb in range(10): # toon's code # skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle = True) # skf = KFold(n_splits=10, shuffle=True) labels = dataset.target for fold_nb, (train_indices, test_indices) in enumerate(skf.split(np.zeros(len(labels)), labels)): to_write = dict() to_write["train_indices"] = train_indices.tolist() to_write["test_indices"] = test_indices.tolist() if os.path.isfile(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb))): print("fold file already exists! not overwriting!") continue with open(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb)), mode = 'w') as fold_file: json.dump(to_write, fold_file)
def process_data(self, dataset: Dataset, stage: Optional[str] = None) -> Dataset: src_text_column_name, tgt_text_column_name = self.source_target_column_names convert_to_features = partial( self.convert_to_features, tokenizer=self.tokenizer, padding=self.cfg.padding, max_source_length=self.cfg.max_source_length, max_target_length=self.cfg.max_target_length, src_text_column_name=src_text_column_name, tgt_text_column_name=tgt_text_column_name, ) dataset = dataset.map( convert_to_features, batched=True, num_proc=self.cfg.preprocessing_num_workers, load_from_cache_file=self.cfg.load_from_cache_file, ) cols_to_keep = [ x for x in ["input_ids", "attention_mask", "labels"] if x in dataset["train"].features ] dataset.set_format(columns=cols_to_keep) return dataset
def prepare_inputs(ds: Dataset, text_col: str, label_col: str) -> Dataset: ds = ds.remove_columns(column_names=[text_col, "__index_level_0__"]) ds = ds.rename_column(label_col, "labels") ds = ds.with_format("torch") return ds
def load_bottleneck_data(training_file, validation_file, breadth): """ Utility function to load bottleneck features. Arguments: training_file - String validation_file - String """ print("Training file", training_file) print("Validation file", validation_file) print("Output breadth", breadth) with open(training_file, 'rb') as f: train_data = pickle.load(f) with open(validation_file, 'rb') as f: validation_data = pickle.load(f) X_train = train_data['features'] y_train = train_data['labels'] X_val = validation_data['features'] y_val = validation_data['labels'] D_train = Dataset('Training', Data(X_train), Likelihoods(y_train, breadth)) D_val = Dataset('Validation', Data(X_val), Likelihoods(y_val, breadth)) return (D_train, D_val)
def process_data(self, dataset: Dataset, stage: Optional[str] = None) -> Dataset: input_feature_fields = [ k for k, v in dataset["train"].features.items() if k not in ["label", "idx"] ] dataset = TextClassificationDataModule.preprocess( dataset, tokenizer=self.tokenizer, input_feature_fields=input_feature_fields, padding=self.cfg.padding, truncation=self.cfg.truncation, max_length=self.cfg.max_length, ) cols_to_keep = [ x for x in ["input_ids", "attention_mask", "token_type_ids", "labels"] if x in dataset["train"].features ] if not isinstance(dataset["train"].features["labels"], ClassLabel): dataset = dataset.class_encode_column("labels") dataset.set_format("torch", columns=cols_to_keep) self.labels = dataset["train"].features["labels"] return dataset
def __init__(self, generator, document): Dataset.__init__(self, data=[]) self.generator = generator self.document = document self.linked = None self._invalidpoints = None self.changeset = -1
def concatenate_datasets_with_ratio(args, train_dataset): concatenate_list = [] for sub_dataset_name, ratio in zip( args.data.sub_datasets.split(","), args.data.sub_datasets_ratio.split(",")): ratio = float(ratio) sub_dataset_path = p.join(args.path.train_data_dir, sub_dataset_name) assert p.exists(sub_dataset_path), f"{sub_dataset_name}이 존재하지 않습니다." sub_dataset = load_from_disk(sub_dataset_path) sub_dataset_len = int(len(sub_dataset["train"]) * ratio) print(f"ADD SUB DATASET {sub_dataset_name}, LENGTH: {sub_dataset_len}") # sub dataset must have same features: ['id', 'title', 'context', 'question', 'answers'] features = sub_dataset["train"].features new_sub_dataset = sub_dataset["train"].select(range(sub_dataset_len)) new_sub_dataset = Dataset.from_pandas(new_sub_dataset.to_pandas(), features=features) concatenate_list.append(new_sub_dataset.flatten_indices()) train_dataset = Dataset.from_pandas(train_dataset.to_pandas(), features=features) train_dataset = concatenate_datasets([train_dataset.flatten_indices()] + concatenate_list) return train_dataset
def process_data(self, dataset: Dataset, stage: Optional[str] = None) -> Dataset: features, label_column_name, text_column_name = self._setup_input_fields( dataset, stage) self._prepare_labels(dataset, features, label_column_name) convert_to_features = partial( TokenClassificationDataModule.convert_to_features, tokenizer=self.tokenizer, padding=self.cfg.padding, label_all_tokens=self.cfg.label_all_tokens, label_to_id=self.label_to_id, text_column_name=text_column_name, label_column_name=label_column_name, ) dataset = dataset.map( convert_to_features, batched=True, num_proc=self.cfg.preprocessing_num_workers, load_from_cache_file=self.cfg.load_from_cache_file, ) cols_to_keep = [ x for x in ["input_ids", "attention_mask", "token_type_ids", "labels", "idx"] if x in dataset["train"].features ] dataset.set_format(columns=cols_to_keep) return dataset
def load_data( self, hf_dataset: Dataset, input_key: str, target_keys: Optional[Union[str, List[str]]] = None, target_formatter: Optional[TargetFormatter] = None, ) -> Dataset: """Loads data into HuggingFace datasets.Dataset.""" if not self.predicting: hf_dataset = hf_dataset.map( partial(self._resolve_target, target_keys)) targets = hf_dataset.to_dict()[DataKeys.TARGET] self.load_target_metadata(targets, target_formatter=target_formatter) # If we had binary multi-class targets then we also know the labels (column names) if isinstance(self.target_formatter, MultiBinaryTargetFormatter) and isinstance( target_keys, List): self.labels = target_keys # remove extra columns extra_columns = set( hf_dataset.column_names) - {input_key, DataKeys.TARGET} hf_dataset = hf_dataset.remove_columns(extra_columns) if input_key != DataKeys.INPUT: hf_dataset = hf_dataset.rename_column(input_key, DataKeys.INPUT) return hf_dataset
def matlab_test(): dataset = Dataset("iris") clusterer = MyCOSCMatlab() clusterer.signal_start(dataset.data) result = clusterer.fit(dataset.data, [(1,2),(2,3),(3,dataset.number_of_instances())], [(10,12),(23,16)], dataset.number_of_classes()) print(result) clusterer.signal_end()
def load_datasets(lang="es", random_state=2021, preprocessing_args={}): """ Load emotion recognition datasets """ train_df = load_df(paths[lang]["train"]) test_df = load_df(paths[lang]["test"]) train_df, dev_df = train_test_split(train_df, stratify=train_df["label"], random_state=random_state) for df in [train_df, dev_df, test_df]: for label, idx in label2id.items(): df.loc[df["label"] == label, "label"] = idx df["label"] = df["label"].astype(int) preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args) train_df.loc[:, "text"] = train_df["text"].apply(preprocess) dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess) test_df.loc[:, "text"] = test_df["text"].apply(preprocess) features = Features({ 'text': Value('string'), 'label': ClassLabel(num_classes=len(id2label), names=[id2label[k] for k in sorted(id2label.keys())]) }) train_dataset = Dataset.from_pandas(train_df, features=features) dev_dataset = Dataset.from_pandas(dev_df, features=features) test_dataset = Dataset.from_pandas(test_df, features=features) return train_dataset, dev_dataset, test_dataset
def run(self): algorithm = algorithm_info_to_object(self.algorithm_name, self.algorithm_parameters) querier_builder = querier_info_to_object(self.querier_name, self.querier_parameters) dataset = Dataset(self.dataset_name) train_indices = fold_path_to_train_indices(self.fold_path) querier = querier_builder.build_querier(dataset) result = None # retry to execute the algorithm 10 times # this is because COSC does not always produce a result and ends with an exception try: result = algorithm.fit(dataset.data, dataset.number_of_classes(), train_indices, querier) except Exception as e: print("An exception occured during calculation of {} (this is silently ignored):".format(self.result_path), file = sys.stderr) traceback.print_exc() if result is None: return # None is not json serializable so use the string "None" instead train_indices = train_indices if train_indices is not None else "None" full_result = result + (train_indices,) os.makedirs(os.path.dirname(self.result_path), exist_ok=True) with open(self.result_path, mode="w") as result_file: json.dump(full_result, result_file)
def main(): """ Main process. """ args = parse_cli_args() config = TrainConfig() train_ds = Dataset(args.train_path) valid_ds = Dataset(args.valid_path) model = make_model() optimizer = getattr(optim, config.optimizer_name)(model.parameters(), lr=config.learning_rate) training = Training( train_ds, valid_ds, model, optimizer, config.batch_size, config.epochs, ) training.train()
def pretrain_RNADE(self,): print 'Pre-training the RNADE' l2 = 2. rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2) batch_size = 100 num_examples = 100 filename = 'pre_train_params.pickle' learning_rate = self.learning_rate_pretrain train_data = mocap_data.sample_train_seq(batch_size) for i in xrange(1,num_examples): train_data = numpy.vstack((train_data,mocap_data.sample_train_seq(batch_size))) numpy.random.shuffle(train_data) total_num = train_data.shape[0] train_frac = 0.8 train_dataset = Dataset([train_data[0:int(train_frac*total_num)]],100) valid_dataset = Dataset([train_data[int(train_frac*total_num):]],100) optimiser = SGD_Optimiser(rnade.params,[rnade.v],[rnade.cost,rnade.ll_cost,rnade.l2_cost],momentum=True,patience=20,clip_gradients=self.clip_gradients) optimiser.train(train_dataset,valid_set=valid_dataset,learning_rate=learning_rate,num_epochs=5,save=True, lr_update=True,update_type='linear',start=2,output_folder=self.output_folder,filename=filename) self.plot_costs(optimiser,fig_title='Pretraining cost',filename='pretraining.png') print 'Done pre-training.' ####load best params from pre-training### print 'Loading best RNADE parameters' rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2) rnade.load_model(self.output_folder,filename=filename) ########### for param in rnade.params: value = param.get_value() self.model.params_dict[param.name].set_value(value) print 'Done pre-training.' #Saving results to dict self.results['pretraining_train_costs'] = optimiser.train_costs self.results['pretraining_valid_costs'] = optimiser.valid_costs
def initialize(self, source, target, batch_size1, batch_size2, scale=32, shuffle_=False): transform = transforms.Compose([ transforms.Resize(scale), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataset_source = Dataset(source['imgs'], source['labels'], transform=transform) dataset_target = Dataset(target['imgs'], target['labels'], transform=transform) data_loader_s = torch.utils.data.DataLoader( dataset_source, batch_size=batch_size1, shuffle=shuffle_, num_workers=4 ) data_loader_t = torch.utils.data.DataLoader( dataset_target, batch_size=batch_size2, shuffle=shuffle_, num_workers=4 ) self.dataset_s = dataset_source self.dataset_t = dataset_target self.paired_data = PairedData(data_loader_s, data_loader_t, float("inf"))
def save_data(train_df, val_df): train_f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'question_type': Value(dtype='int32', id=None) }) train_datasets = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) file = open("../../data/question_type.pkl", "wb") pickle.dump(train_datasets, file) file.close()
def test_dataset_with_image_feature_with_none(): data = {"image": [None]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert item["image"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(item is None for item in batch["image"]) column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(item is None for item in column) # nested tests data = {"images": [[None]]} features = Features({"images": Sequence(Image())}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"images"} assert all(i is None for i in item["images"]) data = {"nested": [{"image": None}]} features = Features({"nested": {"image": Image()}}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"nested"} assert item["nested"].keys() == {"image"} assert item["nested"]["image"] is None
def process_data(self, dataset: Dataset, stage: Optional[str] = None) -> Dataset: convert_to_features = partial( self.convert_to_features, tokenizer=self.tokenizer, context_name=self.context_name, choices=self.choices, question_header_name=self.question_header_name, answer_column_name=self.answer_column_name, options_column_name=self.options_column_name, max_length=self.cfg.max_length, padding=self.cfg.padding, ) dataset = dataset.map( convert_to_features, batched=True, num_proc=self.cfg.preprocessing_num_workers, load_from_cache_file=self.cfg.load_from_cache_file, ) cols_to_keep = [ x for x in ["input_ids", "attention_mask", "token_type_ids", "label", "idx"] if x in dataset["train"].features ] dataset.set_format(columns=cols_to_keep) return dataset
def process_data(self, dataset: Dataset, stage: Optional[str] = None) -> Dataset: column_names = dataset["train" if stage == "fit" else "validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] tokenize_function = partial(self.tokenize_function, tokenizer=self.tokenizer, text_column_name=text_column_name) dataset = dataset.map( tokenize_function, batched=True, num_proc=self.cfg.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=self.cfg.load_from_cache_file, ) # Pass in our additional condition term when converting to features convert_to_features = partial( self.convert_to_features, block_size=self.effective_block_size, tokenized_condition_term=self.tokenized_condition_term, ) dataset = dataset.map( convert_to_features, batched=True, num_proc=self.cfg.preprocessing_num_workers, load_from_cache_file=self.cfg.load_from_cache_file, ) return dataset
def _preprocess_dataset( dataset_name, data, sentence_col, tokenizer, cache_dir="", short_test=False ): preprocess_function = dataset_preprocess.get(dataset_name, lambda x: x) data = concate(dataset_name, data, cache_dir) data = data.map(lambda x: {"input_text": preprocess_function(x[sentence_col])}) data["train"] = data["train"].remove_columns( set(data["train"].features) - set(["input_text"]) ) logging.info(f"NP Concate") if dataset_name == "air_dialogue": data["train"] = Dataset.from_dict( {"input_text": np.concatenate(data["train"]["input_text"]).ravel().tolist()} ) if short_test: data["train"] = Dataset.from_dict( {"input_text": data["train"]["input_text"][:30]} ) if dataset_name == "air_dialogue" or dataset_name == "yahoo_answers_topics": data["train"] = Dataset.from_dict( {"input_text": data["train"]["input_text"][:100000]} ) elif dataset_name == "wikipedia" or dataset_name == "yelp_review_full": data["train"] = Dataset.from_dict( {"input_text": data["train"]["input_text"][:200000]} ) if dataset_name in split_para: logging.info(f"Splitting Paragraphs") data["train"] = Dataset.from_dict( {"input_text": split_long_text(data["train"]["input_text"])} ) logging.info(f"Normalize") data = data.map(lambda x: {"input_text": normalize_raw(x["input_text"])}) logging.info(f"Keep Sentence") data = data.filter(lambda x: keep_sentence(x["input_text"])) now = datetime.now() current_time = now.strftime("%H:%M:%S") filename = f"{dataset_name}-full-text.out" logging.info(f"Opening file {filename} to write results") with open(filename, "w") as outfile: outfile.write("FULL TEXT BELOW\n") for i, text in enumerate(data["train"]["input_text"]): outfile.write(f"{i} | {text}\n") logging.info(f"Join") data = data.map(lambda x: {"input_text": " ".join(x["input_text"])}) logging.info(f"Tokenizer") data = data.map( lambda x: tokenizer(x["input_text"], padding="max_length", truncation=True), batched=True, ) return data
def process(self, dataset: datasets.Dataset): dataset = dataset.filter(self._filter_cmrc_data) if self.task == 'hl_ag': dataset = dataset.filter(self._filter_task_hl) else: dataset = dataset.filter(self._filter_task_qa) dataset = dataset.map(self._convert_to_features) return dataset
def remove_unused_columns(model: nn.Module, dataset: hf_datasets.Dataset) -> None: # This method is implemented in transformer's Trainer. # Inspect model forward signature to keep only the arguments it accepts. signature = inspect.signature(model.forward) signature_columns = list(signature.parameters.keys()) # Labels may be named label or label_ids, the default data collator handles that. signature_columns += ["label", "label_ids"] columns = [k for k in signature_columns if k in dataset.column_names] dataset.set_format(type=dataset.format["type"], columns=columns)
def preprocess(ds: Dataset, **fn_kwargs) -> Dataset: ds = ds.map( # todo: change this to self.convert_to_features for users to override TextClassificationDataModule.convert_to_features, batched=True, with_indices=True, fn_kwargs=fn_kwargs, ) ds.rename_column_("label", "labels") return ds
def train(self, training_data, target_station, verbose=1): print(self.session_name) # print(training_data, target_station, self.vol_size, self.strides) dataset = Dataset(self._builder_class, training_data, target_station, self.vol_size, self.strides) (X_train, Y_train) = dataset.construct() model = self.model(summary=False) model.fit(X_train, Y_train, epochs=self.epochs, batch_size=self.batch_size, callbacks=self.callbacks(), validation_split=0.05, verbose=verbose, shuffle=True)
def geocode(): """ Geocode Basol adresses """ # input dataset basol_filtered = Dataset("etl", "basol_filtered") # output dataset basol_geocoded = Dataset("etl", "basol_geocoded") # write output schema dtype = basol_filtered.read_dtype(primary_key="numerobasol") output_dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), *dtype, Column("geocoded_latitude", Float(precision=10)), Column("geocoded_longitude", Float(precision=10)), Column("geocoded_result_score", Float()), Column("geocoded_result_type", String()), Column("adresse_id", String()) ] basol_geocoded.write_dtype(output_dtype) with basol_geocoded.get_writer() as writer: for df in basol_filtered.get_dataframes(chunksize=100): df = df.replace({np.nan: None}) rows = df.to_dict(orient="records") payload = [{ "adresse": row["adresse"], "code_insee": row["code_insee"] } for row in rows] geocoded = bulk_geocode(payload, columns=["adresse"], citycode="code_insee") zipped = list(zip(rows, geocoded)) for (row, geocodage) in zipped: latitude = geocodage["latitude"] row["geocoded_latitude"] = float(latitude) \ if latitude else None longitude = geocodage["longitude"] row["geocoded_longitude"] = float(longitude) \ if longitude else None result_score = geocodage["result_score"] row["geocoded_result_score"] = float(result_score) \ if result_score else None row["geocoded_result_type"] = geocodage["result_type"] if row["geocoded_result_type"] == precisions.HOUSENUMBER and \ row["geocoded_result_score"] > 0.6: row["adresse_id"] = geocodage["result_id"] else: row["adresse_id"] = None writer.write_row_dict(row)
def filter_departements(): """ Filtre les données pour conserver uniquement les enregistrements localisés dans les départements sélectionnés dans la config """ # Input dataset sis_source = Dataset("etl", "sis_source") # output dataset sis_filtered = Dataset("etl", "sis_filtered") sis_filtered.write_dtype(sis_source.read_dtype()) with sis_filtered.get_writer() as writer: for row in sis_source.iter_rows(): code_insee = row["code_insee"] keep_row = False for departement in DEPARTEMENTS: if code_insee.startswith(departement): keep_row = True break if keep_row: writer.write_row_dict(row)
def prepare_sites(): """ Cette recette ajoute une clé primaire et garde uniquement certaines colonnes """ # input dataset basias_sites_filtered = Dataset("etl", "basias_sites_filtered") # output dataset basias_sites_prepared = Dataset("etl", "basias_sites_prepared") # columns to keep keep = ["indice_departemental", "nom_usuel", "raison_sociale"] dtype = basias_sites_filtered.read_dtype() # transform schema output_dtype = [column for column in dtype if column.name in keep] id_column = Column("id", BigInteger, primary_key=True, autoincrement=True) output_dtype = [id_column, *output_dtype] basias_sites_prepared.write_dtype(output_dtype) # transform data with basias_sites_prepared.get_writer() as writer: for row in basias_sites_filtered.iter_rows(): output_row = dict((key, row[key]) for key in row if key in keep) writer.write_row_dict(output_row)
def normalize_precision(): """ Cette recette permet de normaliser les valeurs de la colonne lib_precis dans la nomenclature PARCEL, HOUSENUMBER, MUNICIPALITY """ # input dataset s3ic_geocoded = Dataset("etl", "s3ic_geocoded") # output dataset s3ic_normalized = Dataset("etl", "s3ic_normalized") dtype = s3ic_geocoded.read_dtype() s3ic_normalized.write_dtype(dtype) with s3ic_normalized.get_writer() as writer: for row in s3ic_geocoded.iter_rows(): mapping = { "Coordonnées précises": precisions.PARCEL, "Coordonnée précise": precisions.PARCEL, "Valeur Initiale": precisions.PARCEL, "Adresse postale": precisions.HOUSENUMBER, "Centroïde Commune": precisions.MUNICIPALITY, "Inconnu": precisions.MUNICIPALITY } precision = row.get("precision") if precision: row["precision"] = mapping.get(precision) else: row["precision"] = precisions.MUNICIPALITY writer.write_row_dict(row)
def merge_cadastre(): """ Merge the different parcelles into a MultiPolygon """ # Input dataset basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") # Output dataset basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("geog", Geometry(srid=4326)) ] basol_cadastre_merged.write_dtype(dtype) BasolCadastreJoined = basol_cadastre_joined.reflect() session = basol_cadastre_joined.get_session() select = [ BasolCadastreJoined.numerobasol, func.st_multi(func.st_union(BasolCadastreJoined.geog)) ] q = session.query(*select) \ .group_by(BasolCadastreJoined.numerobasol) \ .all() with basol_cadastre_merged.get_writer() as writer: for (numerobasol, geog) in q: row = {"numerobasol": numerobasol, "geog": geog} writer.write_row_dict(row) session.close()
def prepare_solution(): train = Dataset.from_train() X = train.get_features() Y = train.get_labels() rf = RandomForestRegressor(n_jobs=-1) model = rf.fit(X, Y) print('Train score: %f' % loss(Y, model.predict(X))) test = Dataset.from_test() X2 = test.get_features() Y2 = model.predict(X2) save_predictions(Y2, test)
def predict(self, seq): result = {} self.predict_data_ = seq for label, value in self.gmmhmms.iteritems(): gmmhmm = value['gmmhmm'] status_set = value['status_set'] d = Dataset(motion_type=status_set['motion'], sound_type=status_set['sound'], location_type=status_set['location']) seq_converted = np.array(d._convetNumericalSequence(seq)) result[label] = gmmhmm.score(seq_converted) return result
def classifyByGMMHMM(seq, models, configs): Y = [] for config in configs: _rawdata_type = config["logType"] _event_type = config["eventType"] _motion_type = config["motionType"] _sound_type = config["soundType"] _location_type = config["locationType"] d = Dataset( rawdata_type=_rawdata_type, event_type=_event_type, motion_type=_motion_type, sound_type=_sound_type, location_type=_location_type ) # Initiation of data need prediction. y = np.array(d._convetNumericalSequence(seq)) Y.append(y) _GMMHMMs = [] for model in models: _GMMs = [] for gmm in model["gmmParams"]["params"]: _GMM = GMM( n_components=model["nMix"], covariance_type=model["covarianceType"] ) _GMM.covars_ = np.array(gmm["covars"]) _GMM.means_ = np.array(gmm["means"]) _GMM.weights_ = np.array(gmm["weights"]) _GMMs.append(_GMM) _GMMHMM = GMMHMM( n_components=model["nComponent"], n_mix=model["nMix"], startprob=np.array(model["hmmParams"]["startProb"]), transmat=np.array(model["hmmParams"]["transMat"]), gmms=_GMMs, covariance_type=model["covarianceType"] ) _GMMHMMs.append(_GMMHMM) results = [] # for _GMMHMM in _GMMHMMs: # res = _GMMHMM.score(Y) # results.append(res) for i in range(0, len(models)): res = _GMMHMMs[i].score(Y[i]) results.append(res) return results
def submission(): print('Cross validate K-Means model') train = Dataset.from_train() test = Dataset.from_test() X = train.get_features() Y = train.get_labels() X2 = test.get_features() kmeans = KMeans(n_clusters=8) clf = kmeans.fit(X, train.get_multi_labels()) score = check_score(Y, to_labels(clf.predict(X))) print("Train dataset score %f" % (score/len(X))) Y2 = to_labels(clf.predict(X2)) save_predictions(Y2, test.df)
def submission(): print('Cross validate bayes model') train = Dataset.from_train() test = Dataset.from_test() X = train.get_features() Y = train.get_labels() X2 = test.get_features() gnb = bayes.MultinomialNB() clf = gnb.fit(X, train.get_multi_labels()) score = check_score(Y, to_labels(clf.predict(X))) print("Train dataset score %f" % (score/len(X))) Y2 = to_labels(clf.predict(X2)) save_predictions(Y2, test.df)
def main(): print('Explore dataset') train = Dataset.from_train() u = train.pca() print('U shape: ' + str(u.shape)) X = train.get_pca_features(u) print(X.shape)
def make_submission(network, params, u): print('Prepare submission') test = Dataset.from_test() if params.pca: X2 = test.get_pca_features(u) else: X2 = test.get_features() predictions = network.predict(X2) save_predictions(predictions, test.df)
def train_nn(restore): print('Training neural net') encoder = AutoEncoder() encoder.restore_session() train_data = Dataset.from_train() X = encoder.encode(train_data.get_features()) y = train_data.get_labels() nn = NeuralNet() nn.fit(X, y)
def prepare_submission(params): print('Prepare submission with params') print(params) network = NeuralNetwork(params) train = Dataset.from_train() u = train.pca() if params.pca: X = train.get_pca_features(u) else: X = train.get_features() Y = train.get_labels() network.fit(X, Y) score = network.check_score(X, Y) print('Train dataset score %f' % (score/len(X))) make_submission(network, params, u)
trans_mat_prior = np.array([[0.2, 0.1, 0.3, 0.4], [0.3, 0.2, 0.2, 0.3], [0.1, 0.1, 0.1, 0.7], [0.1, 0.3, 0.4, 0.2]]) # Build an HMM instance and set parameters model_dining = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=4, covariance_type='spherical', n_iter=50) model_fitness = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=10, covariance_type='spherical', n_iter=50) model_work = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=8, covariance_type='spherical', n_iter=50) model_shop = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=4, covariance_type='spherical', n_iter=50) # print model_dining.gmms_[0].covars_.tolist() # print model_dining.gmms_[0].means_.tolist() # print model_dining.gmms_[0].weights_.tolist() dataset_dining = Dataset() dataset_fitness = Dataset() dataset_work = Dataset() dataset_shop = Dataset() # print Dataset().randomObservations('dining_out_in_chinese_restaurant', 10, 10).obs D = dataset_dining.randomObservations('dining.chineseRestaurant', 10, 300).getDataset() F = dataset_fitness.randomObservations('fitness.running', 10, 300).getDataset() W = dataset_work.randomObservations('work.office', 10, 300).getDataset() S = dataset_shop.randomObservations('shopping.mall', 10, 300).getDataset() # dataset_dining.plotObservations3D() # D = Dataset(obs_dining).dataset # F = Dataset(obs_fitness).dataset # W = Dataset(obs_work).dataset
'transMat': self.gmmhmm.transmat_.tolist(), 'transMatPrior': self.gmmhmm.transmat_prior.tolist(), 'startProb': self.gmmhmm.startprob_.tolist(), 'startProbPrior': self.gmmhmm.startprob_prior.tolist(), }, 'gmmParams': { 'nMix': self.gmmhmm.n_mix, 'covarianceType': self.gmmhmm.covariance_type, 'gmms': gmms_, } } if __name__ == '__main__': from datasets import Dataset d = Dataset() d.randomObservations("dining#chineseRestaurant", 10, 10) _model = { "hmmParams": { "transMat": [ [ 0.2, 0.1, 0.3, 0.4 ], [ 0.3, 0.2, 0.2,
def train_auto_encoder(restore): print('Training auto encoder') network = AutoEncoder() train_data = Dataset.from_train() test_data = Dataset.from_test() network.fit_encoder(train_data, test_data, restore=restore)