示例#1
0
def main():
    args = get_parser().parse_args()

    config = Config.from_file(args.config)

    logger = get_logger(config.output_path)
    logger.info(args)
    logger.info("=> Starting evaluation ...")

    logger.info("Load data")
    corpus = io.load_json(config.input_path, append_title=config.use_title)

    logger.info("Perform preprocessing")
    preprocessed_corpus = Preprocessing(
        corpus["keywords"],
        config=config.preprocessing,
        datatype="keywords",
        logger=logger,
    ).apply_preprocessing()

    preprocessed_corpus["token"] = preprocessed_corpus["token"].apply(flatten)
    preprocessed_corpus.drop("abstract", axis=1, inplace=True)

    logger.info("Start clustering")
    clustering = Clustering(
        preprocessed_corpus,
        clustering_config=config.clustering,
        dim_reduction_config=config.dim_reduction,
        logger=logger,
    )
    model = clustering.perform_clustering()

    logger.info(f"Save results to {config.output_path}")
    corpus["label"] = model.labels_
    io.write_json(config.input_path.split(".")[0] + "_labeled.json", corpus)
示例#2
0
    def __init__(self, root, dataset_partitions, past_frames, future_frames, input_dim, output_dim,
                 blur_radius=3, buffer_memory=1e2, buffer_size=1e3, batch_size=16, caching=True, downsampling=False, dynamicity=1e-3):
        '''
            Data Generator
            Inputs:

                - Path containing folders of frames
                - List of the names of these folders
                - Partitions: [(ids_x(x, 10), ids_y(x, 4))]
        '''

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.dataset_partitions = dataset_partitions
        self.batch_size = np.min([len(x[1]) for x in self.dataset_partitions]) # minimo numero di sequenze per area

        self.past_frames = past_frames
        self.future_frames = future_frames
        self.caching = caching

        self.batch_size = batch_size
        self.blurry_filter_size = (blur_radius, blur_radius)
        self.downsampling_factor = 4
        self.downsampling = downsampling

        self.root = root

        self.buffer = []
        self.buffer_size = buffer_size
        self.buffer_memory = buffer_memory
        self.buffer_hit_ratio = 0

        self.preprocessing = Preprocessing()
        self.dynamicity = dynamicity
示例#3
0
def main():
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)
    analyzer = Preprocessing(dataset.X_train) \
        .featurizer.build_analyzer()

    docs = [analyzer(doc) for doc in dataset.X_train]
    create_word_to_vec_embeddings(docs)
示例#4
0
def preprocessing(app_config):
    logger.info("开始预处理文件")
    preprocess_param_file = open(
        app_config["app"]["preprocessing"]["preprocess_config"],
        "r",
        encoding="utf-8")
    preprocess_config = yaml.load(preprocess_param_file.read())

    output_dir = app_config["app"]["preprocessing"]["output_dir"]
    input_dir = app_config["app"]["preprocessing"]["input_dir"]

    file_pattern = app_config["app"]["preprocessing"]["input_file_pattern"]
    output_file_prefix = app_config["app"]["preprocessing"][
        "output_file_prefix"]

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    for root, dirs, files in os.walk(input_dir):
        for f in files:
            if file_pattern in f:
                logger.info("预处理文件%s" % f)
                prep = Preprocessing(data_frame=pd.read_csv(
                    os.path.join(root, f)),
                                     config=preprocess_config)
                df, seeds = prep.preprocessing()
                if df.shape[0] > 0:
                    df.to_csv(os.path.join(output_dir, output_file_prefix + f),
                              index=False,
                              header=app_config["app"]["preprocessing"]
                              ["keep_header"])

                    if app_config["app"]["preprocessing"]["generate_seed"]:
                        logger.info("导出种子文件%s" % f[:-4] + ".yml")
                        output_seed_file_prefix = app_config["app"][
                            "preprocessing"]["output_seed_file_prefix"]
                        with open(os.path.join(
                                output_dir,
                                output_seed_file_prefix + f[:-4] + ".yml"),
                                  "w",
                                  encoding="utf-8") as sf:
                            yaml.dump(seeds, sf)

    logger.info("预处理完成")
示例#5
0
def get_vectorizer(current_params):
    f = lambda x: Preprocessing.preprocessing(x, current_params)
    Vectorizer = current_params.get("Vectorizer", CountVectorizer)
    vectorizer = Vectorizer(
        preprocessor=f,
        lowercase=False,
        token_pattern=Preprocessing.token_pattern,
        binary=current_params.get("binary", False),
        max_df=current_params.get("max_df", 1.),
        min_df=current_params.get("min_df", 1),
        ngram_range=current_params.get("ngram_range", (1, 1)),
        max_features=current_params.get("max_features", None))
    return vectorizer
class DataGenerator():
    def __init__(self,
                 root,
                 dataset_partitions,
                 past_frames,
                 future_frames,
                 input_dim,
                 output_dim,
                 dynamicity,
                 filtering=True,
                 buffer_memory=1e2,
                 buffer_size=1e3,
                 batch_size=16,
                 caching=True,
                 downsampling=False):
        '''
            Data Generator
            Inputs:

                - Path containing folders of frames
                - List of the names of these folders
                - Partitions: [(ids_x(x, 10), ids_y(x, 4))]
        '''

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.dataset_partitions = dataset_partitions
        self.batch_size = np.min([len(x[1]) for x in self.dataset_partitions
                                  ])  # minimo numero di sequenze per area

        self.past_frames = past_frames
        self.future_frames = future_frames
        self.caching = caching

        self.batch_size = batch_size
        self.blurry_filter_size = (3, 3)
        self.downsampling_factor = 4
        self.downsampling = downsampling

        self.root = root

        self.buffer = []
        self.buffer_size = buffer_size
        self.buffer_memory = buffer_memory
        self.buffer_hit_ratio = 0

        self.preprocessing = Preprocessing()
        self.dynamicity = dynamicity
        self.filtering = filtering

    def get_data(self):
        'Generates batches of datapoints'
        X, Y = self.__data_generation()  # seq, t, h, w, c
        return X, Y

    def __data_generation(self):
        'Generates the raw sequence of datapoints (filtered)'

        # stats
        accesses = 0
        hits = 0

        # Initialization
        X = None
        Y = None

        print("[x] {} areas found".format(len(self.dataset_partitions)))

        # For each area
        for area_index, area in enumerate(self.dataset_partitions):
            # For each sequence
            loaded = 0

            print("Area {} - sequences: {}\n".format(area_index, len(area[1])),
                  end="",
                  flush=True)
            for i, sequence in enumerate(area[1]):

                # --- BTM
                btm_filenames = [
                    x
                    for x in os.listdir(self.root +
                                        self.dataset_partitions[area_index][0])
                    if x.endswith(".BTM")
                ]
                if len(btm_filenames) == 0:
                    raise Exception("No BTM map found for the area {}".format(
                        self.dataset_partitions[area_index][0]))
                btm = iter_loadtxt(
                    self.root + self.dataset_partitions[area_index][0] + "/" +
                    btm_filenames[0],
                    delimiter=" ")

                # --- Outliers
                btm[np.isnan(btm)] = 0
                btm[btm > 10e5] = 0

                # --- Preprocessing
                if self.downsampling:
                    btm = cv.GaussianBlur(btm, self.blurry_filter_size, 0)
                    btm = cv.pyrDown(btm)
                    btm = cv.pyrDown(btm)

                # riduzione valori il sottraendo minimo
                min_btm = np.min(btm)
                btm = btm - min_btm

                btm.resize(btm.shape[0], btm.shape[1], 1)
                btm_x = np.tile(btm, (self.past_frames, 1, 1, 1))

                deps = None
                vvx_s = None
                vvy_s = None

                framestart = int(sequence.replace("id-", ""))

                # Starts from the right frame
                for k in range(
                        framestart,
                        framestart + self.past_frames + self.future_frames):

                    # id area -> id frame
                    gid = "{}-{}-{}".format(area_index, sequence, k)

                    # Parameters
                    extensions = ["DEP", "VVX", "VVY"]
                    matrices = []

                    # Gets datapoint filename
                    dep_filenames = [
                        x for x in os.listdir(
                            self.root + self.dataset_partitions[area_index][0])
                        if x.endswith(".DEP")
                    ]

                    if len(dep_filenames) == 0:
                        raise Exception(
                            "No DEP maps found for the area {}".format(
                                self.dataset_partitions[area_index][0]))

                    # asserting that all maps are named with the same prefix
                    dep_filename = dep_filenames[0].split(".")[0][:-4]

                    # 1 frame -> 3 matrices (3 extensions)
                    for i, ext in enumerate(extensions):
                        accesses += 1
                        global_id = "{}-{}".format(
                            i, gid)  # indice linearizzato globale

                        # ----- Cache
                        if self.caching:
                            cache_frame = self.buffer_lookup(global_id)
                            if cache_frame is False:
                                frame = iter_loadtxt(
                                    self.root +
                                    self.dataset_partitions[area_index][0] +
                                    "/{}{:04d}.{}".format(
                                        dep_filename, k, ext),
                                    delimiter=" ")
                                self.buffer_push(global_id, frame)
                            else:
                                frame = cache_frame
                                hits += 1

                        # ----- No cache
                        else:
                            frame = iter_loadtxt(
                                self.root +
                                self.dataset_partitions[area_index][0] +
                                "/{}{:04d}.{}".format(dep_filename, k, ext),
                                delimiter=" ")

                        # --- Outliers
                        frame[np.isnan(frame)] = 0
                        frame[frame > 10e5] = 0

                        # --- On-spot Gaussian Blurring
                        if self.downsampling:
                            frame = cv.GaussianBlur(frame,
                                                    self.blurry_filter_size, 0)
                            frame = cv.pyrDown(frame)
                            frame = cv.pyrDown(frame)

                        matrices.append(frame)

                    frame, vvx, vvy = matrices

                    # ---

                    if deps is None:
                        deps = np.array([frame])
                    else:
                        deps = np.concatenate((deps, np.array([frame])))

                    if vvx_s is None:
                        vvx_s = np.array([vvx])
                    else:
                        vvx_s = np.concatenate((vvx_s, np.array([vvx])))

                    if vvy_s is None:
                        vvy_s = np.array([vvy])
                    else:
                        vvy_s = np.concatenate((vvy_s, np.array([vvy])))

                # ---------

                deps[deps > 10e5] = 0
                vvx_s[vvx_s > 10e5] = 0
                vvy_s[vvy_s > 10e5] = 0
                btm_x[btm_x > 10e5] = 0

                # --- X
                x_dep = deps[:self.past_frames]
                x_dep.resize(
                    (x_dep.shape[0], x_dep.shape[1], x_dep.shape[2], 1))

                x_vx = vvx_s[:self.past_frames]
                x_vx.resize((x_vx.shape[0], x_vx.shape[1], x_vx.shape[2], 1))

                x_vy = vvy_s[:self.past_frames]
                x_vy.resize((x_vy.shape[0], x_vy.shape[1], x_vy.shape[2], 1))

                x = np.concatenate((x_dep, x_vx, x_vy, btm_x), axis=3)

                # --- Y
                y_dep = deps[self.past_frames:]
                y_dep.resize(
                    (y_dep.shape[0], y_dep.shape[1], y_dep.shape[2], 1))

                y_vx = vvx_s[self.past_frames:]
                y_vx.resize((y_vx.shape[0], y_vx.shape[1], y_vx.shape[2], 1))

                y_vy = vvy_s[self.past_frames:]
                y_vy.resize((y_vy.shape[0], y_vy.shape[1], y_vy.shape[2], 1))

                y = np.concatenate((y_dep, y_vx, y_vy), axis=3)

                # filtering
                if self.filtering:
                    valid = self.preprocessing.eval_datapoint(
                        x[:, :, :, :3], y, self.dynamicity)
                else:
                    valid = True

                if valid:
                    loaded += 1

                    if X is None:
                        X = np.expand_dims(x, 0)
                    else:
                        X = np.concatenate((X, np.expand_dims(x, 0)))

                    if Y is None:
                        Y = np.expand_dims(y, 0)
                    else:
                        Y = np.concatenate((Y, np.expand_dims(y, 0)))

                    print("x ", end="", flush=True)
                else:
                    print("- ", end="", flush=True)

            print("\n[{}%] {} valid sequences loaded".format(
                round((area_index + 1) / len(self.dataset_partitions) * 100),
                loaded))

        # Buffer ratio calculation
        if accesses != 0:
            self.buffer_hit_ratio = self.buffer_hit_ratio * 0.5 + 0.5 * (
                hits / accesses)

        return X, Y

    # ------------------------------------

    def buffer_lookup(self, k):
        ''' Get sequence (datapoint) from cache given the start frame global id '''

        if self.caching:
            for i, x in enumerate(self.buffer):
                # Returns found record
                if x["global_id"] == k:
                    self.buffer[i]["fresh"] += 1
                    return x["value"]

                # Set any read record to 0 (second chance)
                elif self.buffer[i]["fresh"] != 0:
                    self.buffer[i]["fresh"] -= 1

        return False

    def buffer_push(self, k, x):
        ''' Add sequence (datapoint) to cache with start frame global id '''

        if self.caching:
            # Makes space
            if len(self.buffer) >= self.buffer_size:
                for i, j in enumerate(self.buffer):
                    if j["fresh"] == 0:
                        del self.buffer[i]
            # Push
            self.buffer.append({
                'fresh': self.buffer_memory,
                'global_id': k,
                'value': x
            })
示例#7
0
def run():
    parser = get_arg_parser()
    cmd_args = parser.parse_args()

    if cmd_args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu)
        gpunum = os.getenv('CUDA_VISIBLE_DEVICES')
        logging.info("GPU has been set to {}".format(gpunum))

    logging.info("Model used for the regression network: {}"
                 .format(cmd_args.model_name))

    # 1. Dataset retrieval
    # --------------------

    tab_printer(constants.Dataset)
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)

    logging.info("Going to create vocabulary and fit a preprocessing pipeline"
                 "using {} samples. Settings will be listed below"
                 .format(len(dataset.X_train)))

    # 2. Preprocessing
    # -----------------

    tab_printer(constants.NLP)
    preprocessor = Preprocessing(dataset.X_train)

    # Preprocess documents
    X_train = preprocessor.transform_documents(dataset.X_train)
    X_test = preprocessor.transform_documents(dataset.X_test)

    # 3. Word embeddings with word2vec
    # --------------------------------

    # Train word2vec embeddings if train_word2vec option is selected
    if cmd_args.train_word2vec: utils.embeddings.main()
    weights = get_embedding_tensor(preprocessor)

    # 4. Node embeddings with AttentionWalk
    # -------------------------------------
    args = _generate_deepwalk_parameters(dataset.y_train_graph)
    if cmd_args.train_attentionwalk: train_attention_walk(args)

    graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values

    # Get document representations using node embeddings
    y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings)
    y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings)

    # 5. Regressor Training
    # ---------------------

    device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \
        if torch.cuda.is_available() else 'cpu'

    regressor_nn = NeuralNet(
        get_network_class(cmd_args.model_name),
        max_epochs=constants.NeuralNetworkTraining.epochs,
        lr=constants.NeuralNetworkTraining.learning_rate,
        batch_size=constants.NeuralNetworkTraining.batch_size,
        optimizer=torch.optim.Adam,
        criterion=torch.nn.MSELoss,

        module__output_dim=args.dimensions,
        module__embedding=weights,
        module__embedding_dim=constants.NLP.embedding_size,

        device=device,
        train_split=None,
    )

    # Train the regressor neural network
    regressor_nn.fit(X_train, y_embedded.astype(np.float32))

    # 6. Train Multi-label KNN algorithm
    # ----------------------------------

    tab_printer(constants.MLKNN)

    # Train multi-label KNN to turn label embeddings into label predictions
    classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s)
    classifier.fit(y_embedded, dataset.y_train)

    # 7. Evaluation
    # -------------

    # Label prediction with documents
    y_test_pred = regressor_nn.predict(X_test)
    preds = classifier.predict(y_test_pred)
    preds_raw = classifier.predict_proba(y_test_pred)

    # Label prediction with label embeddings
    preds_w_labels = classifier.predict(y_test_embedded)
    preds_w_labels_raw = classifier.predict_proba(y_test_embedded)

    # Log evaluation result with label embeddings
    eval_metrics_w_labels = evaluation \
        .all_metrics(preds_w_labels.toarray(),
                     dataset.y_test,
                     yhat_raw=preds_w_labels_raw.toarray())

    logging.info(str(eval_metrics_w_labels))

    # Log evaluation result with documents
    report_evaluation(preds.toarray(),
                      dataset.y_test,
                      yhat_raw=preds_raw.toarray())
示例#8
0
class DataGenerator():
    def __init__(self, root, dataset_partitions, past_frames, future_frames, input_dim, output_dim,
                 blur_radius=3, buffer_memory=1e2, buffer_size=1e3, batch_size=16, caching=True, downsampling=False, dynamicity=1e-3):
        '''
            Data Generator
            Inputs:

                - Path containing folders of frames
                - List of the names of these folders
                - Partitions: [(ids_x(x, 10), ids_y(x, 4))]
        '''

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.dataset_partitions = dataset_partitions
        self.batch_size = np.min([len(x[1]) for x in self.dataset_partitions]) # minimo numero di sequenze per area

        self.past_frames = past_frames
        self.future_frames = future_frames
        self.caching = caching

        self.batch_size = batch_size
        self.blurry_filter_size = (blur_radius, blur_radius)
        self.downsampling_factor = 4
        self.downsampling = downsampling

        self.root = root

        self.buffer = []
        self.buffer_size = buffer_size
        self.buffer_memory = buffer_memory
        self.buffer_hit_ratio = 0

        self.preprocessing = Preprocessing()
        self.dynamicity = dynamicity


    def get_datapoint(self, area_index, sequence_index, check=True):  
        '''
            Generates a single datapoint on the fly (cached)
            Inputs:
                - index of the area
                - index of the sequence
                - flag to check sequence validity
            Outputs:
                - case 1: valid sequence        ->  (X, Y)
                - case 2: non-valid sequence    ->  None
        '''

        # Initialization
        X = None
        Y = None

        area = self.dataset_partitions[area_index]
        sequence = self.dataset_partitions[area_index][1][sequence_index]

        # --- BTM
        btm_filenames = [x for x in os.listdir(self.root + self.dataset_partitions[area_index][0]) if x.endswith(".BTM")]
        if len(btm_filenames) == 0:
            raise Exception("No BTM map found for the area {}".format(self.dataset_partitions[area_index][0]))
        btm = pd.read_csv(self.root + self.dataset_partitions[area_index][0] + "/" + btm_filenames[0],' ',header=None).values

        # --- Outliers
        btm[np.isnan(btm)] = 0
        btm[btm > 10e5] = 0

        # --- Preprocessing
        if self.downsampling:
            btm = cv.GaussianBlur(btm, self.blurry_filter_size, 0)
            btm = cv.pyrDown(btm)
            btm = cv.pyrDown(btm)

        # riduzione valori il sottraendo minimo
        min_btm = np.min(btm)
        btm = btm - min_btm

        btm.resize(btm.shape[0], btm.shape[1], 1)
        btm_x = np.tile(btm, (self.past_frames, 1, 1, 1))

        deps = None
        vvx_s = None
        vvy_s = None

        framestart = int(sequence.replace("id-", ""))

        # Starts from the right frame
        for k in range(framestart, framestart + self.past_frames + self.future_frames):

            # id area -> id frame
            gid = "{}-{}-{}".format(area_index, sequence, k)

            # Parameters
            extensions = ["DEP", "VVX", "VVY"]
            matrices = []

            # Gets datapoint filename
            dep_filenames = [x for x in os.listdir(self.root + self.dataset_partitions[area_index][0]) if
                            x.endswith(".DEP")]

            if len(dep_filenames) == 0:
                raise Exception("No DEP maps found for the area {}".format(self.dataset_partitions[area_index][0]))

            # asserting that all maps are named with the same prefix
            dep_filename = dep_filenames[0].split(".")[0][:-4]

            # 1 frame -> 3 matrices (3 extensions)
            for i, ext in enumerate(extensions):
                
                global_id = "{}-{}".format(i, gid)  # indice linearizzato globale

                # ----- Cache
                if self.caching:
                    cache_frame = self.buffer_lookup(
                        global_id
                    )
                    if cache_frame is False:
                        frame = pd.read_csv(self.root + self.dataset_partitions[area_index][0] + "/{}{:04d}.{}".format(dep_filename,k, ext), ' ', header=None).values
                        self.buffer_push(global_id, frame)
                    else:
                        frame = cache_frame

                # ----- No cache
                else:
                    frame = pd.read_csv(self.root + self.dataset_partitions[area_index][0] + "/{}{:04d}.{}".format(dep_filename,k, ext), ' ', header=None).values

                # --- Filtering
                frame[np.isnan(frame)] = 0
                frame[frame > 10e5] = 0

                # --- On-spot Gaussian Blurring
                if self.downsampling:
                    frame = cv.GaussianBlur(frame, self.blurry_filter_size, 0)
                    frame = cv.pyrDown(frame)
                    frame = cv.pyrDown(frame)

                matrices.append(frame)

            frame, vvx, vvy = matrices

            # ---

            if deps is None:
                deps = np.array([frame])
            else:
                deps = np.concatenate((deps, np.array([frame])))

            if vvx_s is None:
                vvx_s = np.array([vvx])
            else:
                vvx_s = np.concatenate((vvx_s, np.array([vvx])))

            if vvy_s is None:
                vvy_s = np.array([vvy])
            else:
                vvy_s = np.concatenate((vvy_s, np.array([vvy])))

        # ---------

        deps[deps > 10e5] = 0
        vvx_s[vvx_s > 10e5] = 0
        vvy_s[vvy_s > 10e5] = 0
        btm_x[btm_x > 10e5] = 0

        # --- X
        x_dep = deps[:self.past_frames]
        x_dep.resize((x_dep.shape[0], x_dep.shape[1], x_dep.shape[2], 1))

        x_vx = vvx_s[:self.past_frames]
        x_vx.resize((x_vx.shape[0], x_vx.shape[1], x_vx.shape[2], 1))

        x_vy = vvy_s[:self.past_frames]
        x_vy.resize((x_vy.shape[0], x_vy.shape[1], x_vy.shape[2], 1))

        x = np.concatenate((x_dep, x_vx, x_vy, btm_x), axis=3)

        # --- Y
        y_dep = deps[self.past_frames:]
        y_dep.resize((y_dep.shape[0], y_dep.shape[1], y_dep.shape[2], 1))

        y_vx = vvx_s[self.past_frames:]
        y_vx.resize((y_vx.shape[0], y_vx.shape[1], y_vx.shape[2], 1))

        y_vy = vvy_s[self.past_frames:]
        y_vy.resize((y_vy.shape[0], y_vy.shape[1], y_vy.shape[2], 1))

        y = np.concatenate((y_dep, y_vx, y_vy), axis=3)

        # filtering
        if check:
            valid = self.preprocessing.eval_datapoint(x[:,:,:,:3], y, self.dynamicity)

            if valid:

                if X is None: X = np.expand_dims(x,0)
                else: X = np.concatenate((X, np.expand_dims(x,0)))

                if Y is None: Y = np.expand_dims(y,0)
                else: Y = np.concatenate((Y, np.expand_dims(y,0)))

                return X, Y
            else:
                return (None, None)

        else:
            if X is None: X = np.expand_dims(x,0)
            else: X = np.concatenate((X, np.expand_dims(x,0)))

            if Y is None: Y = np.expand_dims(y,0)
            else: Y = np.concatenate((Y, np.expand_dims(y,0)))

            return X, Y


    # ------------------------------------

    def buffer_lookup(self, k):
        ''' Get sequence (datapoint) from cache given the start frame global id '''

        if self.caching:
            for i, x in enumerate(self.buffer):
                # Returns found record
                if x["global_id"] == k:
                    self.buffer[i]["fresh"] += 1
                    return x["value"]

                # Set any read record to 0 (second chance)
                elif self.buffer[i]["fresh"] != 0:
                    self.buffer[i]["fresh"] -= 1

        return False

    def buffer_push(self, k, x):
        ''' Add sequence (datapoint) to cache with start frame global id '''

        if self.caching:
            # Makes space
            if len(self.buffer) >= self.buffer_size:
                for i, j in enumerate(self.buffer):
                    if j["fresh"] == 0:
                        del self.buffer[i]
            # Push
            self.buffer.append({'fresh': self.buffer_memory, 'global_id': k, 'value': x})
print(list(test.keys())[maxi])

# parametres de la meilleur solution

stop = list(stopwords.words('french')) + ['cet', 'cette', 'là']
params = {
    "lowercase": True,
    "punct": True,
    "marker": True,
    "number": True,
    "stemming": Preprocessing.lem,  #stemmer.stem,
    "ligne": None,
    "strip_accents": True,
    "stopwords": set(stop)
}
f = lambda x: Preprocessing.preprocessing(x, params)
t = time()
data_x = list(map(f, alltxts))
print("temps 1 :",
      time() - t)
vectorizer = CountVectorizer(preprocessor=None,
                             lowercase=False,
                             token_pattern=Preprocessing.token_pattern)
t = time()
X = vectorizer.fit_transform(data_x)
print("temps 2 :",
      time() - t)

# train test split sans équilibrage
t = time()
clf = svm.LinearSVC()
示例#10
0
def gridSearch(datax, datay, params, stock=False):
    '''
    Parameters
    ----------
    datax
        Liste des données.
    datay 
        Liste des labels des données.
    clf_class
        Classifieur à utiliser.
    params
        Dictionnaire des parametres.

    Returns
    -------
    res_train 
        Dictionnaire des F1-score en train en fonction des différents parametres.
    res_test : TYPE
        Dictionnaire des F1-score en train en fonction des différents parametres..

    '''
    el = params.keys()

    res_test = dict()
    res_train = dict()
    size = len(list(itertools.product(*params.values())))
    for i, v in enumerate(list(itertools.product(*params.values()))):
        print(i + 1, "on", size)
        tag = tuple(x if isinstance(x, collections.Hashable) else "YES"
                    for x in v)
        print(tag)
        current_params = dict(zip(el, v))

        # choix du classifieur
        clf_class = current_params.get("clf", svm.LinearSVC)
        if clf_class == nb.MultinomialNB:
            class_prior = current_params.get("class_weight", None)
            if class_prior == "balanced":
                class_prior = len(datax) / (
                    2 * np.bincount(np.where(np.array(datay) == 1, 1, 0)))
                print(class_prior)
            clf = clf_class(class_prior=class_prior)
        else:
            clf = clf_class(
                class_weight=current_params.get("class_weight", None))
        # choix du vectorizer
        Vectorizer = current_params.get("Vectorizer", CountVectorizer)
        # application des parametres au preprocessing
        f = lambda x: Preprocessing.preprocessing(x, current_params)

        # Vectorization
        print(current_params.get("max_df", 1), current_params.get("min_df", 1))
        vectorizer = Vectorizer(
            preprocessor=f,
            lowercase=False,
            token_pattern=Preprocessing.token_pattern,
            binary=current_params.get("binary", False),
            max_df=current_params.get("max_df", 1),
            min_df=current_params.get("min_df", 1),
            ngram_range=current_params.get("ngram_range", (1, 1)),
            max_features=current_params.get("max_features", None))
        X = vectorizer.fit_transform(datax)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            datay,
                                                            test_size=0.4,
                                                            random_state=0)
        clf.fit(X_train, y_train)
        # Application
        yhat_test = clf.predict(X_test)
        yhat_train = clf.predict(X_train)

        res_test[tag] = f1_score(y_test, yhat_test)
        res_train[tag] = f1_score(y_train, yhat_train)
        print(res_test[tag])
    if stock:
        pickle.dump(res_train, open("train", "wb"))
        pickle.dump(res_test, open("test", "wb"))
    return res_train, res_test
def run():
    parser = get_arg_parser(embedding_classifier=False)
    cmd_args = parser.parse_args()

    if cmd_args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu)
        gpunum = os.getenv('CUDA_VISIBLE_DEVICES')
        logging.info("GPU has been set to {}".format(gpunum))

    logging.info("Model used for the classification network: {}".format(
        cmd_args.model_name))

    # 1. Dataset retrieval
    # --------------------

    tab_printer(constants.Dataset)
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)

    logging.info("Going to create vocabulary and fit a preprocessing pipeline"
                 "using {} samples. Settings will be listed below".format(
                     len(dataset.X_train)))

    # 2. Preprocessing
    # -----------------

    tab_printer(constants.NLP)
    preprocessor = Preprocessing(dataset.X_train)

    # Preprocess documents
    X_train = preprocessor.transform_documents(dataset.X_train)
    X_test = preprocessor.transform_documents(dataset.X_test)

    # 3. Word embeddings with word2vec
    # --------------------------------

    # Train word2vec embeddings if train_word2vec option
    # is selected
    if cmd_args.train_word2vec: utils.embeddings.main()
    weights = get_embedding_tensor(preprocessor)

    logging.info("Word embeddings are loaded.")

    # 4. Label Network Optim
    # -----------------------

    device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \
        if torch.cuda.is_available() else 'cpu'
    logging.info("Going to run on device: {}".format(device))

    args = _generate_deepwalk_parameters(dataset.y_train_graph)
    label_embeddings = np.array(
        pd.read_csv(args.embedding_path).iloc[:, 1:].values)
    label_embeddings_weights = torch.FloatTensor(label_embeddings)

    label_network = NeuralNet(
        CAML,
        max_epochs=50,
        lr=constants.NeuralNetworkTraining.learning_rate,
        batch_size=constants.NeuralNetworkTraining.batch_size,
        optimizer=torch.optim.Adam,
        criterion=torch.nn.BCEWithLogitsLoss,
        module__output_dim=dataset.y_train.shape[1],
        module__embedding=label_embeddings_weights,
        module__embedding_dim=args.dimensions,
        module__kernel_size=1,
        device=device,
        train_split=skorch.dataset.CVSplit(stratified=False),
    )

    label_network.fit(dataset.y_train, dataset.y_train.astype(np.float32))

    # 5. Evaluation
    # -------------

    yhat_test_raw_logits = label_network.predict_proba(dataset.y_test)
    yhat_test_raw = torch.sigmoid(torch.Tensor(yhat_test_raw_logits)).numpy()
    yhat_test = np.array(yhat_test_raw >=
                         constants.NeuralNetworkTraining.threshold) \
        .astype(np.int64)

    report_evaluation(yhat_test, dataset.y_test, yhat_raw=yhat_test_raw)
 def __init__(self):
     self.text = Preprocessing()
示例#13
0
        pchg = df.pop("PCHG")
        df.drop(["DATE", "CODE"], axis=1, inplace=True)
        pca = PCA(n_components=20)
        pca_data = pca.fit_transform(df.values)

        PCA_COLUMNS = []
        for i in range(20):
            PCA_COLUMNS.append("PCA" + str(i + 1))

        pca_df = pd.DataFrame(pca_data, columns=PCA_COLUMNS)
        pca_df = pd.concat([pchg, pca_df], axis=1)
        # print(pca_df.head())
        # print(pca_data)
        # print(pca.explained_variance_ratio_)
        config_file_path = "../config/pca_preprocessing_config.yaml"
        config = yaml.safe_load(open(config_file_path, "r"))
        prep = Preprocessing(pca_df, config=config)
        p_df, p_config = prep.preprocessing()
        # print("*" * 70)
        # print(p_df.head())
        # print(p_config)

        p_df.to_csv(os.path.join(out_dir, "pca_" + shortname + ".csv"),
                    index=False,
                    header=False)
        with open(os.path.join(out_dir, "seed_" + shortname + ".yml"),
                  "w",
                  encoding="utf-8") as sf:
            yaml.dump(p_config, sf)
    parser = argparse.ArgumentParser()

    parser.add_argument('--folder_to_save', help='Folder to save summaries')
    args = parser.parse_args()

    folder_to_save = args.folder_to_save
    path_to_save = root_directory + "Data/DUC_2007/" + folder_to_save + "/"

    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)

    for folder in doc_folders:
        path = os.path.join(root_directory + "Data/DUC_2007/Documents/", '') + folder
        print (path)

        sentences, last_indexs = Preprocessing().openDirectory(path)
        text_sents = []
        for item in sentences:
            text_sents.append(item.getStemmedWords())

        clean_sents = []
        org_sents = []
        for item in sentences:
            org_sents.append(item.getOGwords())

            tmp = ""
            for word in item.getStemmedWords():
                tmp += word + " "

            if tmp[-1] not in clean_sents:
                clean_sents.append(tmp[:-1])