Exemplo n.º 1
0
    def __iter__(self):
        # custom iterator function that defines how to iterate over
        # records according to the configuration specified
        # INTERFACE DEFINITION: this iterator should always yield a string
        if self.msg_flag:
            print('\n\n\n\nRunning the following preprocessing actions:\n\n')
            print(utilities.get_config('./config/preprocessing.yaml'))
            self.msg_flag = 0

        if self.grouping == 'doc':
            for PDFObj, f in zip(self.data_map, self.files):
                pdf_reader = PDFR(PDFObj)
                text_file = ""
                for pg_num in range(pdf_reader.numPages):
                    page_text = pdf_reader.getPage(pg_num).extractText()
                    text_file = text_file + ' ' + page_text

                self.doc_ids.append(os.path.splitext(ntpath.basename(f))[0])
                yield Preprocessor(text_file,
                                   './config/preprocessing.yaml').run()

            self.__read_data(self.files)  # get data

        elif self.grouping == 'page':
            for PDFObj, f in zip(self.data_map, self.files):
                pdf_reader = PDFR(PDFObj)
                for pg_num in range(pdf_reader.numPages):
                    page_text = pdf_reader.getPage(pg_num).extractText()
                    self.doc_ids.append(
                        os.path.splitext(ntpath.basename(f))[0] + ' Page ' +
                        str(pg_num + 1) + ' of ' + str(pdf_reader.numPages))
                    yield Preprocessor(page_text,
                                       './config/preprocessing.yaml').run()

            self.__read_data(self.files)  # get data
Exemplo n.º 2
0
    def __iter__(self):
        if self.msg_flag:
            print('\n\n\n\nRunning the following preprocessing actions on group of files:\n\n')
            print(utilities.get_config('./config/preprocessing.yaml'))
            self.msg_flag = 0

        for csv_file, f in zip(self.data_map, self.files):
            reader = csv.reader(csv_file, delimiter=',')
            if self.grouping == "row":
                for row in reader:
                    # print('ROW: \n', row)
                    row_cells = ""
                    for cell in row:
                        # print('cell: \n', cell)
                        row_cells += ' ' + cell + ' '
                    # print('row_cells:\n', row_cells)

                    yield Preprocessor(row_cells,'./config/preprocessing.yaml', self.files).run()

            elif self.grouping == "col":
                columns = zip(*reader)
                col_text = ""
                for column in columns:
                        # print ('COLUMN:\n\n', column )
                        for cell in column:
                            col_text += ' ' + cell + ' '
                        yield Preprocessor(col_text,'./config/preprocessing.yaml').run()
        self.__read_data(self.files) # get data   
Exemplo n.º 3
0
def get_means_sigmas(args, x):
    if args.pre == 'kmeans':
        return Preprocessor().compute_gaussian_basis(x,
                                                     deg=int(args.d),
                                                     scale=args.scale)
    elif args.pre == 'grid':
        return Preprocessor().grid2d_means(np.min(x[:, 0]),
                                           np.max(x[:, 0]),
                                           np.min(x[:, 1]),
                                           np.max(x[:, 1]),
                                           step=args.gsize,
                                           scale=args.scale)
Exemplo n.º 4
0
def preprocess(args, X, T):
    d = args.d
    X_normal, std = Preprocessor().normalize(X)
    if args.pre == 'pca':
        X_phi, phi = Preprocessor().pca(X_normal, k=d)
    elif args.pre == 'lda':
        X_phi, phi = Preprocessor().lda(X_normal, T, d=d)
    else:
        X_phi = X
        phi = np.ones(d)
    bias = np.ones(len(X))[:, np.newaxis]
    X_phi = np.hstack((bias, X_phi))

    return X_phi, phi, std
Exemplo n.º 5
0
def main():
    # initialize preprocessor
    preprocessor = Preprocessor()

    # serialize preprocessor
    with open('preprocessor.pkl', 'wb') as f:
        pickle.dump(preprocessor, f)
Exemplo n.º 6
0
def print_document(X, Y, T, cams):
    doc, tag, text = Doc().tagtext()

    prepro = Preprocessor(cache_path=cache_dir / 'train_text.json')
    X_text = prepro.to_text(X_sample)

    with tag('html'):
        with tag('body', style="width: 900px;"):
            for i, p in enumerate(X_text):
                cam = cams[i]
                # normalize cam
                heatmap = cam / np.ptp(cam)
                color_map = cv2.applyColorMap(
                    np.uint8(255 * heatmap), cv2.COLORMAP_AUTUMN)
                with tag('div'):
                    with tag('p'):
                        words = p.split(' ')
                        for j, word in enumerate(words):
                            color = color_map[j][0]
                            with tag('span', style=f'background: rgba({color[2]}, {color[1]}, {color[0]}, {heatmap[j]});', title=int(X_test[i, j])):
                                text(word + ' ')
                    with tag('p'):
                        text(
                            f'Pred: {Y[i]}, Label: {T[i]}')
                    doc.stag('hr')

    with open(out_dir / 'out.html', 'w') as f:
        f.write(doc.getvalue())
Exemplo n.º 7
0
    def __init__(self,
                 world,
                 filename=None,
                 simulator=None,
                 once=False,
                 headless=False):
        logging.info('Initialising vision')
        if simulator:
            self.capture = SimCapture(simulator)
        else:
            self.capture = Capture(self.rawSize, filename, once)

        self.headless = headless

        self.threshold = threshold.AltRaw()
        self.pre = Preprocessor(self.rawSize, self.threshold, simulator)
        self.featureEx = FeatureExtraction(self.pre.cropSize)
        self.interpreter = Interpreter()
        self.world = world
        self.gui = GUI(world, self.pre.cropSize, self.threshold)
        self.histogram = Histogram(self.pre.cropSize)

        self.times = []
        self.N = 0

        #debug.thresholdValues(self.threshold.Tblue, self.gui)

        logging.debug('Vision initialised')
def train():
    print('Preprocessing raw data')
    preprocessor = Preprocessor()
    preprocessor.preprocess()

    dataset = Dataset(preprocessor)

    print('Training MF')
    mf = MF(preprocessor, dataset)
    mf.train_or_load_if_exists()

    print('Building I2I')
    i2i = Item2Item(dataset)

    print('Generating candidates')
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train()
    X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val()

    import pickle
    try:
        with open('puke.pkl', 'wb') as f:
            pickle.dump((X_train, y_train, q_train, q_train_reader,
                         X_val, y_val, q_val, q_val_reader), f)
    except:
        print("Couldn't save puke")

    print('Training ranker')
    ranker = Ranker()
    ranker.train(X_train, y_train, q_train, X_val, y_val, q_val)
    ranker.save()

    print('Validating ranker')
    rank_scores = ranker.rank(X_val)
    print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
Exemplo n.º 9
0
def ngram_model(Xtrain, ytrain, Xval, yval):
    model_name = "mnb_ngram"
    preprocessor = Preprocessor(vectorizer_mode=TFIDF_MODE, verbose=True)

    # apply tf-idf vectorization on the text
    Xtrain = preprocessor.vectorize_fit_transform_text(Xtrain)
    Xval = preprocessor.vectorize_transform_text(Xval)

    # define classifier and train
    model = MultinomialNB(alpha=0.4)
    model.fit(Xtrain, ytrain)

    # make predictions
    ytrain_pred = model.predict(Xtrain)
    yval_pred = model.predict(Xval)

    prediction_probs = model.predict_proba(Xval).argsort(axis=1)
    best_preds = prediction_probs[:, -1]
    second_best_preds = prediction_probs[:, -2]

    label_in_top_two_preds = np.full((len(yval), ), True)
    for i in range(len(yval)):
        if yval[i] != best_preds[i] and yval[i] != second_best_preds[i]:
            label_in_top_two_preds[i] = False

    val_acc_top_two = label_in_top_two_preds.sum() / len(yval)

    train_acc = accuracy_score(ytrain, ytrain_pred)
    val_acc = accuracy_score(yval, yval_pred)

    print("Validation accuracy (label in top 2 predictions): {}".format(
        val_acc_top_two))

    return model, model_name, preprocessor, ytrain_pred, yval_pred, "accuracy", train_acc, val_acc
Exemplo n.º 10
0
    def get_next(self, set):
        """
        Get next preprocessed batch
        """
        raw_data = tf.placeholder(tf.float32, shape=[None, 3, 32, 32])
        preprocessor = Preprocessor(raw_data,
                                    centered=self.centered,
                                    rescaled=self.rescaled,
                                    grayscale=self.grayscale,
                                    shaped=self.shaped)

        if set == "train":
            batch = self.train_batch
            size = self.batch_size
            raw_x, y_batch = batch.get_next()
        elif set == "train_acc":
            batch = self.train_batch
            size = self.accuracy_size
            raw_x, y_batch = batch.get_first_size(self.accuracy_size)
        elif set == "test_acc":
            batch = self.test_batch
            size = self.validation_size
            raw_x, y_batch = batch.get_first_size(self.validation_size)

        with tf.Session() as sess:
            raw_x = raw_x.reshape(-1, 3, 32, 32)
            x_batch = sess.run(preprocessor.apply(raw_x, size),
                               feed_dict={raw_data: raw_x})
        return x_batch, y_batch
Exemplo n.º 11
0
Arquivo: agent.py Projeto: jsyoo61/DQN
    def __init__(self, n_action_space, n_training_frames = 50 * 1000000, replay_memory_size = 1000000, k = 4, m = 4):

        # Hyperparameters - dynamic
        self.n_action_space = n_action_space
        self.n_training_frames = n_training_frames
        self.replay_memory_size = replay_memory_size
        self.replay_memory = deque(maxlen = self.replay_memory_size)
        self.m = m
        self.k = k

        # Hyperparameters - static
        self.epsilon = 1.0
        self.minibatch_size = 32
        self.C = 10000
        self.gamma = 0.99
        self.update_frequency

        self.epsilon_initial = 1.0
        self.epsilon_final = 0.1
        self.exploration_frame = 1000000 # 1 million
        self.epsilon_decay = (self.epsilon_initial - self.epsilon_final) / self.exploration_frame

        self.replay_start_frame = 50000

        # Parameters - etc
        self.action = None
        self.timestep = 0

        # Modules
        self.preprocessor = Preprocessor(m = self.m)
        self.Q = DQN()
        self.Q_hat = copy.deepcopy(self.Q)

        # Operations
        self.mode('train')
def main(args):
    preprocessor = Preprocessor(args.model_type, args.max_len)
    train_dataloader, val_dataloader, test_dataloader = get_dataloader(
        args, preprocessor)
    bert_finetuner = BertModel(args, train_dataloader, val_dataloader,
                               test_dataloader)

    logger = TensorBoardLogger(save_dir=args.log_dir,
                               version=1,
                               name="nsmc-bert")

    early_stop_callback = EarlyStopping(monitor='val_acc',
                                        min_delta=0.00,
                                        patience=5,
                                        verbose=False,
                                        mode='max')

    checkpoint_callback = ModelCheckpoint(filepath=args.checkpoint_path,
                                          verbose=True,
                                          monitor='val_acc',
                                          mode='max',
                                          save_top_k=3,
                                          prefix='')

    trainer = pl.Trainer(
        gpus=1,
        # distributed_backend='ddp'
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stop_callback,
        logger=logger)

    trainer.fit(bert_finetuner)

    trainer.test()
Exemplo n.º 13
0
def classify(model):
    """ Function that fits a model using the entire training set and stores its predictions on the held out test set in a csv file. """

    # Read datasets
    df = pd.read_csv("data/preprocessed_reddit_train_SnowballStemmer.csv")

    # Using preprocessor to transform data into tf-idf representation
    preprocessor = Preprocessor("stemmer")

    # Transform training data to tf_idf representation
    x_train = preprocessor.tf_idf_vectorizer.fit_transform(df["cleaned"])
    y_train = df["label"]

    # Preprocess test data and transform to tf_idf representation
    x_test_df = pd.read_csv(
        "data/preprocessed_reddit_test_SnowballStemmer.csv")
    x_test = preprocessor.tf_idf_vectorizer.transform(
        x_test_df["cleaned"].values.astype('U'))

    # Train model using whole training set
    model.fit(x_train, y_train)

    # Predict on test set
    predictions = model.predict(x_test)

    # Turn predictions back to original labels
    preprocessor.label_encoder.fit(df["subreddits"])
    predictions = preprocessor.label_encoder.inverse_transform(predictions)

    # save predictions
    pred_df = pd.DataFrame({"Id": x_test_df.id, "Category": predictions})
    pred_df.to_csv("predictions/predictions{}.csv".format(
        datetime.datetime.now()),
                   index=False)
Exemplo n.º 14
0
    def preprocess(self,
            data_dir,
            re_seg=True,
            to_file=False,
            mid_data_paths=None,
            split_train_test=True,
            test_ratio=0.2,
            vec_method="count",
            feature_select=True,
            is_percent=True,
            feature_keep_percent=90,
            feature_keep_num=10,
            min_df=3):
        """
        """
        preprocessor = Preprocessor(
                feature_gen_func=self.feature_label_gen,
                vec_method=vec_method,
                feature_keep_percent=feature_keep_percent,
                feature_keep_num=feature_keep_num,
                is_percent=is_percent, 
                test_ratio=test_ratio,
                min_df=min_df)

        _, train_data, train_label, val_data, val_label = preprocessor.gen_data_vec(
                data_dir,
                self.feature_id_path,
                split_train_test=split_train_test,
                feature_select=feature_select,
                to_file=to_file,
                re_seg=re_seg,
                process_file_path=mid_data_paths)
Exemplo n.º 15
0
def train_and_validate_viterbi2(_inputFile, _outputFile, _devFile,
                                _devOutputFile, _validateFile):
    """
    Create the Preprocessor object
    Train using the SG, EN, CN, FR datasets
    Generate the representer, vocabulary and states and feed it into an Emission object
    """
    preprocessor = Preprocessor(_inputFile)
    representer = preprocessor.get_representer()
    vocabulary = preprocessor.get_vocabulary()
    states = preprocessor.get_states()

    listOfWords = getAllTokens(_devFile)
    """
    Create the Emission and Transition objects
    Validate using the dev datasets
    Label the input sequence and output the file as dev.p3.out
    """
    emission = Emission(representer, vocabulary, states, listOfWords)
    transition = Transition2()
    transition.compute_params(preprocessor)

    label_viterbi(_devFile, _devOutputFile, emission, transition)
    """
    Calculate Validation Error
    """
    evaluate(_validateFile, _devOutputFile)
Exemplo n.º 16
0
    def __init__(self):
        self.preprocessor = Preprocessor()
        self.feature_extractor = FeatureExtractor()
        self.crf_analyzer = CRFAnalyzer()
        self.sentiment_analyzer = SentimentAnalyzer()

        print("\nAll module instantiated and ready to go....\n")
Exemplo n.º 17
0
def generate_summary(text):
    preprocessor = Preprocessor()
    postprocessor = Postprocessor()
    SummaRise = SummaRiser('./path/to/data/vocab', './')

    sentences = sent_tokenize(text)
    totalSentences = len(sentences)
    tokens = 0  # count of tokens
    tokenized = []
    summarys = ''
    for id, sentence in enumerate(sentences):
        tokenized += preprocessor.tokenize(sentence)
        tokens += len(tokenized)
        if tokens >= MAX_TOKENS or (id == (totalSentences - 1)
                                    and tokens >= MIN_TOKENS):
            tokenized = (' '.join(tokenized))
            preprocessed_text = preprocessor.preprocess_text(
                tokenized.split('*N*'))
            print(preprocessed_text)
            summary = SummaRise.summarize([preprocessed_text])
            summarys += postprocessor.postprocess_text(summary[0])
            summarys += ' '
            tokens = 0
            tokenized = []

    return summarys
Exemplo n.º 18
0
def generate_plots(path):
    """ Generates plots for all videos in a directory

    :param path: the directory to search for videos

    """
    videos = glob(path + '/*.mkv')
    print(path, len(videos), videos)

    if len(videos) == 0:
        return
    else:
        videos = videos[0]

    metadata_list = glob(path + '/metadata.txt')
    #print(path, len(metadata_list), metadata_list)

    if len(metadata_list) == 0:
        return

    P = Preprocessor()
    P.import_video(str(videos))
    P.read_metadata(path)
    P.preprocess()
    Im = P.frames_processed
    if len(Im) == 0:
        print(len(Im))
        return

    z_start = P.z_start
    z_end = P.z_end

    mean, cov = analyze_image(Im)

    window_size = 10
    mean_smoothed = smoothing.mean_moving_average(mean, window_size)
    cov_smoothed = smoothing.cov_moving_average(cov, window_size)

    c = CubicFitRotated()
    c.fit(mean=mean_smoothed, cov=cov_smoothed, z_start=z_start, z_end=z_end)

    try:
        os.mkdir(path + '/analysis')
        path += '/analysis'
    except OSError:
        pass

    plots.plot_mean(mean, z_start, z_end).savefig(path + '/beam_center.png')
    plots.plot_beta(cov, z_start, z_end).savefig(path + '/sigma_squared.png')

    export.export_mean(mean=mean,
                       filename=path + '/center.csv',
                       z_start=z_start,
                       z_end=z_end)
    export.export_cov(cov=cov,
                      filename=path + '/cov.csv',
                      z_start=z_start,
                      z_end=z_end)

    plt.close('all')
Exemplo n.º 19
0
def get_data_generator(args, model_args, schema, test=False):
    from cocoa.core.scenario_db import ScenarioDB
    from cocoa.core.dataset import read_dataset
    from cocoa.core.util import read_json

    from core.scenario import Scenario
    from core.lexicon import Lexicon
    from preprocess import DataGenerator, Preprocessor
    import os.path

    # TODO: move this to dataset
    dataset = read_dataset(args, Scenario)

    mappings_path = model_args.mappings

    lexicon = Lexicon(schema.values['item'])
    preprocessor = Preprocessor(schema, lexicon, model_args.entity_encoding_form,
        model_args.entity_decoding_form, model_args.entity_target_form,
        model=model_args.model)

    if test:
        model_args.dropout = 0
        train, dev, test = None, None, dataset.test_examples
    else:
        train, dev, test = dataset.train_examples, dataset.test_examples, None
    data_generator = DataGenerator(train, dev, test, preprocessor, args, schema, mappings_path,
        cache=args.cache, ignore_cache=args.ignore_cache,
        num_context=model_args.num_context,
        batch_size=args.batch_size,
        model=model_args.model)

    return data_generator
Exemplo n.º 20
0
    def preprocess(self,
            data_dir,
            re_seg=True,
            to_file=False,
            mid_data_paths=None,
            split_train_test=True,
            test_ratio=0.2,
            vec_method="count",
            feature_select=True,
            is_percent=True,
            feature_keep_percent=90,
            feature_keep_num=10,
            min_df=3):
        """根据指定目录 获得数据特征
        [out] train_data_vec: matrix, 数据集特征
        """
        preprocessor = Preprocessor(
                feature_gen_func=self.feature_label_gen,
                vec_method=vec_method,
                feature_keep_percent=feature_keep_percent,
                feature_keep_num=feature_keep_num,
                is_percent=is_percent, 
                test_ratio=test_ratio,
                min_df=min_df)

        # 根据数据生成特征
        _, self.train_data_vec, _, _, _ = preprocessor.gen_data_vec(
                data_dir,
                self.feature_id_path,
                split_train_test=split_train_test,
                feature_select=feature_select,
                to_file=to_file,
                re_seg=re_seg,
                process_file_path=mid_data_paths)
Exemplo n.º 21
0
def create_model_instance():
    from models import ADEM
    from preprocess import Preprocessor

    logger.info('loading model from %s', ADEM_MODEL)
    model = ADEM(Preprocessor(), None, ADEM_MODEL)
    logger.info('model loaded. config: %r', model.config)
    return model
Exemplo n.º 22
0
 def test_get_feature_names(self):
     feature_names = ["f1", "f2", "f3"]
     data_set = np.array([feature_names, ["1", "2", "3"], ["", "4", "5"]])
     preprocessor = Preprocessor(data_set)
     names = preprocessor.get_feature_names()
     self.assertTrue(feature_names[0] == names[0])
     self.assertTrue(feature_names[1] == names[1])
     self.assertTrue(feature_names[2] == names[2])
Exemplo n.º 23
0
 def initComponents(self, crop=None):
     undistort = False
     self.pre = Preprocessor(self.rawSize,
                             self.threshold,
                             undistort,
                             crop=crop)
     self.featureEx = FeatureExtraction(self.pre.cropSize)
     self.gui = GUI(self.world, self.pre.cropSize, self.threshold, self)
     self.world.setResolution(self.pre.cropSize)
def inference():
    preprocessor = Preprocessor(first_time=False)
    preprocessor.preprocess()
    dataset = Dataset(preprocessor)
    mf = MF(preprocessor, dataset)
    mf.load()
    i2i = Item2Item(dataset)
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    ranker = Ranker()
    ranker.load()

    X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit()
    try:
        with open('submit_puke.pkl', 'wb') as f:
            pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f)
    except:
        print("Couldn't save submit_puke")

    # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb'))

    rank_scores = ranker.rank(X_submit)
    base = 0
    entire_articles = []
    not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items)
    not_heavy_items = sorted(not_heavy_items)
    cut = 50

    random.seed(0)
    with result_path.open('w') as fout:
        for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)):
            articles = X_article_nums[base:base+group_size]
            scores = rank_scores[base:base+group_size]

            articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)]
            articles = articles[:cut]
            from_followable = candidate_generator.get_readers_followable_articles(reader)
            # from_keywords = candidate_generator.get_readers_keyword_articles(reader)
            for item in from_followable:
                if len(articles) >= cut + 15:
                    break
                if item in articles:
                    continue
                articles.append(item)
            while len(articles) < 100:
                item = random.choice(not_heavy_items)
                if item not in articles:
                    articles.append(item)
            entire_articles.extend(articles)

            reader_str = preprocessor.num2reader[reader]
            article_strs = map(preprocessor.num2article.get, articles)

            fout.write('%s %s\n' % (reader_str, ' '.join(article_strs)))

            base += group_size
    print('Entropy of candidates = ', entropy(entire_articles))
Exemplo n.º 25
0
def main(matcher_path, test_path):
    m_trackers_paths = glob.glob(matcher_path + '/*')
    t_trackers_paths = glob.glob(test_path + '/*')
    tracker_manager = TrackerManager('test')
    matcher = FaissMatcher()
    preprocessor = Preprocessor()
    align_preprocessor = Preprocessor(algs=align_and_crop)
    face_rec_graph_face = FaceGraph()
    face_extractor = FacenetExtractor(face_rec_graph_face,
                                      model_path=Config.FACENET_DIR)
    detector = MTCNNDetector(face_rec_graph_face)

    # create matcher
    print('Creating matcher ...')
    for m_dir in m_trackers_paths:
        print('Processing ' + m_dir)
        face_id = m_dir.split('/')[-1]
        embs, labels = extract_embs(m_dir, preprocessor, face_extractor, None)
        face_id_labels = [face_id for i in range(len(labels))]
        matcher.update(embs, face_id_labels)

    # create tracker
    print('Creating trackers')
    for t_dir in t_trackers_paths:
        print('Processing ' + t_dir)
        embs, _ = extract_embs(t_dir, preprocessor, face_extractor, None)
        track_id = int(t_dir.split('/')[-1])

        first_emb = embs.pop()
        face_info = FaceInfo(None, first_emb, None, None, None, None)
        tracker_manager.current_trackers[track_id] = Tracker(
            track_id, face_info, None)
        for emb in embs:
            face_info = FaceInfo(None, emb, None, None, None, None)
            tracker_manager.current_trackers[track_id].update(face_info, None)
        len(tracker_manager.current_trackers)

    # test matching
    print('Test matching ...')
    for fid in tracker_manager.current_trackers:
        print('Processing: ' + str(fid))
        tops = tracker_manager.recognize_current_tracker(fid, matcher, None)
        print('Track_id {}, recognize: {}'.format(fid, tops))
Exemplo n.º 26
0
def preprocess(args, X, T):
    pre = Preprocessor()
    X_normal = X
    if args.pre == 'pca':
        logging.info('Preprocess with PCA(d = %d)' % args.deg)
        X_phi = pre.pca(X_normal, args.deg)
    elif args.pre == 'lda':
        logging.info('Preprocess with LDA(d = %d)' % args.deg)
        X_phi = pre.lda(X_normal, T, args.deg)
    return X_phi, pre
Exemplo n.º 27
0
 def __init__(cls):
     cls.face_rec_graph_face = FaceGraph()
     cls.coeff_graph = FaceGraph()
     cls.face_extractor = FacenetExtractor(
         cls.face_rec_graph_face, model_path=Config.Model.FACENET_DIR)
     cls.coeff_extractor = FacenetExtractor(
         cls.coeff_graph, model_path=Config.Model.COEFF_DIR)
     cls.detector = MTCNNDetector(
         cls.face_rec_graph_face, scale_factor=Config.MTCNN.SCALE_FACTOR)
     cls.preprocessor = Preprocessor()
Exemplo n.º 28
0
    def test_encode_and_no_categorical(self):
        dask_data = dd.read_csv('data_encode.csv')
        x = Preprocessor(['feat1', 'feat2', 'feat3'], 'target', dask_data,
                         ['o', 'p', 'n'])
        x.execute(duplicates_invalid=True,
                  missing=True,
                  scale=True,
                  transform=True,
                  encode_target=True,
                  train=True)
        expected_output_dict = {
            'target': {
                0: 2,
                1: 2,
                2: 0,
                3: 0,
                6: 1,
                7: 2,
                8: 0,
                9: 2
            },
            'feat1': {
                0: -0.928,
                1: -0.093,
                2: -0.928,
                3: -0.928,
                6: 0.743,
                7: -0.093,
                8: -0.093,
                9: -0.093
            },
            'feat2': {
                0: -0.844,
                1: 0.998,
                2: -0.844,
                3: -0.844,
                6: -0.23,
                7: -0.844,
                8: 0.384,
                9: 0.0
            },
            'feat3': {
                0: -0.548,
                1: 0.0,
                2: 0.0,
                3: -0.548,
                6: 2.739,
                7: -0.548,
                8: 0.0,
                9: -0.548
            }
        }

        self.assertEqual(expected_output_dict, x.df.round(3).head(8).to_dict())
Exemplo n.º 29
0
def main(argv):
    if len(argv) < 2:
        print "Invalid arguments. Format is \'python nlp.py [file name]\'"
    else:
        pp = Preprocessor()
        print "Starting NLP..."
        texts = pp.prepDoc(argv[1])
        c = Corpus(texts)
        c.createCorpus()
        scores = c.calc_tfidf()
        export(scores)
Exemplo n.º 30
0
def create_dataset(filenames, batch_size, num_heatmap, is_train):
    preprocess = Preprocessor(IMAGE_SHAPE, (HEATMAP_SIZE[0], HEATMAP_SIZE[1], num_heatmap), is_train)
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
   
    if is_train:
        dataset = dataset.shuffle(batch_size)

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset