Пример #1
0
    def __init__(self, policy_cls, env_id, args):
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        self.args = args
        self.env = gym.make(env_id, **args2envkwargs(args))
        self.policy_with_value = policy_cls(self.args)
        self.iteration = 0
        if self.args.mode == 'training':
            self.log_dir = self.args.log_dir + '/evaluator'
        else:
            self.log_dir = self.args.test_log_dir
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)

        self.preprocessor = Preprocessor((self.args.obs_dim, ),
                                         self.args.obs_preprocess_type,
                                         self.args.reward_preprocess_type,
                                         self.args.obs_scale,
                                         self.args.reward_scale,
                                         self.args.reward_shift,
                                         gamma=self.args.gamma)

        self.writer = self.tf.summary.create_file_writer(self.log_dir)
        self.stats = {}
        self.eval_timer = TimerStat()
        self.eval_times = 0
Пример #2
0
    def read(self, filename=None, preprocess=True, **defines):
        """Preprocess, read and parse itp file *filename*.

        Any keywords in *defines* are use to modify the default preprocessor
        variables (see
        :meth:`gromacs.fileformats.preprocessor.Preprocessor.parse` for
        details). Setting *preprocess* = ``False`` skips the preprocessing
        step.
        """
        self._init_filename(filename)

        if preprocess:
            kwargs = self.defines.copy()
            kwargs['commentchar'] = self.commentchar
            kwargs['clean'] = True
            ppitp = Preprocessor(self.real_filename, **kwargs)
            ppitp.parse(**defines)
            itp = ppitp.StringIO()
        else:
            itp = open(self.real_filename)

        try:
            stream = OneLineBuffer(itp.next)
            self.parse(stream)
        finally:
            itp.close()
Пример #3
0
    def processData(self):
        """
        The purpose of this method is to process both train/test raw data
        """
        # Load the preprocessor
        preprocessor = Preprocessor()

        if self.train:
            filename = self.parameters['data-path'] + self.parameters[
                'train-data-filename']
        else:
            filename = self.parameters['data-path'] + self.parameters[
                'test-data-filename']

        # read the required file
        data_df = pd.read_json(path_or_buf=filename, lines=True)

        # concatenate response and last 'n' contexts together
        data_df['CONTEXT'] = data_df['context'].apply(
            lambda x: ' '.join(x[-self.n_last_context:]))
        data_df['text'] = data_df['CONTEXT'] + ' ' + data_df['response']
        data_df['text'] = data_df['text'].apply(
            lambda x: preprocessor.process_text_bert(x))

        # save the processed data
        if self.train:
            filename = self.parameters['processed-data-path'] + self.parameters[
                'processed-train-data-filename']
            data_df[['text', 'label']].to_csv(filename)
        else:
            filename = self.parameters['processed-data-path'] + self.parameters[
                'processed-test-data-filename']
            data_df[['text']].to_csv(filename)
        return
Пример #4
0
def load_data(batch_size):
    '''
    Loads training, validation and test data from resources.
    '''
    data_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../resources'))
    data_config = {
        "training": {
            "size": 0.8
        },
        "test": {
            "size": 0.1
        },
        "validation": {
            "size": 0.1
        }
    }

    p = Preprocessor(base_path=data_path, datasets=data_config)

    train_data = p.generate_images('training',
                                   shuffle=True,
                                   batch_size=batch_size)
    valid_data = p.generate_images('validation',
                                   shuffle=False,
                                   batch_size=batch_size)

    class_weights = p.get_class_weights()

    return train_data, valid_data, class_weights
def run_imputation(df):
    """
    Fill in missing numeric values using Kalman Filtering,
    and fill in missing null 
    """

    # Create the dateIndex
    time_cols = ['year', 'month', 'day', 'hour']
    df["timestamp"] = pd.to_datetime(df[time_cols])
    df.set_index("timestamp", inplace=True)
    df.drop(columns=time_cols, inplace=True)
    # Check if there are null values in the dataset and get the columns
    nulls = df.isnull().sum()
    null_cols = nulls[nulls > 0].index.values
    numeric_null_cols = get_numeric_null_cols(df, null_cols)
    obj_null_cols = get_string_null_cols(df, null_cols)

    # Use Kalman Filtering to impute missing numeric
    # values
    for col in numeric_null_cols:
        prep = Preprocessor()
        arr = prep.kalman_impute(df[col])
        df[col] = arr

        # Backfill any missing data at the beginning of the array
        if df[col].isnull().sum():
            df[col].fillna(method="bfill", inplace=True)

    # Random draw based on distribution of
    # unique vals in each column
    for col in obj_null_cols:
        arr = fill_missing_strings(df[col])
        df[col] = arr

    return df
Пример #6
0
def testTennisOrIris(trainDataFile, testDataFile, attrDataFile):
    data = Preprocessor(trainDataFile, testDataFile, attrDataFile)
    data.loadData()
    trainData = data.getMatrix(data.getTrainData())
    testData = data.getMatrix(data.getTestData())
 
    numInput = data.getNumInput()
    numOutput = len(data.getClasses())
    numHidden = 3
    seed = 4 
    learningRate = 0.1
    maxEpochs = 5000
    momentum = 0.0

    print("Generating neural network: %d-%d-%d" % (numInput, numHidden,numOutput))
    nn = NeuralNetwork(numInput, numHidden, numOutput, seed)
    nn.train(trainData, maxEpochs, learningRate, momentum)
    print("Training complete")

 #   accTrain = nn.accuracy(trainData)
    accTest = nn.accuracy(testData)

 #   print("\nAccuracy on train data = %0.4f " % accTrain)
   
    print("Accuracy on test data   = %0.4f " % accTest)
Пример #7
0
 def __init__(self):
     self._rows = 0
     self._cols = 0
     self._params = {}
     self._model = LogisticRegression(max_iter=20)
     self._preprocessor = Preprocessor()
     self.init_params()
    def test_search_lines(self):
        """Various test cases for preprocessor.search_lines."""
        preprocessor = Preprocessor(self.empty_dataframe, self.config,
                                    '_INFO_')

        # These lines should get deleted
        no_matches = ['not matching text', 'should get deleted']
        self.assertFalse(preprocessor.search_lines(no_matches))

        # The first line should be kept since it explicitly matches
        # to all RE in USEFUL_INFORMATION
        # The second line is removed since although it matches to error,
        # it does not match to USEFUL_INFORMATION
        single_match = [
            'this error is USEFUL_INFORMATION', 'but this error is not'
        ]
        self.assertEqual(preprocessor.search_lines(single_match),
                         ['this error is USEFUL_INFORMATION'])

        # Both lines should be kept since both are explicitly matched to
        # by both regular expressions in search_lines
        multi_match = [
            'this error is USEFUL_INFORMATION',
            'of course that error is USEFUL_INFORMATION!'
        ]
        self.assertEqual(preprocessor.search_lines(multi_match), multi_match)
Пример #9
0
    def __init__(self, left_filename, right_filename, directory, config):
        super(TestPredictionCallback, self).__init__()

        self.directory = directory

        # Crop and expand dims to batch = 1
        crop_start_row = config['crop_start_row']
        crop_start_col = config['crop_start_col']
        crop_stop_row = crop_start_row + config['crop_height']
        crop_stop_col = crop_start_col + config['crop_width']

        preprocessor = Preprocessor()

        img = img_to_array(load_img(left_filename),
                           data_format='channels_first')
        img = img[:, crop_start_row:crop_stop_row,
                  crop_start_col:crop_stop_col]
        left_img = preprocessor.resize_img(img, [
            config['channels'], config['resized_height'],
            config['resized_width']
        ])

        img = img_to_array(load_img(right_filename),
                           data_format='channels_first')
        img = img[:, crop_start_row:crop_stop_row,
                  crop_start_col:crop_stop_col]
        right_img = preprocessor.resize_img(img, [
            config['channels'], config['resized_height'],
            config['resized_width']
        ])

        self.left_img = np.expand_dims(left_img, axis=0)
        self.right_img = np.expand_dims(right_img, axis=0)
Пример #10
0
    def test_restore_matrix_2(self):

        missing_value = -999999
        pre = Preprocessor(missing_value=missing_value)
        threshold = 1e-5

        header = ['col1', 'col2']
        x = np.random.randint(low=0, high=2, size=(5, 2)).astype(str)
        v = np.full(shape=x.shape, fill_value=False)

        m = pre.get_metadata(arr=x, header=header)
        obj_d = pre.get_discretized_matrix(arr=x,
                                           meta=m,
                                           header=header,
                                           require_missing=True)
        obj_r = pre.restore_matrix(arr=obj_d['x'],
                                   meta=m,
                                   header=obj_d['header'])

        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                if m[j]['type'] == 'count' or m[j]['type'] == 'continuous':
                    if abs(float(x[i, j]) -
                           float(obj_r['x'][i, j])) < threshold:
                        v[i, j] = True
                else:
                    if x[i, j] == obj_r['x'][i, j]:
                        v[i, j] = True

        assert v.all()
Пример #11
0
    def test_restore_matrix_4(self):

        missing_value = -999999
        pre = Preprocessor(missing_value=missing_value)
        threshold = 1e-5

        obj_f = self.create_multimodal_object(n=1000)
        v = np.full(shape=obj_f['x'].shape, fill_value=False)

        m = pre.get_metadata(obj_f['x'], obj_f['header'])
        obj_d = pre.get_discretized_matrix(arr=obj_f['x'],
                                           meta=m,
                                           header=obj_f['header'],
                                           require_missing=True)
        obj_r = pre.restore_matrix(arr=obj_d['x'],
                                   meta=m,
                                   header=obj_d['header'])

        for i in range(obj_f['x'].shape[0]):
            for j in range(obj_f['x'].shape[1]):
                if m[j]['type'] == 'count' or m[j]['type'] == 'continuous':
                    if abs(float(obj_f['x'][i, j]) -
                           float(obj_r['x'][i, j])) < threshold:
                        v[i, j] = True
                else:
                    if obj_f['x'][i, j] == obj_r['x'][i, j]:
                        v[i, j] = True

        assert v.all()
Пример #12
0
    def test_get_variable_type_constant_str(self):

        pre = Preprocessor(missing_value=-999999)
        x = np.full(fill_value='hello world', shape=10000)
        var_type = pre.get_variable_type(arr=x, label='my_feature')

        assert var_type == 'constant'
Пример #13
0
    def test_get_variable_type_continuous(self):

        pre = Preprocessor(missing_value=-999999)
        x = np.random.random(1000)
        var_type = pre.get_variable_type(arr=x, label='my_feature')

        assert var_type == 'continuous'
Пример #14
0
    def test_get_variable_type_constant_num(self):

        pre = Preprocessor(missing_value=-999999)
        x = np.zeros(shape=1000)
        var_type = pre.get_variable_type(arr=x, label='my_feature')

        assert var_type == 'constant'
Пример #15
0
def main(_):
    pre_processor = Preprocessor()
    pre_processor.set_train_test_data(0.8)

    model = Model('winner_predict_model')
    model.learning_rate = 0.01
    model.sess = tf.Session()

    model.builder(team_input_size=pre_processor.team_input_size,
                  player_input_size=pre_processor.player_input_size,
                  output_size=pre_processor.output_size,
                  model_name='model_builder')
    model.run_train(train_epoch=5000,
                    train_x_home_team=pre_processor.train_x_home_team,
                    train_x_away_team=pre_processor.train_x_away_team,
                    train_x_home_player=pre_processor.train_x_home_player,
                    train_x_away_player=pre_processor.train_x_away_player,
                    train_y=pre_processor.train_y,
                    keep_prob=0.7,
                    print_num=500)
    model.run_test(test_x_home_team=pre_processor.test_x_home_team,
                   test_x_away_team=pre_processor.test_x_away_team,
                   test_x_home_player=pre_processor.test_x_home_player,
                   test_x_away_player=pre_processor.test_x_away_player,
                   test_y=pre_processor.test_y)
    model.closer()
Пример #16
0
    def __init__(self, conf_path, template_path):
        configuration = Configuration(conf_path)
        self.preprocessor = Preprocessor(configuration)
        self.poco_processor = PocoProcessor(configuration)

        self.ros_mapper_processor = RosMapperProcessor(configuration)
        self.ros_msg_processor = RosMsgProcessor(configuration)

        self.dds_mapper_processor = DdsMapperProcessor(configuration)
        self.dds_idl_processor = DdsIdlProcessor(configuration)

        self.zmq_serializer_processor = ZmqSerializerProcessor(configuration)

        self.node_handler_processor = NodeHandlerProcessor(configuration)

        env = Environment(loader=FileSystemLoader(template_path))
        self.poco_template = env.get_template('poco_template.h')
        self.ros_mapper_template = env.get_template('ros_mapper_template.h')
        self.ros_msg_template = env.get_template('ros_template.msg')
        self.dds_mapper_template = env.get_template('dds_mapper_template.h')
        self.dds_idl_template = env.get_template('dds_template.idl')
        self.zmq_serializer_template = env.get_template(
            'zmq_serializer_template.h')
        self.node_handler_template = env.get_template(
            'node_handler_template.js')
    def test_parent_class(self):
        configuration = Configuration(CONF_PATH)
        preprocessor = Preprocessor(configuration)
        ros_msg_processor = RosMsgProcessor(configuration)

        class_definition_dict = {}

        kidl_file = "class_with_ros_mdlw_and_parent_class.yaml"

        with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream:
            try:
                class_definition_data = yaml.load(stream,
                                                  Loader=yaml.FullLoader)
            except yaml.YAMLError as exc:
                print(exc)

        class_definition = preprocessor.process(class_definition_data, False)
        class_definition_dict[class_definition.class_name] = class_definition

        kidl_file = "basic_class_with_ros_mdlw.yaml"

        with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream:
            try:
                class_definition_data = yaml.load(stream,
                                                  Loader=yaml.FullLoader)
            except yaml.YAMLError as exc:
                print(exc)

        class_definition = preprocessor.process(class_definition_data, False)
        class_definition_dict[class_definition.class_name] = class_definition

        ros_msg_definition = ros_msg_processor.process(
            'kpsr::codegen::ClassWithParentClass', class_definition_dict)

        print(ros_msg_definition)
Пример #18
0
 def preprocessData(self, train_data, test_data):
     # Preprocessor
     preprocessor = Preprocessor()
     # Make preprocessing path if it doesnt exist
     if not os.path.exists(self.preprocessing_path):
         os.mkdir(self.preprocessing_path)
     # Check if preprocessing training artifact exists
     if os.path.exists(os.path.join(self.preprocessing_path, 'train_data.txt')):
         # Load train data if it does
         train_data = open(os.path.join(self.preprocessing_path, 'train_data.txt')).read().splitlines()
     else:
         # Preprocess the data as specified in the config file
         for step in self.config['preprocessing']:
             train_data = preprocessor.process(step, train_data)
         # Save the training data artifact
         with open(os.path.join(self.preprocessing_path, 'train_data.txt'), 'w+') as f:
             # Write the array with each datapoint on a new line
             f.write('\n'.join(train_data))
             f.close()
     # Check if preprocessing testing artifact exists
     if os.path.exists(os.path.join(self.preprocessing_path, 'test_data.txt')):
         # Load test data if it does
         test_data = open(os.path.join(self.preprocessing_path, 'test_data.txt')).read().splitlines()
     else:
         # Preprocess the data as specified in the config file
         for step in self.config['preprocessing']:
             test_data = preprocessor.process(step, test_data)
         # Save the testing data artifact
         with open(os.path.join(self.preprocessing_path, 'test_data.txt'), 'w+') as f:
             # Write the array with each datapoint on a new line
             f.write('\n'.join(test_data))
             f.close()
     return train_data, test_data
 def test_word_filter(self):
     """Tests pertaining to preprocessor.filter_words."""
     preprocessor = Preprocessor(self.empty_dataframe, self.config,
                                 '_INFO_')
     sample_string = 'Some error information here, testIgnoreWord'
     self.assertEqual(preprocessor.filter_words(sample_string),
                      'Some error information here, ')
Пример #20
0
def main(mode, other_args):
    if mode != 'build' and mode != 'detect':
        raise Exception('Unknown execution mode: {}'.format(mode))

    with open("config/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    td = TrendDetector(cfg)

    if mode == 'build':
        # Build model
        sl = SearchLoader(cfg)
        df = sl.load()
        pp = Preprocessor(df)
        agg_df = pp.run()
        td.build(agg_df)

        # Detect trending for all queries on the last day
        max_date = agg_df['date'].max()
        for _, row in agg_df[agg_df['date'] == max_date].iterrows():
            query = row['query']
            count = row['count']
            td.is_trending(query, count)

    else:  # 'detect' mode
        # Load model
        td.load_model()

        # Detect trending for the given query and search count
        query = other_args.query
        obs = other_args.obs
        td.is_trending(query, obs, verbose=True)
    def test_basic(self):
        configuration = Configuration(CONF_PATH)
        preprocessor = Preprocessor(configuration)
        ros_msg_processor = RosMsgProcessor(configuration)

        class_definition_dict = {}

        kidl_file = "basic_class_with_ros_mdlw.yaml"

        with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream:
            try:
                class_definition_data = yaml.load(stream, Loader=yaml.FullLoader)
            except yaml.YAMLError as exc:
                print(exc)

        class_definition = preprocessor.process(class_definition_data, False)
        class_definition_dict[class_definition.class_name] = class_definition

        ros_msg_definition = ros_msg_processor.process('BasicClass', class_definition_dict)

        env = Environment(
            loader=FileSystemLoader(TEMPLATE_PATH)
        )
        template = env.get_template('ros_template.msg')

        print(template.render(definition=ros_msg_definition))
Пример #22
0
def externalVoodoo(input,
                   output,
                   linkTo,
                   pathToRemoveFromIdentifier="",
                   trace=False):
    inputLines = _readLinesOfFile(input)
    perFileSettings = PerFileSettings(inputLines)
    preprocessor = Preprocessor(linkTo, output, inputLines,
                                pathToRemoveFromIdentifier)

    out = preprocessor.externalHeader()
    out += '#include "VoodooConfiguration.h"\n'
    out += '#include <VoodooCommon/Common.h>\n\n'
    out += "namespace External\n{\n\n"
    iterator = VoodooMultiplexerIterator(perFileSettings)
    iterator.process(input)
    out += iterator.iter()
    out += "\n}\n\n"
    out += preprocessor.externalSwitchToExpectation()
    out += '#include "VoodooCommon/All.h"\n\n'
    out += "namespace External\n{\n\n"
    out += iterator.expect()
    out += "\n}\n\n"
    out += preprocessor.externalFooter()
    return out
Пример #23
0
def main(testpath, path_to_result):
    with open('config.json') as f:#load config
        config = json.load(f)

    logging.info('loading embedding...')
    with open('embedding.pkl', 'rb') as f:
        embedding = pickle.load(f)
        config['model_parameters']['embedding'] = embedding.vectors#load embedding

    preprocessor = Preprocessor(None)
    preprocessor.embedding = embedding#update embedding used by preprocessor

    logging.info('Processing test from test.pkl')
    test = preprocessor.get_dataset(testpath, 6, {'n_positive': -1, 'n_negative': -1, 'shuffle': False})#get dataset
    test.shuffle = False

    PredictorClass = ExamplePredictor
    predictor = PredictorClass(metrics=[], **config['model_parameters'])#make model

    logging.info('loading model from {}'.format('model.pkl.2'))#load model
    predictor.load('model.pkl.4')

    logging.info('predicting...')
    predicts = predictor.predict_dataset(test, test.collate_fn)#predicting

    write_predict_csv(predicts, test, path_to_result)#save csv
Пример #24
0
 def __init__(self, environment, agent, train, action_freq=1):
     self.env = environment
     self.agent = agent
     self.prep = Preprocessor(
         self.env.get_dim(Preprocessor.NB_STATE_HISTORY))
     self.trainer = agent.get_trainer() if train else None
     self.action_freq = action_freq
Пример #25
0
def validate(model: Model, loader: DataLoaderIAM, line_mode: bool) -> Tuple[float, float]:
    """Validates NN."""
    print('Validate NN')
    loader.validation_set()
    preprocessor = Preprocessor(get_img_size(line_mode), line_mode=line_mode)
    num_char_err = 0
    num_char_total = 0
    num_word_ok = 0
    num_word_total = 0
    while loader.has_next():
        iter_info = loader.get_iterator_info()
        print(f'Batch: {iter_info[0]} / {iter_info[1]}')
        batch = loader.get_next()
        batch = preprocessor.process_batch(batch)
        recognized, _ = model.infer_batch(batch)

        print('Ground truth -> Recognized')
        for i in range(len(recognized)):
            num_word_ok += 1 if batch.gt_texts[i] == recognized[i] else 0
            num_word_total += 1
            dist = editdistance.eval(recognized[i], batch.gt_texts[i])
            num_char_err += dist
            num_char_total += len(batch.gt_texts[i])
            print('[OK]' if dist == 0 else '[ERR:%d]' % dist, '"' + batch.gt_texts[i] + '"', '->',
                  '"' + recognized[i] + '"')

    # print validation result
    char_error_rate = num_char_err / num_char_total
    word_accuracy = num_word_ok / num_word_total
    print(f'Character error rate: {char_error_rate * 100.0}%. Word accuracy: {word_accuracy * 100.0}%.')
    return char_error_rate, word_accuracy
Пример #26
0
def voodoo(input,
           output,
           pathToRemoveFromIdentifier,
           voodooDBFile,
           includes,
           defines,
           preIncludes,
           trace=False):
    inputLines = _readLinesOfFile(input)
    perFileSettings = PerFileSettings(inputLines)
    preprocessor = Preprocessor(input, output, inputLines,
                                pathToRemoveFromIdentifier)

    out = preprocessor.header()
    out += '#include <VoodooCommon/Common.h>\n\n'
    iterator = VoodooMultiplexerIterator(perFileSettings, voodooDBFile)
    iterator.process(input,
                     includes=includes,
                     defines=defines,
                     preIncludes=preIncludes)
    out += iterator.iter()
    out += preprocessor.switchToExpectation()
    out += '#include "VoodooCommon/All.h"\n\n'
    out += iterator.expect()
    out += preprocessor.footer()
    return out
    def __init__(self,
                 data_dir,
                 coord,
                 symbol_list,
                 year_range,
                 symbol_first,
                 data_win_len,
                 receptive_field,
                 queue_size=500):
        # system initialize
        self.db_manager = DBManager(data_dir)
        self.preprocessor = Preprocessor()

        self.coord = coord
        self.threads = []

        # processing params
        self.data_dir = data_dir
        self.symbol_list = symbol_list
        self.year_range = year_range
        self.symbol_first = symbol_first
        self.data_win_len = data_win_len
        self.receptive_field = receptive_field

        # queue setup
        self.trans_placeholder = tf.placeholder(dtype=tf.float32, shape=None)
        self.trans_queue = tf.PaddingFIFOQueue(queue_size, ['float32'],
                                               shapes=[(None, 1)])
        self.trans = self.trans_queue.enqueue([self.trans_placeholder])
        # for multithreading:
        self.yield_list = itertools.product(
            self.symbol_list,
            self.year_range) if self.symbol_first else itertools.product(
                self.year_range, self.symbol_list)
Пример #28
0
    def load(self, filename):
        param_dict = dict()
        with open(filename, 'rb+') as f:
            param_dict = pickle.load(f)

        self.min_word_counts = param_dict['min_word_counts']
        self.dtype = param_dict['dtype']
        self.max_df = param_dict['max_df']
        self.min_df = param_dict['min_df']
        self.vocabulary = param_dict['vocabulary']
        self.word_to_ind = param_dict['word_to_ind']
        self.ngram_range = param_dict['ngram_range']

        self.doc_cleaner_pattern = param_dict['doc_cleaner_pattern']
        self.token_pattern = param_dict['token_pattern']
        self.stop_words = param_dict['stop_words']
        self.document_cleaner_func = param_dict['document_cleaner_func']
        self.tokenizer_func = param_dict['tokenizer_func']
        self.token_cleaner_func = param_dict['token_cleaner_func']

        self.preprocessor = Preprocessor(
            doc_cleaner_pattern=self.doc_cleaner_pattern,
            token_pattern=self.token_pattern,
            document_cleaner_func=self.document_cleaner_func,
            tokenizer_func=self.tokenizer_func,
            token_cleaner_func=self.token_cleaner_func,
            stop_words=self.stop_words)
        self.preprocessor.fit()
Пример #29
0
 def __init__(self, model_name="test.hdf5"):
     print('Starting test of {}'.format(model_name))
     models_path = path.abspath(path.join(
         __file__, "../../..")) + "/models/" + model_name
     print(models_path)
     self.model = load_model(models_path)
     self.preprocessor = Preprocessor()
Пример #30
0
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		self.data_api_impl = DataApi('../../../data/')
		self.cross_validator_impl = CrossValidator()
		self.preprocessor_impl = Preprocessor()
Пример #31
0
 def process(self,file):
     Preprocessor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     pos = cqpf.getColumn(self.column)
     for i in range(2,len(pos)): # ignore first two pos ...
         uni =  (pos[i])[0:3]
         bi = (pos[i-1])[0:3] + "_" + uni
         tri = (pos[i-2])[0:3] + "_" + bi
               
         if uni not in self.unilexicon:
             self.unilexicon[uni] = 0
         self.unilexicon[uni] += 1
         
         if bi not in self.bilexicon:
             self.bilexicon[bi] = 0
         self.bilexicon[bi] += 1
         
         if tri not in self.trilexicon:
             self.trilexicon[tri] = 0
         self.trilexicon[tri] += 1
         
         
         self.count += 1
Пример #32
0
    def test_with_builder(self):
        configuration = Configuration(CONF_PATH)
        preprocessor = Preprocessor(configuration)
        poco_processor = PocoProcessor(configuration)

        class_definition_dict = {}

        kidl_file = "basic_class_with_builder.yaml"

        with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream:
            try:
                class_definition_data = yaml.load(stream,
                                                  Loader=yaml.FullLoader)
            except yaml.YAMLError as exc:
                print(exc)

        class_definition = preprocessor.process(class_definition_data, False)
        class_definition_dict[class_definition.class_name] = class_definition

        poco_definition = poco_processor.process('BasicClassWithBuilder',
                                                 class_definition_dict, '')

        env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
        template = env.get_template('poco_template.h')

        print(template.render(definition=poco_definition))
Пример #33
0
 def __init__(self):
     self.unilexicon = {}
     self.bilexicon = {}
     self.trilexicon = {}
     
     self.count = 0
     self.column = 1
     Preprocessor.__init__(self)
Пример #34
0
 def process(self,file):
     Preprocessor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     for word in cqpf.getColumn(self.column):
         if word not in self.lexicon:
             self.lexicon[word] = 0
         self.lexicon[word] += 1
         self.count += 1
Пример #35
0
    def _test_bg_subtraction2(self):
        p = Preprocessor(10)
        s = p.load_npy('./test.npy')
        generator = DataPreparator("", "", 512)
        samples1=len(s[0,:])
        
        snew,sbg = generator.bg_subtraction(s)
        samples2=len(snew[0,:])

        self.assertGreater(samples1, samples2)
Пример #36
0
def voodooExpectHeader( input, output, pathToRemoveFromIdentifier, voodooDBFile, includes, defines, preIncludes, trace = False ):
	inputLines = _readLinesOfFile( input )
	perFileSettings = PerFileSettings( inputLines )
	preprocessor = Preprocessor( input, output, inputLines, pathToRemoveFromIdentifier )

	iterator = VoodooMultiplexerIterator( perFileSettings, voodooDBFile )
	iterator.process( input, includes = includes, defines = defines, preIncludes = preIncludes )

	out = preprocessor.headerOfHeader() + '\n'
        
	return out
Пример #37
0
def main(args):
    print("Athene Preprocessor v. 0.1")
    if (len(args) > 2):        
        source_filename, target_filename, include_filenames = parse_args(args)
        if ("" == source_filename or "" == target_filename):
            print("arguments error")
        preprocessor = Preprocessor(source_filename, target_filename, include_filenames)
        preprocessor.run()
        print("ok!")
    else:
        print("Usage:")
        print("\tathp -s source -t target [-i file file file]")    
    def track(self):
        print sys.argv
        cam = cv2.VideoCapture(int(sys.argv[1]))
        cam.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH, 640)
        cam.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, 480)
        self._initialize_windows()

        p = Preprocessor()
        hs = HandSegment()

        positions = []
        self.count = 0
        self.skip_frames = 0
        x, y, w, h = 0, 0, 0, 0
        prev_x, prev_y, prev_w, prev_h = 0, 0, 0, 0
        while True:
            frame = self.get_frame(cam)
            if type(frame) == type(None):
                continue
            p.process(frame)


            hand = self.get_biggest_hand(frame, prev_x, prev_y, prev_w, prev_h)
            # print hand
            if not hand == []:
                x, y, w, h = hand
                prev_x, prev_y, prev_w, prev_h = hand
                centerx = x + w / 2
                centery = y + h / 2
                # Drawing rectangle around the hand
                cv2.rectangle(frame, (x, y), (x+w, y+w), (0, 0, 0), 1)

                # pointerx, pointery = hs.get_pointer(frame, x, y, w, h)

                # cv2.imshow("pointer", frame[max(y-h, 0):y+h, x:x+w+w/4])
            else:
                x, y, w, h = -1, -1, prev_w, prev_h
                centerx = -1
                centery = -1
            positions.append([centerx, centery])
            # Action
            skip_frames = self.motion(positions, w, h)
            # Drawing line of motion
            self._draw_motion(frame, positions)
            cv2.imshow("display", frame)


            ch = 0xFF & cv2.waitKey(1)
            if ch == 27:
                break

        cv2.destroyAllWindows()
Пример #39
0
def voodoo( input, output, pathToRemoveFromIdentifier, voodooDBFile, includes, defines, preIncludes, trace = False ):
	inputLines = _readLinesOfFile( input )
	perFileSettings = PerFileSettings( inputLines )
	preprocessor = Preprocessor( input, output, inputLines, pathToRemoveFromIdentifier )

	out = preprocessor.header()
	out += '#include <VoodooCommon/Common.h>\n\n'
	iterator = VoodooMultiplexerIterator( perFileSettings, voodooDBFile )
	iterator.process( input, includes = includes, defines = defines, preIncludes = preIncludes )
	out += iterator.iter()
	out += preprocessor.switchToExpectation()
	out += '#include "VoodooCommon/All.h"\n\n'
	out += iterator.expect()
	out += preprocessor.footer()
	return out
Пример #40
0
def main():

    # create lexer and parser instances:
    lexicon_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "lexicon")
    grammar_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grammar")
    lexer = Lexer(lexicon_file, False)
    parser = Parser(grammar_file, lexer.lexicon_dict.keys())

    # run tests:
    for test in tests:

        # create preprocessor instance
        preprocessor_instance = Preprocessor(prefix + test)
        chunks = preprocessor_instance.get_chunks()
        ok = try_parse_program(chunks, lexer, parser)
        print("test " + test + " " + ("PASSED" if ok else "FAILED"))
Пример #41
0
 def __init__(self, data, label, verbose=False, verbosity_level = 5):
     print('Preprocessing data...')
     self.__prep = Prep()
     for item in data:
         self.__data.append(self.__prep.process(item))
     self.__label = label
     self.__precision = []
     self.__verbose = verbose
     self.__level = verbosity_level
Пример #42
0
 def process(self,file):
     Preprocessor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     pos = cqpf.getColumn(self.column)
     for i in range(2,len(pos)): # ignore first two pos ...
         uni =  (pos[i])[0:3]
         bi = (pos[i-1])[0:3] + "_" + uni
         tri = (pos[i-2])[0:3] + "_" + bi
         self.counts[self.posdict[uni]][self.filecount] += 1
         self.counts[self.posdict[bi]][self.filecount] += 1
         self.counts[self.posdict[tri]][self.filecount] += 1
         
         self.count += 1
     for x in self.posnames:
         self.counts[self.posdict[x]][self.filecount] /= float(len(pos)-3)
     self.filecount += 1
Пример #43
0
    def contains_preprocessor_constructs(self):
        """Check if file makes use of any preprocessor constructs.

        The test is done by running the file through the
        :class:`~gromacs.fileformats.preprocessor.Preprocessor` (while
        stripping all empty and lines starting with a comment character. This
        is compared to the original file, stripped in the same manner. If the
        two stripped files differ from each other then the preprocessor altered
        the file and preprocessor directives must have been involved and this
        function returns ``True``.

        .. versionadded: 0.3.1
        """
        from itertools import izip

        kwargs = self.defines.copy()
        kwargs['commentchar'] = self.commentchar
        kwargs['clean'] = True
        kwargs['strip'] = True
        ppitp = Preprocessor(self.real_filename, **kwargs)
        ppitp.parse()
        pp_lines = ppitp.StringIO().readlines()

        def strip_line(line):
            s = line.strip()
            return len(s) == 0 or s.startswith(self.commentchar)
        raw_lines = [line for line in open(self.real_filename) if not strip_line(line)]

        if len(pp_lines) != len(raw_lines):
            self.logger.debug("File %r is preprocessed (pp: %d vs raw %d lines (stripped))",
                              self.real_filename, len(pp_lines), len(raw_lines))
            return True
        for linenum, (raw, pp) in enumerate(izip(raw_lines, pp_lines)):
            if raw != pp:
                self.logger.debug("File %r is preprocessed. Difference at (stripped) line %d",
                                  self.real_filename, linenum)
                self.logger.debug("preprocessed: %s", pp)
                self.logger.debug("original:     %s", raw)
                return True
        self.logger.debug("File %r does not appear to contain recognized preprocessing directives",
                          self.real_filename)
        return False
Пример #44
0
    def get_important_vars(cfg, dat):
        '''
        This method does Feature Selection.
        '''

        # Balances the dataset
        idxs_pos = dat[cfg['target']] == 1
        pos = dat[idxs_pos]
        neg = dat[dat[cfg['target']] == 0][1:sum(idxs_pos)]

        # Concatenates pos and neg, it's already shuffled
        sub_dat = pos.append(neg, ignore_index = True)

        # Imputes the data and fills in the missing values
        sub_dat = Preprocessor.fill_nans(sub_dat)

        # Changes categorical vars to a numerical form
        X = pd.get_dummies(sub_dat)

        #### Correlation-based Feature Selection ####

        # Computes correlation between cfg['target'] and the predictors
        target_corr = X.corr()[cfg['target']].copy()
        target_corr.sort(ascending = False)

        # Sorts and picks the first x features
        # TODO: get optimal x value automatically
        tmp = abs(target_corr).copy()
        tmp.sort(ascending = False)
        important_vars = [tmp.index[0]]
        important_vars.extend(list(tmp.index[2:52])) # removes other target

        #### Variance-based Feature Selection ####

        #sel = VarianceThreshold(threshold = 0.005)
        #X_new = sel.fit_transform(X)

        #### Univariate Feature Selection ####

        #y = X.TARGET_B
        #X = X.drop("TARGET_B", axis = 1)

        #X_new = SelectKBest(chi2, k = 10).fit_transform(X.values, y.values)

        #### Tree-based Feature Selection ####

        #clf = ExtraTreesClassifier()
        #X_new = clf.fit(X.values, y.values).transform(X.values)

        #aux = dict(zip(X.columns, clf.feature_importances_))
        #important_vars = [i[0] for i in sorted(
        #    aux.items(), key = operator.itemgetter(0))]

        return important_vars
Пример #45
0
def externalVoodoo( input, output, linkTo, pathToRemoveFromIdentifier = "", trace = False ):
	inputLines = _readLinesOfFile( input )
	perFileSettings = PerFileSettings( inputLines )
	preprocessor = Preprocessor( linkTo, output, inputLines, pathToRemoveFromIdentifier )

	out = preprocessor.externalHeader()
	out += '#include "VoodooConfiguration.h"\n'
	out += '#include <VoodooCommon/Common.h>\n\n'
	out += "namespace External\n{\n\n"
	iterator = VoodooMultiplexerIterator( perFileSettings )
	iterator.process( input )
	out += iterator.iter()
	out += "\n}\n\n"
	out += preprocessor.externalSwitchToExpectation()
	out += '#include "VoodooCommon/All.h"\n\n'
	out += "namespace External\n{\n\n"
	out += iterator.expect()
	out += "\n}\n\n"
	out += preprocessor.externalFooter()
	return out
Пример #46
0
    def _createWidgets(self):
        self.SetBackgroundColour((60,60,60))
        self.SetForegroundColour((230,230,230))

        self.processSysIncCb = wx.CheckBox(self, -1, u"Process #include <...> files")
        self.processSysIncCb.SetBackgroundColour((100,100,100))

        sysIncDirs, appIncDirs = Preprocessor.getDefaultIncDirs()
        self._createSysIncWidgets(sysIncDirs)
        self._createAppIncWidgets(appIncDirs)
        self._createPredefMacroWidgets()
        self._createSaveOptionWidgets()
Пример #47
0
	def __init__(self, files_path='', classes={}, out_file='output.csv'):
		# self.mi_terms looks like this {'term1': {'d': 3, 't': 4, 'mi': 2, },}
		self.mi_terms = {}
		# self.mi_classes looks like this {'d': 3, 't': 4}
		self.mi_classes = {}
		self.total_terms_count = 0
		# Some configuration
		self.files_path = files_path
		self.out_file = out_file
		self.classes = classes
		self.files_prefixes = classes.keys()
		self.class_names = [classes[prefix] for prefix in classes]
		# For tokenizing, stemming, etc.
		self.prep = Preprocessor(pattern='\W+', lower=True, stem=False, stemmer_name='porter', pos=False, ngram=1)
    def __init__(self, valid_actions, run_id, display_screen, skip_frames, game_ROM):
        """
        Initialize ALE class. Creates the FIFO pipes, launches ./ale and does the "handshake" phase of communication

        @param display_screen: bool, whether to show the game on screen or not
        @param skip_frames: int, number of frames to skip in the game emulator
        @param game_ROM: location of the game binary to launch with ./ale
        """

        self.display_screen = display_screen
        self.skip_frames = skip_frames
        self.game_ROM = game_ROM
        self.run_id = run_id

        #: create FIFO pipes
        os.mkfifo("ale_fifo_out_%i" % self.run_id)
        os.mkfifo("ale_fifo_in_%i" % self.run_id)

        #: launch ALE with appropriate commands in the background
        command='./ale/ale -max_num_episodes 0 -game_controller fifo_named -disable_colour_averaging true -run_length_encoding false -frame_skip '+str(self.skip_frames) + ' -run_id ' + str(self.run_id) + ' -display_screen '+self.display_screen+" "+self.game_ROM+" &"
        os.system(command)

        os.system('ls -l ale_fifo_out_%i' % self.run_id)
        os.system('ls -l ale_fifo_in_%i' % self.run_id)

        #: open communication with pipes

        self.fin = open('ale_fifo_out_%i' % self.run_id)
        self.fout = open('ale_fifo_in_%i' % self.run_id, 'w')

        input = self.fin.readline()[:-1]
        size = input.split("-")  # saves the image sizes (160*210) for breakout

        #: first thing we send to ALE is the output options- we want to get only image data
        # and episode info(hence the zeros)
        self.fout.write("1,0,0,1\n")
        self.fout.flush()  # send the lines written to pipe

        #: initialize the variables that we will start receiving from ./ale
        self.next_image = []
        self.game_over = True
        self.current_points = 0
        self.actions = [self.all_actions[i] for i in valid_actions]

        #: initialise preprocessor
        self.preprocessor = Preprocessor()
Пример #49
0
    def _createPredefMacroWidgets(self):
        style = wx.LC_REPORT#|wx.LC_VRULES #|wx.LC_HRULES
        self.predefMacroLc = wx.ListCtrl(self, -1, style=style)
        self.predefMacroLc.InsertColumn(0, 'Name')
        self.predefMacroLc.InsertColumn(1, 'Value')
        self.predefMacroLc.SetBackgroundColour((30,30,30))
        self.predefMacroLc.SetForegroundColour((30,30,30))
        f = self.predefMacroLc.GetFont()
        f.SetFaceName("Monospace")
        self.predefMacroLc.SetFont(f)
        for name, val in sorted(Preprocessor.getPredefMacros().items(), key=lambda i: i[0]):
            idx = self.predefMacroLc.InsertStringItem(sys.maxint, name)
            self.predefMacroLc.SetStringItem(idx, 1, val)
            self.predefMacroLc.SetItemTextColour(idx, (255, 255, 255))

        self.predefAddBtn = wx.Button(self, -1, u"Add")
        self.predefEditBtn = wx.Button(self, -1, u"Edit")
        self.predefDelBtn = wx.Button(self, -1, u"Delete")
Пример #50
0
    def __init__(self, label_path,label_bg_path, meta_path, training_description):
        self.label_path = label_path
        self.label_bg_path = label_bg_path
        self.meta_path = meta_path
        self.training_description = training_description
        
        if not os.path.isdir(self.training_description):
            os.mkdir(self.training_description)

        self.batch_size = 64
        self.queue_size = 2048

        self.nr_epoch = 10
        self.preprocessor = Preprocessor(10)
        self.augmenter = AugmentTransform(10, 10)
        self.inverse_labels = {}
        self.inverse_labels_bg = {}

        self.train_val_ratio = 0.1
Пример #51
0
    def __init__(self,  memory, display_screen="true", skip_frames=4, game_ROM='../libraries/ale/roms/breakout.bin'):
        """
        Initialize ALE class. Creates the FIFO pipes, launches ./ale and does the "handshake" phase of communication

        @param memory: memoryD, reference to the instance of class memoryD that collects all transitions in the game
        @param display_screen: bool, whether to show the game on screen or not
        @param skip_frames: int, number of frames to skip in the game emulator
        @param game_ROM: location of the game binary to launch with ./ale
        """

        self.display_screen = display_screen
        self.skip_frames = skip_frames
        self.memory = memory
        self.game_ROM = game_ROM

        #: create FIFO pipes
        os.system("mkfifo ale_fifo_out")
        os.system("mkfifo ale_fifo_in")

        #: launch ALE with appropriate commands in the background
        command='./../libraries/ale/ale -max_num_episodes 0 -game_controller fifo_named -disable_colour_averaging true -run_length_encoding false -frame_skip '+str(self.skip_frames)+' -display_screen '+self.display_screen+" "+self.game_ROM+" &"
        os.system(command)

        #: open communication with pipes
        self.fin = open('ale_fifo_out')
        self.fout = open('ale_fifo_in', 'w')
        
        input = self.fin.readline()[:-1]
        size = input.split("-")  # saves the image sizes (160*210) for breakout

        #: first thing we send to ALE is the output options- we want to get only image data
        # and episode info(hence the zeros)
        self.fout.write("1,0,0,1\n")
        self.fout.flush()  # send the lines written to pipe

        #: initialize the variables that we will start receiving from ./ale
        self.next_image = []
        self.game_over = True
        self.current_reward = 0

        #: initialise preprocessor
        self.preprocessor = Preprocessor()
Пример #52
0
    def setUp(self):
        self.mock_metadata_helper = MagicMock(spec=MetadataHelper)

        self.mock_image_open_patcher = patch('preprocessor.Image.open')
        self.mock_image_open = self.mock_image_open_patcher.start()

        self.mock_image = MagicMock()
        self.mock_exif_data = 'a bunch of exif data'
        self.mock_image.info = {
            'exif': self.mock_exif_data
        }
        self.mock_image_open.return_value = self.mock_image

        self.mock_first_transposed_image = MagicMock()
        self.mock_image.transpose.return_value = self.mock_first_transposed_image

        self.mock_second_transposed_image = MagicMock()
        self.mock_first_transposed_image.transpose.return_value = self.mock_second_transposed_image

        self.test_model = Preprocessor(self.mock_metadata_helper)
Пример #53
0
    def _load_text_preprocessor(self, args):
        """ Load the preprocessor for the context """
        self._update_status(6)
        self.text_preprocessor = Preprocessor()

        # Load preprocessors based on type of model
        ## TF-IDF:
        if "tfidf" in self.options["preproc_type"]:
            print "Loading TF-IDF model..."
            with open(args.tfidfmodel, "rb") as f_tfidf, open(args.svdmodel, "rb") as f_svd:
                tfidf_model = pkl.load(f_tfidf)
                svd_model = pkl.load(f_svd) if "with_svd" in self.options["preproc_params"] else None
                self.text_preprocessor.set_tfidf(tfidf_model, svd_model)
        ## Word2Vec:
        if "w2v" in self.options["preproc_type"]:
            print "Loading Word2Vec model..."
            w2v_model = Word2Vec.load_word2vec_format(args.w2vmodel, binary=True)
            self.text_preprocessor.set_w2v(w2v_model)
        ## Raw:
        if "raw" in self.options["preproc_type"]:
            print "Loading counter model..."
            with open(args.rawmodel, "rb") as f_raw:
                raw_model = pkl.load(args.rawmodel)
                self.text_preprocessor.set_raw(raw_model)
Пример #54
0
                if( check == 9 ):
                        return True
                else:
                        return False
		

if __name__ == '__main__':
    import sys, os
    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info("running %s" % " ".join(sys.argv))

    # check and process cmdline input
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 5:
        print "Usage: python preprocessor.py -infile -synset_list -vocab_filename -outputfilename "
        sys.exit(1)
    infile = sys.argv[1]
    #synset filename
    S = sys.argv[2]
    #vocab filename
    F = sys.argv[3]
    outfile = sys.argv[4]

    from preprocessor import Preprocessor  # for pickle
    #from gensim.models import Preprocessor  # for pickle
    from gensim.models.word2vec import Text8Corpus
    sentences = Text8Corpus(infile)

    prep = Preprocessor(sentences,F,S)
    prep.prep_text(8,sentences,outfile)
Пример #55
0
class ALE:
    actions = [np.uint8(0), np.uint8(1), np.uint8(3), np.uint8(4), np.uint8(11), np.uint8(12)]
    current_points = 0
    next_screen = ""
    game_over = False
    skip_frames = None
    display_screen = "true"
    game_ROM = None
    fin = ""
    fout = ""
    preprocessor = None
    
    def __init__(self, display_screen, skip_frames, game_ROM):
        """
        Initialize ALE class. Creates the FIFO pipes, launches ./ale and does the "handshake" phase of communication

        @param display_screen: bool, whether to show the game on screen or not
        @param skip_frames: int, number of frames to skip in the game emulator
        @param game_ROM: location of the game binary to launch with ./ale
        """

        self.display_screen = display_screen
        self.skip_frames = skip_frames
        self.game_ROM = game_ROM

        #: create FIFO pipes
        os.system("mkfifo ale_fifo_out")
        os.system("mkfifo ale_fifo_in")

        #: launch ALE with appropriate commands in the background
        command='./../libraries/ale/ale -max_num_episodes 0 -game_controller fifo_named -disable_colour_averaging true -run_length_encoding false -frame_skip '+str(self.skip_frames)+' -display_screen '+self.display_screen+" "+self.game_ROM+" &"
        os.system(command)

        #: open communication with pipes
        self.fin = open('ale_fifo_out')
        self.fout = open('ale_fifo_in', 'w')
        
        input = self.fin.readline()[:-1]
        size = input.split("-")  # saves the image sizes (160*210) for breakout

        #: first thing we send to ALE is the output options- we want to get only image data
        # and episode info(hence the zeros)
        self.fout.write("1,0,0,1\n")
        self.fout.flush()  # send the lines written to pipe

        #: initialize the variables that we will start receiving from ./ale
        self.next_image = []
        self.game_over = True
        self.current_points = 0

        #: initialise preprocessor
        self.preprocessor = Preprocessor()

    def new_game(self):
        """
        Start a new game when all lives are lost.
        """

        #: read from ALE:  game screen + episode info
        self.next_image, episode_info = self.fin.readline()[:-2].split(":")
        self.game_over = bool(int(episode_info.split(",")[0]))
        self.current_points = int(episode_info.split(",")[1])

        #: send the fist command
        #  first command has to be 1,0 or 1,1, because the game starts when you press "fire!",
        self.fout.write("1,0\n")
        self.fout.flush()
        self.fin.readline()

        #: preprocess the image and add the image to memory D using a special add function
        #self.memory.add_first(self.preprocessor.process(self.next_image))
        return self.preprocessor.process(self.next_image)

    def end_game(self):
        """
        When all lives are lost, end_game adds last frame to memory resets the system
        """
        #: tell the memory that we lost
        # self.memory.add_last() # this will be done in Main.py
        
        #: send reset command to ALE
        self.fout.write("45,45\n")
        self.fout.flush()
        self.game_over = False  # just in case, but new_game should do it anyway

    
    def move(self, action_index):
        """
        Sends action to ALE and reads responds
        @param action_index: int, the index of the chosen action in the list of available actions
        """
        #: Convert index to action
        action = self.actions[action_index]

	#: Generate a random number for the action of player B
        action_b = random.choice(range(255))


        #: Write and send to ALE stuff
        self.fout.write(str(action)+","+str(action_b)+"\n")
        #print "sent action to ALE: ",  str(action)+",0"
        self.fout.flush()

        #: Read from ALE
        line = self.fin.readline()
        try:
            self.next_image, episode_info = line[:-2].split(":")
            #print "got correct info from ALE: image + ", episode_info
        except:
            print "got an error in reading stuff from ALE"
            traceback.print_exc()
            print line
            exit()
        self.game_over = bool(int(episode_info.split(",")[0]))
        self.current_points = int(episode_info.split(",")[1])
        return self.current_points, self.preprocessor.process(self.next_image)
Пример #56
0
from scipy.cluster.hierarchy import linkage, fcluster
from dionysus import PairwiseDistances,ExplicitDistances
import numpy as np

def bench_cluster(X, y, pca_n_comp):
    n = len(np.unique(y))
    pca = PCA(pca_n_comp)
    X_ = pca.fit_transform(X)
    sc = SpectralClustering(n)
    km = KMeans(n)
    sc_pred = sc.fit_predict(X_)
    km_pred = km.fit_predict(X_)
    distances = PairwiseDistances(X_.tolist())
    distances = ExplicitDistances(distances)
    singlel_pred = fcluster(linkage(ssd.squareform(distances.distances)), n, criterion='maxclust')
    print "single-linkage clustering prediction:", singlel_pred
    print "single-linkage clustering score:", adjusted_rand_score(y, singlel_pred), mutual_info_score(y, singlel_pred)
    print "spectral clustering prediction:", sc_pred
    print "spectral clustering score:", adjusted_rand_score(y, sc_pred), mutual_info_score(y, sc_pred)
    print "kmeans clustering prediction", km_pred
    print "kmeans clustering score:", adjusted_rand_score(y, km_pred), mutual_info_score(y, km_pred)
    print "ground truth labels", y


if __name__ == "__main__":
    funcs = [word_lengths_funcs, sentence_lengths_funcs, ratio_most_n_common_words, ratio_length_of_words_texts,
            lambda text: ratio_length_of_words_texts(text, 8, ge)]
    pp = Preprocessor(Prepreprocessor, funcs, use_tfidf=20)
    X, y = pp.process(['../data/abstracts/', '../data/sports', '../data/reviews'])
    bench_cluster(X, y, 3)
Пример #57
0
 def classify(self) :
     t1 = time.time()
     
     # Schedule a crawl job with the query
     try :        
         crawler = Search(self.search_query)
         crawler.googleSearch()
     except Exception as e :
         print e            
         print "Error in initializing Google search"
     
     t2 = time.time()
     print "Google search done in " + str(t2-t1) + " secs"
     
     # Extract data crawled 
     try :
         crawler.get_crawled_urls()
     except Exception as e :
         print e            
         print "Error in extracting crawl data"
     
     t3 = time.time()
     print "Test data extraction done in " + str(t3-t2) + " secs"
     
     # Preprocess test data
     try :
         preproc_test = Preprocessor(crawler.all_urls)
         preproc_test.preprocessor_main()
     except Exception as e :
         print e
         print "Error in preprocessing crawl data"
         
     t4 = time.time()
     print "Test data preprocessing done in " + str(t4-t3) + " secs"
     
     # Send a search request to Dig server with the query
     dig_search = Dig_Search(self.search_query)
     dig_search.search_request()
     t5 = time.time()
     print "Dig Search done in " + str(t5-t4) + " secs"
     
     # Extract results returned by search query
     dig_search.dig_extraction()
     t6 = time.time()
     print "Dig extraction done in " + str(t6-t5) + " secs"
     
     # Preprocess the search results
     try :        
         preproc_train = Preprocessor(dig_search.urls_dig)
         preproc_train.preprocessor_main()
         dig_search.filter_dig_result(preproc_train.data)
     except Exception as e :
         print e
         print "Error in preprocessing training data"
         
     t7 = time.time()
     print "Training data preprocessing done in " + str(t7-t6) + " secs"
     
     # Compute tfidf vectors of data
     try :        
         tfidf_train = Tfidf_Vectorize(dig_search.urls_dig)
         tfidf_train.tfidf_vectorize_train()
         tfidf_train.tfidf_vectorize_test(preproc_test.data)
     except Exception as e :
         print e
         print "Error in computing tfidf vectorization"
     
     t9 = time.time()
     print "Tfidf computation done in " + str(t9-t7) + " secs"
     
     # Compute similarity of training data with its centroid vector
     try :        
         sim_train = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_train)
         similarity_train = sim_train.similarity_main()
     except Exception as e :
         print e
         print "Error in computing cosine similarity"
         
     t10 = time.time()
     print "Training data similarity computation done in " + str(t10-t9) + " secs"
     
     # Compute similarity of test data with training data
     try :        
         sim_test = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_test)
         similarity_test = sim_test.similarity_main()
     except Exception as e :
         print e
         print "Error in computing cosine similarity"
         
     t11 = time.time()
     print "Similarity computation done in " + str(t11-t10) + " secs"
     
     print "Total time = " + str(t11-t1)
     
     evaluator = Evaluation(similarity_train, similarity_test)
     urls_classified = evaluator.compare_similarity(preproc_test)
     
     classified_output = self.formatOutput(urls_classified)
     
     return classified_output
Пример #58
0
 def classify(self) :
     t1 = time.time()
     
     # Schedule a crawl job with the query
     try :        
         crawler = Search(self.search_query)
         crawler.googleSearch()
     except Exception as e :
         print "Error in initializing Google search"
     
     t2 = time.time()
     print "Google search done in " + str(t2-t1) + " secs"
     
     # Extract data crawled 
     try :
         crawler.get_crawled_urls()
     except Exception as e :
         print "Error in extracting crawl data"
     
     t3 = time.time()
     print "Test data extraction done in " + str(t3-t2) + " secs"
     
     # Preprocess test data
     try :
         preproc_test = Preprocessor(crawler.all_urls)
         preproc_test.preprocessor_main()
     except Exception as e :
         print e
         print "Error in preprocessing crawl data"
         
     t4 = time.time()
     print "Test data preprocessing done in " + str(t4-t3) + " secs"
     
     # Send a search request to Dig server with the query
     dig_search = Dig_Search(self.search_query)
     dig_search.search_request()
     t5 = time.time()
     print "Dig Search done in " + str(t5-t4) + " secs"
     
     # Extract results returned by search query
     dig_search.dig_extraction()
     t6 = time.time()
     print "Dig extraction done in " + str(t6-t5) + " secs"
     
     # Preprocess the search results
     try :        
         preproc_train = Preprocessor(dig_search.urls_dig)
         preproc_train.preprocessor_main()
         dig_search.filter_dig_result(preproc_train.data)
     except Exception as e :
         print e
         print "Error in preprocessing training data"
         
     t7 = time.time()
     print "Training data preprocessing done in " + str(t7-t6) + " secs"
     
     # Compute tfidf vectors of data
     try :        
         tfidf_train = Tfidf_Vectorize(dig_search.urls_dig)
         tfidf_train.tfidf_vectorize_train()
         tfidf_train.tfidf_vectorize_test(preproc_test.data)
     except Exception as e :
         print e
         print "Error in computing tfidf vectorization"
     
     t9 = time.time()
     print "Tfidf computation done in " + str(t9-t7) + " secs"
     
     # Compute similarity of training data with its centroid vector
     try :        
         sim_train = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_train)
         similarity_train = sim_train.similarity_main()
     except Exception as e :
         print e
         print "Error in computing cosine similarity"
         
     t10 = time.time()
     print "Training data similarity computation done in " + str(t10-t9) + " secs"
     
     # Compute similarity of test data with training data
     try :        
         sim_test = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_test)
         similarity_test = sim_test.similarity_main()
     except Exception as e :
         print e
         print "Error in computing cosine similarity"
         
     t11 = time.time()
     print "Similarity computation done in " + str(t11-t10) + " secs"
     
     print "Total time = " + str(t11-t1)
     
     evaluator = Evaluation(similarity_train, similarity_test)
     similarity_count = evaluator.compare_similarity(preproc_test)
     
     avg_train_similarity = numpy.mean(similarity_train)
     epsilon = 0.4 * avg_train_similarity
     classifier_output = open("output/" + self.search_query.replace(' ','_') + "2.html","w")
     urls_classified = []
     
     tfidf_tr = tfidf_train.tfidf_centroid_train
     tfidf_tr = sorted(tfidf_tr, key= lambda tfidf : tfidf[1], reverse=True)
     
     for sim in similarity_count :
         url_desc = {}
         url_desc['Test_url'] = "<a href='"+preproc_test.data[sim[0]]['url']+"''>"+preproc_test.data[sim[0]]['url']+"</a>"
         if sim[1] >= (avg_train_similarity-epsilon) :
             url_desc['Classifier Output'] = True
         else :
             url_desc['Classifier Output'] = False
         
         url_desc['Similarity Score'] = sim[1]
         url_desc['Average Training Similarity'] = avg_train_similarity
         
         tfidf_url = tfidf_train.tfidf_test[sim[0]]
         tfidf_url = sorted(tfidf_url, key= lambda tfidf : tfidf[1], reverse=True)
         
         url_desc['Top Test Keywords'] = ", ".join([tfidf[0] for tfidf in tfidf_url[0:20]])
         urls_classified.append(url_desc)
         
     _json2conv = {"" : urls_classified}
     classifier_output.write("<html><h2 align='center' style='text-decoration:underline'>Classifier Output</h3><h2 align='center'>Query : "+self.search_query+"</h2><h2 align='center'>Top Train Keywords : "+", ".join([tfidf[0] for tfidf in tfidf_tr[0:20]])+"</h2><body>"+ json2html.convert(json=_json2conv, table_attributes="border=2, cellspacing=0, cellpadding=5, text-align='center'") + "</body></html>")
     
     classifier_output.close()
Пример #59
0
def analyze(snd_pipe, db_path, pp_cfg, parser_cfg, srcFiles, use_pipeline=False, analyzer_process=1, pp_process=1, parser_process=1):
    db = DatabaseManager()
    pp_list = [Preprocessor(**pp_cfg) for i in range(pp_process if use_pipeline else analyzer_process)]
    parser_list = [Parser(**parser_cfg) for i in range(parser_process if use_pipeline else analyzer_process)]
    numFiles = len(srcFiles)
    use_pipeline = use_pipeline

    t_0 = datetime.datetime.now()

    projInfo = {}
    projInfo['predefined'] = pp_list[0].preprocess_predef()

    task_queue = Queue()
    done_queue = Queue()

    for i, srcFile in enumerate(srcFiles):
        task_queue.put(srcFile)
    for i in range(len(pp_list)):
        task_queue.put('STOP')

    if not use_pipeline:
        analyzer_p_list = [Process(target=analyzer_worker, args=(pp, parser, task_queue, done_queue)) for pp, parser in zip(pp_list, parser_list)]
        for analyzer_p in analyzer_p_list:
            analyzer_p.start()

        for i, srcFile in enumerate(srcFiles):
            #print 'analyze: [%d/%d]' % (i,numFiles), srcFile
            projInfo[srcFile] = done_queue.get()
            snd_pipe.send((i, numFiles, srcFile))
            if snd_pipe.poll():
                for analyzer_p in analyzer_p_list:
                    analyzer_p.terminate()
                for analyzer_p in analyzer_p_list:
                    analyzer_p.join()
                Preprocessor.clearTokenCache()
                snd_pipe.send('STOPPED')
                print 'analyze: canceled'
                return
        for analyzer_p in analyzer_p_list:
            analyzer_p.join()
    else:
        pp_queue = Queue()

        pp_p_list = [Process(target=preprocessor_worker, args=(pp, task_queue, pp_queue)) for pp in pp_list]
        for pp_p in pp_p_list:
            pp_p.start()

        parser_p_list = [Process(target=parser_worker, args=(parser, pp_queue, done_queue)) for parser in parser_list]
        for parser_p in parser_p_list:
            parser_p.start()

        for i, srcFile in enumerate(srcFiles):
            #print 'analyze: [%d/%d]' % (i,numFiles), srcFile
            projInfo[srcFile] = done_queue.get()
            snd_pipe.send((i, numFiles, srcFile))
            if snd_pipe.poll():
                for pp_p in pp_p_list:
                    pp_p.terminate()
                for parser_p in parser_p_list:
                    parser_p.terminate()
                for pp_p in pp_p_list:
                    pp_p.join()
                for parser_p in parser_p_list:
                    parser_p.join()
                Preprocessor.clearTokenCache()
                snd_pipe.send('STOPPED')
                print 'analyze: canceled'
                return

        for i in range(len(parser_p_list)):
            pp_queue.put('STOP')
        for pp_p in pp_p_list:
            pp_p.join()
        for parser_p in parser_p_list:
            parser_p.join()

    t_1 = datetime.datetime.now()

    db.createDB(db_path)
    db.addData(projInfo)
    db.saveDB()

    db.closeDB()

    print 'analyze: done', t_1 - t_0
    snd_pipe.send((numFiles, numFiles, 'Generating Database ... done'))
Пример #60
0
 def __init__(self):
     self.readNames()
     self.count = 0
     self.column = 1
     Preprocessor.__init__(self)