def normalizeDatabase(self): """ If a collection has a field which contains several fields, we extract those fields and make a new collection out of it """ n = Normalizer(self.metadata_db, self.dataset_db) # Bombs away! LOG.info("Processing mongodb dataset normalization") n.process() LOG.info("Finshing normalization")
def __init__(self, n_states, n_actions, n_goals, action_bounds, capacity, env, k_future, batch_size, action_size=1, tau=0.05, actor_lr=1e-3, critic_lr=1e-3, gamma=0.98): self.device = device("cpu") self.n_states = n_states self.n_actions = n_actions self.n_goals = n_goals self.k_future = k_future self.action_bounds = action_bounds self.action_size = action_size self.env = env self.actor = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.sync_networks(self.actor) self.sync_networks(self.critic) self.actor_target = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic_target = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.init_target_networks() self.tau = tau self.gamma = gamma self.capacity = capacity self.memory = Memory(self.capacity, self.k_future, self.env) self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_optim = Adam(self.actor.parameters(), self.actor_lr) self.critic_optim = Adam(self.critic.parameters(), self.critic_lr) self.state_normalizer = Normalizer(self.n_states[0], default_clip_range=5) self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5)
def stoi(self, filepath, clean_filepath=None): # filepath = path to mashup # Needs octave and octave-signal installed # Use "pip install oct2py" to install python - octave bridge # STOI assumes # * a sampling rate of 10kHz, resamples otherwise # * window length of 384ms # * 15 third octave bands over full frequency range # * overlapping segments with hanning window # * removes silent frames import librosa from oct2py import octave if clean_filepath is None: # No clean file given. # Get processed and clean file from mashup. vocal_isolation = VocalIsolation(config) vocal_isolation.loadWeights(config.weights) audio, sampleRate = conversion.load_audio_file(filepath) spectrogram = conversion.audio_file_to_spectrogram( audio, fftWindowSize=config.fft, learn_phase=self.config.learn_phase) normalizer = Normalizer() normalize = normalizer.get(both=False) denormalize = normalizer.get_reverse() # normalize spectogram, norm = normalize(spectrogram) info = vocal_isolation.process_spectrogram(spectrogram, config.get_channels()) spectrogram, new_spectrogram = info # de-normalize new_spectrogram = denormalize(new_spectrogram, norm) processed = conversion.spectrogram_to_audio_file(new_spectrogram, config.fft, config.phase_iterations) clean_filepath = filepath.replace("_all.wav", "_vocal.wav") clean, sampling_rate = librosa.load(clean_filepath) else: # A clean file is given. # Compare it with the processed audio. processed, sampling_rate = librosa.load(filepath) clean, sampling_rate = librosa.load(clean_filepath) # Make sure the original and processed audio have the same length clean = clean[:processed.shape[0]] octave.eval("pkg load signal") d = octave.stoi(clean, processed, sampling_rate) self._write("stoi: %f" % d)
def __init__(self, **params): self.num_epochs = params["num_epochs"] self.early_stop_tolerance = params["early_stop_tolerance"] self.norm_method = params["norm_method"] self.loss_type = params["loss_type"] self.learning_rate = params["learning_rate"] self.l2_reg = params["l2_reg"] self.clip = params['clip'] self.device = params['device'] self.input_normalizer = Normalizer(self.norm_method) self.output_normalizer = Normalizer(self.norm_method)
def test_extract_section_features(self): normalizer = Normalizer() self.assertEqual(normalizer.extract_section_features('136'), generate_feature(d='136')) self.assertEqual(normalizer.extract_section_features('Reserve 40 '), generate_feature(pp="reserve", d='40')) self.assertEqual(normalizer.extract_section_features('Top Deck 6 '), generate_feature(pp="top deck", d='6')) self.assertEqual(normalizer.extract_section_features('31RS '), generate_feature(d='31', s='rs')) self.assertEqual(normalizer.extract_section_features('Left Field Pavilion 311'), generate_feature(pp='left field pavilion', d='311')) self.assertEqual(normalizer.extract_section_features('Infield Reserve IFR7 '), generate_feature(pp='infield reserve', p='ifr', d='7')) self.assertEqual(normalizer.extract_section_features('311PL'), generate_feature(d='311', s='pl')) self.assertEqual(normalizer.extract_section_features('F9'), generate_feature(p='f', d='9')) self.assertEqual(normalizer.extract_section_features('L36'), generate_feature(p='l', d='36'))
def _prepare_normalizers(self): normalizer_tf = dict() with tf.variable_scope('o_stats'): normalizer_tf['o'] = Normalizer(self._env_spec['o_dim'], **self._normalizer_params) with tf.variable_scope('u_stats'): normalizer_tf['a'] = Normalizer(self._env_spec['a_dim'], **self._normalizer_params) with tf.variable_scope('abs_pred_err_stats'): normalizer_tf['abs_pred_err'] = Normalizer( self._env_spec['o_dim'], **self._normalizer_params) return normalizer_tf
def clean(self, tweets): for tw in tweets: count = 0 for t in tweets[tw]: norm = Normalizer() stp = StpRemoval() t['text_clean'] = t['text'].encode('utf-8', errors='ignore') t['text_clean'] = t['text_clean'].translate(string.maketrans(string.punctuation, ' ' * len(string.punctuation))) text = norm.normalize(t['text_clean']) text = stp.removeStp(t['text_clean']) tweets[tw][count]['text_clean'] = text.lower() count = count + 1 return tweets
def reload(self): """Refreshes this instance's normalizers pool.""" self.normalizers = { 'raw' : [], 'body' : [] } for path in self.iter_normalizer(): norm = parse(open(path)) if not self.dtd.validate(norm): warnings.warn('Skipping %s : invalid DTD' % path) print 'invalid normalizer ', path else: normalizer = Normalizer(norm, self.ctt) normalizer.uuid = self._compute_norm_uuid(normalizer) self.normalizers.setdefault(normalizer.appliedTo, []) self.normalizers[normalizer.appliedTo].append(normalizer) self.activate_normalizers()
def reload(self): """Refreshes this instance's normalizers pool.""" self.normalizers = {'raw': [], 'body': []} for path in self.iter_normalizer(): norm = parse(open(path)) if not self.dtd.validate(norm): warnings.warn('Skipping %s : invalid DTD' % path) print 'invalid normalizer ', path else: normalizer = Normalizer(norm, self.ctt, self.ccb) normalizer.uuid = self._compute_norm_uuid(normalizer) self.normalizers.setdefault(normalizer.appliedTo, []) self.normalizers[normalizer.appliedTo].append(normalizer) self.activate_normalizers()
def __init__(self, env, act_dim, state_dim, goal_dim, act_range, buffer_size=int(1e6), gamma=0.98, lr=0.001, tau=0.95): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = state_dim + goal_dim self.gamma = gamma self.lr = lr self.tau = tau self.env = env # Create actor and critic networks self.actor_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_network = Critic(self.env_dim, act_dim, act_range) self.critic_target_network = Critic(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) sync_networks(self.actor_network) sync_networks(self.critic_network) # Optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=lr) # Replay buffer # self.buffer = MemoryBuffer(buffer_size) self.buffer = ReplayMemory(buffer_size) # Normalizers self.goal_normalizer = Normalizer( goal_dim, default_clip_range=5) # Clip between [-5, 5] self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
def train(self, batch_size, fold_idx, normalize=True, augment=False): """ Dataset for model training Args: batch_size: int, number of images in a batch fold_idx: int, index of fold, from 0 to n_folds - 1 normalize: bool, wether to normalize training data with Welford's online algorithm. augment: bool, wether to use augmentation or not Returns: data: TensroFlow dataset steps: int, number of steps in train epoch """ if not (fold_idx >= 0 and fold_idx < self.n_folds): raise Exception(('Fold index {} is out of expected range:' + ' [0, {}]').format(fold_idx, self.n_folds - 1)) if normalize and augment: raise Exception('Both augmentations and normalization ' + 'with Welford algo is not supported ') print(' ... Generating Training Dataset ... ') if self.n_folds == 1: train_idx = range(0, len(self.filenames)) else: train_idx, _ = list(self.kf.split(self.filenames))[fold_idx] filenames = np.array(self.filenames)[train_idx] labels = np.array(self.labels)[train_idx] steps = math.ceil(len(filenames) / batch_size) if normalize: mean, std = Normalizer.calc_mean_and_std(filenames, self.img_size) mean = np.array([mean['red'], mean['green'], mean['blue']]) std = np.array([std['red'], std['green'], std['blue']]) else: # values taken from ImageNet Dataset mean = np.array([0.485, 0.456, 0.406]) std = np.array([0.229, 0.224, 0.225]) self.normalizer = Normalizer(mean, std) data = tf.data.Dataset.from_tensor_slices( (tf.constant(filenames), tf.constant(labels))) data = data.map(self.parse_fn) if augment: augs = [self.flip, self.color, self.rotate, self.zoom] for f in augs: data = data.map(f, num_parallel_calls=4) data = data.map(self.drop, num_parallel_calls=4) data = data.shuffle(buffer_size=len(filenames)) data = data.batch(batch_size) data = data.prefetch(1) return data, steps
def __init__(self, hp=None, input_size=None, output_size=None, normalizer=None, policy=None, monitor_dir=None): self.hp = hp or Hp() np.random.seed(self.hp.seed) self.env = gym.make(self.hp.env_name) if monitor_dir is not None: # Record periodic videos should_record = lambda i: self.record_video self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True) self.hp.episode_length = self.env.spec.timestep_limit or self.hp.episode_length self.input_size = input_size or self.env.observation_space.shape[0] self.output_size = output_size or self.env.action_space.shape[0] self.normalizer = normalizer or Normalizer(self.input_size) self.policy = policy or Policy(self.input_size, self.output_size, self.hp) self.record_video = False
class Preprocessor(): def __init__(self, data_dir, start_index=0): self.normalizer = Normalizer() self.data_dir = data_dir self.docs_number = len(glob(os.path.join(self.data_dir, '**', '*.nxml'))) self.inserter = Inserter() self.start_index = 0 def preprocess(self): print('Start time: {0}'.format(datetime.now())) for index, file_name in enumerate(sorted(glob(os.path.join(self.data_dir, '**', '*.nxml')))): if index >= self.start_index: terms = self.normalizer.normalize(file_name) for term in set(terms): self.inserter.insert(term, self.doc_id(file_name), terms.count(term)) if index % 100 == 0: print('processing doc {0}/{1}'.format(index + 1, self.docs_number)) print('Fineished at: {0}'.format(datetime.now())) def doc_id(self, file_name): return os.path.splitext(os.path.basename(file_name))[0]
def __init__(self): self.__vocab: t.List[Replacement] = self.__make_vocab() self.__normalizer: Normalizer = Normalizer(self.__vocab) self.__string: str = '' self.__word: Word = Word() self.__normalized_word: Word = Word()
def __init__(self, *args, **kwargs): """ Constructor method, initializes variables. """ # Initializing variables self.nlpnet_normalizer = Normalizer()
class DataSource(object): def __init__(self, mode, data_path): self.normalizer = Normalizer() self.data_path = data_path self.mode = mode self.filenames = [] self.contents = [] self.labels = [] def load_data(self): # load raw sample raw_data = [] raw_sample = "" with open(self.data_path, "r") as f: for row in f: if (self.mode + "_") in row: raw_data.append(raw_sample) raw_sample = row else: raw_sample += row raw_data.append(raw_sample) raw_data = raw_data[1:] # process raw data for i, raw_sample in enumerate(raw_data): self.filenames.append(raw_sample.split("\n")[0]) print(i, end = "\r") content = re.search("\"(.*)\"", raw_sample, re.DOTALL).group(1) self.contents.append(self.normalizer.transform(content)) if self.mode == "train": self.labels.append(raw_sample.split("\n")[-3])
def __init__(self, host='192.168.0.107', port=7777, list_file='inputfiles-full.txt', freqs_file='wordlist', dataset_dir='/home/aelphy/Desktop/ir_project_dataset'): self.normalizer = Normalizer() self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.connect((host, port)) self.document_freqs_list_filename = freqs_file self.document_list_filename = list_file self.dataset_dir = dataset_dir self.documents_freqs = {} self.document_identificators = {} self.identificator_documents = {} with open(self.document_list_filename) as f: for line in f: data = line.strip().split() index = int(data[0]) identifier = data[1] self.document_identificators[index] = identifier self.identificator_documents[identifier] = index self.documents_number = index with open(self.document_freqs_list_filename) as f: for line in f: data = line.strip().split() self.documents_freqs[self.identificator_documents[data[1]]] = int(data[0]) self.avgdl = sum(self.documents_freqs.values()) / float(self.documents_number) self.k1 = 2.0 self.b = 0.75
def prepare_data(self, chop, tracks, post_process=False): normalize = Normalizer().get() x = [] y = [] for track in tracks: x.append(self.mashup[track]) if self.is_instrumental: y.append(self.instrumental[track]) else: y.append(self.vocal[track]) x = [self.prepare_spectrogram(s) for s in x] y = [self.prepare_spectrogram(s) for s in y] x, y = normalize(x, y) mashup_slices = [] output_slices = [] for mashup, output in zip(x, y): x_slices, y_slices = chop(mashup, output) x_slices = np.array(x_slices)[:] y_slices = np.array(y_slices)[:] mashup_slices.append(x_slices) output_slices.append(y_slices) return mashup_slices, output_slices
def main(): numpy.random.seed(3) if not os.path.exists("cache"): os.makedirs("cache") if os.path.exists("cache/training_data.npy"): training_data = numpy.load('cache/training_data.npy') else: training_data = read_images(TRAIN['data'], DATA_FOLDER) numpy.save('cache/training_data', training_data) if os.path.exists("cache/training_labels.npy"): training_labels = numpy.load('cache/training_labels.npy') else: training_labels = read_labels(TRAIN['labels'], DATA_FOLDER) numpy.save('cache/training_labels', training_labels) training_data = training_data.reshape(training_data.shape[0], -1) number_of_inputs = training_data.shape[1] number_of_labels = numpy.unique(training_labels).size topology = [number_of_inputs, 50, 30, number_of_labels] if os.path.exists("cache/test_data.npy"): test_data = numpy.load('cache/test_data.npy') else: test_data = read_images(TEST['data'], DATA_FOLDER) numpy.save('cache/test_data', test_data) if os.path.exists("cache/test_labels.npy"): test_labels = numpy.load('cache/test_labels.npy') else: test_labels = read_labels(TEST['labels'], DATA_FOLDER) numpy.save('cache/test_labels', test_labels) test_data = test_data.reshape(test_data.shape[0], -1) normalized = Normalizer(training_data=training_data, test_data=test_data) for i in range(1): print("Running {}th time...".format(i)) neural_network = NeuralNetwork(topology) neural_network.train(normalized.training_data(), training_labels, test_data=normalized.test_data(), test_labels=test_labels)
def is_signature_real(cls, signature_image_name: str) -> bool: person_id = signature_image_name[-7:-4] standards_specifier = SignatureStandardsSpecifier(person_id=person_id) standards = standards_specifier.get_standards() pic = Normalizer.resize( standards.width, standards.height, cls.__get_normalized_image(signature_image_name)) number_of_pixels = Detector.__count_black_pixels(pic) return standards.pixel_low_bound < number_of_pixels < standards.pixel_high_bound
def normalize_training_inputs(training_inputs): m_training_inputs_only = np.array(training_inputs) m_dimensions = m_training_inputs_only.transpose() m_dimensions_normalized = np.array([ Normalizer.normalize_to_stdev(dimension_set) for dimension_set in m_dimensions ]) m_training_inputs_normalized = m_dimensions_normalized.transpose() return m_training_inputs_normalized
def reset(self): self.action_space = self.env.action_space obs_space = self.env.observation_space.spaces obs_len = obs_space['observation'].shape[0] goal_len = obs_space['desired_goal'].shape[0] self.state_size = obs_len + goal_len self.actions_size = self.action_space.shape[0] max_action = float(self.env.action_space.high[0]) self.actor = ActorNet(self.state_size, *self.config['net_sizes'], self.actions_size, max_action) self.critic = CriticNet(self.state_size, *self.config['net_sizes'], self.actions_size) self.actor_target = ActorNet(self.state_size, *self.config['net_sizes'], self.actions_size, max_action) self.critic_target = CriticNet(self.state_size, *self.config['net_sizes'], self.actions_size) self.actor_optim = Adam(self.actor.parameters(), lr=self.config['learning_rate']) self.critic_optim = Adam(self.critic.parameters(), lr=self.config['learning_rate']) self.update(self.critic_target, self.critic, 1) self.update(self.actor_target, self.actor, 1) self.epsilon = self.config['epsilon'] self.epsilon_decay = self.config['epsilon_decay'] self.gamma = self.config['gamma'] if self.config['PER']: self.memory = self.memory = PrioritizedMemory( self.config['memory_size'], self.config["memory_alpha"], self.config["memory_epsilon"], self.config["memory_beta"], self.config["memory_beta_increment"]) else: self.memory = ReplayBuffer(self.config['memory_size']) self.batch_size = self.config['batch_size'] self.normalizer = Normalizer(obs_len, goal_len) # warm up the normalizer self.normalizer.observe(self.env.reset())
class classifier(AffineModel): def __init__(self, **kwargs): self.normalizer = None AffineModel.__init__(self, **kwargs) def fit(self, data, *args, **kwargs): self.normalizer = Normalizer(data) return AffineModel.fit(self, data, *args, **kwargs) def score(self, data, *args, **kwargs): d = self.normalizer.normalize(data) return AffineModel.score(self, d, *args, **kwargs)
def __init__(self, parent=None, show=True): Qt.QMainWindow.__init__(self, parent) self.ds = reader.DataSet("") self.meshes = [] self.plotter = BackgroundPlotter(shape=(1, 2), border_color='white', title="MMR Visualization") self.setWindowTitle('MMR UI') self.frame = Qt.QFrame() vlayout = Qt.QVBoxLayout() self.normalizer = Normalizer() self.frame.setLayout(vlayout) self.setCentralWidget(self.frame) mainMenu = self.menuBar() fileMenu = mainMenu.addMenu('File') exitButton = Qt.QAction('Exit', self) exitButton.setShortcut('Ctrl+Q') exitButton.triggered.connect(self.close) fileMenu.addAction(exitButton) meshMenu = mainMenu.addMenu('Mesh') self.load_mesh = Qt.QAction('Load mesh', self) self.load_mesh.triggered.connect( lambda: self.add_mesh(self.open_file_name_dialog())) meshMenu.addAction(self.load_mesh) self.show_norm_pipeline = Qt.QAction('Show norm pipeline', self) self.show_norm_pipeline.triggered.connect( lambda: self.show_processing(self.open_file_name_dialog())) meshMenu.addAction(self.show_norm_pipeline) self.extract_features = Qt.QAction('Extract features', self) self.extract_features.triggered.connect(lambda: print( FeatureExtractor.mono_run_pipeline(self.open_file_name_dialog()))) meshMenu.addAction(self.extract_features) if show: self.show()
def create_network(self): # for actor network self.o_stats = Normalizer(size=self.dimo, eps=self.norm_eps, default_clip_range=self.norm_clip) if self.use_goal: self.g_stats = Normalizer(size=self.dimg, eps=self.norm_eps, default_clip_range=self.norm_clip) else: self.g_stats = None self.main = ActorCritic(self.o_stats, self.g_stats, self.input_dims, self.use_goal).to(self.device) self.target = ActorCritic(self.o_stats, self.g_stats, self.input_dims, self.use_goal).to(self.device) self.target.actor = copy.deepcopy(self.main.actor) self.target.critic = copy.deepcopy(self.main.critic) self.actor_optimizer = optim.Adam(self.main.actor.parameters(), lr=self.pi_lr) self.critic_optimizer = optim.Adam(self.main.critic.parameters(), lr=self.Q_lr)
def main(in_subt, out_subt): assert in_subt != "" assert out_subt != "" parser = Parser() normalizer = Normalizer() lemma_filter = Filter() try: f = codecs.open(in_subt, 'r', encoding='utf8') text = f.read() f.close() except IOError: sys.exit("The subtitle could not be found in the path you provided.") parser.parse(text) normalizer.normalize(parser.get_text()) lemma_filter.clean_lemmas(normalizer.get_lemmas()) new_sub = Subtitle(parser.get_indexes(), parser.get_times(), parser.get_text(), lemma_filter.get_final_lemmas(), lemma_filter.get_dict(), out_subt) new_sub.create_subtitle()
def check_data(manifest, sample): normalizer = Normalizer() normalizer.read_manifest(manifest) with open(sample, 'r') as f: sample_file = f.read().strip() if len(sample_file) > 1: for line in sample_file.split('\n')[1:]: # line in the manifest: section_id,section_name,row_id,row_name line = line.strip() elements = line.split(',') section_name = elements[0] row_name = elements[1] # [(section_id, row_id, bool), number of matches, different section ids we have seen, different row_id we have seen] # data example: [(None, None, False), 3, set([170, 11, 118]), set([7])] data = normalizer.normalize_raw(section_name, row_name) # We are looking for matches that result in `False` but are not caused by # database returning multiple entries or row_name value passed as a range count = 0 if data[0][2] == False and data[1] <= 1 and '-' not in row_name: count += 1 print data, section_name, row_name, row_name.isdigit() print 'BAD MATCH: ', count assert count == 0
def process_spectrogram(self, spectrogram, channels=1): chopper = Chopper() chopper.name = "infer" chopper.params = "{'scale': %d}" % self.config.inference_slice chop = chopper.get(both=False) slices = chop(spectrogram) normalizer = Normalizer() normalize = normalizer.get(both=False) denormalize = normalizer.get_reverse() new_spectrogram = np.zeros((spectrogram.shape[0], 0, channels)) for slice in slices: # normalize slice, norm = normalize(slice) epanded_spectrogram = conversion.expand_to_grid( slice, self.peakDownscaleFactor, channels) epanded_spectrogram_with_batch_and_channels = \ epanded_spectrogram[np.newaxis, :, :] predicted_spectrogram_with_batch_and_channels = self.model.predict( epanded_spectrogram_with_batch_and_channels) # o /// o predicted_spectrogram = \ predicted_spectrogram_with_batch_and_channels[0, :, :, :] local_spectrogram = predicted_spectrogram[:slice.shape[0], :slice. shape[1], :] # de-normalize local_spectrogram = denormalize(local_spectrogram, norm) new_spectrogram = np.concatenate( (new_spectrogram, local_spectrogram), axis=1) console.log("Processed spectrogram") return spectrogram, new_spectrogram
def __init__(self, hparams): super(HER, self).__init__() self.hparams = hparams self.test_env = make_env(hparams, render=self.hparams.render_test) sample_obs = self.test_env.observation_space['observation'].sample() sample_goal = self.test_env.observation_space['achieved_goal'].sample() # HARD CODED VALUES FOR Bullet-HRL action_limits, state_limits = get_env_boundaries() action_offset, action_bounds, action_clip_low, action_clip_high = action_limits state_shape = sample_obs.shape[0] action_shape = self.test_env.action_space.shape[0] goal_shape = sample_goal.shape[0] self.action_clips = (action_clip_low, action_clip_high) self.model = DDPG(params=self.hparams, obs_size=state_shape, goal_size=goal_shape, act_size=action_shape, action_clips=(action_clip_low, action_clip_high), action_bounds=action_bounds, action_offset=action_offset) self.model.actor.share_memory() self.model.critic.share_memory() self.state_normalizer = Normalizer( state_shape, default_clip_range=self.hparams.clip_range) self.goal_normalizer = Normalizer( goal_shape, default_clip_range=self.hparams.clip_range) self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size, state_shape, action_shape, goal_shape)
def test_compare_to_scikit_learn_changing_test_size(self): normalizer = Normalizer(self.data) data = normalizer.normalize() for i in range(50, 130, 10): with self.subTest(i=i): testSize = i trainSize = len(data.data) - testSize print("test size: ", i) neighbours = 5 trainData = {} testData = {} trainData['data'] = data.data[:trainSize] trainData['target'] = data.target[:trainSize] testData['data'] = data.data[trainSize:] testData['target'] = data.target[:trainSize] knn = KNN(trainData) #scikit-learn model: model = KNeighborsClassifier(n_neighbors=neighbours) model.fit(trainData['data'], trainData['target']) ourCounter = 0 sciCounter = 0 for i, e in enumerate(testData['data']): if knn.makeGuess(e, neighbours) == testData['target'][i]: ourCounter+=1 if model.predict([e]) == testData['target'][i]: sciCounter+=1 self.assertAlmostEqual(ourCounter/(testSize), sciCounter/(testSize), 3)
def create_app(test_config=None): app = Flask(__name__) app.config['JSON_AS_ASCII'] = False # retrieve UTF-8 messages norm = Normalizer() @app.route('/reply', methods=['POST']) def reply(): params = request.json if not params: return jsonify({ "status": "error", "error": "Request must be of the application/json type!", }) message = params.get("message") method = params.get("method") # Make sure the required params are present. if message is None or method is None: return jsonify({ "status": "error", "error": "message and method are required keys" }) methods = { 'token': norm.tokenizer, 'spell': norm.speller, 'acronym': norm.acronym_searcher, 'textese': norm.untextese, 'proper_noun': norm.proper_noun_normalizer } try: reply = methods[method](message) except KeyError: return jsonify({ "status": "error", "error": "method not valid, try one of the following: token, spell, acronym, textese or proper_noun" }) # Send the response. return jsonify({"status": "ok", "reply": reply}) return app
def prepare_random_data(self, tracks, post_process=False): normalize = Normalizer().get() x = [] y = [] for track in tracks: x.append(self.mashup[track]) if self.is_instrumental: y.append(self.instrumental[track]) else: y.append(self.vocal[track]) x = [self.prepare_spectrogram(s) for s in x] y = [self.prepare_spectrogram(s) for s in y] x, y = normalize(x, y) return x, y
def trainRobotToWalk(): from normalizer import Normalizer options = parse_config('teachingARobotToWalk.config') env = gym.make(options['ENV_NAME']) num_states, num_actions = env.observation_space.shape[ 0], env.action_space.shape[0] print('options:{}'.format(options)) agent = AugmentedRandomSearch(num_states, num_actions, options) env = gym.wrappers.Monitor(env, options['MONITOR_DIR'], video_callable=agent.should_record_step, force=True) normalizer = Normalizer(num_states) agent.train(env, normalizer, options['RECORD_EVERY']) # training for our agent
def __init__(self, url, names, label_tag, drop_tags=None, encode_tags=None, normalizer=Normalizer(), normal_tags=None, test_size=0.2): self.url = url self.names = names self.drop_tags = drop_tags self.encode_tags = encode_tags self.data = None self.label_tag = label_tag self.test_size = test_size self.enc = ohe(categories='auto') self.normal_tags = normal_tags self.normalizer = normalizer
#print(res.url) #print(res.links) pr = PageRank(resources) print(pr.getCurrentRankRow()) print(pr.ranks) pr.calculate() print("THE RANKS") print(pr.getCurrentRankRow()) for index, source in enumerate(resources): source.rank = pr.getCurrentRankRow()[index] resources[0].extractText() n = Normalizer(resources[0].extractText()) print(n.getTokens()) #for en in pr.end_nodes(): #print(en.url) #p = pr.backlinks_for(c.web_resources[0].url) #print(c.web_resources[0].url) #for i in p: #print(i.url) #print("####") #p = pr.parent_for(c.web_resources[1].url) #print(c.web_resources[1].url) #for i in p: #print(i.url)
def __init__(self, data_dir, start_index=0): self.normalizer = Normalizer() self.data_dir = data_dir self.docs_number = len(glob(os.path.join(self.data_dir, '**', '*.nxml'))) self.inserter = Inserter() self.start_index = 0
class NLPNET: def __init__(self, *args, **kwargs): """ Constructor method, initializes variables. """ # Initializing variables self.nlpnet_normalizer = Normalizer() def tokenize(self, tokenize_string): """ Returns the tokenized version of tokenize_string, which is just a normal English sentence. """ # Setting up the nlpnet parser nlpnet.set_data_dir(self.get_data_dir_path()) pos_parser = nlpnet.POSTagger() return pos_parser.tag(tokenize_string) def get_dependencies(self, dependency_string): """ Returns dependency_string with sentence dependencies included. """ nlpnet.set_data_dir(self.get_data_dir_path()) dependency_parser = nlpnet.DependencyParser() return dependency_parser.parse(dependency_string) def get_data_dir_path(self): """ Returns the directory of the nlpnet corpora. """ # Getting nltk data path running = Popen(['python -c "import nltk;print nltk.data.path"'], stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True) stdin, stdout = running.communicate() # Setting the path that the nlpnet dependency was downloaded to path = re.sub(r"\'", "", re.sub(r"\[", '', str(stdin.split('\n')[0].split(',')[0]))) path = path.split(r"/") path = '/'.join(path[: len(path) - 1]) + '/nlpnet_dependency/dependency' return path def use_nlpnet(self, base_string, test_string, pattern_arg): """ Main interface method from the NLPNET class to the rest of the program. """ # Setting up the nlpnet parser nlpnet.set_data_dir(self.get_data_dir_path()) dependency_parser = nlpnet.DependencyParser() pos_parser = nlpnet.POSTagger() # Getting the passed patterns patterns = pattern_arg # Parsing the base_string base_parse = dependency_parser.parse(base_string) base_blob = TextBlob(base_string) base_sentences = base_blob.sentences base_sentence_info = [] for index in range(0, len(base_parse)): # Grabbing sentence information raw_data = str(base_sentences[index]) pos_sentence = pos_parser.tag(str(base_sentences[index])) subject, verb, object, prepositional_phrases = self.identify_sentence_parts_nlpnet(base_parse[index].tokens, base_parse[index].labels) """ # Displaying information for debugging purposes #print "***BASE***" #print "Raw Sentence : " + raw_data #print "POS Sentence : " + str( pos_sentence ) #print "[ Tokens ] : " + str( base_parse[ index ].tokens ) #print "[ Labels ] : " + str( base_parse[ index ].labels ) #print "[ Subject ] : " + subject #print "[ Verb ] : " + verb #print "[ Object ] : " + object #print "[ Prep Phrases ] : " + str( prepositional_phrases ) """ # Deciding whether the sentence/pattern should be added add_sentence = True for sentence in base_sentence_info: if sentence != []: if sentence[len(sentence) - 1] == raw_data: add_sentence = False break # If the sentence should be added to the possible patterns, add it if add_sentence: base_sentence_info.append([subject, verb, object, [], raw_data]) # Parsing the test_string test_parse = dependency_parser.parse(test_string) test_blob = TextBlob(test_string) test_sentences = test_blob.sentences test_sentence_info = [] for index in range(0, len(test_parse)): # Grabbing sentence information raw_data = str(test_sentences[index]) pos_sentence = pos_parser.tag(str(test_sentences[index])) subject, verb, object, prepositional_phrases = self.identify_sentence_parts_nlpnet(test_parse[index].tokens, test_parse[index].labels) """ #print "***TEST***" #print "Raw Sentence : " + raw_data #print "POS Sentence : " + str( pos_sentence ) #print "[ Tokens ] : " + str( test_parse[ index ].tokens ) #print "[ Labels ] : " + str( test_parse[ index ].labels ) #print "[ Subject ] : " + subject #print "[ Verb ] : " + verb #print "[ Object ] : " + object #print "[ Prep Phrases ] : " + str( prepositional_phrases ) """ # Deciding whether the sentence/pattern should be added add_sentence = True for sentence in test_sentence_info: if sentence != []: if sentence[len(sentence) - 1] == raw_data: add_sentence = False break # If the sentence should be added to the possible patterns, add it if add_sentence: test_sentence_info.append([subject, verb, object, [], raw_data]) # Returning the patterns found in the text return self.identify_common_patterns(base_sentence_info, test_sentence_info, patterns) def identify_sentence_parts_nlpnet(self, tokens, labels): subject = "" verb = "" object = "" prepositional_phrases = "" for index in range(0, len(labels)): if "SBJ" in labels[index] and verb == "": subject += tokens[index] + " " elif "ROOT" in labels[index]: verb += tokens[index] elif "PRD" in labels[index] or "OBJ" in labels[index]: object += tokens[index] + " " elif "LOC" in labels[index]: for prep_index in range(index, len(labels)): if "PMOD" in labels[prep_index] and ' '.join(tokens[index : prep_index + 1]) not in prepositional_phrases: prepositional_phrases += ' '.join(tokens[index : prep_index + 1]) + "..." break return subject, verb, object, prepositional_phrases.split("...") def normalize_sentence_info(self, sentence_info): """ Normalizes all of the incoming text to a standard. """ # Normalizing text sentence_info = self.nlpnet_normalizer.normalize_sentence_info(sentence_info) # Return normalized information return sentence_info def identify_common_patterns(self, base_sentence_info, test_sentence_info, patterns): # Creating variables sentence_information = {} # Comparing the two sets of strings together & finding patterns for base_sentence in base_sentence_info: for test_sentence in test_sentence_info: # If there are two sentences/patterns to compare if base_sentence != [] and test_sentence != []: # Normalize the pattern normalized_base_sentence = self.normalize_sentence_info(base_sentence) normalized_test_sentence = self.normalize_sentence_info(test_sentence) # If the patterns' semantic "value" is the same if normalized_base_sentence[0] == normalized_test_sentence[0] and normalized_base_sentence[1] == normalized_test_sentence[1] and normalized_base_sentence[2] == normalized_test_sentence[2]: # If one sentence/pattern is longer than the other, use that pattern if len(base_sentence[len(base_sentence) - 1].split()) > len(test_sentence[len(test_sentence) - 1].split()): # If other patterns have been detected if patterns != []: sentence_information[base_sentence[len(base_sentence) - 1]] = base_sentence[: len(base_sentence) - 1] sentence_information[base_sentence[len(base_sentence) - 1]].append(2) sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) # If the current test patterns are not in patterns if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns: patterns += [base_sentence[len(base_sentence) - 1]] elif base_sentence[len(base_sentence) - 1] in patterns: # Updating reliability score try: sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1 except: sentence_information[base_sentence[len(base_sentence) - 1]].append(2) # If there are no patterns currently found, add this pattern elif patterns == []: patterns += [base_sentence[len(base_sentence) - 1]] sentence_information[ base_sentence[len(base_sentence) - 1]] = base_sentence[0 : len(base_sentence) - 1] # Updating reliability score try: sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1 except: sentence_information[base_sentence[len(base_sentence) - 1]].append(2) # Adding applicability score try: sentence_information[base_sentence[len(base_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]) except: sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) else: # If there are patterns already found if patterns != []: sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[0 : len(test_sentence) - 1] sentence_information[test_sentence[len(test_sentence) - 1]].append(2) sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) # If the test patterns are not in the already found patterns if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns: patterns += [test_sentence[len(test_sentence) - 1]] #sentence_information[ test_sentence[ len( test_sentence ) - 1 ] ] = test_sentence[ 0 : len( test_sentence ) - 1 ] elif test_sentence[len(test_sentence) - 1] in patterns: # Updating reliability score try: sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1 except: sentence_information[test_sentence[len(test_sentence) - 1]].append(2) # If there are no patterns currently found elif patterns == []: patterns += [test_sentence[len(test_sentence) - 1]] sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[: len(test_sentence) - 1] # Updating reliability score try: sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1 except: sentence_information[test_sentence[len(test_sentence) - 1]].append(2) # Adding applicability score try: sentence_information[test_sentence[len(test_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]) except: sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) return patterns, sentence_information
class Client(): def __init__(self, host='192.168.0.107', port=7777, list_file='inputfiles-full.txt', freqs_file='wordlist', dataset_dir='/home/aelphy/Desktop/ir_project_dataset'): self.normalizer = Normalizer() self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.connect((host, port)) self.document_freqs_list_filename = freqs_file self.document_list_filename = list_file self.dataset_dir = dataset_dir self.documents_freqs = {} self.document_identificators = {} self.identificator_documents = {} with open(self.document_list_filename) as f: for line in f: data = line.strip().split() index = int(data[0]) identifier = data[1] self.document_identificators[index] = identifier self.identificator_documents[identifier] = index self.documents_number = index with open(self.document_freqs_list_filename) as f: for line in f: data = line.strip().split() self.documents_freqs[self.identificator_documents[data[1]]] = int(data[0]) self.avgdl = sum(self.documents_freqs.values()) / float(self.documents_number) self.k1 = 2.0 self.b = 0.75 def process_query(self, query): terms = self.normalizer.normalize_line(query) if not terms: return set() inverted_index = {} term_doc_tf = {} for term in terms: self.send_message(term) term_doc_tf[term] = self.parse_message_array(self.recieve_message().split()) inverted_index[term] = term_doc_tf[term].keys() documents = self.merge(inverted_index, terms) if not documents: return set() return self.rank(documents, term_doc_tf, terms) def recieve_message(self): message = '' while not message.endswith('\n'): message += self.socket.recv(1024).decode('utf-8') return message.strip() def send_message(self, message): self.socket.send((message + '\n').encode('utf-8')) def parse_message_array(self, message_array): result = {} i = 0 while i <= len(message_array) - 1: result[int(message_array[i])] = int(message_array[i + 1]) i = i + 2 return result def merge(self, inverted_index, terms): result = set(inverted_index[terms[0]]) for i in range(1, len(terms)): result = result.intersection(set(inverted_index[terms[i]])) return result def rank(self, documents, term_doc_tf, terms): document_scores = {} document_freqs = {} for document in documents: document_scores[document] = self.score_document(document, term_doc_tf, terms) result = sorted(document_scores.items(), key=lambda x: x[1], reverse=True) best_match = result[0] best_match_file = open(os.path.join(self.dataset_dir, client.document_identificators[best_match[0]]), 'r') best_match_data = ' '.join(best_match_file.readlines()) for term in terms: best_match_data = best_match_data.replace(term, color.RED + term + color.END) print(best_match_data) return result def score_document(self, document, term_doc_tf, terms): result = 0 for term in terms: IDF = math.log((self.documents_number - len(term_doc_tf[term]) + 0.5) / (len(term_doc_tf[term]) + 0.5)) f = term_doc_tf[term][document] result += IDF * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * self.documents_freqs[document] / self.avgdl)) return result
for line in fin: line = line.strip() tweet.append(line) #store in dataframe prab_data = DataFrame(tweet) #rename the column prab_data.columns = ['tweet'] #score the tweet score = [] #create normalizer object norm = Normalizer() #create Stop word Removal object st = StpRemoval() #create sentiment analysis object s = Sentianal() for i in range(0,len(prab_data)): #normalize line = norm.normalize(prab_data['tweet'][i]) #remove stopword line = st.removeStp(line) #score sentiment
class PATTERN: def __init__(self, *args, **kwargs): """ Constructor method, initializes variables. """ # Initializing variables self.pattern_normalizer = Normalizer() def tokenize(self, tokenize_string): """ Returns the tokenized version of tokenize_string, which is just a normal English sentence. """ return parse(tokenize_string, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) def find_dependencies(self, dependency_string): """ Returns dependency_string with sentence dependencies included. """ return parse(dependency_string, relations=True) def use_pattern(self, base_string, test_string, pattern_arg): patterns = pattern_arg # Creating string textblob for analysis & analyzing the base_string's sentences base_blob = TextBlob(base_string) base_sentence_info = [] for base_sentence in base_blob.sentences: subject = "" verb = "" object = "" prepositional_phrases = "" raw_data = parse(str(base_sentence), relations=True) for word in parse(str(base_sentence), relations=True).split(): if "SBJ-" in word: subject += re.sub(r'/.*', '', word) + " " elif "OBJ-" in word: object += re.sub(r'/.*', '', word) + " " elif "VP-" in word: verb += re.sub(r'/.*', '', word) + " " elif "PNP" in word: prepositional_phrases += re.sub(r'/.*', '', word) + " " elif "PNP" not in word and prepositional_phrases[len(prepositional_phrases) - 3:] != "...": prepositional_phrases += "..." """ #print "[ Subject ]: " + subject #print "[ Object ]: " + object #print "[ Verb ]: " + verb #print "[ Prepositional Phrases ]: " + str( prepositional_phrases.split( '...' )[ 1:len(prepositional_phrases.split( '...' )) ] ) #print "[ Raw Data ]: " + raw_data """ add_sentence = True for sentence in base_sentence_info: if sentence != []: if sentence[len(sentence) - 1] == str(base_sentence): add_sentence = False break if add_sentence: base_sentence_info.append( [subject, verb, object, prepositional_phrases.split('...')[1 : len(prepositional_phrases.split('...'))], str(base_sentence)]) # Creating string textblob for analysis & analyzing the base_string's sentences test_blob = TextBlob(test_string) test_sentence_info = [] for test_sentence in test_blob.sentences: subject = "" verb = "" object = "" prepositional_phrases = "" raw_data = parse(str(test_sentence), relations=True) for word in parse(str(test_sentence), relations=True).split(): if "SBJ-" in word: subject += re.sub(r'/.*', '', word) + " " elif "OBJ-" in word: object += re.sub(r'/.*', '', word) + " " elif "VP-" in word: verb += re.sub(r'/.*', '', word) + " " elif "PNP" in word: prepositional_phrases += re.sub(r'/.*', '', word) + " " elif "PNP" not in word and prepositional_phrases[len(prepositional_phrases) - 3:] != "...": prepositional_phrases += "..." """ #print "[ Subject ]: " + subject #print "[ Object ]: " + object #print "[ Verb ]: " + verb #print "[ Prepositional Phrases ]: " + str( prepositional_phrases.split( '...' )[ 1:len(prepositional_phrases.split( '...' )) ] ) #print "[ Raw Data ]: " + raw_data """ add_sentence = True for sentence in test_sentence_info: if sentence != []: if sentence[len(sentence) - 1] == str(test_sentence): add_sentence = False break if add_sentence: test_sentence_info.append([subject, verb, object, prepositional_phrases.split('...')[1 : len(prepositional_phrases.split('...'))], str(test_sentence)]) return self.identify_common_patterns(base_sentence_info, test_sentence_info, patterns) def normalize_sentence_info(self, sentence_info): """ Normalizes all of the incoming text to a standard. """ # Normalizing text sentence_info = self.pattern_normalizer.normalize_sentence_info(sentence_info) # Return normalized information return sentence_info def identify_common_patterns(self, base_sentence_info, test_sentence_info, patterns): # Creating variables sentence_information = {} # Comparing the two sets of strings together & finding patterns for base_sentence in base_sentence_info: for test_sentence in test_sentence_info: # If there are two sentences/patterns to compare if base_sentence != [] and test_sentence != []: # Normalize the pattern normalized_base_sentence = self.normalize_sentence_info(base_sentence) normalized_test_sentence = self.normalize_sentence_info(test_sentence) # If the patterns' semantic "value" is the same if normalized_base_sentence[0] == normalized_test_sentence[0] and normalized_base_sentence[1] == normalized_test_sentence[1] and normalized_base_sentence[2] == normalized_test_sentence[2]: # If one sentence/pattern is longer than the other, use that pattern if len(base_sentence[len(base_sentence) - 1].split()) > len(test_sentence[len(test_sentence) - 1].split()): # If other patterns have been detected if patterns != []: sentence_information[base_sentence[len(base_sentence) - 1]] = base_sentence[: len(base_sentence) - 1] sentence_information[base_sentence[len(base_sentence) - 1]].append(2) sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) # If the current test patterns are not in patterns if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns: patterns += [base_sentence[len(base_sentence) - 1]] elif base_sentence[len(base_sentence) - 1] in patterns: # Updating reliability score try: sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1 except: sentence_information[base_sentence[len(base_sentence) - 1]].append(2) # If there are no patterns currently found, add this pattern elif patterns == []: patterns += [base_sentence[len(base_sentence) - 1]] sentence_information[ base_sentence[len(base_sentence) - 1]] = base_sentence[0 : len(base_sentence) - 1] # Updating reliability score try: sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1 except: sentence_information[base_sentence[len(base_sentence) - 1]].append(2) # Adding applicability score try: sentence_information[base_sentence[len(base_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]) except: sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) else: # If there are patterns already found if patterns != []: sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[0 : len(test_sentence) - 1] sentence_information[test_sentence[len(test_sentence) - 1]].append(2) sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) # If the test patterns are not in the already found patterns if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns: patterns += [test_sentence[len(test_sentence) - 1]] #sentence_information[ test_sentence[ len( test_sentence ) - 1 ] ] = test_sentence[ 0 : len( test_sentence ) - 1 ] elif test_sentence[len(test_sentence) - 1] in patterns: # Updating reliability score try: sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1 except: sentence_information[test_sentence[len(test_sentence) - 1]].append(2) # If there are no patterns currently found elif patterns == []: patterns += [test_sentence[len(test_sentence) - 1]] sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[: len(test_sentence) - 1] # Updating reliability score try: sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1 except: sentence_information[test_sentence[len(test_sentence) - 1]].append(2) # Adding applicability score try: sentence_information[test_sentence[len(test_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]) except: sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])) return patterns, sentence_information
for line in fin: line = line.strip() tweet.append(line) #store in dataframe jkw_data = DataFrame(tweet) #rename the column jkw_data.columns = ['tweet'] #score the tweet score = [] #create normalizer object norm = Normalizer() #create Stop word Removal object st = StpRemoval() #create sentiment analysis object s = Sentianal() for i in range(0,len(jkw_data)): #normalize line = norm.normalize(jkw_data['tweet'][i]) #remove stopword line = st.removeStp(line) #score sentiment
# Author : Alfan F. Wicaksono # IR Lab, FASILKOM, UI # Script for pre-processing twitter corpus from normalizer import Normalizer from stpremoval import StpRemoval ##################### you can modify this part ###################### corpusFile = "debatcapres_2014_sesi1.txt" preprocessedFile = "debatcapres_2014_sesi1_processed.txt" ##################################################################### nm = Normalizer() sw = StpRemoval() fin = open(corpusFile, "r") fout = open(preprocessedFile, "w") for line in fin: line = line.strip() # remove carriage return line = nm.normalize(line) # normalization line = sw.removeStp(line) # remove stop word fout.write(line) # put preprocessed tweet on the new file fout.write("\n") fin.close() fout.close()
def fit(self, data, *args, **kwargs): self.normalizer = Normalizer(data) return AffineModel.fit(self, data, *args, **kwargs)