def preprocess_tokens(docs, nlp): """Filter out noisy tokens and lemmatize the remaining ones. :param docs: list list of documents to be parsed :param nlp: spacy.lang.<code>.<language> spacy language, e.g. spacy.lang.es.Spanish :return filtered_tokens: list list of lists of lemmatized and filtered tokens """ try: docs = list(docs) except TypeError: print("Input can't be casted to type 'list'") raise n = len(docs) # used for progress bar only filtered_tokens = [] for i, doc in enumerate(nlp.pipe(docs)): tokens = [ token.lemma_.lower() for token in doc if (remove_noise(token) and token.lemma_ != '-PRON-') ] filtered_tokens.append(tokens) print_progressbar(i, n) return filtered_tokens
def map_imgs_to_classes(filenames, dataset_name): X = [] y = [] for index, filename in enumerate(filenames): X.append(get_img_data('{}/{}'.format(slices_path, filename))) y.append(classes[filename[:3]]) print_progressbar(index / len(filenames), 'Building {} dataset'.format(dataset_name)) return np.array(X), pd.get_dummies(y).values
def slice_spectrograms(): all_spectrograms = listdir(spectrograms_path) if not path.exists(slices_path): makedirs(slices_path) for index, filename in enumerate(all_spectrograms): if filename.endswith('.png'): slice_spectrogram(filename) print_progressbar(index / len(all_spectrograms), 'Slicing')
def run(self): cmd_args = [ item for sublist in self._get_command() for item in sublist ] proc = subprocess.Popen(['ffmpeg', '-hide_banner', '-y', *cmd_args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) while True: line = proc.stdout.readline() if not line: break line = str(line.rstrip()) if self._duration is None: match = re.search(r'Duration: ([^,]+),', line) if match: self._duration = parse_time(match[0]) # if self._frames is None: # match = re.search(r', ([\d.]+) tbr', line, flags=re.IGNORECASE) # if match: # self._frames = float(match[1]) if line.startswith('frame'): status = re.search( r'frame=([\d\s]+).*size=([\d\skmB]+).*time=([\d:.]+).*speed=([\s\d.]+)x', line, flags=re.IGNORECASE) time = self._get_fixed_time_on_run(parse_time(status[3])) speed = float(status[4].strip()) print_progressbar(time, self._duration, suffix=f"(speed: {speed}x)") print()
def render_hourly(session): date = START_DATE total_steps = (END_DATE - START_DATE) / STEP i = 0 while date < END_DATE: i += 1 print_progressbar(i / total_steps) graph = nx.MultiDiGraph() start = date end = (date + STEP) result = session.run( """ MATCH (a:Station)-[r:BIKE_MOVED]->(b:Station) WHERE {start} <= r.timestamp_start < {end} RETURN a, r, b""", { 'start': start.timestamp(), 'end': end.timestamp() }) for record in result: station_a = record['a']['name'].replace('/', ' /\n') station_b = record['b']['name'].replace('/', ' /\n') bike_id = record['r']['bike_id'] start_time = datetime.fromtimestamp( record['r']['timestamp_start']).strftime('%H\:%M') end_time = datetime.fromtimestamp( record['r']['timestamp_end']).strftime('%H\:%M') label = f'{start_time} -\n{end_time}' color = 'red' if record['r']['transporter'] else '#aaaaaa' penwidth = 2 if record['r']['transporter'] else 1 graph.add_edge(station_a, station_b, label=label, color=color, penwidth=penwidth) # graph.add_edge(station_a, station_b, label=bike_id) filename = f"{start.strftime('%Y-%m-%d_%H_%M')} - {end.strftime('%Y-%m-%d_%H_%M')}.dot" write_dot(graph, os.path.join(OUTPUT_DIRECTORY, filename)) date = end clear_progressbar()
def generate_spectrograms(): current_path = path.dirname(path.realpath(__file__)) filenames = glob.glob('{}/**/*.wav'.format(dataset_path)) for index, filename in enumerate(filenames): categories = re.compile('(?:\[((?:[a-z]{3}(?:_)?)+)\])').findall( filename) newname = '{}/{}_{}.png'.format(spectrograms_path, '_'.join(categories), index) cmd = 'sox {} -n spectrogram -Y {} -m -r -o {}'.format( filename, slice_size, newname) p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True, cwd=current_path) output, errors = p.communicate() if errors: print(errors) print_progressbar(index / len(filenames), 'Generating')
def create_bikes(number_of_samples=1): track_a_bike = TrackABike() fieldnames = ['number', 'version', 'marke_id', 'marke_name', 'is_pedelec'] headernames = { 'number': 'bike_id:ID(Bike)', 'version': 'version:INT', 'marke_id': 'marke_id:INT', 'is_pedelec': 'is_pedelec:BOOLEAN' } bikes = {} i = 0 for timestamp, data in read_xml_dumps(): i += 1 # We want to build a list of all bikes. Since some bikes may be rented or are even in maintenance, # it is not enough to look at a given moment. To save time, we just don't need to look at every # minute, so we just process a dataset every hour if i % 60: continue print_progressbar(i / number_of_samples) track_a_bike.load_xml(data) for station in track_a_bike.stations.values(): # print(station['free_bikes']) update = {} for bike in station['free_bikes']: update[bike['number']] = { headernames.get(key, key): bike[key] for key in fieldnames } bikes.update(update) # bikes.update({free_bikes['number']: free_bikes[key] for key in fieldnames}) with open(os.path.join(CSV_DIRECTORY, 'bikes.csv'), 'w') as f: writer = csv.DictWriter(f, [headernames.get(x, x) for x in fieldnames]) writer.writeheader() bikes_list = list(bikes.values()) bikes_list.sort(key=lambda x: x['bike_id:ID(Bike)']) writer.writerows(bikes_list)
def create_bike_positions_and_movement(number_of_samples=0): fieldnames_position = [ 'number', 'timestamp', 'can_be_rented', 'can_be_returned', 'station_id' ] headernames_position = { 'number': ':START_ID(Bike)', 'station_id': ':END_ID(Station)', 'timestamp': 'timestamp:INT', 'can_be_rented': 'can_be_rented:BOOLEAN', } fieldnames_movement = [ ':START_ID(Station)', ':END_ID(Station)', 'timestamp_start:INT', 'timestamp_end:INT', 'duration:INT', 'bike_id:INT' ] with open(os.path.join(CSV_DIRECTORY, 'bike_positions.csv'), 'w') as f: with open(os.path.join(CSV_DIRECTORY, 'bike_movements.csv'), 'w') as f2: position_writer = csv.DictWriter( f, [headernames_position.get(x, x) for x in fieldnames_position]) movement_writer = csv.DictWriter(f2, fieldnames_movement) position_writer.writeheader() movement_writer.writeheader() i = 0 track_a_bike = TrackABike() current_bike_positions = {} for timestamp, data in read_xml_dumps(): i += 1 print_progressbar(i / number_of_samples) track_a_bike.load_xml(data) for station in track_a_bike.stations.values(): bike_positions = [] for bike in station['free_bikes']: bike_id = bike['number'] prev_station = current_bike_positions.get( bike_id, None) if prev_station is not None and prev_station[ 'id'] != station['id']: duration = (timestamp - prev_station['timestamp']) movement_writer.writerow({ ':START_ID(Station)': prev_station['id'], ':END_ID(Station)': station['id'], 'timestamp_start:INT': int(prev_station['timestamp'].timestamp()), 'timestamp_end:INT': int(timestamp.timestamp()), 'duration:INT': int(duration.total_seconds()), 'bike_id:INT': bike_id, }) current_bike_positions[bike_id] = { 'id': station['id'], 'timestamp': timestamp } bike_position = { headernames_position.get(key, key): bike.get(key, None) for key in fieldnames_position } bike_position[ headernames_position['station_id']] = station['id'] bike_position[headernames_position['timestamp']] = int( timestamp.timestamp()) bike_positions.append(bike_position) position_writer.writerows(bike_positions)