def mismas_features_distinto_humor(corpus): print("Buscando tweets con mismos valores de features pero distinto de humor...") humoristicos = [tweet for tweet in corpus if tweet.es_humor] no_humoristicos = [tweet for tweet in corpus if not tweet.es_humor] res = [] bar = IncrementalBar("Buscando en tweets\t\t", max=len(humoristicos) * len(no_humoristicos), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet_humor in humoristicos: for tweet_no_humor in no_humoristicos: if tweet_humor.features == tweet_no_humor.features: res.append((tweet_humor, tweet_no_humor)) if tweet_humor.texto_original == tweet_no_humor.texto_original: print("-----MISMO TEXTO ORIGINAL------") if tweet_humor.texto == tweet_no_humor.texto: print("----------MISMO TEXTO----------") if tweet_humor.id == tweet_no_humor.id: print("-----------MISMO ID------------") if tweet_humor.cuenta == tweet_no_humor.cuenta: print("----------MISMA CUENTA---------") print('') print(tweet_humor.id) print(tweet_humor.texto) print("------------") print(tweet_no_humor.id) print(tweet_no_humor.texto) print("------------") print('') bar.next() bar.finish() return res
def crawl(self): n = MIN_NUMBER_OF_DOCS startingURL = START_PAGES os.makedirs(AFTER_CRAWL_BASE_DIR, exist_ok=True) self.n = n for sURL in startingURL: try: self.queue.extend(self.parseProfilePage(sURL)) except: print('cannot parse profile page') with open(os.path.join(AFTER_CRAWL_BASE_DIR, ERRORS_FILE_NAME), "a") as ErrorFile: ErrorFile.write('cannot parse profile page ',sURL,'\n') from progress.bar import IncrementalBar progress_bar = IncrementalBar('Crawling', max=MIN_NUMBER_OF_DOCS, suffix='%(percent)d%% %(remaining)s remaining - eta %(eta_td)s') threads = [CrawlThread(self, progress_bar) for t in range(NUMBER_OF_THREADS)] for t in threads: t.start() for t in threads: t.join() with open(os.path.join(AFTER_CRAWL_BASE_DIR, MAP_FILE_NAME), 'w') as outfile: json.dump(self.URLIDMap, outfile) progress_bar.finish()
def run(): for model, num_to_create in to_create.items(): model_name = model._meta.model_name bar = Bar('Creating {}'.format(model_name), max=num_to_create) model_count = model.objects.count() create_f = globals()['populate_{}'.format(model_name)] for i in range(num_to_create): ident = '{}{}'.format(model_name, i) if i < model_count: unit = model.objects.all()[i] else: unit = create_f(model, i) globals()[ident] = unit bar.next() bar.finish() # This bit is special: Associate all rpms with the first repo, # for maximum relational query fun num_units = platform.ContentUnit.objects.count() repo = globals()['repository0'] bar = Bar('Adding all units to {} repo'.format(repo.slug)) bar.max = num_units for unit in platform.ContentUnit.objects.all(): repo.add_units(unit) bar.next() bar.finish()
def save_frames(source, vertices, images_dir): print('Saving frames...') if not os.path.isdir(images_dir): os.makedirs(images_dir) bar = IncrementalBar(max=len(vertices)) angle_change = 360 // len(vertices) for i, v in enumerate(vertices): update(source, v, angle_change=angle_change) mlab.savefig(filename=os.path.join(images_dir, frame_fn(i))) bar.next() bar.finish() mlab.close()
def calcular_feature_thread(self, tweets, nombre_feature, identificador): if len(tweets) > 0: bar = IncrementalBar("Calculando feature " + nombre_feature + ' - ' + unicode(identificador), max=len(tweets), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) feature = self.features[nombre_feature] self.abortar_si_feature_no_es_thread_safe(feature) for tweet in tweets: tweet.features[feature.nombre] = feature.calcular_feature(tweet) bar.next() bar.finish()
def calcular_features_thread(self, tweets, identificador): if len(tweets) > 0: bar = IncrementalBar("Calculando features - " + unicode(identificador), max=len(tweets) * len(self.features), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet in tweets: for feature in list(self.features.values()): self.abortar_si_feature_no_es_thread_safe(feature) tweet.features[feature.nombre] = feature.calcular_feature(tweet) bar.next() bar.finish()
def guardar_parecidos_con_distinto_humor(pares_parecidos_distinto_humor): with closing(open_db()) as conexion: with closing(conexion.cursor()) as cursor: consulta = "INSERT INTO tweets_parecidos_distinto_humor VALUES (%s, %s)" \ + " ON DUPLICATE KEY UPDATE id_tweet_no_humor = %s" bar = IncrementalBar("Guardando tweets parecidos\t", max=len(pares_parecidos_distinto_humor), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet_humor, tweet_no_humor in pares_parecidos_distinto_humor: cursor.execute(consulta, (tweet_humor.id, tweet_no_humor.id, tweet_no_humor.id)) bar.next() conexion.commit() bar.finish()
def cross_validation_y_reportar(clasificador, features, clases, numero_particiones): skf = cross_validation.StratifiedKFold(clases, n_folds=numero_particiones) features = np.array(features) clases = np.array(clases) matrices = [] medidas = defaultdict(list) bar = IncrementalBar("Realizando cross-validation\t", max=numero_particiones, suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for entrenamiento, evaluacion in skf: clasificador.fit(features[entrenamiento], clases[entrenamiento]) clases_predecidas = clasificador.predict(features[evaluacion]) matriz_de_confusion = metrics.confusion_matrix(clases[evaluacion], clases_predecidas).flatten() matrices.append(matriz_de_confusion) for medida, valor_medida in calcular_medidas(*matriz_de_confusion).items(): medidas[medida].append(valor_medida) bar.next() bar.finish() promedios = {} print('') print("Resultados de cross-validation:") print('') for medida, valor_medida in medidas.items(): print("\t{medida: >18s}:\t{valor_medida}".format(medida=medida, valor_medida=valor_medida)) promedio = np.mean(valor_medida) promedios[medida] = promedio delta = np.std(valor_medida) * 1.96 / math.sqrt(numero_particiones) print("Intervalo de confianza 95%:\t{promedio:0.4f} ± {delta:0.4f} --- [{inf:0.4f}, {sup:0.4f}]".format( promedio=promedio, delta=delta, inf=promedio - delta, sup=promedio + delta)) print('') imprimir_matriz_metricas( promedios['Precision No humor'], promedios['Recall No humor'], promedios['F1-score No humor'], promedios['Precision Humor'], promedios['Recall Humor'], promedios['F1-score Humor'], ) print('') print('') print('')
def render(self, ctx, invert=False, filename=None, pbar=False): """ Generate image of layer. Parameters ---------- ctx : :class:`GerberContext` GerberContext subclass used for rendering the image filename : string <optional> If provided, save the rendered image to `filename` pbar : bool <optional> If true, render a progress bar """ ctx.set_bounds(self.bounds) ctx._paint_background() if invert: ctx.invert = True ctx._clear_mask() for p in self.primitives: ctx.render(p) if invert: ctx.invert = False ctx._render_mask() _pbar = None if pbar: try: from progress.bar import IncrementalBar _pbar = IncrementalBar( self.filename, max=len(self.primitives) ) except ImportError: pbar = False for p in self.primitives: ctx.render(p) if pbar: _pbar.next() if pbar: _pbar.finish() if filename is not None: ctx.dump(filename)
def guardar_parecidos_con_distinto_humor(pares_parecidos_distinto_humor): with closing(mysql.connector.connect(user=DB_USER, password=DB_PASS, host=DB_HOST, database=DB_NAME)) as conexion: with closing(conexion.cursor()) as cursor: consulta = ( "INSERT INTO tweets_parecidos_distinto_humor VALUES (%s, %s)" + " ON DUPLICATE KEY UPDATE id_tweet_no_humor = %s" ) bar = IncrementalBar( "Guardando tweets parecidos\t", max=len(pares_parecidos_distinto_humor), suffix=SUFIJO_PROGRESS_BAR ) bar.next(0) for tweet_humor, tweet_no_humor in pares_parecidos_distinto_humor: cursor.execute(consulta, (tweet_humor.id, tweet_no_humor.id, tweet_no_humor.id)) bar.next() conexion.commit() bar.finish()
def _create_unfilled_voxel_data( model_id, edge_length_threshold=0.1, voxel_config=None, overwrite=False, example_ids=None): from template_ffd.data.ids import get_example_ids from shapenet.core import cat_desc_to_id from template_ffd.model import load_params import numpy as np from progress.bar import IncrementalBar if voxel_config is None: voxel_config = _default_config cat_id = cat_desc_to_id(load_params(model_id)['cat_desc']) if example_ids is None: example_ids = get_example_ids(cat_id, 'eval') mesh_dataset = get_inferred_mesh_dataset(model_id, edge_length_threshold) voxel_dataset = get_voxel_dataset( model_id, edge_length_threshold, voxel_config, filled=False, auto_save=False) if not overwrite: example_ids = [i for i in example_ids if i not in voxel_dataset] if len(example_ids) == 0: return print('Creating %d voxels for model %s' % (len(example_ids), model_id)) kwargs = dict( voxel_dim=voxel_config.voxel_dim, exact=voxel_config.exact, dc=voxel_config.dc, aw=voxel_config.aw) with mesh_dataset: bar = IncrementalBar(max=len(example_ids)) for example_id in example_ids: bar.next() mesh = mesh_dataset[example_id] vertices, faces = ( np.array(mesh[k]) for k in ('vertices', 'faces')) binvox_path = voxel_dataset.path(example_id) # x, z, y = vertices.T # vertices = np.stack([x, y, z], axis=1) bio.mesh_to_binvox( vertices, faces, binvox_path, **kwargs) bar.finish()
def render_deferred(self): if not len(self._deferred): return print("Optimizing deferred elements") paths = self._optimize_deferred().paths print("Rendering Paths") try: from progress.bar import IncrementalBar _pbar = IncrementalBar(max=len(paths)) except ImportError: _pbar = None for path in paths: self._render_path(path) if _pbar: _pbar.next() if _pbar: _pbar.finish()
def cargar_parecidos_con_distinto_humor(): with closing(open_db()) as conexion: # buffered=True así sé la cantidad que son antes de iterarlos. with closing(conexion.cursor() if DB_ENGINE == 'sqlite3' else conexion.cursor(buffered=True)) as cursor: consulta = """ SELECT id_tweet_humor, id_tweet_no_humor FROM tweets_parecidos_distinto_humor """ cursor.execute(consulta) pares_ids_parecidos_con_distinto_humor = [] bar = IncrementalBar("Cargando tweets parecidos\t", max=cursor.rowcount, suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for par_ids in cursor: pares_ids_parecidos_con_distinto_humor.append(par_ids) bar.next() bar.finish() return pares_ids_parecidos_con_distinto_humor
def guardar_features(tweets, **opciones): nombre_feature = opciones.pop('nombre_feature', None) conexion = open_db() cursor = conexion.cursor() consulta = "INSERT INTO features VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE valor_feature = %s" if nombre_feature: mensaje = 'Guardando feature ' + nombre_feature else: mensaje = 'Guardando features' bar = IncrementalBar(mensaje, max=len(tweets), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet in tweets: if nombre_feature: cursor.execute( consulta, ( tweet.id, nombre_feature, unicode(tweet.features[nombre_feature]), unicode(tweet.features[nombre_feature]) ) ) else: for nombre_feature, valor_feature in tweet.features.items(): cursor.execute(consulta, (tweet.id, nombre_feature, unicode(valor_feature), unicode(valor_feature))) bar.next() conexion.commit() bar.finish() cursor.close() conexion.close()
def guardar_features(tweets, **opciones): nombre_feature = opciones.pop("nombre_feature", None) conexion = mysql.connector.connect(user=DB_USER, password=DB_PASS, host=DB_HOST, database=DB_NAME) cursor = conexion.cursor() consulta = "INSERT INTO features VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE valor_feature = %s" if nombre_feature: mensaje = "Guardando feature " + nombre_feature else: mensaje = "Guardando features" bar = IncrementalBar(mensaje, max=len(tweets), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet in tweets: if nombre_feature: cursor.execute( consulta, ( tweet.id, nombre_feature, unicode(tweet.features[nombre_feature]), unicode(tweet.features[nombre_feature]), ), ) else: for nombre_feature, valor_feature in tweet.features.items(): cursor.execute(consulta, (tweet.id, nombre_feature, unicode(valor_feature), unicode(valor_feature))) bar.next() conexion.commit() bar.finish() cursor.close() conexion.close()
def cargar_parecidos_con_distinto_humor(): with closing(mysql.connector.connect(user=DB_USER, password=DB_PASS, host=DB_HOST, database=DB_NAME)) as conexion: # buffered=True así sé la cantidad que son antes de iterarlos. with closing(conexion.cursor(buffered=True)) as cursor: consulta = """ SELECT id_tweet_humor, id_tweet_no_humor FROM tweets_parecidos_distinto_humor """ cursor.execute(consulta) pares_ids_parecidos_con_distinto_humor = [] bar = IncrementalBar("Cargando tweets parecidos\t", max=cursor.rowcount, suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for par_ids in cursor: pares_ids_parecidos_con_distinto_humor.append(par_ids) bar.next() bar.finish() return pares_ids_parecidos_con_distinto_humor
def cargar_tweets(limite=None, agregar_sexuales=False, cargar_features=True): """Carga todos los tweets, inclusive aquellos para evaluación, aunque no se quiera evaluar, y aquellos mal votados, así se calculan las features para todos. Que el filtro se haga luego.""" conexion = open_db() if DB_ENGINE == 'sqlite3': cursor = conexion.cursor() else: cursor = conexion.cursor(buffered=True) # buffered así sé la cantidad que son antes de iterarlos if agregar_sexuales: consulta_sexuales_tweets = "" consulta_limite_sexuales = "" else: consulta_sexuales_tweets = "censurado_tweet = 0" consulta_limite_sexuales = "AND " + consulta_sexuales_tweets consulta_sexuales_features = consulta_sexuales_tweets if limite: consulta = "SELECT id_tweet FROM tweets WHERE evaluacion = 0 " + consulta_limite_sexuales + " ORDER BY RAND() LIMIT "\ + unicode(limite) cursor.execute(consulta) bar = IncrementalBar("Eligiendo tweets aleatorios\t", max=cursor.rowcount, suffix=SUFIJO_PROGRESS_BAR) bar.next(0) ids = [] for (tweet_id,) in cursor: ids.append(tweet_id) bar.next() bar.finish() str_ids = '(' + unicode(ids).strip('[]L') + ')' consulta_prueba_tweets = "T.id_tweet IN {ids}".format(ids=str_ids) consulta_prueba_features = "id_tweet IN {ids}".format(ids=str_ids) else: consulta_prueba_features = "" consulta_prueba_tweets = "" if not agregar_sexuales and limite: restricciones_tweets = "WHERE " + consulta_sexuales_tweets + " AND " + consulta_prueba_tweets restricciones_features = "WHERE " + consulta_sexuales_features + " AND " + consulta_prueba_features elif not agregar_sexuales: restricciones_tweets = "WHERE " + consulta_sexuales_tweets restricciones_features = "WHERE " + consulta_sexuales_features elif limite: restricciones_tweets = "WHERE " + consulta_prueba_tweets restricciones_features = "WHERE " + consulta_prueba_features else: restricciones_tweets = "" restricciones_features = "" if DB_ENGINE == 'sqlite3': consulta = """ SELECT id_account, T.id_tweet, text_tweet, favorite_count_tweet, retweet_count_tweet, eschiste_tweet, censurado_tweet, name_account, followers_count_account, evaluacion, votos, votos_humor, promedio_votos, categoria_tweet FROM tweets AS T NATURAL JOIN twitter_accounts LEFT JOIN (SELECT id_tweet, Avg(voto) AS promedio_votos, Count(*) AS votos, Count(case when voto <> 'x' then 1 else NULL end) AS votos_humor FROM votos WHERE voto <> 'n' GROUP BY id_tweet) V ON ( V.id_tweet = T.id_tweet ) {restricciones} """.format(restricciones=restricciones_tweets) else: consulta = """ SELECT id_account, T.id_tweet, text_tweet, favorite_count_tweet, retweet_count_tweet, eschiste_tweet, censurado_tweet, name_account, followers_count_account, evaluacion, votos, votos_humor, promedio_votos, categoria_tweet FROM tweets AS T NATURAL JOIN twitter_accounts LEFT JOIN (SELECT id_tweet, Avg(voto) AS promedio_votos, Count(*) AS votos, Count(If(voto <> 'x', 1, NULL)) AS votos_humor FROM votos WHERE voto <> 'n' GROUP BY id_tweet) V ON ( V.id_tweet = T.id_tweet ) {restricciones} """.format(restricciones=restricciones_tweets) cursor.execute(consulta) bar = IncrementalBar("Cargando tweets\t\t\t", max=(999999 if DB_ENGINE == 'sqlite3' else cursor.rowcount), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) resultado = {} for (id_account, tweet_id, texto, favoritos, retweets, es_humor, censurado, cuenta, seguidores, evaluacion, votos, votos_humor, promedio_votos, categoria) in cursor: tweet = Tweet() tweet.id = tweet_id tweet.texto_original = texto tweet.texto = texto tweet.favoritos = favoritos tweet.retweets = retweets tweet.es_humor = es_humor tweet.es_chiste = es_humor tweet.censurado = censurado tweet.cuenta = cuenta tweet.seguidores = seguidores tweet.evaluacion = evaluacion tweet.categoria = categoria if votos: tweet.votos = int(votos) # Esta y la siguiente al venir de count y sum, son decimal. if votos_humor: tweet.votos_humor = int(votos_humor) if promedio_votos: tweet.promedio_de_humor = promedio_votos resultado[tweet.id] = tweet bar.next() bar.finish() if cargar_features: consulta = """ SELECT id_tweet, nombre_feature, valor_feature FROM features NATURAL JOIN tweets {restricciones} """.format(restricciones=restricciones_features) cursor.execute(consulta) bar = IncrementalBar("Cargando features\t\t", max=(9999999 if DB_ENGINE == 'sqlite3' else cursor.rowcount), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for (id_tweet, nombre_feature, valor_feature) in cursor: if id_tweet in resultado: resultado[id_tweet].features[nombre_feature] = valor_feature bar.next() bar.finish() cursor.close() conexion.close() return list(resultado.values())
def fat_experiment(self): it_list = [100, 200, 300] c1_list = [0.2, 0.4, 0.6] c2_list = [0.2, 0.4, 0.6] w_list = [0.3, 0.6, 0.9] pop_list = [100, 200, 300] data = { 'id': [], 'it_number': [], 'size': [], 'c1': [], 'c2': [], 'w': [], 'best_fo': [], 'best_s_it': [] } id = 0 data_100 = {'id': [], 'min_fo': [], 'max_fo': [], 'mean_fo': []} data_200 = {'id': [], 'min_fo': [], 'max_fo': [], 'mean_fo': []} data_300 = {'id': [], 'min_fo': [], 'max_fo': [], 'mean_fo': []} bar = IncrementalBar('Processing', max=(3**5) * 10) for it in it_list: for c1 in c1_list: for c2 in c2_list: for w in w_list: for size in pop_list: for i in range(10): best_s, best_fo, results = self.solve( it_number=it, pop_size=size, c1=c1, c2=c2, w=w) data['id'].append(id) data['it_number'].append(it) data['size'].append(size) data['c1'].append(c1) data['c2'].append(c2) data['w'].append(w) data['best_s_it'].append(results['best_s_it']) data['best_fo'].append(best_fo) if it == 100: for i in range(len(results['min_fo'])): data_100['id'].append(id) data_100['min_fo'].append( results['min_fo'][i]) data_100['max_fo'].append( results['max_fo'][i]) data_100['mean_fo'].append( results['mean_fo'][i]) #print('data',data_100) if it == 200: for i in range(len(results['min_fo'])): data_200['id'].append(id) data_200['min_fo'].append( results['min_fo'][i]) data_200['max_fo'].append( results['max_fo'][i]) data_200['mean_fo'].append( results['mean_fo'][i]) if it == 300: for i in range(len(results['min_fo'])): data_300['id'].append(id) data_300['min_fo'].append( results['min_fo'][i]) data_300['max_fo'].append( results['max_fo'][i]) data_300['mean_fo'].append( results['mean_fo'][i]) bar.next() id += 1 pd.DataFrame.from_dict(data).to_csv('results/header.csv', index=False) pd.DataFrame.from_dict(data_100).to_csv('results/history_100.csv', index=False) pd.DataFrame.from_dict(data_200).to_csv('results/history_200.csv', index=False) pd.DataFrame.from_dict(data_300).to_csv('results/history_300.csv', index=False) bar.finish()
def run(self): # print("starting file thread") sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('127.0.0.1', 10001)) sock.settimeout(10.0) sock.listen(5) # print("Entering while loop") block_size = 4096 while not self.file_terminate.is_set(): # print("In while loop") c, addr = sock.accept() # Establish connection with client. # print('Got connection from', addr) # print("Receiving...") data = str(c.recv(5).decode("utf-8")) # print("data:",data) num_files = int(data) for i in range(num_files): data = str(c.recv(10).decode('utf-8')) # print("filebytes: "+data) fileb = int(data.split(" ") [0]) #endianness may be affecting data transfer filenamebytes = int(data.split(" ")[1]) data = str(c.recv(filenamebytes).decode('utf-8')) # print("filename received: "+data) filename = data curb = 0 print('Receiving ' + filename + "...") bar = IncrementalBar('Percentage of file transferred', max=100) percentage = 0 path = "/".join(filename.split("/")[:-1]) destination = self.id + "/" + path file_name = filename.split("/")[-1] if not os.path.exists(destination + "/" + file_name): if not os.path.exists(destination): os.makedirs(destination) writeto = open(destination + '/' + file_name, 'w+') while curb < fileb: sys.stdout.flush() if curb + block_size > fileb: data = str(c.recv(fileb - curb).decode('utf-8')) else: data = str(c.recv(block_size).decode('utf-8')) writeto.write(data) next_val = int(100 * (curb / fileb)) for i in range(next_val - percentage): bar.next() curb += len(data) percentage = next_val #print('\r' + str(curb) + "/" + str(fileb)) # print("") # print("Last text:",data) next_val = int(100 * (curb / fileb)) for i in range(next_val - percentage): bar.next() curb += len(data) percentage = next_val writeto.close() bar.finish() print('Transfer of ' + filename + ' successful.') else: lines = "" while curb < fileb: sys.stdout.flush() if curb + block_size > fileb: data = str(c.recv(fileb - curb).decode('utf-8')) else: data = str(c.recv(block_size).decode('utf-8')) # data = str(data.decode('utf-8')) if data: lines += data next_val = int(100 * (curb / fileb)) for i in range(next_val - percentage): bar.next() curb += len(data) percentage = next_val #print('\r' + str(curb) + "/" + str(fileb)) # print("") lines = lines.split("\n") next_val = int(100 * (curb / fileb)) for i in range(next_val - percentage): bar.next() curb += len(data) percentage = next_val #print("lines: ",lines) #print("end") for i in range(len(lines) - 1): lines[i] = lines[i] + "\n" self.writeToFile(lines, destination + "/" + file_name, destination + "/" + file_name) bar.finish() print('Transfer of ' + filename + ' successful.') self.file_terminate.set() print("File reading client closed") sock.close()
if __name__ == "__main__": scriptPath = os.path.realpath(__file__) scriptDir = os.path.dirname(scriptPath) subDirs = next(os.walk(scriptDir))[1] # Upload all of the top-level files for fileName in os.listdir(scriptDir): if not os.path.isdir(os.path.join(scriptDir, fileName)): if fileName in IGNORE_FILES: continue upload_file(os.path.join(scriptDir, fileName), AWS_WEBSITE_BUCKET_NAME, fileName, determine_content_type(fileName)) # For each subDir within the top-level directory, upload all files using an ObjectName of subdir/fileName # to recreate the folder structure within S3 progress_bar = IncrementalBar( " Uploading website assets to S3", max=return_total_num_of_asset_files(scriptDir), suffix='%(percent)d%% ') for subDir in subDirs: if subDir in IGNORE_DIRS: continue for fileName in os.listdir(os.path.join(scriptDir, subDir)): if fileName in IGNORE_FILES: continue upload_file(os.path.join(scriptDir, subDir, fileName), AWS_WEBSITE_BUCKET_NAME, os.path.join(subDir, fileName), determine_content_type(fileName)) progress_bar.next() progress_bar.finish()
def create_temp_frustrum_voxels(view_manager, voxel_config, out_dim, cat_id, compression='lzf'): from progress.bar import IncrementalBar view_params = view_manager.get_view_params() n_views = view_params['n_views'] f = view_params['f'] in_dims = (voxel_config.voxel_dim, ) * 3 ray_shape = (out_dim, ) * 3 example_ids = tuple(view_manager.get_example_ids(cat_id)) n0 = len(example_ids) temp_path = _get_frustrum_voxels_path(voxel_config, view_manager.view_id, out_dim, cat_id, code='temp') _make_dir(temp_path) with h5py.File(temp_path, 'a') as vox_dst: attrs = vox_dst.attrs prog = attrs.get('prog', 0) if prog == n0: return temp_path attrs.setdefault('n_views', n_views) max_len = attrs.setdefault('max_len', 0) vox_manager = get_voxel_manager(voxel_config, cat_id, key='rle', compression=compression, shape_key='pad') vox_manager.get_dataset() # ensure data exists assert (vox_manager.has_dataset()) with h5py.File(vox_manager.path, 'r') as vox_src: rle_src = vox_src[GROUP_KEY] n, m = rle_src.shape max_max_len = m * 3 assert (n == n0) print('Creating temp rle frustrum voxel data at %s' % temp_path) rle_dst = vox_dst.require_dataset(GROUP_KEY, shape=(n, n_views, max_max_len), dtype=np.uint8, compression=compression) bar = IncrementalBar(max=n - prog) for i in range(prog, n): bar.next() voxels = RleVoxels(np.array(rle_src[i]), in_dims) eye = view_manager.get_camera_positions(cat_id, example_ids[i]) for j in range(n_views): out = convert(voxels, eye[j], ray_shape, f) data = out.rle_data() dlen = len(data) if dlen > max_len: attrs['max_len'] = dlen max_len = dlen if dlen > max_max_len: raise ValueError('max_max_len exceeded. %d > %d' % (dlen, max_max_len)) rle_dst[i, j, :dlen] = data attrs['prog'] = i + 1 bar.finish() return temp_path
print(f"{Fore.GREEN}All settings valid, proceeding...") print(f"Downloading {filename[0]}") chunkSize = 10240 try: r = requests.get(url + filename[0], stream=True) with open(tempDir + filename[0], "wb") as f: pbar = IncrementalBar( "Downloading", max=int(r.headers["Content-Length"]) / chunkSize, suffix="%(percent)d%%", ) for chunk in r.iter_content(chunk_size=chunkSize): if chunk: # filter out keep-alive new chunks pbar.next() f.write(chunk) pbar.finish() except Exception: print(f"Download {Fore.RED}failed, please try again. Exiting.") sys.exit() print(f"Download {Fore.GREEN}done") # Extraction spinnerExtract = Spinner("Extracting... ") spinnerExtract.start() try: shutil.unpack_archive(tempDir + filename[0], tempDir) except Exception: print(f"Extraction {Fore.RED}failed, please try again. Exiting.") sys.exit() spinnerExtract.stop() print(f"Extraction {Fore.GREEN}done")
def initiateTraining(self): bar = IncrementalBar('Training', max=self.epochs) self.d_x = [] self.d_x2 = [] for e in range(self.epochs): bar.next() self.epoch = e noise, noise_hyperparams = self.get_noise() batch_DS, batch_hyperparams = self.get_train_data() self.generated_ds = self.layout.g.predict( [noise] + noise_hyperparams, batch_size=self.batch_size) real_label = np.array( [[1., 0.] for i in range(len(self.energies) * self.batch_size)]) fake_label = np.array( [[0., 1.] for i in range(len(self.energies) * self.batch_size)]) train_label = np.array( [[1., 0.] for i in range(len(self.energies) * self.batch_size)]) X = np.concatenate([batch_DS, self.generated_ds]) all_Xh = [X] for num in range(len(noise_hyperparams)): all_Xh.append( np.concatenate( [batch_hyperparams[num], noise_hyperparams[num]])) Y = np.concatenate([real_label, fake_label]) W = np.concatenate([ np.ones(shape=(len(self.energies) * self.batch_size, )), np.full(fill_value=1, shape=(len(self.energies) * self.batch_size, )) ]) self.layout.d.trainable = True d_loss, d_acc = self.layout.d.train_on_batch(all_Xh, Y, sample_weight=W) d_loss, d_acc = self.layout.d.train_on_batch(all_Xh, Y, sample_weight=W) d_loss, d_acc = self.layout.d.train_on_batch(all_Xh, Y, sample_weight=W) d_loss, d_acc = self.layout.d.train_on_batch(all_Xh, Y, sample_weight=W) d_loss, d_acc = self.layout.d.train_on_batch(all_Xh, Y, sample_weight=W) self.layout.d.trainable = False logs = self.gan.train_on_batch([noise] + noise_hyperparams, train_label) self.layout.tensorboard.on_epoch_end( self.epoch, self.named_logs(self.gan, logs)) if e == 0 or (e + 1) % self.epochCheck == 0: self.generated_ds = {} noise, noise_hyperparams = self.get_noise() temp_generated = self.layout.g.predict([noise] + noise_hyperparams) gen_class_length = int(temp_generated.shape[0] / (len(self.energies))) for en in range(1, len(self.energies) + 1): self.generated_ds[self.energies[en - 1]] = {} for num, var in enumerate(self.variables_of_interest): current_var = np.asarray(temp_generated)[:, num] gen_energies_var = current_var[(en - 1) * gen_class_length:en * gen_class_length] self.generated_ds[self.energies[ en - 1]][var] = gen_energies_var multiples = int( len(self.training_ds[self.energies[0]][ self.variables_of_interest[0]]) / self.batch_size) for i in range(1, multiples): noise, noise_hyperparams = self.get_noise() temp_generated = self.layout.g.predict([noise] + noise_hyperparams) gen_class_length = int(temp_generated.shape[0] / (len(self.energies))) for en in range(1, len(self.energies) + 1): for num, var in enumerate(self.variables_of_interest): current_var = np.asarray(temp_generated)[:, num] gen_energies_var = current_var[ (en - 1) * gen_class_length:en * gen_class_length] self.generated_ds[self.energies[ en - 1]][var] = np.concatenate([ self.generated_ds[self.energies[en - 1]][var], gen_energies_var ]) if (self.epoch + self.epochCheck >= self.epochs): self.last = True #Set instead of 3 the number of candidates to show indexes = np.round(np.linspace(0, len(self.energies) - 1, 3)).astype(int) selected_energies = [ self.energies[i] for i in range(len(self.energies)) if i in indexes ] selected_training_ds = dict( filter(lambda elem: elem[0] in selected_energies, self.training_ds.items())) selected_generated_ds = dict( filter(lambda elem: elem[0] in selected_energies, self.generated_ds.items())) for num, var in enumerate(self.variables_of_interest): random_energy = randint(0, len(selected_energies) - 1) true_d = selected_training_ds[selected_energies[ random_energy]][var] * self.normalisation[ selected_energies[random_energy]][var] false_d = selected_generated_ds[selected_energies[ random_energy]][var] * self.normalisation[ selected_energies[random_energy]][var] self.d_x.append(self.getMoment1(true_d, false_d)) self.d_x2.append(self.getMoment2(true_d, false_d)) self.visualiseCurrentEpoch(selected_training_ds, selected_generated_ds, selected_energies, self.generated_ds) self.all_epochs.append(e) self.d_loss.append(d_loss) self.d_acc.append(d_acc) self.layout.tensorboard.on_train_end(None) bar.finish() return self.final_produced
def assemble_collage(): print('start assembling collage') # load all from downsized path files = os.listdir(downsized_path) files = [ file for file in files if os.path.isfile(os.path.join(downsized_path, file)) ] images = [] bar = IncrementalBar('Loading', max=len(files)) for file in files: im = Image.open(os.path.join(downsized_path, file)) im = np.asarray(im) images.append(im) bar.next() bar.finish() # compute total amount of light in each image and only keep the N brightest images = [(np.sum(image), image) for image in images] images.sort(key=lambda x: x[0], reverse=True) images = images[:N] images = [x[1] for x in images] # compute the average color in each quadrant Cx = int(target_height / 2) Cy = int(target_width / 2) U = [np.mean(image[:Cx, :, :], axis=(1, 2)) for image in images] D = [np.mean(image[Cx:, :, :], axis=(1, 2)) for image in images] R = [np.mean(image[:, :Cy, :], axis=(1, 2)) for image in images] L = [np.mean(image[:, Cy:, :], axis=(1, 2)) for image in images] # initially just sort them in randomly map = np.random.permutation(N).reshape((Nx, Ny)) # optimize neighbors with a stochastic metropolis algorithm Ni = 500000 T = np.linspace(150, 2, Ni) A = np.zeros((Ni, 1)) u = lambda x: (x + 1) % Nx d = lambda x: (x - 1) % Nx r = lambda x: (x + 1) % Ny l = lambda x: (x - 1) % Ny score = lambda i1, j1, i2, j2: np.linalg.norm(U[map[i1, j1]] - D[map[u( i2), j2]]) + np.linalg.norm(D[map[i1, j1]] - U[map[d( i2), j2]]) + np.linalg.norm(L[map[i1, j1]] - R[map[i2, l( j2)]]) + np.linalg.norm(R[map[i1, j1]] - L[map[i2, r(j2)]]) bar = IncrementalBar('Optimization', max=Ni) for ai in range(Ni): # get two non-equal random locations i1 = np.random.randint(Nx) j1 = np.random.randint(Ny) while True: i2 = np.random.randint(Nx) j2 = np.random.randint(Ny) if i1 != i2 or j1 != j2: break # compute score x = score(i1, j1, i1, j1) - score(i1, j1, i2, j2) + score( i2, j2, i2, j2) - score(i2, j2, i1, j1) # exchange # if x < 0: # if x > 0: if x > 0 or np.exp(x / T[ai]) > np.random.uniform(): map[i1, j1], map[i2, j2] = map[i2, j2], map[i1, j1] A[ai] = 1 bar.next() bar.finish() # time evolution of acceptance rate Nc = int(np.floor(Ni / 20)) for ai in range(20): print('{}: {}'.format(ai, np.mean(A[ai * Nc:(ai + 1) * Nc]))) # shift brightest to center B = np.zeros((Nx, Ny)) for i in range(Nx): for j in range(Ny): B[i, j] = np.sum(images[map[i, j]]) sk = np.array([0.25, 0.5, 1, 0.5, 0.25]) # convolve in 1D along all rows and all columns for i in range(Nx): B[i, :] = np.convolve(B[i, :], sk, mode='same') for j in range(Ny): B[:, j] = np.convolve(B[:, j], sk, mode='same') cx, cy = np.unravel_index(np.argmax(B), B.shape) map = np.roll(map, (int(Nx / 2 - cx), int(Ny / 2 - cy)), axis=(0, 1)) # assemble image final = np.zeros((Nx * target_height, Ny * target_width, 3), dtype=np.uint8) for i in range(Nx): for j in range(Ny): final[i * target_height:(i + 1) * target_height, j * target_width:(j + 1) * target_width] = images[map[i, j]] # convert back to pillow image and save im = Image.fromarray(final) im.save(output_file)
def lc2SDS(): """ Convert fixed LCHEAPO data to SeisComp Data Structure SIMPLE drift and leapsecond correction: - offset is constant within each daily file - offset information is not written in header - data quality field is not modified - leapsecond flag is not raised (causes apparent 1-s gap/overlap). Writes to a directory named SDS/ in the output directory. """ print(lc2SDS.__doc__) parser = argparse.ArgumentParser( description=inspect.cleandoc(lc2SDS.__doc__), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("infiles", nargs='+', help="Input filename(s). If there are captured " "wildcards (put in '' so that they aren't " "interpreted by the shell), will expand them " "in the input directory") parser.add_argument("-t", "--obs_type", default='SPOBS2', help="obs type. Controls channel and location codes", choices=[s for s in chan_maps]) parser.add_argument("--station", default='SSSSS', help="station code for this instrument") parser.add_argument("--network", default='XX', help="network code for this instrument") parser.add_argument("-s", "--start_times", nargs='+', metavar=("REF_START", "INST_START"), help="Start datetimes for the reference (usually GPS) " "and instrument. If only one value is provided, " "it will be used for both") parser.add_argument("-e", "--end_times", nargs=2, metavar=("REF_END", "INST_END"), help="End datetimes for the reference and instrument") parser.add_argument("--leapsecond_times", nargs='+', help="leapsecond times") parser.add_argument("--leapsecond_types", default='+', help="'+' for extra second, '-' for removed second. " "If there is one character it is applied to all " "leapseconds, if there is more than one the " "length of the string must match " "the number of leapsecond_times") parser.add_argument("-d", dest="base_dir", metavar="BASE_DIR", default='.', help="base directory for files") parser.add_argument("-i", dest="in_dir", metavar="IN_DIR", default='.', help="input file directory (absolute, " + "or relative to base_dir)") parser.add_argument("-o", dest="out_dir", metavar="OUT_DIR", default='.', help="output file directory (absolute, " + "or relative to base_dir)") parser.add_argument("-v", "--verbose", action='store_true', help="verbose output") parser.add_argument("--version", action='store_true', help="Print version number and quit") args = parser.parse_args() parameters = vars(args).copy() if args.version is True: print(f"Version {__version__}") sys.exit(0) # ADJUST INPUT PARAMETERS if args.start_times is not None: args.start_times = [UTCDateTime(x) for x in args.start_times] if args.end_times is not None: args.end_times = [UTCDateTime(x) for x in args.end_times] ls_times, ls_types = _adjust_leapseconds(args.leapsecond_times, args.leapsecond_types) # SETUP FOR PROCESS-STEPS process_step = ProcessStep('lc2SDS', " ".join(sys.argv), app_description=__doc__, app_version=__version__, parameters=parameters) args.in_dir, args.out_dir, args.infiles = ProcessStep.setup_paths(args) # Expand captured wildcards #args.infiles = [x.name for f in args.infiles # for x in Path(args.in_dir).glob(f)] for infile in args.infiles: lc_start, lc_end = get_data_timelimits(Path(args.in_dir) / infile) if args.start_times and args.end_times: ref_start = args.start_times[0] if len(args.start_times) > 1: inst_start = args.start_times[1] else: inst_start = ref_start ref_end, inst_end = args.end_times if inst_start == 0: inst_start = ref_start inst_start_offset = inst_start - ref_start inst_drift = ((inst_end - ref_end) - inst_start_offset)\ / (ref_end - inst_start) print( 'instrument start offset = {:g}s, drift rate = {:.4g}'.format( inst_start_offset, inst_drift)) # quality_flag = 'Q' # Don't know how to put this in miniSEED else: ref_start, inst_start = lc_start, lc_start inst_start_offset = 0 inst_drift = 0 warnings.warn('Could not calculate clock drift, assuming zero!') # quality_flag = 'D' # Don't know how to put this in miniSEED lc_start_day = lc_start.replace(hour=0, minute=0, second=0, microsecond=0) lc_end_day = lc_end.replace(hour=0, minute=0, second=0, microsecond=0) stime = lc_start_day bar = IncrementalBar(f'Processing {infile}', max=(lc_end_day - lc_start_day) / 86400 + 1) while stime <= lc_end_day: inst_offset = inst_start_offset + inst_drift * (stime - ref_start) _write_daily(inst_offset, stime, infile, args, ls_times, ls_types) bar.next() stime += 86400 bar.finish() return_code = 0 process_step.exit_code = return_code process_step.write(args.in_dir, args.out_dir) sys.exit(return_code)
def _seed_districts(self, graph, districts): """ A simple procedure that selects n random seed nodes (n = number of districts) and then selects neighbors of those seeds and claims them to be of the same district. Performance Notes: o(n^3), but operations are cheap. """ bar = IncrementalBar("Seeding Districts", max=len(graph.nodes)) graph_pool = [_ for _ in graph.nodes] random.shuffle(graph_pool) district_sizes = [[1, district] for district in range(districts)] # Start the district with some seeds for district in range(districts): bar.next() seed = graph_pool.pop() graph.nodes.get(seed)['dis'] = district # While there are unclaimed nodes while graph_pool: last_run = len(graph_pool) # Let each district claim a new node district_sizes = sorted(district_sizes) for i, (size, district) in enumerate(district_sizes): round_complete = False # Find the nodes that belong to a district for node, props in graph.nodes(data=True): if props.get('dis') == district: # Iterate through edges and find an unclaimed neighbor for _, neighbor in graph.edges(node): if neighbor in graph_pool: graph_pool.remove(neighbor) district_sizes[i][0] += 1 bar.next() graph.nodes.get(neighbor)['dis'] = district round_complete = True break if round_complete: break # Quicker breaking # if round_complete: break # Quicker breaking if len(graph_pool) == last_run: for candidate in graph_pool: for _, neighbor in graph.edges(candidate): district = graph.nodes[neighbor].get('dis', -1) if district != -1: graph_pool.remove(candidate) district_sizes[district][0] += 1 bar.next() graph.nodes[candidate]['dis'] = district round_complete = True break if round_complete: break if round_complete: break if len(graph_pool) == last_run: # PANIC import pdb pdb.set_trace() bar.finish() return graph
def random_sampling_program(literal, bn, samples): global inconsistent_program, repeated_program, uniquePrograms, uniqueOnlyPrograms, uniqueEvidences, repeated_evidence all_possible_programs = pow(2, dimension) bar = IncrementalBar("Random sampling programs...", max=samples) initial_time = time.time() for i in range(samples): int_program = np.random.choice(all_possible_programs + 1, 1) program = int_to_bin_with_format(int_program, dimension)[0] tuple_program = tuple(program) if tuple_program not in uniqueOnlyPrograms: uniqueOnlyPrograms.add(tuple_program) delp = build_delp_from_binaries(program) evidence = get_evidence(program) if evidence != 'incorrect_program': tuple_evidence = tuple(evidence.items()) if tuple_evidence not in uniqueEvidences: uniqueEvidences.add(tuple_evidence) status = queryToProgram([delp, program], literal, uniquePrograms) prWorld = bn.get_sampling_prob(evidence) if status[1] == 'yes': results['yes']['total'] += 1 results['yes'][ 'prob'] = results['yes']['prob'] + prWorld elif status[1] == 'no': results['no']['total'] += 1 results['no']['prob'] = results['no']['prob'] + prWorld elif status[1] == 'undecided': results['und']['total'] += 1 results['und'][ 'prob'] = results['und']['prob'] + prWorld elif status[1] == 'unknown': results['unk']['total'] += 1 results['unk'][ 'prob'] = results['unk']['prob'] + prWorld else: repeated_evidence += 1 else: inconsistent_program += 1 else: repeated_program += 1 bar.next() bar.finish() time_execution = time.time() - initial_time results['execution_time'] = time_execution results['unique_programs'] = len(uniqueOnlyPrograms) results['inconsistent']['total'] = inconsistent_program results['inconsistent']['perc'] = "{:.2f}".format( (results['inconsistent']['total'] * 100) / samples) results['repeated']['total'] = repeated_program results['repeated']['perc'] = "{:.2f}".format( (results['repeated']['total'] * 100) / samples) results['domain'] = 'programs' results['yes']['perc'] = "{:.2f}".format( (results['yes']['total'] * 100) / samples) results['no']['perc'] = "{:.2f}".format( (results['no']['total'] * 100) / samples) results['und']['perc'] = "{:.2f}".format( (results['und']['total'] * 100) / samples) results['unk']['perc'] = "{:.2f}".format( (results['unk']['total'] * 100) / samples) results['l'] = results['yes']['prob'] results['u'] = results['u'] - results['no']['prob'] results['total_sampling'] = samples print("Unique programs: ", end='') print_ok_ops("%s" % len(uniqueOnlyPrograms)) print("Unique evidence: ", end='') print_ok_ops("%s" % len(uniqueEvidences)) print("Inconsistent programs: ", end='') print_ok_ops("%s" % inconsistent_program) print("repeated evidence: ", repeated_evidence) with open('/home/mario/results/umda/UMDARandomPrograms.json', 'w') as outfile: json.dump(results, outfile, indent=4)
def umda_brute_force_programs(literal, bn): global uniqueEvidence, uniquePrograms, repeated_evidence, inconsistent_program all_possible_programs = pow(2, dimension) bar = IncrementalBar("Analyzing programs...", max=all_possible_programs) initial_time = time.time() for int_value in range(all_possible_programs): program = int_to_bin_with_format( int_value, dimension)[0] # Return [program, evidence] REVISAR evidence = get_evidence(program) if evidence != 'incorrect_program': tuple_evidence = tuple(evidence.items()) if tuple_evidence not in uniqueEvidences: uniqueEvidences.add(tuple_evidence) delp = build_delp_from_binaries(program) status = queryToProgram([delp, program], literal, uniquePrograms) prWorld = bn.get_sampling_prob(evidence) if status[1] == 'yes': results['yes']['total'] += 1 results['yes']['prob'] = results['yes']['prob'] + prWorld elif status[1] == 'no': results['no']['total'] += 1 results['no']['prob'] = results['no']['prob'] + prWorld elif status[1] == 'undecided': results['und']['total'] += 1 results['und']['prob'] = results['und']['prob'] + prWorld elif status[1] == 'unknown': results['unk']['total'] += 1 results['unk']['prob'] = results['unk']['prob'] + prWorld else: repeated_evidence += 1 else: inconsistent_program += 1 bar.next() bar.finish() time_execution = time.time() - initial_time results['execution_time'] = time_execution results['yes']['perc'] = "{:.2f}".format( (results['yes']['total'] * 100) / all_possible_programs) results['no']['perc'] = "{:.2f}".format( (results['no']['total'] * 100) / all_possible_programs) results['und']['perc'] = "{:.2f}".format( (results['und']['total'] * 100) / all_possible_programs) results['unk']['perc'] = "{:.2f}".format( (results['unk']['total'] * 100) / all_possible_programs) results['inconsistent']['total'] = inconsistent_program results['inconsistent']['perc'] = "{:.2f}".format( (results['inconsistent']['total'] * 100) / all_possible_programs) results['programsAnalyzed'] = all_possible_programs results['l'] = results['yes']['prob'] results['u'] = results['u'] - results['no']['prob'] print("Unique programs: ", end='') print_ok_ops("%s" % (int_value + 1)) print("Unique evidence: ", end='') print_ok_ops("%s" % len(uniqueEvidences)) print("Inconsistent programs: ", end='') print_ok_ops("%s" % inconsistent_program) with open('/home/mario/results/umda/UMDAForceBrutePrograms.json', 'w') as outfile: json.dump(results, outfile, indent=4)
#filename = "tuchan.html" Manga_name = "VuLuyenDienPhong" Path(Manga_name).mkdir(parents=True, exist_ok=True) f = open(Manga_name + "/" + Manga_name + ".html", 'a+') # Write to only one file chapter_count = 1 for chapter in all_chapter: if int(args.square) > 1 and chapter_count < int(args.square): chapter_count = chapter_count + 1 #_IncrementalBar.next() continue chapter_count = chapter_count + 1 #print(chapter['href']) Content = Get_Chapter_Content(chapter['href']) #print(Content) #exit() _IncrementalBar.next() #print(str(Content[0])) #xit() f.write(str(Content[0])) content_final = Remove_unwanted_string(str(Content[1])) f.write(content_final) f.write("\n") f.close() _IncrementalBar.finish() end_time = time.time() print(end_time - star_time)
def texttype_freqs(database, folder, prop_names): """ Used to collect lemmas by the types of text they appear in and sort them by frequency. Filters the RMH in order to retrieve the desired results. The script can be modified according to the user's need and to fit another corpus. """ dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db') # Predefined stop-word list based on the RMH print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) xml_files = glob.glob(folder+'/**/*.xml', recursive=True) alltexttypes = [] freqdic1 = {} freqdic2 = {} filebar = IncrementalBar('Framvinda', max = len(xml_files)) for file in xml_files: with open(file, 'r', encoding='utf-8') as content: try: tree = ET.parse(content) root = tree.getroot() textClass = root[0][2][0][0][0][0] # Retrieve the texttype tag from the XML file texttype = textClass.text if texttype not in alltexttypes: alltexttypes.append(texttype) # Collect all unique texttypes pos_to_ignore = ['e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'] # The POS tags that should not be displayed in the results for word in tree.iter(): pos = word.attrib.get('type') if pos is not None: if prop_names==False: if pos.startswith('n') and pos.endswith('s'): # Ignore proper names continue if pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.text)): # Ignore all that are not alphabetic letters or hyphen continue if len(word.text) < 3: # Ignore very short words, likely to be particles continue if word.text[-1] == '-': # Ignore words starting or ending with a hypen (likely OCR errors) continue if word.text[0] == '-': continue if word.attrib.get('lemma') is not None: lemma = word.attrib.get('lemma') filter_query = SQLiteQuery(lemma,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) # Ignore stop words if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery(lemma, 'lemma','DCI_ELEMENT', cursor = dci.cursor) # Capitalized words included query_lower = SQLiteQuery(lemma.lower(),'lemma','DCI_ELEMENT', cursor = dci.cursor) elif database == 'BIN': query = SQLiteQuery(lemma, 'lemma','DIM_ELEMENT', cursor = dim.cursor) # Capitalized words included query_lower = SQLiteQuery(lemma.lower(),'lemma','DIM_ELEMENT', cursor = dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords if lemma not in freqdic1: # Collect total freqs freqdic1[lemma] = 1 else: freqdic1[lemma] += 1 if (lemma,texttype) not in freqdic2: # Collect texttype freqs freqdic2[(lemma,texttype)] = 1 else: freqdic2[(lemma,texttype)] += 1 except IndexError: continue except ET.ParseError: continue filebar.next() sys.stdout.flush() filebar.finish() print(""" ============================================================ Flokkar tíðni eftir textagerðum. ============================================================ """) tempfinal = [] bar1 = IncrementalBar('Framvinda', max = len(freqdic1)) for key, value in sorted(freqdic1.items()): # Lemma, total freq tempf = [] tempf.append(key) temp = [] for k, v in freqdic2.items(): if k[0] == key: temp.append((k[1], v)) # A list of all possible texttypes that appear with the lemma for tt in alltexttypes: if tt in [item[0] for item in temp]: continue else: temp.append((tt, 0)) tempf.append(value) for tup in sorted(temp): tempf.append(tup[1]) tempfinal.append(tempf) # The format of this list is [lemma, totalfreq, texttype_a freq, texttype_b freq...] bar1.next() sys.stdout.flush() bar1.finish() header = ['Lemma', 'Heildartíðni'] + sorted(alltexttypes) if folder == "malheildir/RMH/": with open(f"uttak/{database}/RMH_textagerdir.csv", mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Úttaksskjalið RMH_textagerdir.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_textagerdir.csv', mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Úttaksskjalið CC_BY_textagerdir.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_textagerdir.csv', mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Úttaksskjalið MIM_textagerdir.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = folder.split("/")[3] with open(f'uttak/{database}/'+namefolder+"_textagerdir.csv", mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Úttaksskjalið {namefolder}_textagerdir.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def migrate(callback): connection = op.get_bind() s = sa.select([n.c.node, n.c.path]) nodes = connection.execute(s).fetchall() bar = IncrementalBar('Migrating node paths...', max=len(nodes)) for node, path in nodes: account, sep, rest = path.partition('/') match = callback(account) if not match: bar.next() continue path = sep.join([match, rest]) u = n.update().where(n.c.node == node).values({'path':path}) connection.execute(u) bar.next() bar.finish() s = sa.select([v.c.muser]).distinct() musers = connection.execute(s).fetchall() bar = IncrementalBar('Migrating version modification users...', max=len(musers) ) for muser, in musers: match = callback(muser) if not match: bar.next() continue u = v.update().where(v.c.muser == muser).values({'muser': match}) connection.execute(u) bar.next() bar.finish() s = sa.select([p.c.public_id, p.c.path]) public = connection.execute(s).fetchall() bar = IncrementalBar('Migrating public paths...', max=len(public)) for id, path in public: account, sep, rest = path.partition('/') match = callback(account) if not match: bar.next() continue path = sep.join([match, rest]) u = p.update().where(p.c.public_id == id).values({'path':path}) connection.execute(u) bar.next() bar.finish() s = sa.select([x.c.feature_id, x.c.path]) xfeatures = connection.execute(s).fetchall() bar = IncrementalBar('Migrating permission paths...', max=len(xfeatures)) for id, path in xfeatures: account, sep, rest = path.partition('/') match = callback(account) if not match: bar.next() continue path = sep.join([match, rest]) u = x.update().where(x.c.feature_id == id).values({'path':path}) connection.execute(u) bar.next() bar.finish() s = sa.select([xvals.c.feature_id, xvals.c.key, xvals.c.value]) s = s.where(xvals.c.value != '*') xfeaturevals = connection.execute(s).fetchall() bar = IncrementalBar('Migrating permission holders...', max=len(xfeaturevals)) for feature_id, key, value in xfeaturevals: account, sep, group = value.partition(':') match = callback(account) if not match: bar.next() continue new_value = sep.join([match, group]) u = xvals.update() u = u.where(and_( xvals.c.feature_id == feature_id, xvals.c.key == key, xvals.c.value == value)) u = u.values({'value':new_value}) connection.execute(u) bar.next() bar.finish() s = sa.select([g.c.owner, g.c.name, g.c.member]) groups = connection.execute(s).fetchall() bar = IncrementalBar('Migrating group owners & members...', max=len(groups)) for owner, name, member in groups: owner_match = callback(owner) member_match = callback(member) if owner_match or member_match: u = g.update() u = u.where(and_( g.c.owner == owner, g.c.name == name, g.c.member == member)) values = {} if owner_match: values['owner'] = owner_match if member_match: values['member'] = member_match u = u.values(values) connection.execute(u) bar.next() bar.finish()
def find_solutions(self, graph_setting_groups): results = {} # check for solutions for a specific set of interaction settings logging.info( "Number of interaction settings groups being processed: " + str(len(graph_setting_groups))) for strength, graph_setting_group in sorted( graph_setting_groups.items(), reverse=True): logging.info("processing interaction settings group with " "strength " + str(strength)) logging.info( str(len(graph_setting_group)) + " entries in this group") logging.info("running with " + str(self.number_of_threads) + " threads...") temp_results = [] bar = IncrementalBar('Propagating quantum numbers...', max=len(graph_setting_group)) bar.update() if self.number_of_threads > 1: with Pool(self.number_of_threads) as p: for result in p.imap_unordered( self.propagate_quantum_numbers, graph_setting_group, 1): temp_results.append(result) bar.next() else: for graph_setting_pair in graph_setting_group: temp_results.append( self.propagate_quantum_numbers(graph_setting_pair)) bar.next() bar.finish() logging.info('Finished!') if strength not in results: results[strength] = [] results[strength].extend(temp_results) for k, v in results.items(): logging.info("number of solutions for strength (" + str(k) + ") after qn propagation: " + str(sum([len(x[0]) for x in v]))) # remove duplicate solutions, which only differ in the interaction qn S results = remove_duplicate_solutions(results, self.filter_remove_qns, self.filter_ignore_qns) node_non_satisfied_rules = [] solutions = [] for result in results.values(): for (tempsolutions, non_satisfied_laws) in result: solutions.extend(tempsolutions) node_non_satisfied_rules.append(non_satisfied_laws) logging.info("total number of found solutions: " + str(len(solutions))) violated_laws = [] if len(solutions) == 0: violated_laws = analyse_solution_failure(node_non_satisfied_rules) logging.info("violated rules: " + str(violated_laws)) # finally perform combinatorics of identical external edges # (initial or final state edges) and prepare graphs for # amplitude generation match_external_edges(solutions) final_solutions = [] for sol in solutions: final_solutions.extend( perform_external_edge_identical_particle_combinatorics(sol)) return (final_solutions, violated_laws)
class ReportCompile(object): def __init__(self, job_name, template, **kwargs): self.job_name = job_name self.template = template self.no_artifacts = kwargs.get('no_artifacts', True) self.num_builds = int(kwargs.get('num_builds', composite['num_builds'])) self.minimum_build = int(kwargs.get('minimum_build', composite['min_build'])) self.exclude_builds = [int(xb) for xb in kwargs.get('exclude_builds', [])] try: self.work_dir = local(kwargs.get('work_dir', composite['work_dir'])) self.work_dir.ensure(dir=True) except KeyError: self.work_dir = local.mkdtemp() print('Writing composite report to {}'.format(self.work_dir.strpath)) self._progress = None self._queue = Queue() num_workers = 4 for __ in xrange(num_workers): worker = Thread(target=_queue_worker, args=(self,)) worker.daemon = True worker.start() @property def ssh_client(self): c = SSHClient() return c @staticmethod def _best_result(*results): # results should be a list of (result_id, result_value) tuples # result ranking, best to worst results_ranking = ('passed', 'xfailed', 'failed', 'xpassed', 'skipped', 'error') # Go through all the results, returning the best outcome based on results_ranking for result in results_ranking: for result_id, result_value in reversed(sorted(results, key=lambda r: r[0])): if result_value == result: return (result_id, result_value) @staticmethod def _streak(*results): sorted_results = sorted(results, key=lambda r: r[0]) # the value of the highest numbered (and therefore more recent) build latest_result = sorted_results[-1][1] streak = 0 for __, result_value in reversed(sorted_results): if result_value == latest_result: streak += 1 else: break return {'latest_result': latest_result, 'count': streak} def _progress_update(self, item, items_done): if self._progress is None: self._progress = Bar() self._progress.message = '%(index)d/%(max)d' self._progress.suffix = '' if item: items_done[item] = True self._progress.max = len(items_done) self._progress.index = len(filter(None, items_done.values())) with lock: try: self._progress.update() except ZeroDivisionError: pass def _progress_finish(self): self._progress.finish() self._progress = None def compile(self): return self.composite_report() def build_numbers(self): api = trackerbot.api() builds = trackerbot.depaginate(api, api.build.get(job_name=self.job_name, template=self.template) ) build_numbers = [] # XXX relying on trackerbot giving us the most recent builds first, should be explicit for build in builds.get('objects', []): if (build['number'] not in self.exclude_builds and build['number'] >= self.minimum_build): build_numbers.append(build['number']) if self.num_builds and len(build_numbers) == self.num_builds: break if build_numbers: print('Pulling reports from builds {}'.format( ', '.join([str(n) for n in build_numbers]))) return build_numbers def template_log_dirs(self): log_dir_tpl = composite['log_dir_tpl'] log_dirs = [] for build_number in self.build_numbers(): log_dirs.append((build_number, log_dir_tpl.format(self.job_name, build_number))) return log_dirs def test_reports(self): print('Collecting test reports to determine best build nodes') log_dirs = self.template_log_dirs() reports = {} c = self.ssh_client jenkins_host = composite['jenkins_host'] c.connect(jenkins_host, username=credentials['jenkins-result']['username'], password=credentials['jenkins-result']['password'], timeout=10, allow_agent=False, look_for_keys=False, gss_auth=False) builds_done = {} self._progress_update(None, builds_done) for build_number, log_dir in log_dirs: build_work_dir = local(self.work_dir.join(str(build_number))) build_work_dir.ensure(dir=True) _remote = local(log_dir).join('test-report.json').strpath _local = build_work_dir.join('test-report.json').strpath builds_done[build_number] = False self._progress_update(None, builds_done) self._queue.put((_remote, _local, build_number, builds_done)) self._queue.join() self._progress_finish() for build_number, __ in log_dirs: build_work_dir = local(self.work_dir.join(str(build_number))) for path in build_work_dir.visit('*/test-report.json'): try: report = json.load(path.open()) reports[build_number] = report except: # invalid json, skip this report pass return reports def composite_status(self, reports=None): jenkins_host = composite['jenkins_host'] reports = reports or self.test_reports() results = {} # results dict structure: # { # nodeid: { # 'build_results': {build_id_1: build_id_1_result, build_id_2: ...} # 'best_result': (best_build_id, best_build_result) # 'result_url': http://jenkins/path/to/build # 'streak': (latest_build_result, number_of_results_in_a_row) # }, # nodeid: { # ... # } # } for build_number, report in reports: for nodeid, nodedata in report.get('tests', {}).items(): try: # Try to pull the build statuses, skip the node if we can't node_results_temp = nodedata['statuses']['overall'] node_results = results.setdefault(nodeid, {'build_results': {}}) node_results['build_results'][build_number] = node_results_temp except KeyError: continue for nodeid, nodedata in results.items(): node_results = nodedata['build_results'].items() nodedata['best_result'] = self._best_result(*node_results) nodedata['result_url'] = 'https://{}/job/{}/{}/'.format( jenkins_host, self.job_name, nodedata['best_result'][0] ) nodedata['streak'] = self._streak(*node_results) test_counts[nodedata['best_result'][1]] += 1 return results def composite_report(self): reports = self.test_reports() composite_status = self.composite_status(reports.iteritems()) composite_report = { 'test_counts': test_counts, 'tests': OrderedDict() } print('Collecting artifacts from best build nodes') # tracking dict for file pull progress remotes_done = {} self._progress_update(None, remotes_done) for nodeid, nodedata in sorted(composite_status.items(), key=lambda s: s[1]['streak']['count'], reverse=True): best_build_number = nodedata['best_result'][0] best_build_test = reports[best_build_number]['tests'][nodeid] composite_report['tests'][nodeid] = best_build_test composite_report['tests'][nodeid]['composite'] = nodedata reports[best_build_number]['tests'][nodeid]['files'] = [] # wait for all the files to arrive before building the report self._queue.join() self._progress_finish() json.dump(composite_report, self.work_dir.join('composite-report.json').open('w'), indent=1) try: passing_percent = (100. * (test_counts['passed'] + test_counts['skipped'] + test_counts['xfailed'])) / sum(test_counts.values()) print('Passing percent:', passing_percent) # XXX: Terrible artifactor spoofing happens here. print('Running artifactor reports') r = reporter.ReporterBase() reports_done = {'composite': False, 'provider': False} self._progress_update(None, reports_done) r._run_report(composite_report['tests'], self.work_dir.strpath) self._progress_update('composite', reports_done) r._run_provider_report(composite_report['tests'], self.work_dir.strpath) self._progress_update('provider', reports_done) self._progress_finish() except ZeroDivisionError: print('No tests collected from test reports (?!)') return composite_report def _translate_artifacts_path(self, artifact_path, build_number): preamble = composite['preamble'].format(self.job_name) replacement = composite['replacement'].format(self.job_name, build_number) artifact_remote = artifact_path.replace(preamble, replacement) artifact_local = self.work_dir.join(str(build_number), artifact_path[len(preamble):]) try: assert artifact_remote.startswith(composite['remote_sw']) assert artifact_local.strpath.startswith(self.work_dir.strpath) except AssertionError: print('wat?') print('path', artifact_path) print('remote', artifact_remote) print('local', artifact_local.strpath) return artifact_remote, artifact_local.strpath
class Converter: def __init__(self, color_count_method=None): if color_count_method: self.color_count_method = color_count_method else: self.color_count_method = self.color_count_all self.__img = None self.__output_img = None self.__progress = 0 self.__progress_bar = None def set_image(self, img: np.ndarray): self.__img = img.copy() def quadify_image(self, max_colors): self.__progress_bar = IncrementalBar('Render Progress', suffix='%(percent)d%%') self.__output_img = self.__img.copy() width, height, *_ = self.__output_img.shape self._quad(0, 0, width, height, max_colors, 0) self.__progress_bar.finish() return self.__output_img def _update_progress(self, new_progress): temp = self.__progress self.__progress += new_progress self.__progress_bar.goto(self.__progress * 100) if temp // 0.05 < self.__progress // 0.05: print('#', end='') def _quad(self, x, y, nx, ny, max_colors, depth): width = nx - x height = ny - y num_of_colors = self.color_count_method(self.__output_img, x, y, nx, ny, max_colors) if num_of_colors <= max_colors: # pixel_to_color_ratio = width * height / num_of_colors self.__output_img[x: nx, y: ny, :] = np.mean(self.__output_img[x: nx, y: ny, :], axis=(0, 1)) # * pixel_to_color_ratio self._update_progress(0.25 ** depth) else: mx, my = width // 2 + x, height // 2 + y self._quad(x, y, mx, my, max_colors, depth + 1) self._quad(mx, y, nx, my, max_colors, depth + 1) self._quad(x, my, mx, ny, max_colors, depth + 1) self._quad(mx, my, nx, ny, max_colors, depth + 1) @staticmethod def color_count_all(img, x, y, nx, ny, max_colors): colors = set() for i, j in iterate_cartesian(range(x, nx), range(y, ny)): colors.add(str(img[i, j, :])) if len(colors) > max_colors: return len(colors) return len(colors) @staticmethod def color_count_differing(img, x, y, nx, ny, max_colors): colors = [] for i, j in iterate_in_steps(x, y, nx, ny, step=(ny - y) // 8): pixel_color = img[i, j, :] for color in colors: if ((color - pixel_color) ** 2).sum() < 256: break else: colors.append(pixel_color) if len(colors) > max_colors: return len(colors) return len(colors)
class World: class WorldMetric: def __init__(self): self.lost_demand = [] self.average_deviation_ideal_state = [] self.deficient_battery = [] self.time = [] def add_analysis_metrics(self, world): """ Add data to analysis :param world: world object to record state from """ self.lost_demand.append( sum([1 for reward in world.rewards if reward == LOST_TRIP_REWARD]) if len(world.rewards) > 0 else 0 ) self.average_deviation_ideal_state.append( sum( [ abs( (sum([1 for _ in cluster.get_available_scooters()])) - cluster.ideal_state ) for cluster in world.state.clusters ] ) / len(world.state.clusters) ) self.deficient_battery.append( sum( [ cluster.ideal_state * 100 - ( sum( [ scooter.battery for scooter in cluster.get_available_scooters() ] ) ) for cluster in world.state.clusters if len(cluster.scooters) < cluster.ideal_state ] ) ) self.time.append(world.time) def get_lost_demand(self): """ Returns list of all lost demand """ return self.lost_demand def get_deviation_ideal_state(self): """ Returns list of average deviation from ideal state during the time analysed """ return self.average_deviation_ideal_state def get_deficient_battery(self): """ Returns list of total deficient battery in the system during the analysed time """ return self.deficient_battery def get_time_array(self): """ Returns a list of all timestamps when when data used for analysis is recorded """ return self.time def get_all_metrics(self): """ Returns all metrics recorded for analysis """ return ( self.lost_demand, self.average_deviation_ideal_state, self.deficient_battery, ) def __init__( self, shift_duration: int, sample_size=100, number_of_clusters=20, initial_state=None, policy="RandomRolloutPolicy", initial_location_depot=True, verbose=False, ): self.shift_duration = shift_duration if initial_state: self.state = initial_state else: self.state = clustering_scripts.get_initial_state( sample_size=sample_size, number_of_clusters=number_of_clusters, initial_location_depot=initial_location_depot, ) self.stack = [] self.time = 0 self.rewards = [] self.cluster_flow = { (start, end): 0 for start in np.arange(len(self.state.clusters)) for end in np.arange(len(self.state.clusters)) if start != end } self.policy = get_policy(policy) self.metrics = World.WorldMetric() self.verbose = verbose if verbose: self.progress_bar = IncrementalBar( "Running World", check_tty=False, max=round(shift_duration / ITERATION_LENGTH_MINUTES) + 1, color=WHITE, suffix="%(percent)d%% - ETA %(eta)ds", ) def run(self): while self.time < self.shift_duration: event = self.stack.pop(0) event.perform(self) if isinstance(event, classes.GenerateScooterTrips) and self.verbose: self.progress_bar.next() if self.verbose: self.progress_bar.finish() def get_remaining_time(self) -> int: """ Computes the remaining time by taking the difference between the shift duration and the current time of the world object. :return: the remaining time as a float """ return self.shift_duration - self.time def add_reward(self, reward: float, discount=False) -> None: """ Adds the input reward to the rewards list of the world object :param discount: boolean if the reward is to be discounted :param reward: reward given """ self.rewards.append(reward * self.get_discount() if discount else reward) def get_total_reward(self) -> float: """ Get total accumulated reward at current point of time :return: """ return sum(self.rewards) def add_event(self, event) -> None: """ Adds event to the sorted stack. Avoids calling sort on every iteration by using the bisect package :param event: event to insert """ insert_index = bisect.bisect([event.time for event in self.stack], event.time) self.stack.insert(insert_index, event) def add_trip_to_flow(self, start: int, end: int) -> None: """ Adds a trip from start to end for cluster flow :param start: departure cluster :param end: arrival cluster """ self.cluster_flow[(start, end)] += 1 def get_cluster_flow(self) -> [(int, int, int)]: """ Get all flows between cluster since last vehicle arrival :return: list: tuple (start, end, flow) flow from departure cluster to arrival cluster """ return [(start, end, flow) for (start, end), flow in self.cluster_flow.items()] def clear_flow_dict(self) -> None: """ Clears the cluster flow dict """ for key in self.cluster_flow.keys(): self.cluster_flow[key] = 0 def get_scooters_on_trip(self) -> [(int, int, int)]: """ Get all scooters that are currently out on a trip :return: list of all scooters that are out on a trip """ return [ (event.departure_cluster_id, event.arrival_cluster_id, event.scooter.id) for event in self.stack if isinstance(event, classes.ScooterArrival) ] def get_discount(self): # Divide by 60 as there is 60 minutes in an hour. We want this number in hours to avoid big numbers is the power return DISCOUNT_RATE ** (self.time / 60)
def downloadImage(image, iteration, outage=False, oNr=0, oTime=0): doc = open( './measurements/%s/%s/results/setup.txt' % (currentInstance, currentTest), 'w+') doc.write( 'Server:%s\nHosts:%s\nSeeders:%s\nImage:%s\nServer outage:%s\nOutage number:%s\nOutage start:%s' % (str(len(set.servers)), str(len(set.name)), str(len( set.seeder)), image, outage, oNr, oTime)) doc.close() image = image.strip() for node in set.name: subprocess.call(['docker exec mn.%s sh -c "rm -rf times/*"' % (node)], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call(['docker exec mn.%s sh -c "mkdir times/"' % (node)], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) for i in range(int(iteration)): print('\n###\nTest #%s\n###' % (i + 1)) print datetime.now() image = image.strip() #prepare downloads subprocess.call([ 'mkdir measurements/%s/%s/%s/' % (currentInstance, currentTest, i) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ 'mkdir measurements/%s/%s/%s/time/' % (currentInstance, currentTest, i) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ 'mkdir measurements/%s/%s/%s/traffic/' % (currentInstance, currentTest, i) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #deleting and restarting deleted = [False] * len(set.name) restarted = [False] * len(set.name) sum = 0 print 'Deleting images and restarting container' bar_restart = IncrementalBar('Finished cleanup(s)', max=len(set.name)) for node in set.name: subprocess.call([ 'docker exec -it mn.%s docker image rm -f %s' % (node, image) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) if node in set.servers: subprocess.call([ 'docker exec -it mn.%s sh -c "(docker stop dfclient supernode && docker rm dfclient supernode)"&' % (node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) else: subprocess.call([ 'docker exec -it mn.%s sh -c "(docker stop dfclient && docker rm dfclient)"&' % (node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call( ['docker exec -it mn.%s sh -c "iptables -Z"' % (node)], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ "docker exec mn.%s sh -c 'rm -f root/.small-dragonfly/logs/dfdaemon.log'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #root/.small-dragonfly/logs/* subprocess.call([ "docker exec mn.%s sh -c 'rm -f root/.small-dragonfly/logs/dfclient.log'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #root/.small-dragonfly/logs/* subprocess.call([ "docker exec mn.%s sh -c 'rm -f root/.small-dragonfly/logs/dfserver.log'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #root/.small-dragonfly/logs/* subprocess.call([ "docker exec mn.%s sh -c 'rm -rf root/.small-dragonfly/data/*'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #root/.small-dragonfly/logs/* subprocess.call([ "docker exec mn.%s sh -c 'rm -rf root/.small-dragonfly/meta/*'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #root/.small-dragonfly/logs/* subprocess.call([ "docker exec mn.%s sh -c 'rm -rf root/.small-dragonfly/dfdaemon/data/*'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #root/.small-dragonfly/logs/* while sum < len(set.name): for node in set.name: if 'localhost:16000/%s' % image in subprocess.check_output( ['docker exec mn.%s docker image ls' % node], shell=True): subprocess.call([ 'docker exec mn.%s docker image rm -f %s' % (node, image) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) if deleted[set.name.index(node)] == False: #delete if node in set.servers: if not ('dfclient' and 'supernode') in subprocess.check_output( ['docker exec mn.%s docker ps' % node], shell=True): deleted[set.name.index(node)] = True else: if not ( 'docker rm' or 'docker stop' ) in subprocess.check_output( ['docker exec mn.%s sh -c "ps -a"' % node], shell=True): subprocess.call([ 'docker exec mn.%s sh -c "(docker stop dfclient supernode && docker rm dfclient supernode )"&' % (node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) else: if not 'dfclient' in subprocess.check_output( ['docker exec mn.%s docker ps' % node], shell=True): deleted[set.name.index(node)] = True else: if not ( 'docker rm' or 'docker stop' ) in subprocess.check_output( ['docker exec mn.%s sh -c "ps -a"' % node], shell=True): subprocess.call([ 'docker exec mn.%s sh -c "(docker stop dfclient && docker rm dfclient)"&' % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) else: if restarted[set.name.index(node)] == False: if node in set.servers: if ('dfclient' and 'supernode') in subprocess.check_output( ['docker exec mn.%s docker ps' % node], shell=True): sum = sum + 1 restarted[set.name.index(node)] = True bar_restart.next() else: if not ('compose') in subprocess.check_output( ['docker exec mn.%s sh -c "ps -a"' % node], shell=True): subprocess.call([ 'docker exec mn.%s sh -c "(export IP=%s && docker-compose -f stack_server.yml up -d)"&' % (node, set.ip[set.name.index(node)]) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #print node else: if 'dfclient' in subprocess.check_output( ['docker exec mn.%s docker ps' % node], shell=True): sum = sum + 1 restarted[set.name.index(node)] = True bar_restart.next() else: if not ('compose') in subprocess.check_output( ['docker exec mn.%s sh -c "ps -a"' % node], shell=True): subprocess.call([ 'docker exec mn.%s sh -c "(export IP=%s && docker-compose -f stack_client.yml up -d)"&' % (node, set.ip[set.name.index(node)]) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) time.sleep(5) print '' check.check() while check.repeat == True: check.check() bar_restart.finish() #prepare seeder print('Preparing seeder(s)') for node in set.seeder: subprocess.call( ['docker exec mn.%s docker pull %s' % (node, image)], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call( ['docker exec -it mn.%s sh -c "iptables -Z"' % (node)], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #start download sum = 0 complete = [False] * len(set.name) print('Starting download(s)') iStart = datetime.now() print iStart bar_download = IncrementalBar('Waiting for download(s)', max=len(set.name)) for node in set.name: if not node in set.seeder: subprocess.call([ 'docker exec mn.%s sh -c "(date +"%%Y-%%m-%%dT%%T.%%6N" > times/%s_%s_start.txt && docker pull %s && date +"%%Y-%%m-%%dT%%T.%%6N" > times/%s_%s_end.txt)"&' % (node, node, i, image, node, i) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) else: complete[set.name.index(node)] = True bar_download.next() sum = sum + 1 iPrev = datetime.now() #server outage if outage == True: print('\nWaiting %s seconds for outage...' % oTime) time.sleep(int(oTime)) for j in range(1, int(oNr) + 1): print set.servers[j] subprocess.call([ 'docker exec mn.%s docker stop supernode &' % (set.servers[-j]) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #check download while sum < len(set.name): for node in set.name: if complete[set.name.index(node)] == False: if image in subprocess.check_output( ['docker exec mn.%s docker image ls' % node], shell=True): #print ('Docker pull successful for mn.%s' % node) #remove first comment for info on successful pull sum = sum + 1 complete[set.name.index(node)] = True bar_download.next() else: if not image in subprocess.check_output( ['docker exec mn.%s sh -c "ps -a"' % node], shell=True): subprocess.call([ 'docker exec mn.%s sh -c "(docker pull %s && date +"%%Y-%%m-%%dT%%T.%%6N" > times/%s_%s_end.txt)"&' % (node, image, node, i) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) #print ('Docker pull restarted for mn.%s' % node) #remove first comment for info on failed pull time.sleep(1) bar_download.finish() print 'Download(s) successful' print 'Grabbing data after download(s)' for node in set.name: subprocess.call([ 'docker cp mn.%s:times/%s_%s_start.txt measurements/%s/%s/%s/time/%s_start.txt' % (node, node, i, currentInstance, currentTest, i, node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ 'docker cp mn.%s:times/%s_%s_end.txt measurements/%s/%s/%s/time/%s_end.txt' % (node, node, i, currentInstance, currentTest, i, node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ "docker exec mn.%s sh -c 'iptables -L INPUT -n -v -x > tmp_IN.txt'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ 'docker cp mn.%s:tmp_IN.txt measurements/%s/%s/%s/traffic/%s_IN.txt' % (node, currentInstance, currentTest, i, node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ "docker exec mn.%s sh -c 'iptables -L OUTPUT -n -v -x > tmp_OUT.txt'" % node ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) subprocess.call([ 'docker cp mn.%s:tmp_OUT.txt measurements/%s/%s/%s/traffic/%s_OUT.txt' % (node, currentInstance, currentTest, i, node) ], stdout=FNULL, stderr=subprocess.STDOUT, shell=True) set.measureTime(False, currentInstance, currentTest, iteration) set.measureTraffic(False, currentInstance, currentTest, iteration)
class SysExParser(object): def __init__(self,send_func,debug=False): super(SysExParser,self).__init__() self.send_func = send_func self.debug = debug self.dump_file = None self.dump_on = False self.dump_ram = False self.printer = MessagePrinter(debug=self.debug) self.handlers = { # FILE FUNCTIONS FILE_F "F_DHDR": self.handleFileDumpHeader, "F_DPKT": self.handleFileDumpDataBlock, "DIR_HDR": self.handleFileDumpHeader, "F_WAIT" : noop, "F_CANCEL" : cancel, "F_ERR" : cancel, # DEVICE COMMAND DEVICE_CMD "STAT_ANSWER": self.handleStatusAnswer, "DATA_HEADER": self.handleDirectoryAnswer, "DATA_DUMP" : self.handleDataDump, "DIR_ANSWER" : self.handleDirectoryAnswer, "D_WAIT" : noop, "D_ACK" : noop, "D_CANCEL" : cancel, "D_ERR" : cancel, } self.dump_start = [ "F_DREQ", "DATA_REQUEST" ] self.dump_stop = [ "F_CANCEL", "D_CANCEL"] def __del__(self): self.closeDumpFile() def createDumpFile(self,filename=None): if not filename: timestamp = time.strftime("%Y%m%d%H%M%S") filename="dump_%s.bin" % mktimestamp() self.dump_file = open(filename,"wb") def closeDumpFile(self): if not self.dump_file: return self.dump_file.close() self.dump_file = None def startDump(self,filename,size): if not self.dump_on: return self.dump_written = 0 self.dump_size = size self.closeDumpFile() self.createDumpFile(filename) print "Dumping '%s'" % filename showsize = ' 0x%(index)06x' if self.dump_ram else '' self.bar = IncrementalBar( max=size, suffix = '%(percent)d%% [%(elapsed_td)s / %(eta_td)s]' + showsize) def stopDump(self): if not self.dump_on: return self.bar.finish() self.closeDumpFile() self.dump_on = False def dump(self,data,filename=None): if not self.dump_on: return if not self.dump_file: self.createDumpFile() if self.dump_written == self.dump_size: print "Discarding", len(data), "bytes, dump has ended" elif len(data) + self.dump_written > self.dump_size: discard = len(data) + self.dump_written - self.dump_size self.dump_file.write(bytearray(data[:-discard])) self.bar.next(self.dump_size-self.dump_written) self.dump_written = self.dump_size self.bar.finish() leftover = data[-discard:] for i in leftover: if i != 0: print "Discarding non-NUL data:", hexdump(leftover) break else: self.dump_file.write(bytearray(data)) self.dump_written += len(data) self.bar.next(len(data)) # FILE FUNCTIONS FILE_F def handleFileDumpHeader(self,msg,timestamp): self.sendSysEx( MSCEIMessage(fromName="F_WAIT"),timestamp=timestamp+1) offset=17 data = [] for i in xrange(2): data += conv7_8(msg[offset:offset+8]) offset += 8 location = '' while msg[offset] != 0: location += chr(msg[offset]) offset += 1 offset+=1 cc = msg[offset] cc_calc = checksum(msg[1:offset]) if cc == cc_calc: filename = str(bytearray(msg[5:16])).strip() length = struct.unpack('>I',list2str(data[4:8]))[0] self.startDump(filename,length) self.dump(data[8:]) self.sendSysEx( MSCEIMessage(fromName="F_ACK"), timestamp=timestamp+2) else: self.sendSysEx( MSCEIMessage(fromName="F_NACK"), timestamp=timestamp+2) return True def handleFileDumpDataBlock(self,msg,timestamp): self.sendSysEx( MSCEIMessage(fromName="F_WAIT"),timestamp=timestamp+1) noctets = msg[5] offset=6 data = [] for i in xrange(noctets): data += conv7_8(msg[offset:offset+8]) offset += 8 cc = msg[offset] cc_calc = checksum(msg[1:offset]) if cc == cc_calc: self.dump(data) self.sendSysEx( MSCEIMessage(fromName="F_ACK"), timestamp=timestamp+2) else: self.sendSysEx( MSCEIMessage(fromName="F_NACK"), timestamp=timestamp+2) return True # DEVICE COMMAND DEVICE_CMD def handleStatusAnswer(self,msg,timestamp): self.sendSysEx( MSCEIMessage(fromName="D_WAIT"),timestamp=timestamp+1) offset= 5 + 3*8 cc = msg[offset] cc_calc = checksum(msg[1:offset]) if cc == cc_calc: self.sendSysEx( MSCEIMessage(fromName="D_ACK"), timestamp=timestamp+2) if self.dump_ram: self.dump_on = True self.startDump("ramdump_%s.bin" % mktimestamp(), 2097060) time.sleep(0.1) self.sendSysEx( MSCEIMessage(fromName="F_ACK"), timestamp=timestamp+3) return True else: self.sendSysEx( MSCEIMessage(fromName="D_NACK"), timestamp=timestamp+2) return False def handleDataDump(self,msg,timestamp): self.sendSysEx( MSCEIMessage(fromName="D_WAIT")) noctets = msg[5] offset=6 data = [] for i in xrange(noctets): data += conv7_8(msg[offset:offset+8]) offset += 8 cc = msg[offset] cc_calc = checksum(msg[1:offset]) if cc == cc_calc: self.dump(data) self.sendSysEx( MSCEIMessage(fromName="D_ACK"), timestamp=timestamp+2) else: self.sendSysEx( MSCEIMessage(fromName="D_NACK"), timestamp=timestamp+2) return True def handleDirectoryAnswer(self,msg,timestamp): #time.sleep(0.1) self.sendSysEx( MSCEIMessage(fromName="D_WAIT"),timestamp=timestamp+1) offset = 8 + 11 + 1 data = [] for i in xrange(2): data += conv7_8(msg[offset:offset+8]) offset += 8 offset += 11 cc = msg[offset] cc_calc = checksum(msg[1:offset]) if cc == cc_calc: filename = str(bytearray(msg[8:19])).strip() length = struct.unpack('>I',list2str(data[4:8]))[0] self.startDump(filename,length) #time.sleep(0.1) self.sendSysEx( MSCEIMessage(fromName="D_ACK"), timestamp=timestamp+2) else: self.sendSysEx( MSCEIMessage(fromName="D_NACK"), timestamp=timestamp+2) return True def parse(self, msg, timestamp, acceptUnhandled=True): if msg[0] != 0xF0: print 'Non-sysex message' print [ hex(b) for b in msg ] print return acceptUnhandled
def render_all(self, cat_ids=None, verbose=True, blender_path='blender'): import subprocess from progress.bar import IncrementalBar import tempfile from .path import renderings_format from ..objs import try_extract_models for cat_id in cat_ids: try_extract_models(cat_id) _FNULL = open(os.devnull, 'w') call_kwargs = dict() if not verbose: call_kwargs['stdout'] = _FNULL call_kwargs['stderr'] = subprocess.STDOUT root_dir = os.path.realpath(os.path.dirname(__file__)) script_path = os.path.join(root_dir, 'scripts', 'blender_render.py') render_params_path = None camera_positions_path = None def clean_up(): for path in (render_params_path, camera_positions_path): if path is not None and os.path.isfile(path): os.remove(path) render_params_fp, render_params_path = tempfile.mkstemp(suffix='.json') try: view_params = self.get_view_params() view_params.update(**self.get_image_params()) os.write(render_params_fp, json.dumps(view_params)) os.close(render_params_fp) args = [ blender_path, '--background', '--python', script_path, '--', '--render_params', render_params_path ] keys = tuple(self.needs_rendering_keys(cat_ids)) n = len(keys) if n == 0: print('No keys to render.') return print('Rendering %d examples' % n) bar = IncrementalBar(max=n) for cat_id, example_id in keys: bar.next() camera_positions_fp, camera_positions_path = tempfile.mkstemp( suffix='.npy') os.close(camera_positions_fp) np.save( camera_positions_path, self.view_manager.get_camera_positions(cat_id, example_id)) out_dir = self.get_renderings_dir(cat_id, example_id) proc = subprocess.Popen( args + [ '--obj', self.get_obj_path(cat_id, example_id), '--out_dir', out_dir, '--filename_format', renderings_format, '--camera_positions', camera_positions_path, ], **call_kwargs) try: proc.wait() except KeyboardInterrupt: proc.kill() raise if os.path.isfile(camera_positions_path): os.remove(camera_positions_path) bar.finish() except (Exception, KeyboardInterrupt): clean_up() raise clean_up()
def lemmas_collocations(database, IGC_folder, prop_names): dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results outdict = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) xml_files = glob.glob(IGC_folder + '/**/*.xml', recursive=True) filebar = IncrementalBar('Framvinda', max=len(xml_files)) for file in xml_files: colloc = [] with open(file, 'r', encoding='utf-8') as content: try: tree = ET.parse(content) for word in tree.iter(): if word.text is not None: if word.attrib.get('lemma') is not None: pos = word.attrib.get('type') lemma = word.attrib.get('lemma') word_form = word.text colloc.append((word_form, lemma, pos)) elif word.text in punctuation: colloc.append((word.text, ' ', ' ')) for i, w in enumerate(colloc): if prop_names == False: if w[2].startswith('n') and w[2].endswith( 's'): # Ignore proper names continue if w[2] in pos_to_ignore: continue if w[1][-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error) continue if w[1][0] == '-': continue if ( not all(i.isalpha() or i == '-' for i in w[1]) ): # if a word contains anything but an alphabetic letter or hyphen, ignore it continue filter_query = SQLiteQuery(w[1], 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery(w[1], 'lemma', 'DCI_ELEMENT', cursor=dci.cursor ) # Capitalized words included query_lower = SQLiteQuery(w[1].lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'BIN': query = SQLiteQuery(w[1], 'lemma', 'DIM_ELEMENT', cursor=dim.cursor ) # Capitalized words included query_lower = SQLiteQuery(w[1].lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the database nor the filters if len(w[1]) > 1: if i - 2 < 0: # collects 2 words before and after the candidate w1 = "" else: w1 = str(colloc[i - 2][0]) if i - 1 < 0: w2 = "" else: w2 = str(colloc[i - 1][0]) if i + 1 > len(colloc) - 1: w4 = "" else: w4 = str(colloc[i + 1][0]) if i + 2 > len(colloc) - 1: w5 = "" else: w5 = str(colloc[i + 2][0]) if w[1] in outdict: if str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5) not in outdict[ w[1]]['orðstaða']: outdict[w[1]]['orðstaða'][ str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5)] = 1 else: outdict[w[1]]['orðstaða'][ str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5)] += 1 outdict[w[1]]['tíðni'] += 1 else: outdict[w[1]] = {} outdict[w[1]]['tíðni'] = 1 outdict[w[1]]['orðstaða'] = { str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5): 1 } except sqlite3.OperationalError: pass filebar.next() sys.stdout.flush() filebar.finish() if IGC_folder == "malheildir/RMH/": with open(f'uttak/{database}/RMH_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið RMH_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið CC_BY_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið MIM_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'uttak/{database}/' + namefolder + '_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið {namefolder}_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def getFeatureVectorAndLabels(data_dir): allDrugEvents = [] samplesList = [] labelsList = [] allFiles = [] featuresDict = defaultdict(list) featureNames = ['sections', 'containsFutureWord', 'prevSentContainsFutureWord',\ 'current_tense', 'prev_tense', 'temporalType',\ 'polarity', 'position', 'modality', 'proximity', 'futureCount'] tfIdfFeatureVectorList = [] wordEmbeddingsFeatureVectorList = [] with open('drugClassification.csv', 'w') as csvfile: filewriter = csv.writer(csvfile) filewriter.writerow(['Drug', 'Predicted Label', 'Correct Label']) coreNLPClient = CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref' ], timeout=100000, memory='8G') filesToProcess = [ file for file in os.listdir(data_dir) if (file.endswith('.txt')) ] bar = IncrementalBar('Processing', max=len(filesToProcess)) for file in filesToProcess: f = open(os.path.join(data_dir, file), 'r') raw = f.read() CLAMPdrugs = getAllDrugsFromCLAMP(file, data_dir, raw) drugEvents, drugEventsStartIndices, drugEventPolarityFeatureVector, drugEventModalityFeatureVector = getDrugEvents( file, data_dir, CLAMPdrugs) correctLabels = getLabels(file, drugEvents, data_dir) allFiles += [file] * len(correctLabels) sectionsFeatureVector = getSectionFeature(file, data_dir, drugEventsStartIndices) containsFutureWordsVector, prevSentContainsFutureWordsFeatureVector, proximityToFutureWordFeatureVector, futureWordsCountFeatureVector = getContainsFutureWordsFeature( raw, drugEvents, allDrugEvents) currentTenseFeatureVector = getCurrentTenseFeatureVector( file, coreNLPClient, drugEvents, raw) prevTenseFeatureVector = getPrevTenseFeatureVector( file, coreNLPClient, drugEvents, raw) temporalTypeFeatureVector = getTemporalCluesFeatureVectors( file, drugEvents, raw, data_dir) positionInTextFeatureVector = getPositionInTextFeatureVector( raw, drugEvents) wordEmbeddingsFeatureVector = getWordEmbeddingsFeatureVector( raw, drugEvents) tfIdfFeatureVector = getTfIdfVectors(drugEvents, raw, drugEventsStartIndices) wordEmbeddingsFeatureVectorList += wordEmbeddingsFeatureVector tfIdfFeatureVectorList += tfIdfFeatureVector features = [ sectionsFeatureVector, containsFutureWordsVector, prevSentContainsFutureWordsFeatureVector, currentTenseFeatureVector, prevTenseFeatureVector, temporalTypeFeatureVector, drugEventPolarityFeatureVector, positionInTextFeatureVector, drugEventModalityFeatureVector, proximityToFutureWordFeatureVector, futureWordsCountFeatureVector ] for i in range(len(features)): featuresDict[featureNames[i]] += features[i] for i in range(len(sectionsFeatureVector)): sampleList = [feature[i] for feature in features] samplesList.append(sampleList) for label in correctLabels: labelsList.append(label) bar.next() bar.finish() ordinalEncoder = OrdinalEncoder() featuresVector = ordinalEncoder.fit_transform(samplesList) wordEmbeddingsFeatureVector = np.array(wordEmbeddingsFeatureVectorList) featuresVector = np.hstack((featuresVector, wordEmbeddingsFeatureVector)) tfIdfFeatureVector = np.array(tfIdfFeatureVectorList) featuresVector = np.hstack((featuresVector, tfIdfFeatureVector)) labelsVector = np.array(labelsList) return allFiles, allDrugEvents, featuresDict, featuresVector, labelsVector
def user_defined_collocations(database, filterbase, corpus): """ Iterates through the corpus and retrieves the words that do not appear in the database. Collects 5 word collocations on every word, two words before and after the candidate word. """ db = SQLDatabase(db_name=database) txt_files = glob.glob(corpus + '/**/*.txt', recursive=True) if filterbase not in ['n', 'N']: filters = SQLDatabase(db_name=filterbase) else: pass # if there is no filterbase, ignore this step outdict = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) filebar = IncrementalBar('Framvinda', max=len(txt_files)) for file in txt_files: with open(file, 'r', encoding='utf-8') as content: f = content.read() words = f.split() for i, w in enumerate(words): if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error) continue if w[0] == '-': continue if ( not all(i.isalpha() or i == '-' for i in w) ): # if a word contains anything but an alphabetic letter or hyphen, ignore it continue if filterbase not in [ 'n', 'N' ]: # if a stopword database has been defined, filter the results filter_query = SQLiteQuery(w, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery( w, 'word', 'LEXICON_WORD', cursor=db.cursor ) # parameters must be updated if the database format is changed query_lower = SQLiteQuery(w.lower(), 'word', 'LEXICON_WORD', cursor=db.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the database nor the filters if len(w) > 1: if i - 2 < 0: # collects 2 words before and after the candidate w1 = "" else: w1 = str(words[i - 2]) if i - 1 < 0: w2 = "" else: w2 = str(words[i - 1]) if i + 1 > len(words) - 1: w4 = "" else: w4 = str(words[i + 1]) if i + 2 > len(words) - 1: w5 = "" else: w5 = str(words[i + 2]) if w in outdict: if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5) not in outdict[w]['orðstaða']: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] = 1 else: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] += 1 outdict[w]['tíðni'] += 1 else: outdict[w] = {} outdict[w]['tíðni'] = 1 outdict[w]['orðstaða'] = { str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5): 1 } else: query = SQLiteQuery(w, 'word', 'LEXICON_WORD', cursor=db.cursor) query_lower = SQLiteQuery(w.lower(), 'word', 'LEXICON_WORD', cursor=db.cursor) if not query.exists and not query_lower.exists: if len(w) > 1: if i - 2 < 0: w1 = "" else: w1 = str(words[i - 2]) if i - 1 < 0: w2 = "" else: w2 = str(words[i - 1]) if i + 1 > len(words) - 1: w4 = "" else: w4 = str(words[i + 1]) if i + 2 > len(words) - 1: w5 = "" else: w5 = str(words[i + 2]) if w in outdict: if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5) not in outdict[w]['orðstaða']: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] = 1 else: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] += 1 outdict[w]['tíðni'] += 1 else: outdict[w] = {} outdict[w]['tíðni'] = 1 outdict[w]['orðstaða'] = { str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5): 1 } filebar.next() sys.stdout.flush() filebar.finish() output_file = input(""" ============================================================ Skrifaðu það sem þú vilt að úttaksskjalið heiti með endingunni .freq Dæmi: ordasafn_ordstodulyklar.freq ============================================================ """) with open('uttak/notendagogn/' + output_file, mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið {output_file} er tilbúið og má finna í undirmöppunni uttak/notendagogn/ ============================================================ """)
class scraping(): """ classe pour le scrapping du site BooksToScrap """ def __init__(self): # nom de l'image du livre qui est sauvegardé self.nomImage = '' #variable qui stocke les urls de chaque catégorie à scraper dans le site self.urlsCategories = [] #variable qui stocke pour une catégorie les paramètres nécessaires au scraping self.categorie = { 'nbLivres': '', 'nbPages': '', 'urlsLivres': [], 'urlsPages': [] } # variable de nom de CSV sauvegardé (valeur par défaut donnée) self.fichierCSV = 'resultats.csv' # url de base du site à scrapper self.urlBase = 'http://books.toscrape.com/index.html' # url de base pour scrapper une image self.urlBaseImage = 'http://books.toscrape.com/' # url de base pour scrapper une catégorie self.urlCatalogue = 'http://books.toscrape.com/catalogue/' # stocke tous les paramètres à scrapper pour un livre self.livre = { 'product_page_url': '', 'upc': '', 'title': '', 'price_including_tax': '', 'price_excluding_tax': '', 'number_available': '', 'product_description': '', 'category': '', 'review_rating': '', 'image_url': '' } def initialiseLivre(self): """ lors du scrapping d'une catégorie, permet de remettre les valeurs par défaut de self.livre pour passer d'un livre à l'autre""" self.livre = { 'product_page_url': '', 'upc': '', 'title': '', 'price_including_tax': '', 'price_excluding_tax': '', 'number_available': '', 'product_description': '', 'category': '', 'review_rating': '', 'image_url': '' } def initialiseCategorie(self): """ pour le scraping du site entier, remet les valeurs par defaut pour self.categorie quand on change de catégorie à scrapper""" self.categorie = { 'nbLivres': '', 'nbPages': '', 'urlsLivres': [], 'urlsPages': [] } def creerObjetSoup(self, url): """ créé self.soup avec l'url en paramètre """ self.reponse = requests.get(url) self.reponse.encoding = 'utf-8' self.soup = BeautifulSoup(self.reponse.text, features="html.parser") def recupereCategorieEtTitreLivre(self): """ recupère et sauvegarde la catégorie et le titre du livre dans self.livre le titre est le content de la balise <li class ='active'> la catégorie est le content de la balise <li> qui contient <a href= urlPageCategorie> et dont le content est different de 'Books' Je transforme les "'" en '"' pour éviter des problèmes de segmentation lors de la création du CSV les chaînes de caractères sont identifiées avec des "'" """ lis = self.soup.findAll('li') for elem in lis: # boucle pour trouver le titre et la catégorie try: # pour trouver le titre if (elem['class'][0] == 'active'): self.livre['title'] = elem.contents[0].replace("'", '"') break except: # pour catégorie if ('category' in elem.find('a')['href']): cat = elem.find('a') if (cat.contents[0] != 'Books'): self.livre['category'] = cat.contents[0] def recupereUrlImageLivre(self): """ recupere et sauvegarde l'url de l'image du livre dans self.livre l'url (chemin relatif) de l'image se trouve dans une balise <div class='item active' il faut compléter le chemin l'url de base pour une image pour avoir le chemin absolu """ divs = self.soup.findAll('div') for elem in divs: try: if (elem['class'] == ['item', 'active']): cheminRelatif = elem.find('img')['src'][6:] self.livre['image_url'] = self.urlBaseImage + cheminRelatif except: pass def recupererReviewRating(self): """ récupère le nombre d'étoiles du livre et le stocke dans self.livre le rating se trouve dans une balise <p class='star-rating RATING' Cependant d'autres livres sont suggérés en lecture à la fin de la page produit avec également du rating. Pour éviter la confusion, il faut choisir la balise <div class="col-sm-6 product_main"> qui correspond à celle du produit voulu """ filtreStar = 'star-rating' filtreProduit = 'product_main' divs = self.soup.findAll('div') for d in divs: try: if (filtreProduit in d['class']): ps = d.findAll('p') for p in ps: try: if (filtreStar in p['class']): self.livre[ 'review_rating'] = p['class'][1] + '/Five' break except: pass except: pass def recupereDescriptionLivre(self): """ recupere et sauvegarde la description du livre dans self.livre la description est contenue dans une balise <p>, il y en a plusieurs dans une page produit mais la seule qui corresponde à la description a un content qui fait au moins 50 caractères Deplus dans la description des points-virgule peuvent apparaîtrent. Ce sont les séparateurs du fichier CSV. Aussi pour éviter des confusions lors de la création du fichier une solution est d'entourer cette string de guillemets """ ps = self.soup.findAll('p') for p in ps: if (len(p.contents[0]) >= 50): self.livre['product_description'] = p.contents[0].replace( "'", '"') def recupereAutresParametresLivre(self): """ recupere et sauvegarde les autres caracteristiques du livre dans self.livre Tous les autres paramètres cherchés se trouvent dans un tableau les balises <tr> incluent une balise <th> dont le content est le nom du paramètre cherché et dont la balise <td> contient la valeur du paramètre cherché Pour la disponibilité des livres on cherche seulement le nombre dans la chaine de caractère qui est du type 'In stock (19 available)' d'où le travail sur ce paramètre""" trs = self.soup.findAll('tr') for elem in trs: if (elem.find('th').contents[0] == 'UPC'): self.livre['upc'] = elem.find('td').contents[0] elif (elem.find('th').contents[0] == 'Price (excl. tax)'): self.livre['price_excluding_tax'] = elem.find( 'td').contents[0][1:] elif (elem.find('th').contents[0] == 'Price (incl. tax)'): self.livre['price_including_tax'] = elem.find( 'td').contents[0][1:] elif (elem.find('th').contents[0] == 'Availability'): filtre = ' available' res = elem.find('td').contents[0] res = res[res.find('(') + 1:res.find(filtre)] self.livre['number_available'] = res def creationDossiersSauvegarde(self): """ créé les dossiers nécessaires pour sauvegarder à partir du fichier config.py les livres scrappés""" if (not os.path.isdir(cf.dossierSauvegarde)): os.mkdir(cf.dossierSauvegarde) if (not os.path.isdir(cf.dossierImages)): os.mkdir(cf.dossierImages) def ecrireHeadersCSV(self, fichierCSV): """ créé les headers du fichier CSV entré en paramètre à partir de la variable self.livre""" headers = '' for k in self.livre.keys(): headers += k + cf.delimiteurCSV headers = headers[:-1] + '\n' with open(fichierCSV, 'w') as f: f.write(headers) def ajouterUneLigneCSV(self, fichierCSV): """ ajoute une ligne (qui correspond à un livre) à un fichier CSV déjà existant et de la variable self.livre""" ligne = '' for v in self.livre.values(): ligne += v + cf.delimiteurCSV ligne = ligne[:-1] + '\n' with open(fichierCSV, 'a') as f: f.write(ligne) def creeCSVunLivre(self, fichierCSV): """ crée un CSV avec séparateur défini dans le fichier config et sauvegarde les caracteristiques présentes dans self.livre dans fichierCSV""" if (not os.path.exists(fichierCSV)): self.ecrireHeadersCSV(fichierCSV) self.ajouterUneLigneCSV(fichierCSV) def scrapUnLivre(self, urlLivre, unLivre=False): """ scrap les paramètres pour un livre donné,cette méthode est utilisée pour scrapper un livre mais également tout une catégorie aussi il est nécessaire de faire cette distinction pour pouvoir créer le bon nom de fichier CSV pour la sauvegarde si unLivre == True alors le nom du fichier CSV est le nom du livre, sinon c'est celui de sa catégorie""" self.creerObjetSoup(urlLivre) self.initialiseLivre() self.livre['product_page_url'] = urlLivre self.recupereCategorieEtTitreLivre() self.recupereUrlImageLivre() self.recupereDescriptionLivre() self.recupererReviewRating() self.recupereAutresParametresLivre() self.creationDossiersSauvegarde() if (unLivre): self.fichierCSV = self.livre['title'] + '.csv' else: self.fichierCSV = self.livre['category'] + '.csv' self.creeCSVunLivre(os.path.join(cf.dossierSauvegarde, self.fichierCSV)) self.sauvegardeImageUnLivre(self.livre) def recupereUrlsUnePageCategorie(self): """ recupere toutes les urls de livre pour une seule page d'une catégorie les urls (chemin relatif) sont incuses dans une balise <div> qui contient une balise <a> avec un href Il faut compléter l'url pour en faire un chemin absolu """ ol = self.soup.find('ol') divs = ol.findAll('div') self.categorie['urlsLivres'].append([]) for elem in divs: try: self.categorie['urlsLivres'][-1].append( self.urlCatalogue + elem.find('a')['href'][9:]) except: pass def recupereInfosUneCategorie(self, urlCategorie): """ recupere toutes les infos necessaires à une catégorie dans self.categorie La première information à avoir est le nombre de pages à scrapper. Je cherche donc le nombre de livres, chaque page contenant 20 livres je connais le nombre de pages. Ensuite si il y a une seule page à scrapper alors j'ajoute simplement l'url de la catégorie à self.categorie['urlsPages']. Sinon je créé les urls des pages à scrapper à partir du modèle suivant : http://books.toscrape.com/catalogue/category/books/<NOMCATEGORIE>_<VALEUR>/page-1.html et je modifie l'url avec le bon numéro de page puis je les stocke dans self.categorie['urlsPages'] """ self.initialiseCategorie() self.creerObjetSoup(urlCategorie) strongs = self.soup.findAll('strong') self.categorie['nbLivres'] = int(strongs[1].contents[0]) self.categorie['nbPages'] = int(self.categorie['nbLivres'] / 20) + 1 if (self.categorie['nbPages'] > 1): urlUtilisee = urlCategorie[:-10] + 'page-1.html' for elem in range( self.categorie['nbPages'] ): # urls de toutes les pages pour obtenir tous les livres self.categorie['urlsPages'].append(urlUtilisee[:-6] + str(elem + 1) + '.html') else: self.categorie['urlsPages'].append(urlCategorie) for elem in self.categorie[ 'urlsPages']: # recupere toutes les urls des livres dans toutes les pages de la catégorie self.creerObjetSoup(elem) self.recupereUrlsUnePageCategorie() def scrapUneCategorie(self, url): """ scrap une catégorie entière de livres, cette méthode est également utilisée pour scrapper le site en entier""" # Prépare le scraping self.recupereInfosUneCategorie(urlCategorie=url) # Optimise l'affichage pour l'utilisateur print('Il y a {} page(s) dans cette catégorie'.format( len(self.categorie['urlsLivres']))) self.barre = IncrementalBar('pages scrapées : ', max=len(self.categorie['urlsLivres'])) # recupere toutes les infos de livre de chaque url for index, elem in enumerate(self.categorie['urlsLivres']): for el in elem: self.initialiseLivre() self.scrapUnLivre( el ) # enregistre le CSV et l'image de chaque livre à la volée self.barre.next() self.barre.finish() print('CSV de la catégorie sauvegardée dans {}'.format( cf.dossierSauvegarde)) print('Images de la categorie sauvegardees dans {}'.format( cf.dossierImages)) def scrapSiteInternet(self): # en test """ scrap tout le site internet et génère un csv par catégorie de livres ces csv sont stockés dans le dossier indiqué dans config.py et les images dans un sous dossier indiqué également dans config.py ATTENTION : cette méthode nécessite environ 15 minutes pour s'exécuter intégralement""" self.recupereInfosPourToutesCategories() print('Il y a {} categories'.format(len(self.urlsCategories))) print('Le scrapping va prendre environ 15 minutes...') for index, elem in enumerate(self.urlsCategories): print('Catégorie {}/{} : {}'.format(index + 1, len(self.urlsCategories), elem['csv'][:-4])) self.scrapUneCategorie(url=elem['url']) print('\nSite Web scrapé intégralement dans {}'.format( cf.dossierSauvegarde)) def trouverNomCategorie(self, url): """ Cette méthode permet de récupérer le nom des catégories quand on scrappe tout le site internet. Cette information est utilisée uniquement pour l'affichage dans la console lors de l'éxecution du programme pour le confort de l'utilisateur """ csv = url[::-1] csv = csv[csv.find('/') + 1:] csv = csv[:csv.find('/')] csv = csv[csv.find('_') + 1:] csv = csv[::-1] return (csv) def recupereInfosPourToutesCategories(self): """ recupere les urls de chaque categorie du site et les stocke dans self.urlsCategories. Elles sont incluses dans la balise <ul class='nav nav-list'> Elles sont contenus esuite dans la balise <ul> puis les balises <li> puis <a href=...>""" self.creerObjetSoup(self.urlBase) uls = self.soup.findAll('ul') for ul in uls: try: if (ul['class'] == ['nav', 'nav-list']): lis = ul.find('ul').findAll('li') break except: pass for li in lis: try: url = self.urlBase[:-10] + li.find('a')['href'] csv = self.trouverNomCategorie(url) + '.csv' self.urlsCategories.append({'url': url, 'csv': csv}) except: pass def sauvegardeImageUnLivre(self, livre): """ enregistre l'image du livre depuis son url dans self.livre dans le dossier self.dossierImages pour les images de livre """ reponse = requests.get(livre['image_url']) #le nom du livre peut poser des problemes lors de l'enregistrement du fichier #les deux problèmes rencontrés ont été: 1. avoir des slashs dans les titres ( donc remplacés par des underscores) # 2. avoir des noms avec trop de caractères (donc limités à 30) self.nomImage = livre['title'].replace('/', '_')[:30] + '.png' # création des dossiers de sauvegardes self.creationDossiersSauvegarde() dossierImage = os.path.join(cf.dossierImages, self.livre['category']) if (not os.path.isdir(dossierImage)): os.mkdir(dossierImage) #écriture du fichier image with open(os.path.join(dossierImage, self.nomImage), 'wb') as f: f.write(reponse.content) def choisirLaCibleDuScraping(self): """ Cette méthode sert à laisser le choix à l'utilisateur de ce qu'il veut scrapper sans avoir à toucher au code du programme, il peut scrapper: - un livre seul - une catégorie entière - tout le site (environ 15 minutes) """ print( 'Le programme va scrapper le site http://books.toscrape.com/index.html\nPlusieurs choix sont possibles' ) print( 'Choisir parmi les 3 options de scraping:\n1- scraper un seul livre\n' '2- scraper une seule catégorie\n3- scraper tout le site (environ 15 minutes)' ) entree = input('Taper 1, 2 ou 3 puis <Entree>: ') if (entree == '3'): # scraping de tout le site self.scrapSiteInternet() exit() elif (entree == '1'): # scraping d'un seul livre url = input("Entrer l'url du livre à utiliser :\n") self.scrapUnLivre(urlLivre=url, unLivre=True) exit() elif (entree == '2'): #scraping d'une seule catégorie url = input("Entrer l'url de la catégorie à utiliser :\n") self.scrapUneCategorie(url) exit() else: print('Votre choix doit être 1, 2 ou 3') exit()
def train_model(self, input_data): """ An implementation of the CD-n algorithm. inputs: training data should be a vector V of the same shape as v = np.zeros(nvisible) """ self.data_matrix = input_data N = self.nvisible #Scaling factor used in the learning process. #Creates empty arrays to store "differentials". dW = np.zeros((self.nvisible, self.nhidden)) dvb = np.zeros(self.nvisible) dhb = np.zeros(self.nhidden) bar = IncrementalBar("Progress", max=self.nepochs) #Sets up the progressbar. #Trains the RBM using the CD-n algorithm on a single datapoint at the time. for epoch in range(self.nepochs): bar.next() shuffled_indices = np.random.permutation(self.batch_size) training_data = self.data_matrix[shuffled_indices] error = 0 for k in range(self.batch_size): visible = training_data[k] #sample hidden variables self.compute_hidden(visible) #compute <vh>_0 CDpos = np.tensordot( visible, self.hiddenprob, axes=0 ) #Tensor product computes a matrix of shape (nvisible x nhidden) CDpos_vb = visible #Simply the initial state of the visible nodes. CDpos_hb = self.hiddenprob #The first computed state of the hidden nodes. #CD-n, if nCDsteps = 1, this is essentially just reconstrunction of the input. #Choosing nCDsteps = 1 works alright and is computationally effective. for j in range(self.nCDsteps): self.compute_visible(self.hiddenact) self.compute_hidden(self.visibleact) #self.compute_visible(self.hiddenprob) #self.compute_hidden(self.visibleact) #Computes <vh>_n CDneg = np.tensordot(self.visibleact, self.hiddenprob, axes=0) CDneg_vb = self.visibleact CDneg_hb = self.hiddenprob #This is where the learning happens, you can skip the momentum if you want but it speeds up initial learning #You can modifiy the class to add decay, that is add -self.decay*dW to the learning rule, or reduce the momentum towards the end of learning. #Reconstruction error. It measures how well the RBM reconstructs the data it's shown. visible = training_data[k] error += np.sum((self.data_matrix[k] - self.visibleact)**2) dW = self.eta * (CDpos - CDneg) / N + self.momentum * dW self.weights += dW dvb = self.eta * (CDpos_vb - CDneg_vb) / N + self.momentum * dvb self.visiblebias += dvb dhb = self.eta * (CDpos_hb - CDneg_hb) / N + self.momentum * dhb self.hiddenbias += dhb error /= self.batch_size self.loss[epoch] = error bar.finish()
for bar_cls in (Bar, ChargingBar, FillingSquaresBar, FillingCirclesBar): suffix = '%(index)d/%(max)d [%(elapsed)d / %(eta)d / %(eta_td)s]' bar = bar_cls(bar_cls.__name__, suffix=suffix) for i in bar.iter(range(200)): sleep() for bar_cls in (IncrementalBar, PixelBar, ShadyBar): suffix = '%(percent)d%% [%(elapsed_td)s / %(eta)d / %(eta_td)s]' with bar_cls(bar_cls.__name__, suffix=suffix, max=200) as bar: for i in range(200): bar.next() sleep() for spin in (Spinner, PieSpinner, MoonSpinner, LineSpinner, PixelSpinner): for i in spin(spin.__name__ + ' ').iter(range(100)): sleep() print() for singleton in (Counter, Countdown, Stack, Pie): for i in singleton(singleton.__name__ + ' ').iter(range(100)): sleep() print() bar = IncrementalBar('Random', suffix='%(index)d') for i in range(100): bar.goto(random.randint(0, 100)) sleep() bar.finish()
def downloadFile(image, iterations, outage = False, oNr = 0, oTime = 0): for node in set.name: subprocess.call(['docker cp mn.%s:var/log/transmission/transmission.log measurements/%s/%s/0/time/%s.txt&' % (node, currentInstance, currentTest, node)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) image = image.strip() milestone = [0] * len(set.name) for iteration in range(int(iterations)): print ('\n###\nTest #%s\n###' % (iteration + 1)) iStart = datetime.now() print iStart #checkTransmissionContainer() subprocess.call(['mkdir measurements/%s/%s/%s/' % (currentInstance,currentTest,(iteration + 1))],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['mkdir measurements/%s/%s/%s/time/' % (currentInstance,currentTest,(iteration + 1))],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['mkdir measurements/%s/%s/%s/traffic/' % (currentInstance,currentTest,(iteration + 1))],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) with open('measurements/%s/torrentsNr.txt' % currentInstance,'r+') as current: lines = current.readlines() torrentsNr = int(lines[-1]) print 'Torrent #%s' % torrentsNr doc = open('measurements/%s/torrentsNr.txt' % currentInstance,'w+') doc.write(str(torrentsNr + 1)+'\n') doc.close() #delete existing file and log files on hosts sum = 0 seederPrep = [False] * len(set.seeder) complete = [False] * len(set.name) bar_restart = IncrementalBar('Deleting existing files ', max = len(set.name)) for node in set.name: if not node in set.servers: subprocess.call(['docker exec -it mn.%s docker image rm -f %s' %(node, image)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) else: subprocess.call(['docker exec -it mn.%s sh -c "(docker stop opentracker && docker rm opentracker && export IP=%s && docker-compose -f stack_server.yml up -d)"' % (node, set.ip[set.name.index(node)])],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["docker exec -it mn.%s sh -c 'rm -rf downloads/*'" % node],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["docker exec -it mn.%s sh -c 'rm -rf torrents/*'" % node],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["docker exec -it mn.%s sh -c 'rm -rf root/.config/transmission-daemon/resume/*'" % node],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker exec -it mn.%s transmission-remote -t %s -r' % (node, str(torrentsNr))],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["docker exec -it mn.%s sh -c 'rm -rf root/.config/transmission-daemon/torrents/*'" % node],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) #root/.small-dragonfly/logs/* subprocess.call(["docker exec mn.%s sh -c 'iptables -Z'" % node ],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) bar_restart.next() bar_restart.finish() check.check() while check.repeat == True: check.check() print ('%s deleted on every host' % image) #Prepare seeder for node in set.seeder: if iteration == 0: subprocess.call(['docker exec mn.%s docker pull %s' %(node, image)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker exec mn.%s docker save -o downloads/%s%s.tar %s' %(node, image, torrentsNr, image)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["'docker exec mn.%s sh -c 'iptables -Z'" % node ],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker exec mn.%s transmission-remote -a torrents/%s%s.torrent &' % (node, image, torrentsNr)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) #Creating torrent and sharing torrent bar_sharing = IncrementalBar('Creating and sharing torrent', max = len(set.name)) trackerAdr = '' for node in set.servers: trackerAdr = '%s -t udp://%s:6969' % (trackerAdr, set.ip[set.name.index(node)]) subprocess.call(['docker exec mn.%s transmission-create -o torrents/%s%s.torrent%s downloads/%s%s.tar' % (set.seeder[0], image, torrentsNr, trackerAdr, image, torrentsNr)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker cp mn.%s:torrents/%s%s.torrent measurements/%s/%s/torrents/%s%s.torrent' % (set.seeder[0], image, torrentsNr, currentInstance, currentTest, image, torrentsNr)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) for node in set.name: subprocess.call(['docker cp measurements/%s/%s/torrents/%s%s.torrent mn.%s:torrents/%s%s.torrent' % (currentInstance, currentTest, image, torrentsNr, node, image, torrentsNr)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) bar_sharing.next() bar_sharing.finish() #Start download print datetime.now() sum = 0 bar_download = IncrementalBar('Waiting for download(s)', max = len(set.name)) for node in set.name: subprocess.call(['docker exec mn.%s transmission-remote -a torrents/%s%s.torrent &' % (node, image, torrentsNr)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) if node in set.seeder: complete[set.name.index(node)] = True bar_download.next() sum = sum + 1 #Server outage if outage == True: print ('\nWaiting %s seconds for outage...' % oTime) time.sleep(int(oTime)) for j in range(1,int(oNr)+1): print set.servers[j] subprocess.call(['docker exec mn.%s docker stop opentracker &' % (set.servers[-j])],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) while sum < len(set.name): time.sleep(120) for node in set.name: if complete[set.name.index(node)] == False: if ('%s%s.tar' % (image, str(torrentsNr) ) in subprocess.check_output(['docker exec mn.%s ls downloads/' % node],shell=True)): #and not (file + '.part' in subprocess.check_output(['docker exec mn.' + node + ' ls downloads/'],shell=True)): subprocess.call(['docker cp mn.%s:var/log/transmission/transmission.log measurements/%s/%s/%s/time/%s.txt' % (node, currentInstance, currentTest, (iteration + 1), node)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) with open('measurements/%s/%s/%s/time/%s.txt' % (currentInstance, currentTest, (iteration + 1), node)) as tmp: lines = tmp.readlines() for i in range(milestone[set.name.index(node)],len(lines)): if '%s%s.tar State changed from "Incomplete" to "Complete"' % (image, torrentsNr) in lines[i]: sum = sum + 1 complete[set.name.index(node)] = True milestone[set.name.index(node)] = i + 1 bar_download.next() break bar_download.finish() print 'Download(s) successful' print 'Grabbing data after download(s)' for node in set.name: subprocess.call(["docker exec mn.%s sh -c 'iptables -L INPUT -n -v -x > tmp_IN.txt'" % node ],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker cp mn.%s:tmp_IN.txt measurements/%s/%s/%s/traffic/%s_IN.txt' % (node, currentInstance, currentTest, (iteration + 1), node)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["docker exec mn.%s sh -c 'iptables -L OUTPUT -n -v -x > tmp_OUT.txt'" % node ],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker cp mn.%s:tmp_OUT.txt measurements/%s/%s/%s/traffic/%s_OUT.txt' % (node, currentInstance, currentTest, (iteration + 1), node)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(["docker exec mn.%s sh -c 'iptables -L FORWARD -n -v -x > tmp_OUT.txt'" % node ],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker cp mn.%s:tmp_OUT.txt measurements/%s/%s/%s/traffic/%s_FOR.txt' % (node, currentInstance, currentTest, (iteration + 1), node)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) subprocess.call(['docker cp mn.%s:downloads/%s%s.tar measurements/%s/%s/results/%s%s.tar' % (set.seeder[0], image, torrentsNr, currentInstance, currentTest, image, torrentsNr)],stdout=FNULL, stderr=subprocess.STDOUT,shell=True) set.measureTime(image, False, currentInstance, currentTest, iterations, torrentsNr) set.measureTraffic(image, False, currentInstance, currentTest, iterations) doc = open('./measurements/%s/%s/results/setup.txt' % (currentInstance, currentTest), 'w+') doc.write('Server:%s\nHosts:%s\nSeeders:%s\nImage:%s\nServer outage:%s\nOutage number:%s\nOutage start:%s' % (str(len(set.servers)), str(len(set.name)), str(len(set.seeder)), image, outage, oNr, oTime)) doc.close() set.imageTime(image, '%s%s.tar' % (image, torrentsNr), currentInstance, currentTest)
class SampleDumpHandler(object): def __init__(self,debug=False,samplelist=None): super(SampleDumpHandler,self).__init__() self.debug=debug self.samplelist = samplelist self.reset() def __del__(self): if len(self.data): self.saveFile() def reset(self): self.header = {} self.data = [] self.lastpacket = 0 self.raw = [] self.packetcounter = 0 self.dump_start = 0 self.exppacket = 0 self.starttime = 0 def parse(self,msg): status = None if msg[3] == 0x1: status = self.parseHeader(msg) elif msg[3] == 0x2: status = self.parsePacket(msg) elif msg[3] == 0x3: status = self.parseRequest(msg) elif msg[3] == 0x7F and self.dump_start > 0: status = self.continueDump() return status def parseHeader(self, msg): self.reset() if len(msg) != 21: print "Size mismatch, is", len(msg) return HandshakeMessage.NAK(packetnumber=self.lastpacket) speriod = int(msg[9] << 14 | msg[8] << 7 | msg[7]) srate = 1./(speriod *1e-9) self.header = { "target_id" : msg[2], "sample_number" : msg[5] << 7 | msg[4], "sample_format" : msg[6], "sample_period" : speriod, "sample_rate" : srate, "sample_length" : msg[12] << 14 | msg[11] << 7 | msg[10], "sample_loop_start": msg[15] << 14 | msg[14] << 7 | msg[13], "sample_loop_end" : msg[18] << 14 | msg[17] << 7 | msg[16], "loop_type" : msg[19], } if self.debug: print "Sample Dump Header" print " Data:" for k,v in self.header.iteritems(): print " %s:" % k, v self.raw += msg format = int(self.header["sample_format"]) length = int(self.header["sample_length"]) self.exppacket = (format+6)/7*length/120+1 self.starttime = time.time() self.bar = IncrementalBar( "Receiving sample dump", max=self.exppacket, suffix = '%(percent)d%% [%(elapsed_td)s / %(eta_td)s]') return HandshakeMessage.ACK(packetnumber=self.lastpacket) def parsePacket(self, msg): if not 0xF7 in msg: print "printSampleDumpDataPacket: could not find EOX" return HandshakeMessage.NAK(packetnumber=self.lastpacket) cs = msg.index(0xF7)-1 calced_cs = checksum(msg[1:cs]) if self.debug: print "Sample Dump Data Packet" print " Data:" print " Packet count", msg[4] print " checksum:", hex(msg[cs]), \ "(calculated 0x%x)" % calced_cs if msg[cs] != calced_cs: print "Checksum mismatch:", hex(msg[cs]), "should be", hex(calced_cs) return HandshakeMessage.NAK(packetnumber=self.lastpacket) offset = 5 format = int(self.header['sample_format']) if format == 14: self.data += msg[offset:offset+120] else: print format, "bit samples are not supported" self.lastpacket = msg[4] self.raw += msg self.packetcounter += 1 self.bar.next() return HandshakeMessage.ACK(packetnumber=self.lastpacket) def parseRequest(self,msg): self.reset() if not 0xF7 in msg: print "printSampleDumpDataPacket: could not find EOX" return HandshakeMessage.NAK(packetnumber=self.lastpacket) samplenumber = int(msg[5] << 7 | msg[4]) print "Received Sample Dump Request for sample", samplenumber if self.debug: print " Data:" print " targetid:", msg[2] print " samplenumber:", samplenumber samplefile = None if self.samplelist and samplenumber < len(self.samplelist): samplefile = self.samplelist[samplenumber] print "Selected list index", samplenumber, repr(samplefile) if not samplefile or not os.path.exists(samplefile): samplefile = "sample.sds" print "Selected fallback", repr(samplefile) if not os.path.exists(samplefile): print "No sample to send" return HandshakeMessage.Cancel(packetnumber=self.lastpacket) f = open(samplefile, "rb") self.raw = [ ord(i) for i in f.read() ] f.close() n = self.raw.count(0xF7) if n > 0: print "Sending", n, "Sample Dump Packets (+ header)" self.starttime = time.time() self.dump_start = self.raw.index(0xF7)+1 self.packetcounter += 1 return self.raw[:self.dump_start] return HandshakeMessage.Cancel(packetnumber=self.lastpacket) def continueDump(self): n = self.raw[self.dump_start:].count(0xF7) if n == 0: elapsed = time.time()-self.starttime print "Sent %d packets in %.1f seconds (%.1f bytes/sec)" % ( self.packetcounter, elapsed, len(self.raw)/elapsed) self.reset() return HandshakeMessage.EOF(packetnumber=self.lastpacket) ds = self.dump_start self.dump_start = self.raw.index(0xF7,self.dump_start)+1 if self.packetcounter % 100 == 0: print "Sent %d packets" % self.packetcounter self.packetcounter += 1 return self.raw[ds:self.dump_start] def saveFile(self, filename=None): self.bar.finish() if not filename: timestamp = time.strftime("%Y%m%d%H%M%S") filename = "sample_%s" % timestamp rate = self.packetcounter*120/(time.time()-self.starttime) print "Packets received: %d/%d" % (self.packetcounter, self.exppacket) print "Average rate: %.1f bytes/sec" % rate print "Saving to", filename # concatenation of sysex messages with open(filename+".sds", "wb") as f: f.write(bytearray(self.raw)) # adjust data size to sample length nsamples = int(self.header.get('sample_length',len(self.data)/2)) self.data = self.data[:nsamples*2] # sample data only (7-in-8-bit chunks, big-endian: .dcba987 .6543210) with open(filename+".dmp", "wb") as f: f.write(bytearray(self.data)) # decoded sample data format = int(self.header['sample_format']) out = [] if format == 14: pos = 0 while pos < len(self.data): # assume big-endian tmp = self.data[pos] << 7 | self.data[pos+1] # convert to s16le tmp = u2s(tmp<<2) out.append(tmp & 0xFF) out.append((tmp >> 8) & 0xFF) pos += 2 print else: print format, "bit samples are not supported" if len(out): # write raw file with open(filename+".raw", "wb") as f: f.write(bytearray(out)) # write WAV file writeWAV(filename+".wav",int(self.header.get("sample_rate", 22050)), bytearray(out)) # sample properties with open(filename+".txt", "w") as f: f.writelines( [ "%s: %s\n" % i for i in self.header.iteritems() ] ) f.writelines( [ "file_%s: %s.%s\n" % (suffix,filename,suffix) for suffix in [ 'sds', 'raw', 'dmp', 'wav' ] ]) self.reset()
def convert_jds_wf_to_wf32(source_directory, result_directory, no_of_bunches_per_file): """ function converts jds waveform data to wf32 waveform data for further processing (coherent dedispersion) and saves txt files with time data Input parameters: source_directory - directory where initial jds waveform data are stored result_directory - directory where new wf32 files will be stored no_of_bunches_per_file - number of data bunches per file to process (depends on RAM volume on the PC) Output parameters: result_wf32_files - list of results files """ file_list = find_and_check_files_in_current_folder(source_directory, '.jds') # To print in console the header of first file print('\n First file header parameters: \n') # *** Data file header read *** [df_filename, df_filesize, df_system_name, df_obs_place, df_description, clock_freq, df_creation_timeUTC, channel, receiver_mode, Mode, Navr, time_res, fmin, fmax, df, frequency, freq_points_num, data_block_size] = FileHeaderReaderJDS(source_directory + file_list[0], 0, 1) if Mode > 0: sys.exit(' ERROR!!! Data recorded in wrong mode! Waveform mode needed.\n\n Program stopped!') result_wf32_files = [] # Main loop by files start for file_no in range(len(file_list)): # loop by files fname = source_directory + file_list[file_no] # Create long data files and copy first data file header to them if file_no == 0: with open(fname, 'rb') as file: # *** Data file header read *** file_header = file.read(1024) # *** Creating a name for long timeline TXT file *** tl_file_name = df_filename + '_Timeline.wtxt' tl_file = open(tl_file_name, 'w') # Open and close to delete the file with the same name tl_file.close() # *** Creating a binary file with data for long data storage *** file_data_A_name = df_filename + '_Data_chA.wf32' result_wf32_files.append(file_data_A_name) file_data_A = open(file_data_A_name, 'wb') file_data_A.write(file_header) file_data_A.close() if channel == 2: file_data_B_name = df_filename + '_Data_chB.wf32' result_wf32_files.append(file_data_B_name) file_data_B = open(file_data_B_name, 'wb') file_data_B.write(file_header) file_data_B.close() del file_header # Calculation of number of blocks and number of spectra in the file if channel == 0 or channel == 1: # Single channel mode no_of_spectra_in_bunch = int((df_filesize - 1024) / (no_of_bunches_per_file * 2 * data_block_size)) else: # Two channels mode no_of_spectra_in_bunch = int((df_filesize - 1024) / (no_of_bunches_per_file * 4 * data_block_size)) no_of_blocks_in_file = (df_filesize - 1024) / data_block_size if file_no == 0: print(' Number of blocks in file: ', no_of_blocks_in_file) print(' Number of bunches to read in file: ', no_of_bunches_per_file) print('\n *** Reading data from file *** \n') # ******************************************************************************* # R E A D I N G D A T A * # ******************************************************************************* with open(fname, 'rb') as file: file.seek(1024) # Jumping to 1024 byte from file beginning # !!! Fake timing. Real timing to be done!!! TimeFigureScaleFig = np.linspace(0, no_of_bunches_per_file, no_of_bunches_per_file + 1) for i in range(no_of_bunches_per_file): TimeFigureScaleFig[i] = str(TimeFigureScaleFig[i]) time_scale_bunch = [] bar = IncrementalBar(' File ' + str(file_no + 1) + ' of ' + str(len(file_list)) + ' reading: ', max=no_of_bunches_per_file, suffix='%(percent)d%%') bar.start() for bunch in range(no_of_bunches_per_file): # bar.next() # Reading and reshaping all data with time data if channel == 0 or channel == 1: # Single channel mode wf_data = np.fromfile(file, dtype='i2', count=no_of_spectra_in_bunch * data_block_size) wf_data = np.reshape(wf_data, [data_block_size, no_of_spectra_in_bunch], order='F') if channel == 2: # Two channels mode wf_data = np.fromfile(file, dtype='i2', count=2 * no_of_spectra_in_bunch * data_block_size) wf_data = np.reshape(wf_data, [data_block_size, 2 * no_of_spectra_in_bunch], order='F') # Timing timeline_block_str = jds_waveform_time(wf_data, clock_freq, data_block_size) if channel == 2: # Two channels mode # Cut the timeline of second channel timeline_block_str = timeline_block_str[0:int(len(timeline_block_str) / 2)] for i in range(len(timeline_block_str)): time_scale_bunch.append(df_creation_timeUTC[0:10] + ' ' + timeline_block_str[i]) # [0:12] # Deleting the time blocks from waveform data real_data_block_size = data_block_size - 4 wf_data = wf_data[0: real_data_block_size, :] # Separation data into channels if channel == 0 or channel == 1: # Single channel mode wf_data_chA = np.reshape(wf_data, [real_data_block_size * no_of_spectra_in_bunch, 1], order='F') del wf_data # Deleting unnecessary array name just in case if channel == 2: # Two channels mode # Separating the data into two channels wf_data = np.reshape(wf_data, [2 * real_data_block_size * no_of_spectra_in_bunch, 1], order='F') wf_data_chA = wf_data[0: (2 * real_data_block_size * no_of_spectra_in_bunch): 2] # A wf_data_chB = wf_data[1: (2 * real_data_block_size * no_of_spectra_in_bunch): 2] # B del wf_data # Saving WF data to dat file file_data_A = open(file_data_A_name, 'ab') file_data_A.write(np.float32(wf_data_chA).transpose().copy(order='C')) file_data_A.close() if channel == 2: file_data_B = open(file_data_B_name, 'ab') file_data_B.write(np.float32(wf_data_chB).transpose().copy(order='C')) file_data_B.close() # Saving time data to ling timeline file with open(tl_file_name, 'a') as tl_file: for i in range(no_of_spectra_in_bunch): tl_file.write((str(time_scale_bunch[i][:])) + ' \n') # str bar.next() bar.finish() file.close() # Close the data file del file_data_A if channel == 2: del file_data_B return result_wf32_files
def Alliance(): if not os.path.exists(f"./scrap_essentials/data/{date}.json"): with webdriver.Chrome(executable_path=path, options=options) as driver: def connect(): driver.find_element_by_id('username').send_keys(email) driver.find_element_by_id('password').send_keys(password) driver.find_element_by_id('loginSubmit').click() driver.get(URL) wait = WebDriverWait(driver, 60) connect() driver.get("https://www.airlines-manager.com/home") AllResult = {"Alliance": []} if (wait.until( presence_of_element_located( (By.XPATH, '//*[@id="mainHeader"]/div[2]')))): for id in ALLIANCE_LIST: result = { "Name": None, "ID": None, "Classement": None, "Profile": { "General": { "Created": None, "nbCompanies": None, "Solde": None, "BeneficeHebdo": None, "TaxeHebdo": None }, "Hub": { "HubsDispo": None, "KmPartage": None, "TaxeLigne": None, "TaxeCompanies": None }, "AG": { "nbAvionProposer": None, "ReducMax": None, "Reduc30j": None, "nbAvionAcheter": None, "AideAchatMax": None, "AideAchat30j": None }, "R&D": None }, "Members": [ # PatternMembers Here ], "Networks": { "Statistique": { "NbrHub": None, "NbrLigne": None, "KmLigne": None, }, "Hubs": [] } } bar = Bar( f'Tabs ({id["Name"]}): ', max=(len(ALLIANCE_TABS)), suffix= '%(percent).1f%% (%(index)d/%(max)d) - [%(elapsed_td)s / %(eta_td)s]' ) for tabs in ALLIANCE_TABS: driver.get(f"{URL_ALLIANCE_PROFIL}/{tabs}/{id['ID']}") if tabs == "profile": if (wait.until( presence_of_element_located( (By.CSS_SELECTOR, 'div#alliance_profile_statistiques' )))): result = Profile(driver, result) bar.next() elif tabs == "members": if (wait.until( presence_of_element_located(( By.CSS_SELECTOR, '#allianceMembersList > tbody > tr:nth-child(1) > th:nth-child(2) > span' )))): result = Member(driver, result) bar.next() elif tabs == "network": if (wait.until( presence_of_element_located( (By.CSS_SELECTOR, 'div#map_canvas')))): result = Network(driver, result) bar.next() AllResult["Alliance"].append(result) bar.finish() with open(os.getcwd() + date_str + date + ".json", "w", encoding='utf8') as f: f.write(json.dumps(AllResult, indent=4)) print(os.getcwd() + date_str + date + ".json") return AllResult # Back previous page : driver.back() driver.Quit() else: print(os.getcwd() + date_str + date + ".json") return json.load( open(os.getcwd() + date_str + date + ".json", "r", encoding='utf8')) driver.Quit()
def collect(self, topic_list): data_num = np.zeros(4, dtype=np.int) for bag_file in self.bag_filelist: print "loading bag file: ", bag_file bag_temp = rosbag.Bag( os.path.join(self.bag_path, bag_file), 'r') info_dict = yaml.load(bag_temp._get_yaml_info()) for topic_item in info_dict["topics"]: for i in range(len(topic_list)): if topic_item["topic"] == topic_list[i]: data_num[i] = topic_item["messages"] data_pose = np.zeros((self.batch_size, 6), dtype=np.float) data_status = np.zeros((self.batch_size, 2), dtype=np.int) if self.greyscale: data_image = np.zeros((self.batch_size, np.prod(self.img_size)/3), dtype=np.uint8) else: data_image = np.zeros((self.batch_size, np.prod(self.img_size)), dtype=np.uint8) # data_lidar = np.zeros((self.batch_size, 1000000), dtype=np.uint8) bar = IncrementalBar('Processing messages', max=np.sum(data_num)) cout = np.zeros(4, dtype=np.int) cout_tot = np.zeros(4, dtype=np.int) for topic, msg, t in bag_temp.read_messages(topics=topic_list): if topic == "/nav/fix": data_pose[cout[0], 0] = msg.pose.position.x data_pose[cout[0], 1] = msg.pose.position.y data_pose[cout[0], 2] = msg.pose.orientation.x data_pose[cout[0], 3] = msg.pose.orientation.y data_pose[cout[0], 4] = msg.pose.orientation.z data_pose[cout[0], 5] = msg.pose.orientation.w cout[0] += 1 if topic == "/nav/status": data_status[cout[1], 0] = msg.gps data_status[cout[1], 1] = msg.satellite cout[1] += 1 if topic == "/usb_cam/image_raw": img_array = self.bridge.imgmsg_to_cv2(msg, "rgb8") if self.greyscale: img_array = cv2.cvtColor(np.reshape(img_array, self.img_size), cv2.COLOR_RGB2GRAY).flatten() else: img_array = img_array.flatten() data_image[cout[2], :] = img_array cout[2] += 1 bar.next() if (bar.index+1)%self.batch_size == 0: self.save(bag_file, topic_list, \ data_pose[:cout[0], :],\ data_status[:cout[1],:],\ data_image[:cout[2], :]) cout_tot += cout cout = np.zeros(4, dtype=np.int) # if topic == "/velodyne_points": # point_tuple = struct.unpack("B"*len(msg.data), msg.data) # point_array = np.asarray(point_tuple) # data_lidar[cout[3], :len(msg.data)] = point_array # cout[3] += 1 bar.finish() del bag_temp self.save(bag_file, topic_list, \ data_pose[:cout[0], :],\ data_status[:cout[1],:],\ data_image[:cout[2], :])
elif isinstance(fault.data, types.ListType): missing = fault.data if '' in missing: del missing[missing.index(''):] bar = IncrementalBar('Uploading', max=len(missing)) bar.suffix = '%(percent).1f%% - %(eta)ds' with open(path) as fp: for hash in missing: offset = hashes.index(unhexlify(hash)) * blocksize fp.seek(offset) block = fp.read(blocksize) client.update_container_data(container, StringIO(block)) bar.next() bar.finish() return client.create_object_by_hashmap(container, object, map, **kwargs) def download(client, container, object, path): res = client.retrieve_object_hashmap(container, object) blocksize = int(res['block_size']) blockhash = res['block_hash'] bytes = res['bytes'] map = res['hashes'] if os.path.exists(path): h = HashMap(blocksize, blockhash) h.load(open(path))
class AuthorCrawler: visitedProfileURL = [] queueProfileURL = [] visitedArticleURL = [] queueArticleURL = [] numberOfCrawlerProfile = 0 def __init__(self): self.baseURL = 'https://www.researchgate.net/' from progress.bar import IncrementalBar self.progress_bar = IncrementalBar('Crawling', max=MIN_NUMBER_OF_PROFILE, suffix='%(percent)d%% %(remaining)s remaining - eta %(eta_td)s') def crawl(self): self.queueProfileURL.extend(START_PAGES) os.makedirs(AFTER_CRAWL_AUTHOR_DIR, exist_ok=True) while self.numberOfCrawlerProfile < MIN_NUMBER_OF_PROFILE: while len(self.queueProfileURL) == 0: if len(self.queueArticleURL) == 0: self.progress_bar.finish() return try: self.queueProfileURL.extend(filter(lambda x: x not in self.visitedProfileURL and x not in self.queueProfileURL,self.getAuthorFromArticle(self.queueArticleURL.pop(0)))) except: pass try: self.progress_bar.next() self.crawlProfile(self.queueProfileURL.pop(0)) except: pass self.progress_bar.finish() def getAuthorFromArticle(self, url): r = requests.get(url) s = BeautifulSoup(r.text, 'html.parser') authors = s.findAll('a', class_='display-name') authorsList = [] for author in authors: authorsList.append(self.baseURL +author['href']) return authorsList def getArticleIDFromURL(self, url): return re.findall(r'publication/(?P<id>\d+)_', url)[0] def crawlProfile(self, profURL): if not profURL.endswith('publications'): profURL += '/publications' r = requests.get(profURL) s = BeautifulSoup(r.text, 'html.parser') name = s.find('h1', class_='profile-header-name') name = name.text n = 1 articles = [] while True: url = profURL+'/'+n.__str__() n+=1 res = self.parseProfilePage(url) if res is None or len(res) == 0: break articles.extend(res) self.queueArticleURL.extend(filter(lambda x: x not in self.visitedArticleURL and x not in self.queueArticleURL,map(lambda x : x[0],articles))) js = {} js['Name'] = name js['Article'] = articles file_name = '{}.json'.format(name) with open(os.path.join(AFTER_CRAWL_AUTHOR_DIR , file_name), 'w') as outfile: json.dump(js, outfile) self.numberOfCrawlerProfile +=1 print(self.numberOfCrawlerProfile) def parseProfilePage(self, url): # return top 10 article url r = requests.get(url) s = BeautifulSoup(r.text, 'html.parser') articles = s.findAll('a', class_='ga-publication-item') result = [] for article in articles: result.append((self.baseURL + article['href'], self.getArticleIDFromURL(article['href']))) return result
def find_solutions(self, graph_setting_groups): results = {} # check for solutions for a specific set of interaction settings logging.info("Number of interaction settings groups being processed: " + str(len(graph_setting_groups))) for strength, graph_setting_group in sorted( graph_setting_groups.items(), reverse=True): logging.info("processing interaction settings group with " "strength " + str(strength)) logging.info(str(len(graph_setting_group)) + " entries in this group") logging.info("running with " + str(self.number_of_threads) + " threads...") temp_results = [] bar = IncrementalBar('Propagating quantum numbers...', max=len(graph_setting_group)) bar.update() if self.number_of_threads > 1: with Pool(self.number_of_threads) as p: for result in p.imap_unordered( self.propagate_quantum_numbers, graph_setting_group, 1): temp_results.append(result) bar.next() else: for graph_setting_pair in graph_setting_group: temp_results.append(self.propagate_quantum_numbers( graph_setting_pair)) bar.next() bar.finish() logging.info('Finished!') if strength not in results: results[strength] = [] results[strength].extend(temp_results) for k, v in results.items(): logging.info( "number of solutions for strength (" + str(k) + ") after qn propagation: " + str(sum([len(x[0]) for x in v]))) # remove duplicate solutions, which only differ in the interaction qn S results = remove_duplicate_solutions(results, self.filter_remove_qns, self.filter_ignore_qns) node_non_satisfied_rules = [] solutions = [] for result in results.values(): for (tempsolutions, non_satisfied_laws) in result: solutions.extend(tempsolutions) node_non_satisfied_rules.append(non_satisfied_laws) logging.info("total number of found solutions: " + str(len(solutions))) violated_laws = [] if len(solutions) == 0: violated_laws = analyse_solution_failure(node_non_satisfied_rules) logging.info("violated rules: " + str(violated_laws)) # finally perform combinatorics of identical external edges # (initial or final state edges) and prepare graphs for # amplitude generation match_external_edges(solutions) final_solutions = [] for sol in solutions: final_solutions.extend( perform_external_edge_identical_particle_combinatorics(sol) ) return (final_solutions, violated_laws)
for (w, weight) in m.show_topic(topic[0], topn=50): d = {} d['word'] = w d['doc_page'] = counter d['year'] = years[i] d['word_weight'] = weight d['topic_id'] = topic[0] d['topic_weight'] = topic[1] dl = pd.DataFrame().append(d, ignore_index=True) if os.path.exists(output_path): dl.to_csv(output_path, mode='a', header=False, index=False) else: dl.to_csv(output_path, header=True, index=False) counter = counter + 1 processing.finish() # lda_1960 = pd.read_csv('../scripts/1960_all_lda_words.csv') # lda_1961 = pd.read_csv('../scripts/1961_all_lda_words.csv') # lda_1962 = pd.read_csv('../scripts/1962_all_lda_words.csv') # lda_1963 = pd.read_csv('../scripts/1963_all_lda_words.csv') # lda_1964 = pd.read_csv('../scripts/1964_all_lda_words.csv') # lda_1965 = pd.read_csv('../scripts/1965_all_lda_words.csv') # lda_1966 = pd.read_csv('../scripts/1966_all_lda_words.csv') # print('1960',list(lda_1960.word.unique())) # print('1961',list(lda_1961.word.unique())) # print('1962',list(lda_1962.word.unique())) # print('1963',list(lda_1963.word.unique())) # print('1964',list(lda_1964.word.unique())) # print('1965',list(lda_1965.word.unique()))
def tweets_parecidos_con_distinto_humor(corpus): print("Buscando tweets muy parecidos pero con distinto valor de humor...") parecidos_con_distinto_humor = set() ids_parecidos_con_distinto_humor = cargar_parecidos_con_distinto_humor() if ids_parecidos_con_distinto_humor: corpus_por_id = {tweet.id: tweet for tweet in corpus} for id_tweet_humor, id_tweet_no_humor in ids_parecidos_con_distinto_humor: parecidos_con_distinto_humor.add((corpus_por_id[id_tweet_humor], corpus_por_id[id_tweet_no_humor])) else: subcorpus_cuentas_de_humor = [] subsubcorpus_cuentas_de_humor_humor = [] subsubcorpus_cuentas_de_humor_no_humor = [] for tweet in corpus: if tweet.es_chiste: subcorpus_cuentas_de_humor.append(tweet) if tweet.es_humor: subsubcorpus_cuentas_de_humor_humor.append(tweet) else: subsubcorpus_cuentas_de_humor_no_humor.append(tweet) subsubcorpus_cuentas_de_humor_no_humor_por_largo = defaultdict(list) bar = IncrementalBar("Tokenizando\t\t\t", max=len(subcorpus_cuentas_de_humor), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet_cuenta_humor in subcorpus_cuentas_de_humor: tweet_cuenta_humor.oraciones = Freeling.procesar_texto(tweet_cuenta_humor.texto_original) tweet_cuenta_humor.tokens = list(itertools.chain(*tweet_cuenta_humor.oraciones)) bar.next() bar.finish() for tweet_no_humor in subsubcorpus_cuentas_de_humor_no_humor: subsubcorpus_cuentas_de_humor_no_humor_por_largo[len(tweet_no_humor.tokens)].append(tweet_no_humor) bar = IncrementalBar("Buscando en tweets\t\t", max=len(subsubcorpus_cuentas_de_humor_humor), suffix=SUFIJO_PROGRESS_BAR) bar.next(0) for tweet_humor in subsubcorpus_cuentas_de_humor_humor: margen = int(round(len(tweet_humor.tokens) / 5)) largo_min = len(tweet_humor.tokens) - margen largo_max = len(tweet_humor.tokens) + margen for largo in range(largo_min, largo_max + 1): for tweet_no_humor in subsubcorpus_cuentas_de_humor_no_humor_por_largo[largo]: if distancia_edicion(tweet_humor.tokens, tweet_no_humor.tokens)\ <= max(len(tweet_humor.tokens), len(tweet_no_humor.tokens)) / 5: parecidos_con_distinto_humor.add((tweet_humor, tweet_no_humor)) print('') print(tweet_humor.id) print(tweet_humor.texto_original) print("------------") print(tweet_no_humor.id) print(tweet_no_humor.texto_original) print("------------") print('') bar.next() bar.finish() guardar_parecidos_con_distinto_humor(parecidos_con_distinto_humor) return parecidos_con_distinto_humor
def dump_image_point(): data_prefix = '/media/tree/data1/projects/AttentionBased/data' train_output_folder = '/media/tree/backup/projects/AttentionBased/data/train' test_output_folder = '/media/tree/backup/projects/AttentionBased/data/test' image_input_folder = 'image_256_256_12' point_input_folder = 'pointcloud_12/16384' image_output_folder = 'image_256_256_12' image_192_output_folder = 'image_192_256_12' point_output_folder = 'point_16384_12' image_number = 12 with open('/media/tree/backup/projects/AttentionBased/data/train_models.json', 'r') as f: train_models_dict = json.load(f) with open('/media/tree/backup/projects/AttentionBased/data/test_models.json', 'r') as f: test_models_dict = json.load(f) cats = shapenet_category_to_id.values() for cat in cats: print(cat, 'starts at ', time.strftime("%m-%d %H:%M:%S", time.localtime())) print(cat, 'loading train_split!') train_image_models = [] train_point_models = [] train_img_path = [] train_pt_path = [] train_image_models.extend([os.path.join(data_prefix, image_input_folder, model) for model in train_models_dict[cat]]) for each in train_image_models: for index in range(image_number): train_img_path.append(os.path.join(each, '{0:02d}.png'.format(int(index)))) train_point_models.extend([os.path.join(data_prefix, point_input_folder, model) for model in train_models_dict[cat]]) for each in train_point_models: for index in range(image_number): train_pt_path.append(os.path.join(each, '{0:02d}.npy'.format(int(index)))) print(cat, 'train_split loaded!') train_image_save = h5py.File(os.path.join(train_output_folder, image_output_folder, '{}.h5'.format(cat)), mode = 'w') train_image_192_save = h5py.File(os.path.join(train_output_folder, image_192_output_folder, '{}.h5'.format(cat)), mode = 'w') # train_point_save = h5py.File(os.path.join(train_output_folder, point_output_folder, '{}.h5'.format(cat)), mode = 'w') train_img_shape = (len(train_img_path), 256, 256, 3) train_img_192_shape = (len(train_img_path), 192, 256, 3) train_pt_shape = (len(train_pt_path), 16384, 3) train_image_save.create_dataset('image', train_img_shape, np.uint8) train_image_192_save.create_dataset('image', train_img_192_shape, np.uint8) # train_point_save.create_dataset('point', train_pt_shape, np.float32) print(cat, 'saving train data at', time.strftime("%m-%d %H:%M:%S", time.localtime())) train_bar = IncrementalBar(max=len(train_img_path)) for i in range(len(train_img_path)): image_array, point_array , image_192_array = load_data(train_img_path[i], train_pt_path[i]) train_image_save['image'][i, ...] = image_array train_image_192_save['image'][i, ...] = image_192_array # train_point_save['point'][i, ...] = point_array train_bar.next() train_bar.finish() print(cat, 'train data saved!') train_image_save.close() train_image_192_save.close() # train_point_save.close() print(cat, 'loading test_split!') test_image_models = [] test_point_models = [] test_img_path = [] test_pt_path = [] test_image_models.extend([os.path.join(data_prefix, image_input_folder, model) for model in test_models_dict[cat]]) for each in test_image_models: for index in range(image_number): test_img_path.append(os.path.join(each, '{0:02d}.png'.format(int(index)))) test_point_models.extend([os.path.join(data_prefix, point_input_folder, model) for model in test_models_dict[cat]]) for each in test_point_models: for index in range(image_number): test_pt_path.append(os.path.join(each, '{0:02d}.npy'.format(int(index)))) print(cat, 'test_split loaded!') test_image_save = h5py.File(os.path.join(test_output_folder, image_output_folder, '{}.h5'.format(cat)), mode = 'w') test_image_192_save = h5py.File(os.path.join(test_output_folder, image_192_output_folder, '{}.h5'.format(cat)), mode = 'w') # test_point_save = h5py.File(os.path.join(test_output_folder, point_output_folder, '{}.h5'.format(cat)), mode = 'w') test_img_shape = (len(test_img_path), 256, 256, 3) test_img_192_shape = (len(test_img_path), 192, 256, 3) test_pt_shape = (len(test_pt_path), 16384, 3) test_image_save.create_dataset('image', test_img_shape, np.uint8) test_image_192_save.create_dataset('image', test_img_192_shape, np.uint8) # test_point_save.create_dataset('point', test_pt_shape, np.float32) print(cat, 'saving test data at ', time.strftime("%m-%d %H:%M:%S", time.localtime())) test_bar = IncrementalBar(max=len(test_img_path)) for i in range(len(test_img_path)): image_array, point_array , image_192_array = load_data(test_img_path[i], test_pt_path[i]) test_image_save['image'][i, ...] = image_array test_image_192_save['image'][i, ...] = image_192_array # test_point_save['point'][i, ...] = point_array test_bar.next() test_bar.finish() print(cat, 'test data saved!') print(cat, 'finished at ', time.strftime("%m-%d %H:%M:%S", time.localtime())) test_image_save.close() test_image_192_save.close()