def rename_countries(output_every=262144): """ Peupler les noms alternatifs des pays """ # [0:alternate_id, 1:geonames_id, 2:lang, 3:name, 4:preferred, 5:short, 6:slang, 7:historic] try: if not settings.TEST: default_path = join(Paths.get_root_dir('files', 'geonames'), 'alternateNames.zip') else: default_path = join(Paths.get_root_dir('files', 'geonames', 'tests'), 'alternateNames.zip') filename = default_path if os.path.exists(default_path) else download_url_resource('http://download.geonames.org/export/dump/alternateNames.zip') reader = load_geoname_alternate_table_raw(filename, 'alternateNames') LANGUAGES = frozenset([item[0] for item in settings.LANGUAGES]) COUNTRY_IDS = frozenset(set(Country.objects.all().values_list('id', flat=True))) # Récupérer la taille de la table en lignes rows, affected = ALTERNATES_COUNT, 0 # Traiter immédiatement le ficiher CountryName.objects.all().delete() for idx, row in enumerate(reader, start=1): if row[2] and row[1] and not (row[6] or row[7]) and row[2] in LANGUAGES: geonames_id, alternate_id = int(row[1]), int(row[0]) if geonames_id in COUNTRY_IDS: CountryName.objects.create(id=alternate_id, country_id=geonames_id, language=row[2], name=row[3], preferred=row[4] == '1', short=row[5] == '1') affected += 1 if idx % output_every == 0 or idx == rows: output_progress("Renaming: {pc:>5.1f}% ({idx:>10}/{rows:>10}, {affected:>10} updated)", idx, rows, output_every, {'affected': affected}) sys.stdout.write("\n") reader.close() return True except Exception: return False
def rename_cities(output_every=262144): """ Peupler les noms alternatifs des villes """ # [0:altid,1:geoid,2:lang,3:name,4:preferred,5:short,6:slang,7:historic] if not settings.DEBUG: try: default_path = join(Paths.get_root_dir('files', 'geonames'), 'alternateNames.zip') filename = default_path if os.path.exists(default_path) else download_url_resource('http://download.geonames.org/export/dump/alternateNames.zip') reader = load_geoname_alternate_table_raw(filename, 'alternateNames') languages = frozenset([language[0] for language in settings.LANGUAGES] + ['post', '']) affected, existing = 0, frozenset(set(City.objects.values_list('id', flat=True))) citynames = [] appender = citynames.append # Traiter immédiatement le ficiher CityName.objects.all().delete() for idx, row in enumerate(reader, start=1): if (row[2] in languages and row[1]) and not (row[6] or row[7]): if int(row[1]) in existing: ascii = unidecode(row[3].decode('utf-8') if type(row[3]) != str else row[3]).lower() cityname = CityName(id=int(row[0]), city_id=int(row[1]), language=row[2], name=row[3], ascii=ascii, preferred=(row[4] == '1'), short=(row[5] == '1')) appender(cityname) affected += 1 if idx % output_every == 0 or idx == ALTERNATES_COUNT: output_progress("Renaming: {pc:>5.1f}% ({idx:>10}/{rows:>10}, {affected:>10} updated)", idx, ALTERNATES_COUNT, output_every, {'affected': affected}) CityName.objects.bulk_create(citynames, batch_size=settings.BATCH_SIZE) return True except Exception as e: print_exc(e) return False finally: pass return False
def get_corpus(self): """ Lire et peupler le corpus """ if self.corpus is None: self.corpus = Dictionary() self.corpus.updated = time.time() try: directory = Paths.get_root_dir(*CORPUS_PATH) infile = '{name}.csv'.format(name=self.pathname) path = join(directory, '{name}.csv.zip'.format(name=self.pathname)) # Lire le CSV dans le fichier zip with ZipFile(open(path, 'rb')) as zipfile: buffer = StringIO(zipfile.read(infile)) reader = csv.reader(buffer) for row in reader: # 0: category, 1: doc, 2: hash self.corpus[row[2]] = (row[0], row[1]) except IOError: pass if self.corpus_shadow is None or self.corpus_shadow.updated < self.corpus.updated: self.corpus_shadow = List(self.corpus.values()) self.corpus_shadow.updated = time.time() self.classifier = MaxEntClassifier( self.corpus_shadow, feature_extractor=extractor_base) # ou NaiveBayesClassifier return self.corpus_shadow
def exports(self, name=None, debug=False): """ Exporter les données dans un fichier :param name: nom du fichier de sortie :param debug: écrire également un fichier de sortie lisible par un humain """ # Créer les données de sortie start = time.time() data = [ self.get_object_export_data(item) for item in self.get_export_list() if item is not None ] elapsed = time.time() - start # Écrire le fichier path = join(Paths.get_root_dir('files', 'legacy'), '{}.pickle'.format(name or self.name)) with open(path, 'wb') as f: pickle.dump(data, f, protocol=3) # Écrire un autre fichier pretty printed if debug: with io.open("{}.txt".format(path), "w", encoding="utf-8") as f: pp = pprint.PrettyPrinter(stream=f) output = pp.pformat(data) f.write(str(output)) # Résumé de l'opération print( "- {exporter} successfully exported {count} items in {elapsed:.03f}s" .format(exporter=type(self).__name__, count=len(data), elapsed=elapsed))
def get_data(self, name=None): """ Renvoyer les données du fichier exporté """ path = join(Paths.get_root_dir('files', 'legacy'), '{}.pickle'.format(name or self.name)) if not self.__class__.data_bits: with io.open(path, 'rb') as f: self.__class__.data_bits = pickle.load(f) return self.__class__.data_bits
def dump(queryset): """ Consiger des accès dans un fichier CSV :param queryset: queryset à consigner dans le fichier CSV """ if queryset.model == Access: fmt = timezone.now().strftime filename_info = {'year': fmt("%Y"), 'month': fmt("%m"), 'week': fmt("%W"), 'day': fmt("%d"), 'hour': fmt("%H"), 'minute': fmt("%M"), 'second': fmt("%S"), 'rows': queryset.count()} path = join(Paths.get_root_dir('isolated', 'var', 'log'), "access-log-{year}-{month}-{day}-{hour}-{minute}-{rows}.csv".format(**filename_info)) return csv_dump(queryset, path, compress=True) return False
def handle(self, *args, **options): """ Exécuter la commande """ project_paths = getattr(settings, 'MAKEMESSAGES_DIRS', [Paths.get_root_dir()]) for project_path in project_paths: print("Update locales for project at {path}".format(path=project_path)) parsable_paths = [] for root, dirs, _ in os.walk(project_path, topdown=True): for directory in dirs: new_directory = os.path.join(root, directory) parsable_paths.append(new_directory) for directory in settings.TEMPLATES[0]['DIRS']: parsable_paths.append(directory) for subdir in parsable_paths: os.chdir(subdir) if 'locale' in os.listdir(subdir): print("Updating locale messages for {dir}".format(dir=subdir)) call_command('makemessages', *args, **options) print("Finished updating locale messages.")
def dump(self, queryset): """ Consigner les données dans un fichier CSV """ if queryset.model == Record: fmt = timezone.now().strftime filename_info = { 'year': fmt("%Y"), 'month': fmt("%m"), 'week': fmt("%W"), 'day': fmt("%d"), 'hour': fmt("%H"), 'minute': fmt("%M"), 'second': fmt("%S"), 'rows': queryset.count() } path = join( Paths.get_root_dir('isolated', 'var', 'log'), "record-log-{year}-{month}-{day}-{hour}-{minute}-{rows}.csv.gz" .format(**filename_info)) csv_dump(queryset, path, compress=True)
def detect_features(self, name, cascades=None): """ Inscrire les 'caractéristiques' trouvées dans l'image Inscrit dans les données 'data' de l'image les informations sur les coordonnées des types d'objet cherchés, via OpenCV Typiquement, on peut détecter des visages avec cette méthode. :param name: Nom de la feature à retrouver. Si aucun fichier de cascade Haar n'existe, ne cherche rien. :param cascades: Liste de noms de cascades. ex. 'face_front', 'face_profile' """ if 'features' not in self.data: self.data['features'] = dict() self.data['features'][name] = list() image = cv2.cvtColor(cv2.imread(self.image.path), cv2.COLOR_BGR2GRAY) # Image en noir et blanc for cascade in cascades: cascade_file = join(Paths.get_root_dir('isolated', 'database', 'opencv', 'haar'), 'haarcascade_{0}.xml'.format(cascade)) classifier = cv2.CascadeClassifier(cascade_file) # Créer un classifieur avec les données de cascade features = classifier.detectMultiScale(image, scaleFactor=1.1, minNeighbors=3, minSize=(32, 32), maxSize=(2048, 2048)) rectangles = [(x, y, x + w, y + h) for (x, y, w, h) in features] self.data['features'][name] += rectangles self.save(update_fields=['data'])
def save(self): """ Enregistrer le corpus sur disque :rtype: bool :returns: True si la sauvegarde a eu lieu, False sinon """ directory = Paths.get_root_dir(*CORPUS_PATH) infile = '{name}.csv'.format(name=self.pathname) path = join(directory, '{name}.csv.zip'.format(name=self.pathname)) # Écrire le CSV dans le fichier zip try: with ZipFile(path, 'w', ZIP_DEFLATED) as zipfile: buffer = StringIO() writer = csv.writer(buffer, delimiter=",", encoding='utf-8') for row in self.corpus_shadow: writer.writerow(row) zipfile.writestr(infile, buffer.getvalue()) return True except IOError: return False
def project_path(*args): """ Renvoyer le chemin du répertoire du projet """ return Paths.get_root_dir(*args)
# coding: utf-8 from django.conf import settings from django.test.runner import DiscoverRunner from scoop.core.util.stream.directory import Paths class CeleryTestSuiteRunner(DiscoverRunner): """ Test runner configuré pour exécuter les tâches Celery immédiatement """ def setup_test_environment(self, **kwargs): super(CeleryTestSuiteRunner, self).setup_test_environment(**kwargs) # Désactiver la communication avec celery, exécuter directement les tâches settings.CELERY_ALWAYS_EAGER = False # Configuration settings pour les tests TEST_CONFIGURATION = { 'EMAIL_BACKEND': 'django.core.mail.backends.filebased.EmailBackend', 'EMAIL_FILE_PATH': Paths.get_root_dir('files', 'tests', 'mail'), 'DEFAULT_FROM_EMAIL': '*****@*****.**', 'MESSAGING_DEFAULT_THREAD_QUOTA': 32, }
def populate_cities(country, output_every=8192): """ Peupler la base de données avec des subdivisions administratives Ces subdivisions sont : ^ADM\d$ et ^PPL.{0,2}$ ADM : Zone administrative : région, département, etc. jusqu'à commune ( :/ ) PPL : Ville, lieu habité nommé (contient beaucoup de spam) """ # [0:id,1:name,2:ascii,3:altname,4:lat,5:lon,6:f,7:type,8:country,9:c1,10:a1,11:a2,12:a3,13:a4,14:population,15:elevation,16:gtopo,17:tz,18:updated] if not settings.DEBUG: try: used_features = {'ADM', 'PPL'} unused_features = {'PCLH', 'PCLI', 'PCLIX', 'PCLS', 'ADM1H', 'ADM2H', 'ADM3H', 'ADM4H', 'PPLCH', 'PPLF', 'PPLH', 'PPLQ', 'PPLR', 'PPLW'} country_name = country.get_name() # Remplir un dictionnaire avec la liste des lignes default_path = join(Paths.get_root_dir('files', 'geonames'), '{country}.zip'.format(country=country.code2.upper())) if os.path.exists(default_path): filename = default_path else: filename = download_url_resource('http://download.geonames.org/export/dump/{country}.zip'.format(country=country.code2.upper()), '{path}/geonames-country-{country}.zip'.format(path=tempfile.gettempdir(), country=country.code2.upper())) reader, table = load_geoname_table_raw(filename, unidecode(country.code2)), dict() for row in reader: if len(row) in {18, 19} and row[7][:3] in used_features and row[7] not in unused_features: table[int(row[0])] = row # Récupérer la taille de la table en lignes timezones = Timezone.get_dict() rows, updated_count = len(table), 0 # Mettre à jour la table des villes du pays if country.has_entries(): db_ids = frozenset(set(City.objects.filter(country=country).values_list('id', flat=True))) # Effacer de la base les éléments qui ne sont plus dans la nouvelle table (ex. PPL devenu MNT) table_ids = frozenset(set(table.keys())) removed_ids = db_ids.difference(table_ids) City.objects.filter(id__in=removed_ids).delete() sys.stdout.write("{} items were removed from the database.\n".format(len(removed_ids))) sys.stdout.flush() # Traiter le reste for idx, row in enumerate(table.values(), start=1): geoid = int(row[0]) updateable = datetime.datetime.strptime(row[18], '%Y-%m-%d').replace(tzinfo=pytz.utc) >= country.updated if updateable or geoid not in db_ids: # Le acode est un hash de tous les codes A1, A2, A3 et A4. AAAA est le hash de la chaîne vide (ou 0) # Note : en base64, 4 caractères permettent de représenter 24 bits acode = ''.join([base64.b64encode((hash(code) & 0xffffff).to_bytes(3, 'big')).decode('ascii') for code in row[10:14]]) latitude, longitude = float(row[4]), float(row[5]) city = City(id=geoid, level=0, country=country, timezone=timezones[row[17]], name=row[1], ascii=row[2].lower(), acode=acode, type=row[7], feature=row[6], city=(row[6] == 'P'), population=int(row[14]), position=Point(longitude, latitude)) city.save() updated_count += int(updateable) if idx % output_every == 0 or idx == rows - 1: output_progress("Updating {country:>15}: {pc:>5.1f}% ({idx:>10}/{rows:>10}, {updated:>10} updated)", idx, rows, output_every, {'country': country_name, 'updated': updated_count}) # Peupler la liste des villes si aucune n'existe pour le pays else: bulk = list() append = bulk.append for idx, row in enumerate(table.values(), start=1): latitude, longitude = float(row[4]), float(row[5]) acode = ''.join([base64.b64encode((hash(code) & 0xffffff).to_bytes(3, 'big')).decode('ascii') for code in row[10:14]]) city = City(id=int(row[0]), level=0, country=country, timezone=timezones[row[17]], name=row[1], ascii=row[2], acode=acode, type=row[7], feature=row[6], city=row[6] == 'P', population=int(row[14]), position=Point(longitude, latitude, srid=4326)) append(city) if idx % output_every == 0 or idx == rows - 1: output_progress("Filling {country:>15}: {pc:>5.1f}% ({idx:>10}/{rows:>10})", idx, rows, output_every, {'country': country_name}) City.objects.bulk_create(bulk, batch_size=settings.BATCH_SIZE) # Les portions de ville sont ensuite marquées comme non villes City.objects.filter(type='PPLX').update(city=False) country.update(updated=timezone.now(), public=True, save=True) return True except Exception as e: traceback.print_exc(e) return False else: print("Operation not launched, disable DEBUG first.")
# coding: utf-8 import re from os import listdir from os.path import isfile, join import requests from requests.exceptions import HTTPError from django.core.cache import cache from scoop.core.util.stream.directory import Paths PROXY_LIST_DIRECTORY = Paths.get_root_dir('isolated', 'database', 'rogue', 'proxies') def get_tor_nodes(path='http://torstatus.blutmagie.de/ip_list_exit.php/Tor_ip_list_EXIT.csv'): """ Charger la liste de nœuds pour le réseau TOR ou renvoyer celle en cache L'URL de la liste peut changer au cours du temps - 30.01.2013 : https://www.dan.me.uk/torlist/ (inaccessible via urllib) - 07.06.2014 : http://torstatus.blutmagie.de/ip_list_exit.php/Tor_ip_list_EXIT.csv """ # Renvoyer la liste mise en cache si existante cached = cache.get('rogue.torlist', None) if cached is not None: return cached # Sinon, mettre le fichier distant en cache try: data = requests.get(path).text results = frozenset([row.strip() for row in data.split('\n') if row.strip()]) cache.set('rogue.torlist', results, timeout=86400 * 3)