예제 #1
0
def rename_countries(output_every=262144):
    """ Peupler les noms alternatifs des pays """
    # [0:alternate_id, 1:geonames_id, 2:lang, 3:name, 4:preferred, 5:short, 6:slang, 7:historic]
    try:
        if not settings.TEST:
            default_path = join(Paths.get_root_dir('files', 'geonames'), 'alternateNames.zip')
        else:
            default_path = join(Paths.get_root_dir('files', 'geonames', 'tests'), 'alternateNames.zip')
        filename = default_path if os.path.exists(default_path) else download_url_resource('http://download.geonames.org/export/dump/alternateNames.zip')
        reader = load_geoname_alternate_table_raw(filename, 'alternateNames')
        LANGUAGES = frozenset([item[0] for item in settings.LANGUAGES])
        COUNTRY_IDS = frozenset(set(Country.objects.all().values_list('id', flat=True)))
        # Récupérer la taille de la table en lignes
        rows, affected = ALTERNATES_COUNT, 0
        # Traiter immédiatement le ficiher
        CountryName.objects.all().delete()
        for idx, row in enumerate(reader, start=1):
            if row[2] and row[1] and not (row[6] or row[7]) and row[2] in LANGUAGES:
                geonames_id, alternate_id = int(row[1]), int(row[0])
                if geonames_id in COUNTRY_IDS:
                    CountryName.objects.create(id=alternate_id, country_id=geonames_id, language=row[2], name=row[3], preferred=row[4] == '1',
                                               short=row[5] == '1')
                    affected += 1
            if idx % output_every == 0 or idx == rows:
                output_progress("Renaming: {pc:>5.1f}% ({idx:>10}/{rows:>10}, {affected:>10} updated)", idx, rows, output_every, {'affected': affected})
        sys.stdout.write("\n")
        reader.close()
        return True
    except Exception:
        return False
예제 #2
0
def rename_cities(output_every=262144):
    """ Peupler les noms alternatifs des villes """
    # [0:altid,1:geoid,2:lang,3:name,4:preferred,5:short,6:slang,7:historic]
    if not settings.DEBUG:
        try:
            default_path = join(Paths.get_root_dir('files', 'geonames'), 'alternateNames.zip')
            filename = default_path if os.path.exists(default_path) else download_url_resource('http://download.geonames.org/export/dump/alternateNames.zip')
            reader = load_geoname_alternate_table_raw(filename, 'alternateNames')
            languages = frozenset([language[0] for language in settings.LANGUAGES] + ['post', ''])
            affected, existing = 0, frozenset(set(City.objects.values_list('id', flat=True)))
            citynames = []
            appender = citynames.append
            # Traiter immédiatement le ficiher
            CityName.objects.all().delete()
            for idx, row in enumerate(reader, start=1):
                if (row[2] in languages and row[1]) and not (row[6] or row[7]):
                    if int(row[1]) in existing:
                        ascii = unidecode(row[3].decode('utf-8') if type(row[3]) != str else row[3]).lower()
                        cityname = CityName(id=int(row[0]), city_id=int(row[1]), language=row[2], name=row[3], ascii=ascii, preferred=(row[4] == '1'),
                                            short=(row[5] == '1'))
                        appender(cityname)
                        affected += 1
                if idx % output_every == 0 or idx == ALTERNATES_COUNT:
                    output_progress("Renaming: {pc:>5.1f}% ({idx:>10}/{rows:>10}, {affected:>10} updated)", idx, ALTERNATES_COUNT, output_every,
                                    {'affected': affected})
            CityName.objects.bulk_create(citynames, batch_size=settings.BATCH_SIZE)
            return True
        except Exception as e:
            print_exc(e)
            return False
        finally:
            pass
    return False
예제 #3
0
파일: file.py 프로젝트: artscoop/scoop
 def get_corpus(self):
     """ Lire et peupler le corpus """
     if self.corpus is None:
         self.corpus = Dictionary()
         self.corpus.updated = time.time()
         try:
             directory = Paths.get_root_dir(*CORPUS_PATH)
             infile = '{name}.csv'.format(name=self.pathname)
             path = join(directory,
                         '{name}.csv.zip'.format(name=self.pathname))
             # Lire le CSV dans le fichier zip
             with ZipFile(open(path, 'rb')) as zipfile:
                 buffer = StringIO(zipfile.read(infile))
                 reader = csv.reader(buffer)
                 for row in reader:
                     # 0: category, 1: doc, 2: hash
                     self.corpus[row[2]] = (row[0], row[1])
         except IOError:
             pass
     if self.corpus_shadow is None or self.corpus_shadow.updated < self.corpus.updated:
         self.corpus_shadow = List(self.corpus.values())
         self.corpus_shadow.updated = time.time()
         self.classifier = MaxEntClassifier(
             self.corpus_shadow,
             feature_extractor=extractor_base)  # ou NaiveBayesClassifier
     return self.corpus_shadow
예제 #4
0
    def exports(self, name=None, debug=False):
        """
        Exporter les données dans un fichier

        :param name: nom du fichier de sortie
        :param debug: écrire également un fichier de sortie lisible par un humain
        """
        # Créer les données de sortie
        start = time.time()
        data = [
            self.get_object_export_data(item)
            for item in self.get_export_list() if item is not None
        ]
        elapsed = time.time() - start
        # Écrire le fichier
        path = join(Paths.get_root_dir('files', 'legacy'),
                    '{}.pickle'.format(name or self.name))
        with open(path, 'wb') as f:
            pickle.dump(data, f, protocol=3)
        # Écrire un autre fichier pretty printed
        if debug:
            with io.open("{}.txt".format(path), "w", encoding="utf-8") as f:
                pp = pprint.PrettyPrinter(stream=f)
                output = pp.pformat(data)
                f.write(str(output))
        # Résumé de l'opération
        print(
            "- {exporter} successfully exported {count} items in {elapsed:.03f}s"
            .format(exporter=type(self).__name__,
                    count=len(data),
                    elapsed=elapsed))
예제 #5
0
 def get_data(self, name=None):
     """ Renvoyer les données du fichier exporté """
     path = join(Paths.get_root_dir('files', 'legacy'),
                 '{}.pickle'.format(name or self.name))
     if not self.__class__.data_bits:
         with io.open(path, 'rb') as f:
             self.__class__.data_bits = pickle.load(f)
     return self.__class__.data_bits
예제 #6
0
    def dump(queryset):
        """
        Consiger des accès dans un fichier CSV

        :param queryset: queryset à consigner dans le fichier CSV
        """
        if queryset.model == Access:
            fmt = timezone.now().strftime
            filename_info = {'year': fmt("%Y"), 'month': fmt("%m"), 'week': fmt("%W"), 'day': fmt("%d"), 'hour': fmt("%H"), 'minute': fmt("%M"),
                             'second': fmt("%S"), 'rows': queryset.count()}
            path = join(Paths.get_root_dir('isolated', 'var', 'log'), "access-log-{year}-{month}-{day}-{hour}-{minute}-{rows}.csv".format(**filename_info))
            return csv_dump(queryset, path, compress=True)
        return False
예제 #7
0
 def handle(self, *args, **options):
     """ Exécuter la commande """
     project_paths = getattr(settings, 'MAKEMESSAGES_DIRS', [Paths.get_root_dir()])
     for project_path in project_paths:
         print("Update locales for project at {path}".format(path=project_path))
         parsable_paths = []
         for root, dirs, _ in os.walk(project_path, topdown=True):
             for directory in dirs:
                 new_directory = os.path.join(root, directory)
                 parsable_paths.append(new_directory)
         for directory in settings.TEMPLATES[0]['DIRS']:
             parsable_paths.append(directory)
         for subdir in parsable_paths:
             os.chdir(subdir)
             if 'locale' in os.listdir(subdir):
                 print("Updating locale messages for {dir}".format(dir=subdir))
                 call_command('makemessages', *args, **options)
     print("Finished updating locale messages.")
예제 #8
0
파일: recorder.py 프로젝트: artscoop/scoop
 def dump(self, queryset):
     """ Consigner les données dans un fichier CSV """
     if queryset.model == Record:
         fmt = timezone.now().strftime
         filename_info = {
             'year': fmt("%Y"),
             'month': fmt("%m"),
             'week': fmt("%W"),
             'day': fmt("%d"),
             'hour': fmt("%H"),
             'minute': fmt("%M"),
             'second': fmt("%S"),
             'rows': queryset.count()
         }
         path = join(
             Paths.get_root_dir('isolated', 'var', 'log'),
             "record-log-{year}-{month}-{day}-{hour}-{minute}-{rows}.csv.gz"
             .format(**filename_info))
         csv_dump(queryset, path, compress=True)
예제 #9
0
    def detect_features(self, name, cascades=None):
        """
        Inscrire les 'caractéristiques' trouvées dans l'image

        Inscrit dans les données 'data' de l'image les informations
        sur les coordonnées des types d'objet cherchés, via OpenCV
        Typiquement, on peut détecter des visages avec cette méthode.
        :param name: Nom de la feature à retrouver. Si aucun fichier de cascade Haar n'existe, ne cherche rien.
        :param cascades: Liste de noms de cascades. ex. 'face_front', 'face_profile'
        """
        if 'features' not in self.data:
            self.data['features'] = dict()
        self.data['features'][name] = list()
        image = cv2.cvtColor(cv2.imread(self.image.path), cv2.COLOR_BGR2GRAY)  # Image en noir et blanc
        for cascade in cascades:
            cascade_file = join(Paths.get_root_dir('isolated', 'database', 'opencv', 'haar'), 'haarcascade_{0}.xml'.format(cascade))
            classifier = cv2.CascadeClassifier(cascade_file)  # Créer un classifieur avec les données de cascade
            features = classifier.detectMultiScale(image, scaleFactor=1.1, minNeighbors=3, minSize=(32, 32), maxSize=(2048, 2048))
            rectangles = [(x, y, x + w, y + h) for (x, y, w, h) in features]
            self.data['features'][name] += rectangles
        self.save(update_fields=['data'])
예제 #10
0
파일: file.py 프로젝트: artscoop/scoop
    def save(self):
        """
        Enregistrer le corpus sur disque

        :rtype: bool
        :returns: True si la sauvegarde a eu lieu, False sinon
        """
        directory = Paths.get_root_dir(*CORPUS_PATH)
        infile = '{name}.csv'.format(name=self.pathname)
        path = join(directory, '{name}.csv.zip'.format(name=self.pathname))
        # Écrire le CSV dans le fichier zip
        try:
            with ZipFile(path, 'w', ZIP_DEFLATED) as zipfile:
                buffer = StringIO()
                writer = csv.writer(buffer, delimiter=",", encoding='utf-8')
                for row in self.corpus_shadow:
                    writer.writerow(row)
                zipfile.writestr(infile, buffer.getvalue())
            return True
        except IOError:
            return False
예제 #11
0
def project_path(*args):
    """ Renvoyer le chemin du répertoire du projet """
    return Paths.get_root_dir(*args)
예제 #12
0
파일: testing.py 프로젝트: artscoop/scoop
# coding: utf-8
from django.conf import settings
from django.test.runner import DiscoverRunner
from scoop.core.util.stream.directory import Paths


class CeleryTestSuiteRunner(DiscoverRunner):
    """ Test runner configuré pour exécuter les tâches Celery immédiatement """
    def setup_test_environment(self, **kwargs):
        super(CeleryTestSuiteRunner, self).setup_test_environment(**kwargs)
        # Désactiver la communication avec celery, exécuter directement les tâches
        settings.CELERY_ALWAYS_EAGER = False


# Configuration settings pour les tests
TEST_CONFIGURATION = {
    'EMAIL_BACKEND': 'django.core.mail.backends.filebased.EmailBackend',
    'EMAIL_FILE_PATH': Paths.get_root_dir('files', 'tests', 'mail'),
    'DEFAULT_FROM_EMAIL': '*****@*****.**',
    'MESSAGING_DEFAULT_THREAD_QUOTA': 32,
}
예제 #13
0
def populate_cities(country, output_every=8192):
    """
    Peupler la base de données avec des subdivisions administratives

    Ces subdivisions sont : ^ADM\d$ et ^PPL.{0,2}$
    ADM : Zone administrative : région, département, etc. jusqu'à commune ( :/ )
    PPL : Ville, lieu habité nommé (contient beaucoup de spam)
    """
    # [0:id,1:name,2:ascii,3:altname,4:lat,5:lon,6:f,7:type,8:country,9:c1,10:a1,11:a2,12:a3,13:a4,14:population,15:elevation,16:gtopo,17:tz,18:updated]
    if not settings.DEBUG:
        try:
            used_features = {'ADM', 'PPL'}
            unused_features = {'PCLH', 'PCLI', 'PCLIX', 'PCLS', 'ADM1H', 'ADM2H', 'ADM3H', 'ADM4H', 'PPLCH', 'PPLF', 'PPLH', 'PPLQ', 'PPLR', 'PPLW'}
            country_name = country.get_name()
            # Remplir un dictionnaire avec la liste des lignes
            default_path = join(Paths.get_root_dir('files', 'geonames'), '{country}.zip'.format(country=country.code2.upper()))
            if os.path.exists(default_path):
                filename = default_path
            else:
                filename = download_url_resource('http://download.geonames.org/export/dump/{country}.zip'.format(country=country.code2.upper()),
                                                 '{path}/geonames-country-{country}.zip'.format(path=tempfile.gettempdir(), country=country.code2.upper()))
            reader, table = load_geoname_table_raw(filename, unidecode(country.code2)), dict()
            for row in reader:
                if len(row) in {18, 19} and row[7][:3] in used_features and row[7] not in unused_features:
                    table[int(row[0])] = row
            # Récupérer la taille de la table en lignes
            timezones = Timezone.get_dict()
            rows, updated_count = len(table), 0
            # Mettre à jour la table des villes du pays
            if country.has_entries():
                db_ids = frozenset(set(City.objects.filter(country=country).values_list('id', flat=True)))
                # Effacer de la base les éléments qui ne sont plus dans la nouvelle table (ex. PPL devenu MNT)
                table_ids = frozenset(set(table.keys()))
                removed_ids = db_ids.difference(table_ids)
                City.objects.filter(id__in=removed_ids).delete()
                sys.stdout.write("{} items were removed from the database.\n".format(len(removed_ids)))
                sys.stdout.flush()
                # Traiter le reste
                for idx, row in enumerate(table.values(), start=1):
                    geoid = int(row[0])
                    updateable = datetime.datetime.strptime(row[18], '%Y-%m-%d').replace(tzinfo=pytz.utc) >= country.updated
                    if updateable or geoid not in db_ids:
                        # Le acode est un hash de tous les codes A1, A2, A3 et A4. AAAA est le hash de la chaîne vide (ou 0)
                        # Note : en base64, 4 caractères permettent de représenter 24 bits
                        acode = ''.join([base64.b64encode((hash(code) & 0xffffff).to_bytes(3, 'big')).decode('ascii') for code in row[10:14]])
                        latitude, longitude = float(row[4]), float(row[5])
                        city = City(id=geoid, level=0, country=country, timezone=timezones[row[17]], name=row[1], ascii=row[2].lower(), acode=acode,
                                    type=row[7], feature=row[6], city=(row[6] == 'P'), population=int(row[14]), position=Point(longitude, latitude))
                        city.save()
                        updated_count += int(updateable)
                    if idx % output_every == 0 or idx == rows - 1:
                        output_progress("Updating {country:>15}: {pc:>5.1f}% ({idx:>10}/{rows:>10}, {updated:>10} updated)", idx, rows, output_every,
                                        {'country': country_name, 'updated': updated_count})
            # Peupler la liste des villes si aucune n'existe pour le pays
            else:
                bulk = list()
                append = bulk.append
                for idx, row in enumerate(table.values(), start=1):
                    latitude, longitude = float(row[4]), float(row[5])
                    acode = ''.join([base64.b64encode((hash(code) & 0xffffff).to_bytes(3, 'big')).decode('ascii') for code in row[10:14]])
                    city = City(id=int(row[0]), level=0, country=country, timezone=timezones[row[17]], name=row[1], ascii=row[2], acode=acode, type=row[7],
                                feature=row[6], city=row[6] == 'P', population=int(row[14]), position=Point(longitude, latitude, srid=4326))
                    append(city)
                    if idx % output_every == 0 or idx == rows - 1:
                        output_progress("Filling {country:>15}: {pc:>5.1f}% ({idx:>10}/{rows:>10})", idx, rows, output_every, {'country': country_name})
                City.objects.bulk_create(bulk, batch_size=settings.BATCH_SIZE)
            # Les portions de ville sont ensuite marquées comme non villes
            City.objects.filter(type='PPLX').update(city=False)
            country.update(updated=timezone.now(), public=True, save=True)
            return True
        except Exception as e:
            traceback.print_exc(e)
            return False
    else:
        print("Operation not launched, disable DEBUG first.")
예제 #14
0
# coding: utf-8
import re
from os import listdir
from os.path import isfile, join

import requests
from requests.exceptions import HTTPError

from django.core.cache import cache
from scoop.core.util.stream.directory import Paths

PROXY_LIST_DIRECTORY = Paths.get_root_dir('isolated', 'database', 'rogue', 'proxies')


def get_tor_nodes(path='http://torstatus.blutmagie.de/ip_list_exit.php/Tor_ip_list_EXIT.csv'):
    """
    Charger la liste de nœuds pour le réseau TOR ou renvoyer celle en cache

    L'URL de la liste peut changer au cours du temps
    - 30.01.2013 : https://www.dan.me.uk/torlist/ (inaccessible via urllib)
    - 07.06.2014 : http://torstatus.blutmagie.de/ip_list_exit.php/Tor_ip_list_EXIT.csv
    """
    # Renvoyer la liste mise en cache si existante
    cached = cache.get('rogue.torlist', None)
    if cached is not None:
        return cached
    # Sinon, mettre le fichier distant en cache
    try:
        data = requests.get(path).text
        results = frozenset([row.strip() for row in data.split('\n') if row.strip()])
        cache.set('rogue.torlist', results, timeout=86400 * 3)