示例#1
0
 def handle(self, *args, **options):
     total_orphans_count = 0
     for topic in Topic.objects.all():
         # escape common & energy as usual
         if topic.slug in ["common", "energy"]: continue
         self.stdout.write("Topic: %s" % (topic))
         orphans_count = 0
         try:
             for Model in topic.get_models():
                 try:
                     fields = utils.get_model_fields(Model)
                     for field in fields:
                         if field["related_model"] and field["direction"] == "out" and "through" in field["rules"]:
                             ids= []
                             for Model in Model.objects.all():
                                 ids.extend([_.id for _ in Model.node.relationships.all()])
                             Properties = field["rules"]["through"]
                             for info in Properties.objects.all():
                                 if info._relationship not in ids:
                                     self.stdout.write("\t%s is an orphelin property of the model %s. The relation doesn't exist no more." % (info._NodeModel__node, Model.__class__.__name__))
                                     orphans_count += 1
                                     total_orphans_count += 1
                                     if options["fix"]:
                                         self.stdout.write("\tremoving %s" % (info))
                                         info.delete()
                 except Exception as e:
                     self.stderr.write("\tError with model %s (%s)" % (Model.__class__.__name__, e))
             self.stdout.write("\tfound %d orphans" % (orphans_count))
         except Exception as e:
             self.stderr.write("\tError with model %s (%s)" % (Model.__class__.__name__, e))
     self.stdout.write("TOTAL: found %d orphans" % (total_orphans_count))
示例#2
0
 def get_literal_search_output(self):
     output = lambda m: {'name': m.name, 'label': m.label, 'subject': m.subject}
     terms  = self.get_literal_search()
     _out = []
     for model in self.get_models():
         for field in [f for f in utils.get_model_fields(model) if f['type'].lower() != 'relationship']:
             if "search_terms" in field["rules"]:
                 _out += [{'name': field['name'], 'label': st, 'subject': model._meta.object_name} for st in field["rules"]["search_terms"]]
     return _out + [ output(rs) for rs in terms ]
示例#3
0
 def get_columns(model):
     edges   = dict()
     columns = []
     fields  = utils.get_model_fields(model)
     for field in fields:
         if field['type'] != 'Relationship':
             if field['name'] not in ['id']:
                 columns.append(field['name'])
         else:
             edges[field['rel_type']] = [field['model'], field['name'], field['related_model']]
     return (columns, edges)
示例#4
0
 def field(self):
     cache_key = "%s__field" % (self.name)
     field     = utils.topic_cache.get(self.topic, cache_key)
     if field is None and self.name:
         topic_models = self.topic.get_models()
         for model in topic_models:
             # Retreive every relationship field for this model
             for f in utils.get_model_fields(model):
                 if f["name"] == self.name:
                     field = f
         field["rules"]["through"] = None # Yes, this is ugly but this field is creating Pickling errors.
         utils.topic_cache.set(self.topic, cache_key, field)
     return field
示例#5
0
    def rdf_search(self, subject, predicate, obj):
        obj = obj["name"] if "name" in obj else obj
        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in self.topic.get_models())
        # If the received obj describe a literal value
        if self.is_registered_literal(predicate["name"]):
            # Get the field name into the database
            field_name = predicate["name"]
            # Build the request
            query = """
                START root=node(*)
                MATCH (root)<-[:`<<INSTANCE>>`]-(type)
                WHERE HAS(root.name)
                AND HAS(root.{field})
                AND root.{field} = {value}
                AND type.model_name = {model}
                AND type.app_label = {app}
                RETURN DISTINCT ID(root) as id, root.name as name, type.model_name as model
            """.format(
                field=field_name,
                value=adapt(obj),
                model=adapt(subject["name"]),
                app=adapt(self.topic.app_label())
            )
        # If the received obj describe a literal value
        elif self.is_registered_relationship(predicate["name"]):
            fields        = utils.get_model_fields( all_models[predicate["subject"]] )
            # Get the field name into the database
            relationships = [ field for field in fields if field["name"] == predicate["name"] ]
            # We didn't find the predicate
            if not len(relationships): return {'errors': 'Unkown predicate type'}
            relationship  = relationships[0]["rel_type"]
            # Query to get every result
            query = """
                START st=node(*)
                MATCH (st)<-[:`{relationship}`]-(root)<-[:`<<INSTANCE>>`]-(type)
                WHERE HAS(root.name)
                AND HAS(st.name)
                AND st.name = {name}
                AND type.app_label = {app}
                RETURN DISTINCT ID(root) as id, root.name as name, type.model_name as model
            """.format(
                relationship=relationship,
                name=adapt(obj),
                app=adapt(self.topic.app_label())
            )
        else:
            return {'errors': 'Unkown predicate type'}

        return connection.cypher(query).to_dicts()
示例#6
0
    def summary_forms(self, bundle, request):
        available_resources = {}
        # Get the model's rules manager
        rulesManager = request.current_topic.get_rules()
        # Fetch every registered model
        # to print out its rules
        for model in self.topic.get_models():
            name                = model.__name__.lower()
            rules               = rulesManager.model(model).all()
            fields              = utils.get_model_fields(model)
            verbose_name        = getattr(model._meta, "verbose_name", name)
            verbose_name_plural = getattr(model._meta, "verbose_name_plural", verbose_name + "s")

            for key in rules:
                # Filter rules to keep only Neomatch
                if isinstance(rules[key], Neomatch):
                    fields.append({
                        "name"         : key,
                        "type"         : "ExtendedRelationship",
                        "verbose_name" : rules[key].title,
                        "rules"        : {},
                        "related_model": rules[key].target_model.__name__
                    })

            for field in fields:
                # Create a copy of the rule to avoid compromize the rules singleton
                field["rules"] = field["rules"].copy()
                for key, rule in field["rules"].items():
                    # Convert class to model name
                    if inspect.isclass(rule):
                        field["rules"][key] = getattr(rule, "__name__", rule)

            try:
                idx = model.__idx__
            except AttributeError:
                idx = 0
            available_resources[name] = {
                'description'         : getattr(model, "_description", None),
                'topic'               : getattr(model, "_topic", self.topic.slug) or self.topic.slug,
                'model'               : getattr(model, "__name__", ""),
                'verbose_name'        : verbose_name,
                'verbose_name_plural' : verbose_name_plural,
                'name'                : name,
                'fields'              : fields,
                'rules'               : rules,
                'index'               : idx
            }

        return available_resources
示例#7
0
 def field(self):
     field = None
     if self.name:
         # Build a cache key with the topic token
         cache_key = "%s__%s__field" % ( self.topic.module, self.name )
         # Try to use the cache value
         if cache.get(cache_key) is not None:
             field = cache.get(cache_key)
         else:
             topic_models = self.topic.get_models()
             for model in topic_models:
                 # Retreive every relationship field for this model
                 for f in utils.get_model_fields(model):
                     if f["name"] == self.name:
                         field = f
     # Very small cache to optimize recording
     cache.set(cache_key, field, 10)
     return field
示例#8
0
    def summary_forms(self, bundle):
        available_resources = {}
        # Get the model's rules manager
        rulesManager = register_model_rules()
        # Fetch every registered model
        # to print out its rules
        for model in get_registered_models():
            # Do this ressource has a model?
            # Do this ressource is a part of apps?
            if model != None and model.__module__.startswith("app.detective.apps"):
                name                = model.__name__.lower()
                rules               = rulesManager.model(model).all()
                fields              = get_model_fields(model)
                verbose_name        = getattr(model._meta, "verbose_name", name).title()
                verbose_name_plural = getattr(model._meta, "verbose_name_plural", verbose_name + "s").title()
                # Extract the model parent to find its scope
                scope               = model.__module__.split(".")[-2]

                for key in rules:
                    # Filter rules to keep only Neomatch
                    if isinstance(rules[key], Neomatch):
                        fields.append({
                            "name"         : key,
                            "type"         : "ExtendedRelationship",
                            "verbose_name" : rules[key].title,
                            "rules"        : {},
                            "related_model": rules[key].target_model.__name__
                        })

                available_resources[name] = {
                    'description'         : getattr(model, "_description", None),
                    'scope'               : getattr(model, "_scope", scope),
                    'model'               : getattr(model, "__name_", ""),
                    'verbose_name'        : verbose_name,
                    'verbose_name_plural' : verbose_name_plural,
                    'name'                : name,
                    'fields'              : fields,
                    'rules'               : rules
                }

        return available_resources
示例#9
0
 def formfield_for_choice_field(self, db_field, request, **kwargs):
     if db_field.name == 'name' and hasattr(request, "topic_id"):
         # We add choices for this field using the current topic's models
         kwargs["choices"] = []
         # Get the current topic with the ID set into the parent form
         topic  = Topic.objects.get(id=request.topic_id)
         # Get the topic's models
         models = topic.get_models()
         for model in models:
             model_name    = getattr(model._meta, "verbose_name").title()
             subset        = []
             # Retreive every relationship field for this model
             for field in utils.get_model_fields(model):
                 if field["type"] != 'AutoField':
                     choice   = [ field["name"], field["verbose_name"].title(), ]
                     # Add ... at the end ot the relationship field
                     if field["type"] == 'Relationship': choice[1] += "..."
                     subset.append(choice)
             # Add the choice subset only if it contains elements
             if len(subset): kwargs["choices"].append( (model_name, subset,) )
     return super(SearchTermInline, self).formfield_for_choice_field(db_field, request,**kwargs)
示例#10
0
    def summary_forms(self, bundle, request):
        available_resources = {}
        # Get the model's rules manager
        rulesManager = topics_rules()
        # Fetch every registered model
        # to print out its rules
        for model in self.topic.get_models():
            name                = model.__name__.lower()
            rules               = rulesManager.model(model).all()
            fields              = utils.get_model_fields(model)
            verbose_name        = getattr(model._meta, "verbose_name", name).title()
            verbose_name_plural = getattr(model._meta, "verbose_name_plural", verbose_name + "s").title()

            for key in rules:
                # Filter rules to keep only Neomatch
                if isinstance(rules[key], Neomatch):
                    fields.append({
                        "name"         : key,
                        "type"         : "ExtendedRelationship",
                        "verbose_name" : rules[key].title,
                        "rules"        : {},
                        "related_model": rules[key].target_model.__name__
                    })

            available_resources[name] = {
                'description'         : getattr(model, "_description", None),
                'topic'               : getattr(model, "_topic", self.topic.slug) or self.topic.slug,
                'model'               : getattr(model, "__name_", ""),
                'verbose_name'        : verbose_name,
                'verbose_name_plural' : verbose_name_plural,
                'name'                : name,
                'fields'              : fields,
                'rules'               : rules
            }

        return available_resources
示例#11
0
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None):
    """
    Job which parses uploaded content, validates and saves them as model
    """

    start_time               = start_time != None and start_time or time.time()
    entities                 = {}
    relations                = []
    errors                   = []
    id_mapping               = {}
    nb_lines                 = 0
    file_reading_progression = 0
    job                      = get_current_job()

    # Define Exceptions
    class Error (Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)
        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail     (Error): pass
    class WarningValidationError      (Error): pass
    class WarningKeyUnknown           (Error): pass
    class WarningInformationIsMissing (Error): pass
    class AttributeDoesntExist        (Error): pass
    class WrongCSVSyntax              (Error): pass
    class ColumnUnknow                (Error): pass
    class ModelDoesntExist            (Error): pass
    class RelationDoesntExist         (Error): pass

    try:
        assert type(files) in (tuple, list), type(files)
        assert len(files) > 0, "You need to upload at least one file."
        assert type(files[0]) in (tuple, list)
        assert len(files[0]) == 2

        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file      = file[1]
            else:
                raise Exception()
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            assert len(header) > 1, "{file_name} header should have at least 2 columns"
            assert header[0].endswith("_id"), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(file_name=file_name, first_col=header[0])
            if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys())
            nb_lines += len(file) - 1 # -1 removes headers

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            # must check that all columns map to an existing model field
            fields       = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns        = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if not column in field_names and not column.endswith("__sources__"):
                    raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                    break
                if column.endswith("__sources__"):
                    column_type = "__sources__"
                    column = column[:-len("__sources__")]
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                        break
                else:
                    column_type = fields_types.get(column, None)
                columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data      = {}
                    sources   = {}
                    entity_id = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i+1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(int, re.split('[^\d]', value)[:3])).replace(tzinfo=utc)

                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name = column,
                                    value       = value,
                                    type        = column_type,
                                    data        = data, model=entity,
                                    file        = file_name,
                                    line        = csv_reader.line_num,
                                    error       = str(e)
                                )
                                errors.append(e)
                                break
                            if column_type == "__sources__":
                                sources[column] = value
                            else:
                                data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, entity_id)] = item
                            # create sources
                            for sourced_field, reference in sources.items():
                                for ref in reference.split("||"):
                                    FieldSource.objects.create(individual=item.id, field=sourced_field, reference=ref)
                            # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource)
                            # Concurrent access are not secure here.
                            # For now we refresh the job just before saving it.
                            file_reading_progression += 1
                            if job:
                                job.refresh()
                                job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100
                                job.meta["file_reading"] = file_name
                                job.save()
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data  = data,
                                    model = entity,
                                    file  = file_name,
                                    line  = csv_reader.line_num,
                                    error = str(e)
                                )
                            )

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader      = utils.open_csv(file)
            csv_header      = csv_reader.next()
            relation_name   = utils.to_underscores(csv_header[1])
            model_from      = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to        = utils.to_class_name(csv_header[2].replace("_id", ""))
            properties_name = csv_header[3:]
            # retrieve ModelProperties from related model
            ModelProperties = topic.get_rules().model(all_models[model_from]).field(relation_name).get("through")
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file             = file_name,
                    model_from       = model_from,
                    model_to         = model_to,
                    relation_name    = relation_name,
                    fields_available = [field['name'] for field in utils.iterate_model_fields(all_models[model_from])],
                    error            = str(e))
            for row in csv_reader:
                id_from    = row[0]
                id_to      = row[2]
                properties = [p.decode('utf-8') for p in row[3:]]
                if id_to and id_from:
                    try:
                        instance_from = id_mapping[(model_from, id_from)]
                        instance_to   = id_mapping[(model_to, id_to)]
                        getattr(instance_from, relation_name).add(instance_to)
                        # add properties if needed
                        if ModelProperties and properties_name and properties:
                            # save the relationship to create an id
                            instance_from.save()
                            # retrieve this id
                            relation_id = next(rel.id for rel in instance_from.node.relationships.outgoing() if rel.end.id == instance_to.id)
                            # properties of the relationship
                            relation_args = {
                                "_endnodes"     : [id_mapping[(model_from, id_from)].id, instance_to.id],
                                "_relationship" : relation_id,
                            }
                            # Pairwise the properties with their names 
                            relation_args.update(zip(properties_name, properties))
                            try:
                                ModelProperties.objects.create(**relation_args)
                            except TypeError as e:
                                errors.append(
                                    AttributeDoesntExist(
                                        file             = file_name,
                                        line             = csv_reader.line_num,
                                        model_from       = model_from,
                                        id_from          = id_from,
                                        model_to         = model_to,
                                        id_to            = id_to,
                                        relation_args    = relation_args,
                                        error            = str(e)
                                    )
                        )
                        # update the job
                        inserted_relations += 1
                        file_reading_progression += 1
                        if job:
                            job.refresh()
                            job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100
                            job.meta["file_reading"] = file_name
                            job.save()
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(
                                file             = file_name,
                                line             = csv_reader.line_num,
                                model_from       = model_from,
                                id_from          = id_from,
                                model_to         = model_to,
                                id_to            = id_to,
                                relation_name    = relation_name,
                                error            = str(e)
                            )
                        )
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(
                            file             = file_name,
                            line             = csv_reader.line_num,
                            model_from       = model_from,
                            id_from          = id_from,
                            model_to         = model_to,
                            id_to            = id_to,
                            relation_name    = relation_name,
                            error            = str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(
                            file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from
                        )
                    )

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        if job:
            job.refresh()
            job.meta["objects_to_save"] = len(id_mapping)
            job.save()
        for item in id_mapping.values():
            item.save()
            saved += 1
            if job:
                job.refresh()
                job.meta["saving_progression"] = saved
                job.save()
        if job: job.refresh()
        if job and "track" in job.meta:
            from django.core.mail import send_mail
            user = User.objects.get(pk=job.meta["user"])
            send_mail("upload finished", "your upload just finished", settings.DEFAULT_FROM_EMAIL, (user.email,))
        return {
            'duration' : (time.time() - start_time),
            'inserted' : {
                'objects' : saved,
                'links'   : inserted_relations
            },
            "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors])
        }

    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        if e.__dict__:
            message = str(e.__dict__)
        else:
            message = e.message
        return {
            "errors" : [{e.__class__.__name__ : message}]
        }
示例#12
0
def process_parsing(topic, files):
    """
    Job which reads the uploaded files, validate and saves them as model
    """

    entities   = {}
    relations  = []
    errors     = []
    id_mapping = {}

    assert type(files) in (tuple, list)
    assert len(files) > 0
    assert type(files[0]) in (tuple, list)
    assert len(files[0]) == 2

    # Define Exceptions
    class Error (Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)
        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail     (Error): pass
    class WarningValidationError      (Error): pass
    class WarningKeyUnknown           (Error): pass
    class WarningInformationIsMissing (Error): pass
    class AttributeDoesntExist        (Error): pass
    class WrongCSVSyntax              (Error): pass
    class ColumnUnknow                (Error): pass
    class ModelDoesntExist            (Error): pass
    class RelationDoesntExist         (Error): pass

    try:
        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file      = file[1]
            elif hasattr(file, "read"):
                file_name = file.name
            else:
                raise Exception("ERROR")
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            assert len(header) > 1, "header should have at least 2 columns"
            assert header[0].endswith("_id"), "First column should begin with a header like <model_name>_id"
            if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys())

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            # must check that all columns map to an existing model field
            fields      = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if column is not '':
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                        break
                    column_type = fields_types[column]
                    columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data = {}
                    id   = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i+1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(int, re.split('[^\d]', value)[:-1])).replace(tzinfo=utc)
                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name = column,
                                    value       = value,
                                    type        = column_type,
                                    data        = data, model=entity,
                                    file        = file_name,
                                    line        = csv_reader.line_num,
                                    error       = str(e)
                                )
                                errors.append(e)
                                break
                            data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, id)] = item
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data  = data,
                                    model = entity,
                                    file  = file_name,
                                    line  = csv_reader.line_num,
                                    error = str(e)
                                )
                            )

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader    = utils.open_csv(file)
            csv_header    = csv_reader.next()
            relation_name = utils.to_underscores(csv_header[1])
            model_from    = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to      = utils.to_class_name(csv_header[2].replace("_id", ""))
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file             = file_name,
                    model_from       = model_from,
                    model_to         = model_to,
                    relation_name    = relation_name,
                    fields_available = [field['name'] for field in utils.get_model_fields(all_models[model_from])],
                    error            = str(e))
            for row in csv_reader:
                id_from = row[0]
                id_to   = row[2]
                if id_to and id_from:
                    try:
                        getattr(id_mapping[(model_from, id_from)], relation_name).add(id_mapping[(model_to, id_to)])
                        inserted_relations += 1
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(
                                file             = file_name,
                                line             = csv_reader.line_num,
                                model_from       = model_from,
                                id_from          = id_from,
                                model_to         = model_to,
                                id_to            = id_to,
                                relation_name    = relation_name,
                                error            = str(e)
                            )
                        )
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(
                            file             = file_name,
                            line             = csv_reader.line_num,
                            model_from       = model_from,
                            id_from          = id_from,
                            model_to         = model_to,
                            id_to            = id_to,
                            relation_name    = relation_name,
                            error            = str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(
                            file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from
                        )
                    )

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        for item in id_mapping.values():
            item.save()
            saved += 1

        return {
            'inserted' : {
                'objects' : saved,
                'links'   : inserted_relations
            },
            "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors])
        }
    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        return {
            "errors" : [{e.__class__.__name__ : str(e.__dict__)}]
        }
示例#13
0
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None):
    """
    Job which parses uploaded content, validates and saves them as model
    """

    start_time = start_time != None and start_time or time.time()
    entities = {}
    relations = []
    errors = []
    id_mapping = {}
    nb_lines = 0
    file_reading_progression = 0
    job = get_current_job()

    # Define Exceptions
    class Error(Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)

        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail(Error):
        pass

    class WarningValidationError(Error):
        pass

    class WarningKeyUnknown(Error):
        pass

    class WarningInformationIsMissing(Error):
        pass

    class AttributeDoesntExist(Error):
        pass

    class WrongCSVSyntax(Error):
        pass

    class ColumnUnknow(Error):
        pass

    class ModelDoesntExist(Error):
        pass

    class RelationDoesntExist(Error):
        pass

    try:
        assert type(files) in (tuple, list), type(files)
        assert len(files) > 0, "You need to upload at least one file."
        assert type(files[0]) in (tuple, list)
        assert len(files[0]) == 2

        # retrieve all models in current topic
        all_models = dict(
            (model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file = file[1]
            else:
                raise Exception()
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            assert len(
                header
            ) > 1, "{file_name} header should have at least 2 columns"
            assert header[0].endswith(
                "_id"
            ), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(
                file_name=file_name, first_col=header[0])
            if len(header) >= 3 and header[0].endswith(
                    "_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name,
                                           file=file_name,
                                           models_availables=all_models.keys())
            nb_lines += len(file) - 1  # -1 removes headers

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            # must check that all columns map to an existing model field
            fields = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if not column in field_names and not column.endswith(
                        "__sources__"):
                    raise ColumnUnknow(file=file_name,
                                       column=column,
                                       model=entity,
                                       attributes_available=field_names)
                    break
                if column.endswith("__sources__"):
                    column_type = "__sources__"
                    column = column[:-len("__sources__")]
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name,
                                           column=column,
                                           model=entity,
                                           attributes_available=field_names)
                        break
                else:
                    column_type = fields_types.get(column, None)
                columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data = {}
                    sources = {}
                    entity_id = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i + 1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(
                                        int,
                                        re.split('[^\d]', value)[:3])).replace(
                                            tzinfo=utc)

                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name=column,
                                    value=value,
                                    type=column_type,
                                    data=data,
                                    model=entity,
                                    file=file_name,
                                    line=csv_reader.line_num,
                                    error=str(e))
                                errors.append(e)
                                break
                            if column_type == "__sources__":
                                sources[column] = value
                            else:
                                data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, entity_id)] = item
                            # create sources
                            for sourced_field, reference in sources.items():
                                for ref in reference.split("||"):
                                    FieldSource.objects.create(
                                        individual=item.id,
                                        field=sourced_field,
                                        reference=ref)
                            # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource)
                            # Concurrent access are not secure here.
                            # For now we refresh the job just before saving it.
                            file_reading_progression += 1
                            if job:
                                job.refresh()
                                job.meta["file_reading_progression"] = (
                                    float(file_reading_progression) /
                                    float(nb_lines)) * 100
                                job.meta["file_reading"] = file_name
                                job.save()
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data=data,
                                    model=entity,
                                    file=file_name,
                                    line=csv_reader.line_num,
                                    error=str(e)))

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader = utils.open_csv(file)
            csv_header = csv_reader.next()
            relation_name = utils.to_underscores(csv_header[1])
            model_from = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to = utils.to_class_name(csv_header[2].replace("_id", ""))
            properties_name = csv_header[3:]
            # retrieve ModelProperties from related model
            ModelProperties = topic.get_rules().model(
                all_models[model_from]).field(relation_name).get("through")
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file=file_name,
                    model_from=model_from,
                    model_to=model_to,
                    relation_name=relation_name,
                    fields_available=[
                        field['name'] for field in utils.iterate_model_fields(
                            all_models[model_from])
                    ],
                    error=str(e))
            for row in csv_reader:
                id_from = row[0]
                id_to = row[2]
                properties = [p.decode('utf-8') for p in row[3:]]
                if id_to and id_from:
                    try:
                        instance_from = id_mapping[(model_from, id_from)]
                        instance_to = id_mapping[(model_to, id_to)]
                        getattr(instance_from, relation_name).add(instance_to)
                        # add properties if needed
                        if ModelProperties and properties_name and properties:
                            # save the relationship to create an id
                            instance_from.save()
                            # retrieve this id
                            relation_id = next(
                                rel.id for rel in
                                instance_from.node.relationships.outgoing()
                                if rel.end.id == instance_to.id)
                            # properties of the relationship
                            relation_args = {
                                "_endnodes": [
                                    id_mapping[(model_from, id_from)].id,
                                    instance_to.id
                                ],
                                "_relationship":
                                relation_id,
                            }
                            # Pairwise the properties with their names
                            relation_args.update(
                                zip(properties_name, properties))
                            try:
                                ModelProperties.objects.create(**relation_args)
                            except TypeError as e:
                                errors.append(
                                    AttributeDoesntExist(
                                        file=file_name,
                                        line=csv_reader.line_num,
                                        model_from=model_from,
                                        id_from=id_from,
                                        model_to=model_to,
                                        id_to=id_to,
                                        relation_args=relation_args,
                                        error=str(e)))
                        # update the job
                        inserted_relations += 1
                        file_reading_progression += 1
                        if job:
                            job.refresh()
                            job.meta["file_reading_progression"] = (
                                float(file_reading_progression) /
                                float(nb_lines)) * 100
                            job.meta["file_reading"] = file_name
                            job.save()
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(file=file_name,
                                              line=csv_reader.line_num,
                                              model_from=model_from,
                                              id_from=id_from,
                                              model_to=model_to,
                                              id_to=id_to,
                                              relation_name=relation_name,
                                              error=str(e)))
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(file=file_name,
                                    line=csv_reader.line_num,
                                    model_from=model_from,
                                    id_from=id_from,
                                    model_to=model_to,
                                    id_to=id_to,
                                    relation_name=relation_name,
                                    error=str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(file=file_name,
                                                    row=row,
                                                    line=csv_reader.line_num,
                                                    id_to=id_to,
                                                    id_from=id_from))

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        if job:
            job.refresh()
            job.meta["objects_to_save"] = len(id_mapping)
            job.save()
        for item in id_mapping.values():
            item.save()
            saved += 1
            if job:
                job.refresh()
                job.meta["saving_progression"] = saved
                job.save()
        if job: job.refresh()
        if job and "track" in job.meta:
            from django.core.mail import send_mail
            user = User.objects.get(pk=job.meta["user"])
            send_mail("upload finished", "your upload just finished",
                      settings.DEFAULT_FROM_EMAIL, (user.email, ))
        return {
            'duration': (time.time() - start_time),
            'inserted': {
                'objects': saved,
                'links': inserted_relations
            },
            "errors":
            sorted([
                dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors
            ])
        }

    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        if e.__dict__:
            message = str(e.__dict__)
        else:
            message = e.message
        return {"errors": [{e.__class__.__name__: message}]}