Пример #1
0
    def read_nml(self,
                 f: IO,
                 attrs: Optional[Dict[str, Any]] = None) -> 'core.TreeNeuron':
        """Read .nml buffer into a TreeNeuron.

        NML files are XML files containing a single neuron.

        Parameters
        ----------
        f :         IO
                    Readable buffer.
        attrs :     dict | None
                    Arbitrary attributes to include in the TreeNeuron.

        Returns
        -------
        core.TreeNeuron
        """
        if isinstance(f, bytes):
            f = f.decode()

        f = io.StringIO(f)
        root = ET.parse(f).getroot()

        # Copy the attributes dict
        for element in root:
            if element.tag == 'thing':
                nodes = pd.DataFrame.from_records(
                    [n.attrib for n in element[0]])
                edges = pd.DataFrame.from_records(
                    [n.attrib for n in element[1]])
                edges = edges.astype(self._dtypes['node_id'])

                nodes.rename({'id': 'node_id'}, axis=1, inplace=True)
                nodes = nodes.astype({
                    k: v
                    for k, v in self._dtypes.items() if k in nodes.columns
                })

        G = nx.Graph()
        G.add_edges_from(edges.values)
        tree = nx.bfs_tree(G, list(G.nodes)[0])
        edges = pd.DataFrame(list(tree.edges), columns=['source', 'target'])
        nodes['parent_id'] = edges.set_index('target').reindex(
            nodes.node_id.values).source.values
        nodes['parent_id'] = nodes.parent_id.fillna(-1).astype(
            self._dtypes['node_id'])
        nodes.sort_values('node_id', inplace=True)

        return core.TreeNeuron(
            nodes,
            **(self._make_attributes({
                'name': 'NML',
                'origin': 'nml'
            }, attrs)))
Пример #2
0
def import_dataset(
    name: str,
    sample_class: Type[Sample],
    features: IO,
    content: IO,
    user: User = None,
    ensure_incomplete=True,
) -> Tuple[Dataset, int]:
    if not user:
        try:
            user = db.query(User).first()
        except NoResultFound:
            raise ValueError(
                "To import a dataset, there must be at least one user in the system "
                "(for creating the association between samples and labels)"
            )

    if isinstance(features, Path):
        feature_file = features.open("r")
        df = pd.read_csv(feature_file).set_index("ID")
        feature_file.close()
    elif isinstance(features, SpooledTemporaryFile):
        features.rollover()
        df = pd.read_csv(features._file).set_index("ID")
    else:
        raise ValueError(
            "The features argument must be either a Path or FileStorage")

    try:
        if isinstance(content, SpooledTemporaryFile):
            content.rollover()
            zip_file = ZipFile(content._file, "r")
        else:
            zip_file = ZipFile(content, "r")
    except AttributeError:
        raise ValueError(
            "The content argument must be either a Path or FileStorage")

    feature_df = df.drop(["LABEL"], axis=1)

    dataset = Dataset(
        name=name,
        features=feature_df.to_csv(),
        feature_names=",".join(feature_df.columns),
    )

    all_labels = {
        label_name: Label(name=label_name, dataset=dataset)
        for label_name in df["LABEL"].unique()
    }
    db.add_all(all_labels.values())
    db.commit()

    total, samples, associations = len(df.index), [], []
    for index, (identifier, label_name) in enumerate(df["LABEL"].iteritems()):
        content = zip_file.read(f"{int(identifier)}.raw")

        sample = sample_class()
        sample.dataset = dataset

        # get type of content and convert to text if needed
        content_type = sample_class.content.property.columns[0].type
        if isinstance(content_type, Text):
            content = content.decode('utf-8')

        sample.content = content
        samples.append(sample)

        if label_name and label_name in all_labels:
            associations.append(
                Association(sample=sample,
                            label=all_labels[label_name], user=user)
            )

        if index % 1000 == 0:
            logger.info(
                "{:.2f}% imported ({}/{})".format((index /
                                                   total) * 100, index, total)
            )
            db.add_all(samples)
            db.commit()
            db.add_all(associations)
            db.commit()
            samples = []
            associations = []

    logger.info("Done importing dataset {}".format(dataset))
    db.add_all(samples)
    db.commit()
    db.add_all(associations)
    db.commit()

    number_of_samples = db.query(Sample).filter(
        Sample.dataset == dataset).count()
    if ensure_incomplete:
        number_of_associations = (
            db.query(Association.sample_id)
            .join(Association.sample)
            .filter(Sample.dataset == dataset)
            .count()
        )

        if number_of_samples == number_of_associations:
            logger.info(f"{dataset} is already complete. Thinning it out!")
            sample_ids = (
                db.query(Association.sample_id)
                .join(Association.sample)
                .filter(Sample.dataset == dataset)
                .all()
            )
            flat_sample_ids = list(map(int, np.array(sample_ids)[:, 0]))
            to_delete = flat_sample_ids[::3]

            # Dirty; Use a raw query because otherwise SQLAlchemy unsuccessfully tries to synchronise the
            # current session
            db.execute(
                f'DELETE FROM association WHERE sample_id IN ({",".join(map(str, to_delete))})'
            )
            db.commit()

    return dataset, number_of_samples