def read_nml(self, f: IO, attrs: Optional[Dict[str, Any]] = None) -> 'core.TreeNeuron': """Read .nml buffer into a TreeNeuron. NML files are XML files containing a single neuron. Parameters ---------- f : IO Readable buffer. attrs : dict | None Arbitrary attributes to include in the TreeNeuron. Returns ------- core.TreeNeuron """ if isinstance(f, bytes): f = f.decode() f = io.StringIO(f) root = ET.parse(f).getroot() # Copy the attributes dict for element in root: if element.tag == 'thing': nodes = pd.DataFrame.from_records( [n.attrib for n in element[0]]) edges = pd.DataFrame.from_records( [n.attrib for n in element[1]]) edges = edges.astype(self._dtypes['node_id']) nodes.rename({'id': 'node_id'}, axis=1, inplace=True) nodes = nodes.astype({ k: v for k, v in self._dtypes.items() if k in nodes.columns }) G = nx.Graph() G.add_edges_from(edges.values) tree = nx.bfs_tree(G, list(G.nodes)[0]) edges = pd.DataFrame(list(tree.edges), columns=['source', 'target']) nodes['parent_id'] = edges.set_index('target').reindex( nodes.node_id.values).source.values nodes['parent_id'] = nodes.parent_id.fillna(-1).astype( self._dtypes['node_id']) nodes.sort_values('node_id', inplace=True) return core.TreeNeuron( nodes, **(self._make_attributes({ 'name': 'NML', 'origin': 'nml' }, attrs)))
def import_dataset( name: str, sample_class: Type[Sample], features: IO, content: IO, user: User = None, ensure_incomplete=True, ) -> Tuple[Dataset, int]: if not user: try: user = db.query(User).first() except NoResultFound: raise ValueError( "To import a dataset, there must be at least one user in the system " "(for creating the association between samples and labels)" ) if isinstance(features, Path): feature_file = features.open("r") df = pd.read_csv(feature_file).set_index("ID") feature_file.close() elif isinstance(features, SpooledTemporaryFile): features.rollover() df = pd.read_csv(features._file).set_index("ID") else: raise ValueError( "The features argument must be either a Path or FileStorage") try: if isinstance(content, SpooledTemporaryFile): content.rollover() zip_file = ZipFile(content._file, "r") else: zip_file = ZipFile(content, "r") except AttributeError: raise ValueError( "The content argument must be either a Path or FileStorage") feature_df = df.drop(["LABEL"], axis=1) dataset = Dataset( name=name, features=feature_df.to_csv(), feature_names=",".join(feature_df.columns), ) all_labels = { label_name: Label(name=label_name, dataset=dataset) for label_name in df["LABEL"].unique() } db.add_all(all_labels.values()) db.commit() total, samples, associations = len(df.index), [], [] for index, (identifier, label_name) in enumerate(df["LABEL"].iteritems()): content = zip_file.read(f"{int(identifier)}.raw") sample = sample_class() sample.dataset = dataset # get type of content and convert to text if needed content_type = sample_class.content.property.columns[0].type if isinstance(content_type, Text): content = content.decode('utf-8') sample.content = content samples.append(sample) if label_name and label_name in all_labels: associations.append( Association(sample=sample, label=all_labels[label_name], user=user) ) if index % 1000 == 0: logger.info( "{:.2f}% imported ({}/{})".format((index / total) * 100, index, total) ) db.add_all(samples) db.commit() db.add_all(associations) db.commit() samples = [] associations = [] logger.info("Done importing dataset {}".format(dataset)) db.add_all(samples) db.commit() db.add_all(associations) db.commit() number_of_samples = db.query(Sample).filter( Sample.dataset == dataset).count() if ensure_incomplete: number_of_associations = ( db.query(Association.sample_id) .join(Association.sample) .filter(Sample.dataset == dataset) .count() ) if number_of_samples == number_of_associations: logger.info(f"{dataset} is already complete. Thinning it out!") sample_ids = ( db.query(Association.sample_id) .join(Association.sample) .filter(Sample.dataset == dataset) .all() ) flat_sample_ids = list(map(int, np.array(sample_ids)[:, 0])) to_delete = flat_sample_ids[::3] # Dirty; Use a raw query because otherwise SQLAlchemy unsuccessfully tries to synchronise the # current session db.execute( f'DELETE FROM association WHERE sample_id IN ({",".join(map(str, to_delete))})' ) db.commit() return dataset, number_of_samples