def _generate_corpus(records, required_text_fields): """ Generates a corpus from the input NYT records. :param records: The input NYT records. :type records: list :param required_text_fields: A list of the available NYT text fields. :type required_text_fields: list :return: :class: `orangecontrib.text.corpus.Corpus` """ metas, class_values = _parse_record_json(records, required_text_fields) # Create domain. meta_vars = [StringVariable.make(field) for field in required_text_fields] meta_vars += [ StringVariable.make("pub_date"), StringVariable.make("country") ] class_vars = [ DiscreteVariable("section_name", values=list(set(class_values))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(None, Y, metas, domain, meta_vars) # used all features
def concatenate_data(tables, filenames, label): domain, xs = domain_union_for_spectra(tables) ntables = [(table if isinstance(table, Table) else table[2]).transform(domain) for table in tables] data = type(ntables[0]).concatenate(ntables, axis=0) source_var = StringVariable.make("Filename") label_var = StringVariable.make("Label") # add other variables xs_atts = tuple([ContinuousVariable.make("%f" % f) for f in xs]) domain = Domain(xs_atts + domain.attributes, domain.class_vars, domain.metas + (source_var, label_var)) data = data.transform(domain) # fill in spectral data xs_sind = np.argsort(xs) xs_sorted = xs[xs_sind] pos = 0 for table in tables: t = table if isinstance(table, Table) else table[2] if not isinstance(table, Table): indices = xs_sind[np.searchsorted(xs_sorted, table[0])] data.X[pos:pos+len(t), indices] = table[1] pos += len(t) data[:, source_var] = np.array(list( chain(*(repeat(fn, len(table)) for fn, table in zip(filenames, ntables))) )).reshape(-1, 1) data[:, label_var] = np.array(list( chain(*(repeat(label, len(table)) for fn, table in zip(filenames, ntables))) )).reshape(-1, 1) return data
def parse_record_json(record, includes_metadata): """ Parses the JSON representation of the record returned by the New York Times Article API. :param record: The JSON representation of the query's results. :param includes_metadata: The flags that determine which fields to include. :return: A list of articles parsed into documents and a list of the corresponding metadata, joined in a tuple. """ text_fields = [ "headline", "lead_paragraph", "snippet", "abstract", "keywords" ] documents = [] class_values = [] meta_vars = [ StringVariable.make(field) for field, flag in zip(text_fields, includes_metadata) if flag ] # Also add pub_date and glocation. meta_vars += [ StringVariable.make("pub_date"), StringVariable.make("country") ] metadata = np.empty((0, len(meta_vars)), dtype=object) for doc in record["response"]["docs"]: string_document = "" metas_row = [] for field, flag in zip(text_fields, includes_metadata): if flag and field in doc: field_value = "" if isinstance(doc[field], dict): field_value = " ".join( [val for val in doc[field].values() if val]) elif isinstance(doc[field], list): field_value = " ".join( [kw["value"] for kw in doc[field] if kw]) else: if doc[field]: field_value = doc[field] string_document += field_value metas_row.append(field_value) # Add the pub_date. field_value = "" if "pub_date" in doc and doc["pub_date"]: field_value = doc["pub_date"] metas_row.append(field_value) # Add the glocation. metas_row.append(",".join([ kw["value"] for kw in doc["keywords"] if kw["name"] == "glocations" ])) # Add the section_name. class_val = "" if "section_name" in doc and doc["section_name"]: class_val = doc["section_name"] documents.append(string_document) class_values.append(class_val) metadata = np.vstack((metadata, np.array(metas_row))) return documents, metadata, meta_vars, class_values
def test_nyt_corpus_domain_generation(self): corpus = self.nyt.run_query('slovenia') meta_vars = [StringVariable.make(field) for field in NYT_TEXT_FIELDS] + \ [StringVariable.make('pub_date'), StringVariable.make('country')] self.assertEqual(len(meta_vars), len(corpus.domain.metas)) self.assertEqual(len(corpus.Y), 10)
def corpusDomain(mails): return(Domain([TimeVariable.make(FIELDDATE), \ DiscreteVariable.make(FIELDFROM,set([x[1] for x in mails])), \ DiscreteVariable.make(FIELDTO,set([x[2] for x in mails]))], \ metas=[StringVariable.make(FIELDTEXT), \ StringVariable.make(FIELDFILE), \ StringVariable.make(FIELDCOUNSELOR), \ StringVariable.make(FIELDSUBJECT)]))
def corpusDomain(self, mails): return(Domain([TimeVariable.make("date"), \ DiscreteVariable.make("from", set([x[1] for x in mails])), \ DiscreteVariable.make("to", set([x[2] for x in mails])), \ DiscreteVariable.make("duplicate", set([x[3] for x in mails]))], \ metas=[StringVariable.make("file"), \ StringVariable.make("subject"), \ StringVariable.make("extra"), \ StringVariable.make("text")]))
def capture_image(self): cap = self.cap for i in range(3): # Need some warmup time; use the last frame success, frame = cap.read() if success: self.Error.no_webcam.clear() else: self.Error.no_webcam() return def normalize(name): return ''.join( ch for ch in unicodedata.normalize('NFD', name.replace(' ', '_')) if unicodedata.category(ch) in 'LuLlPcPd') timestamp = datetime.now().strftime('%Y%m%d%H%M%S.%f') image_title, self.image_title = self.image_title or self.DEFAULT_TITLE, '' normed_name = normalize(image_title) for image, suffix, output in ((frame, '', self.Output.SNAPSHOT), (self.clip_aspect_frame(frame), '_aspect', self.Output.SNAPSHOT_ASPECT)): path = os.path.join( self.IMAGE_DIR, '{normed_name}_{timestamp}{suffix}.png'.format(**locals())) cv2.imwrite( path, # imwrite expects original bgr image, so this is reversed self.bgr2rgb(image) if self.avatar_filter else image) size = ContinuousVariable.make('size') width = ContinuousVariable.make('width') height = ContinuousVariable.make('height') s, w, h = self.image_meta_data(path) image_var = StringVariable.make('image') image_var.attributes['type'] = 'image' metas = np.array([[image_title, path, s, w, h]], dtype=object) table = Table.from_numpy(Domain( [], metas=[ StringVariable.make('image name'), image_var, size, width, height ]), np.empty((1, 0)), metas=metas) self.send(output, table) self.snapshot_flash = 80
def parse_record_json(record, includes_metadata): """ Parses the JSON representation of the record returned by the New York Times Article API. :param record: The JSON representation of the query's results. :param includes_metadata: The flags that determine which fields to include. :return: A list of articles parsed into documents and a list of the corresponding metadata, joined in a tuple. """ text_fields = ["headline", "lead_paragraph", "snippet", "abstract", "keywords"] documents = [] class_values = [] meta_vars = [StringVariable.make(field) for field, flag in zip(text_fields, includes_metadata) if flag] # Also add pub_date and glocation. meta_vars += [StringVariable.make("pub_date"), StringVariable.make("country")] metadata = np.empty((0, len(meta_vars)), dtype=object) for doc in record["response"]["docs"]: string_document = "" metas_row = [] for field, flag in zip(text_fields, includes_metadata): if flag and field in doc: field_value = "" if isinstance(doc[field], dict): field_value = " ".join([val for val in doc[field].values() if val]) elif isinstance(doc[field], list): field_value = " ".join([kw["value"] for kw in doc[field] if kw]) else: if doc[field]: field_value = doc[field] string_document += field_value metas_row.append(field_value) # Add the pub_date. field_value = "" if "pub_date" in doc and doc["pub_date"]: field_value = doc["pub_date"] metas_row.append(field_value) # Add the glocation. metas_row.append(",".join([kw["value"] for kw in doc["keywords"] if kw["name"] == "glocations"])) # Add the section_name. class_val = "" if "section_name" in doc and doc["section_name"]: class_val = doc["section_name"] documents.append(string_document) class_values.append(class_val) metadata = np.vstack((metadata, np.array(metas_row))) return documents, metadata, meta_vars, class_values
def make_orange_table(self, data, Y, file_names, domain_metas): """ Make Orange table with min, max and avg classification probabilities :param data: min, max and avg classification probabilities :param Y: category values :param file_names: name of files :param domain_metas: all meta attributes from domain :return: orange table with category, file name, min, max and avg classification probabilities """ category = DiscreteVariable( "Target class", values=self.discrete_atributes) attributes = [] for i in range(5, len(domain_metas)): if str(type(domain_metas[i])) == "ContinuousVariable": attributes.append(ContinuousVariable.make( "MIN_" + str(domain_metas[i].name))) attributes.append(ContinuousVariable.make( "MAX_" + str(domain_metas[i].name))) attributes.append(ContinuousVariable.make( "AVG_" + str(domain_metas[i].name))) METAS = [('File name', 'file_name')] meta_attr = [StringVariable.make(meta[0]) for meta in METAS] self.domain = Domain(attributes, class_vars=category, metas=meta_attr) orange_table = Table(self.domain, data, Y, file_names) return orange_table
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata ) meta_vars = [] for field_name, _ in includes_metadata: if field_name == 'pub_date': meta_vars.append(TimeVariable(field_name)) else: meta_vars.append(StringVariable.make(field_name)) class_vars = [ DiscreteVariable('section_name', values=list(set(class_values))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def _guess_variable(self, field_name, field_metadata, inspect_table): type_code = field_metadata[0] NUMERIC_TYPES = ("FLOAT", "DOUBLE", "DECIMAL") # real, float8, numeric INT_TYPES = ("INT", "TINYINT", "SMALLINT", "MEDIUMINT", "BIGINT") DATE_TYPES = ("DATE", "DATETIME", "YEAR") TIME_TYPES = ("TIMESTAMP", "TIME") CHAR_TYPES = ("CHAR", "ENUM") if type_code in NUMERIC_TYPES: return ContinuousVariable.make(field_name) if type_code in TIME_TYPES + DATE_TYPES: tv = TimeVariable.make(field_name) tv.have_date |= type_code in DATE_TYPES tv.have_time |= type_code in TIME_TYPES return tv if type_code in INT_TYPES: # bigint, int, smallint if inspect_table: values = self.get_distinct_values(field_name, inspect_table) if values: return DiscreteVariable.make(field_name, values) return ContinuousVariable.make(field_name) if type_code in CHAR_TYPES: if inspect_table: values = self.get_distinct_values(field_name, inspect_table) # remove trailing spaces values = [v.rstrip() for v in values] if values: return DiscreteVariable.make(field_name, values) return StringVariable.make(field_name)
def _create_corpus(self): corpus = None names = ["name", "path", "content"] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: data.append( [textdata.name, textdata.path, textdata.content] ) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain( [], category_var, [StringVariable.make(name) for name in names] ) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus(domain, Y=category_data, metas=data, text_features=[domain.metas[2]]) return corpus
def generate_corpus(self, url_list): """ generate new corpus with values requested by user :param url_list: :return: corpus """ new_table=None text_includes_params = [self.includes_article, self.includes_author, self.includes_date, self.includes_title, self.includes_web_url] if True not in text_includes_params: self.warning(1, "You must select at least one text field.") return required_text_fields = [incl_field for yes, incl_field in zip(text_includes_params, ARTICLE_TEXT_FIELDS) if yes] meta_vars = [StringVariable.make(field) for field in required_text_fields] metadata=[] for url in url_list: info, is_cached =_get_info(url) final_fields = [incl_field for yes, incl_field in zip(text_includes_params, info) if yes] metadata.append(final_fields) metadata = np.array(metadata, dtype=object) metas=metadata domain = Domain([], class_vars=None, metas=(meta_vars)) new_table = Corpus(None, None, metadata, domain, meta_vars) self.output_corpus=new_table self.send("Corpus",self.output_corpus)
def __into_orange_table(self, attrs, X, meta_parts): if not attrs and X.shape[1]: attrs = Domain.from_numpy(X).attributes try: metas = None M = None if meta_parts: meta_parts = [ df_.reset_index() if not df_.index.is_integer() else df_ for df_ in meta_parts ] metas = [ StringVariable.make(name) for name in chain(*(_.columns for _ in meta_parts)) ] M = np.hstack(tuple(df_.values for df_ in meta_parts)) domain = Domain(attrs, metas=metas) table = Table.from_numpy(domain, X, None, M) except ValueError: table = None rows = self.leading_cols if self.transposed else self.leading_rows cols = self.leading_rows if self.transposed else self.leading_cols self.errors["inadequate_headers"] = (rows, cols) return table
def etc_to_table(etc_json, time_var=False): """ Converts data from Json to :obj:`Orange.data.table` Args: etc_json (dict): Data in json like format from genesis time_var (bool): Create column of time points. Default is set to False. Returns: :obj:`Orange.data.Table` """ variables = [] time_point = 1 for time in etc_json['etc']['timePoints']: var = ContinuousVariable('TP ' + str(time_point)) var.attributes['Time'] = str(time) variables.append(var) time_point += 1 meta_attr = StringVariable.make('Gene') domain = Domain(variables, metas=[meta_attr]) table = [] for row in etc_json['etc']['genes']: gene_expression = [exp for exp in etc_json['etc']['genes'][row]] gene_expression.append(row) table.append(gene_expression) orange_table = Table(domain, table) if time_var: orange_table = transpose_table(orange_table) return orange_table
def transpose_table(table): """ Transpose the rows and columns of the table. Args: table: Data in :obj:`Orange.data.Table` Returns: Transposed :obj:`Orange.data.Table`. (Genes as columns) """ # TODO: remove this and use Orange.data.Table.transpose attrs = table.domain.attributes attr = [ContinuousVariable.make(ex['Gene'].value) for ex in table] # Set metas new_metas = [ StringVariable.make(name) if name != 'Time' else TimeVariable.make(name) for name in sorted(table.domain.variables[0].attributes.keys()) ] domain = Domain(attr, metas=new_metas) meta_values = [[exp.attributes[var.name] for var in domain.metas] for exp in attrs] return Table(domain, table.X.transpose(), metas=meta_values)
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata) meta_vars = [] for field_name, _ in includes_metadata: if field_name == 'pub_date': meta_vars.append(TimeVariable(field_name)) else: meta_vars.append(StringVariable.make(field_name)) class_vars = [ DiscreteVariable('section_name', values=list(set(class_values))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def _create_corpus(self): corpus = None names = ["name", "path", "content"] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: data.append([textdata.name, textdata.path, textdata.content]) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain([], category_var, [StringVariable.make(name) for name in names]) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus(domain, Y=category_data, metas=data, text_features=[domain.metas[2]]) return corpus
def _guess_variable(self, field_name, field_metadata, inspect_table): type_code = field_metadata[0] FLOATISH_TYPES = (700, 701, 1700) # real, float8, numeric INT_TYPES = (20, 21, 23) # bigint, int, smallint CHAR_TYPES = ( 25, 1042, 1043, ) # text, char, varchar BOOLEAN_TYPES = (16, ) # bool DATE_TYPES = ( 1082, 1114, 1184, ) # date, timestamp, timestamptz # time, timestamp, timestamptz, timetz TIME_TYPES = ( 1083, 1114, 1184, 1266, ) if type_code in FLOATISH_TYPES: return ContinuousVariable.make(field_name) if type_code in TIME_TYPES + DATE_TYPES: tv = TimeVariable.make(field_name) tv.have_date |= type_code in DATE_TYPES tv.have_time |= type_code in TIME_TYPES return tv if type_code in INT_TYPES: # bigint, int, smallint if inspect_table: values = self.get_distinct_values(field_name, inspect_table) if values: return DiscreteVariable.make(field_name, values) return ContinuousVariable.make(field_name) if type_code in BOOLEAN_TYPES: return DiscreteVariable.make(field_name, ['false', 'true']) if type_code in CHAR_TYPES: if inspect_table: values = self.get_distinct_values(field_name, inspect_table) # remove trailing spaces values = [v.rstrip() for v in values] if values: return DiscreteVariable.make(field_name, values) return StringVariable.make(field_name)
def _generate_corpus(records, required_text_fields): """ Generates a corpus from the input NYT records. :param records: The input NYT records. :type records: list :param required_text_fields: A list of the available NYT text fields. :type required_text_fields: list :return: :class: `orangecontrib.text.corpus.Corpus` """ metas, class_values = _parse_record_json(records, required_text_fields) documents = [] for doc in metas: documents.append(" ".join([d for d in doc if d is not None]).strip()) # Create domain. meta_vars = [StringVariable.make(field) for field in required_text_fields] meta_vars += [StringVariable.make("pub_date"), StringVariable.make("country")] class_vars = [DiscreteVariable("section_name", values=list(set(class_values)))] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(documents, None, Y, metas, domain)
def commit(self): table = None if self.data is not None: if self.correlations_type == 2 and self.target_variable and \ self.target_variable.is_continuous: pearson = ContinuousVariable.make("Pearson") spearman = ContinuousVariable.make("Spearman") row_name = StringVariable.make("Variable") domain = Orange.data.Domain([pearson, spearman], metas=[row_name]) table = Orange.data.Table(domain, self.target_correlations) for inst, name in zip(table, self.var_names): inst[row_name] = name self.send("Correlations", table)
def create_data_from_states(example_states, example_traces): data_desc = example_states[0].domain attributes = data_desc.get_attributes() domain = Domain( attributes, ContinuousVariable.make("complexity"), metas=[StringVariable.make("id"), ContinuousVariable("trace")]) data = Table.from_domain(domain) for si, s in enumerate(example_states): e = Instance(domain) for f in attributes: e[f] = s.get_attribute(f) e["id"] = s.get_id() e["trace"] = example_traces[si] data.append(e) return data
def transpose_table(table): """ Transpose the rows and columns of the table. Args: table: Data in :obj:`Orange.data.Table` Returns: Transposed :obj:`Orange.data.Table`. (Genes as columns) """ attrs = table.domain.attributes attr = [ContinuousVariable.make(ex['Gene'].value) for ex in table] # Set metas new_metas = [StringVariable.make(name) if name is not 'Time' else TimeVariable.make(name) for name in sorted(table.domain.variables[0].attributes.keys())] domain = Domain(attr, metas=new_metas) meta_values = [[exp.attributes[var.name] for var in domain.metas] for exp in attrs] return Table(domain, table.X.transpose(), metas=meta_values)
def test_domaineditor_makes_variables(self): # Variables created with domain editor should be interchangeable # with variables read from file. dat = """V0\tV1\nc\td\n\n1.0\t2""" v0 = StringVariable.make("V0") v1 = ContinuousVariable.make("V1") with named_file(dat, suffix=".tab") as filename: self.open_dataset(filename) model = self.widget.domain_editor.model() model.setData(model.createIndex(0, 1), "text", Qt.EditRole) model.setData(model.createIndex(1, 1), "numeric", Qt.EditRole) self.widget.apply_button.click() data = self.get_output(self.widget.Outputs.data) self.assertEqual(data.domain["V0"], v0) self.assertEqual(data.domain["V1"], v1)
def _create_corpus(self) -> Corpus: corpus = None names = ["name", "path", "content"] if not self.is_conllu else [ "name", "path", "utterance", "content" ] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: datum = [ # some characters are written as decomposed (č is char c # and separate char for caron), with NFC normalization we # normalize them to be written as precomposed (č is one # unicode char - 0x10D) # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize normalize('NFC', textdata.name), normalize('NFC', textdata.path), normalize('NFC', textdata.content) ] if self.is_conllu: datum.insert(2, normalize('NFC', textdata.doc_id)) data.append(datum) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain([], category_var, [StringVariable.make(name) for name in names]) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus.from_numpy(domain, X=np.empty((len(category_data), 0)), Y=category_data, metas=data, text_features=[domain.metas[-1]]) return corpus
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_vars = [] time_var = None for field_name, _ in includes_metadata: if field_name == PUBMED_FIELD_DATE: time_var = TimeVariable(field_name) meta_vars.append(time_var) else: meta_vars.append(StringVariable.make(field_name)) if field_name == PUBMED_FIELD_TITLE: meta_vars[-1].attributes["title"] = True meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata, time_var=time_var, ) class_vars = [ DiscreteVariable('section', values=list(map(str, set(filter(None, class_values))))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus.from_numpy(domain=domain, X=np.empty((len(Y), 0)), Y=Y, metas=meta_values)
def _guess_variable(self, field_name, field_metadata, inspect_table): type_code = field_metadata[0] FLOATISH_TYPES = (700, 701, 1700) # real, float8, numeric INT_TYPES = (20, 21, 23) # bigint, int, smallint CHAR_TYPES = (25, 1042, 1043,) # text, char, varchar BOOLEAN_TYPES = (16,) # bool DATE_TYPES = (1082, 1114, 1184, ) # date, timestamp, timestamptz # time, timestamp, timestamptz, timetz TIME_TYPES = (1083, 1114, 1184, 1266,) if type_code in FLOATISH_TYPES: return ContinuousVariable.make(field_name) if type_code in TIME_TYPES + DATE_TYPES: tv = TimeVariable.make(field_name) tv.have_date |= type_code in DATE_TYPES tv.have_time |= type_code in TIME_TYPES return tv if type_code in INT_TYPES: # bigint, int, smallint if inspect_table: values = self.get_distinct_values(field_name, inspect_table) if values: return DiscreteVariable.make(field_name, values) return ContinuousVariable.make(field_name) if type_code in BOOLEAN_TYPES: return DiscreteVariable.make(field_name, ['false', 'true']) if type_code in CHAR_TYPES: if inspect_table: values = self.get_distinct_values(field_name, inspect_table) # remove trailing spaces values = [v.rstrip() for v in values] if values: return DiscreteVariable.make(field_name, values) return StringVariable.make(field_name)
def etc_to_table(self, etc_json, time_var=False, callback=lambda: None): """ Converts data from Json to :obj:`Orange.data.table` Args: etc_json (dict): Data in json like format time_var (bool): Create column of time points. Default is set to False. Returns: :obj:`Orange.data.Table` """ cbc = CallBack(2, callback, callbacks=30) variables = [] time_point = 1 for time in etc_json['etc']['timePoints']: var = ContinuousVariable('TP ' + str(time_point)) var.attributes['Time'] = str(time) variables.append(var) time_point += 1 meta_attr = StringVariable.make('Gene') domain = Domain(variables, metas=[meta_attr]) cbc() table = [] for row in etc_json['etc']['genes']: gene_expression = [exp for exp in etc_json['etc']['genes'][row]] gene_expression.append(row) table.append(gene_expression) orange_table = Table(domain, table) if time_var: orange_table = transpose_table(orange_table) cbc() cbc.end() return orange_table
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_vars = [] time_var = None for field_name, _ in includes_metadata: if field_name == PUBMED_FIELD_DATE: time_var = TimeVariable(field_name) meta_vars.append(time_var) else: meta_vars.append(StringVariable.make(field_name)) if field_name == PUBMED_FIELD_TITLE: meta_vars[-1].attributes["title"] = True meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata, time_var=time_var, ) class_vars = [ DiscreteVariable('section', values=list(map(str, set(filter(None, class_values))))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def read(self): who = matlab.whosmat(self.filename) if not who: raise IOError("Couldn't load matlab file " + self.filename) else: ml = matlab.loadmat(self.filename, chars_as_strings=True) ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)} # X is the biggest numeric array numarrays = [] for name, con in ml.items(): if issubclass(con.dtype.type, numbers.Number): numarrays.append((name, reduce(lambda x, y: x*y, con.shape, 1))) X = None if numarrays: nameX = max(numarrays, key=lambda x: x[1])[0] X = ml.pop(nameX) # find an array with compatible shapes attributes = [] if X is not None: nameattributes = None for name, con in ml.items(): if con.shape in [(X.shape[1],), (1, X.shape[1])]: nameattributes = name break attributenames = ml.pop(nameattributes).ravel() if nameattributes else range(X.shape[1]) attributenames = [str(a).strip() for a in attributenames] # strip because of numpy char array attributes = [ContinuousVariable.make(a) for a in attributenames] metas = [] metaattributes = [] sizemetas = None if X is None: counts = defaultdict(list) for name, con in ml.items(): counts[len(con)].append(name) if counts: sizemetas = max(counts.keys(), key=lambda x: len(counts[x])) else: sizemetas = len(X) if sizemetas: for name, con in ml.items(): if len(con) == sizemetas: metas.append(name) metadata = [] for m in sorted(metas): f = ml[m] metaattributes.append(StringVariable.make(m)) f.resize(sizemetas, 1) metadata.append(f) metadata = np.hstack(tuple(metadata)) domain = Domain(attributes, metas=metaattributes) if X is None: X = np.zeros((sizemetas, 0)) return Orange.data.Table.from_numpy(domain, X, Y=None, metas=metadata)
def commit(self): path = self._current_path if not path: return transpose = not self._cells_in_rows row_annot = self.row_annotations_combo.currentData(Qt.UserRole) col_annot = self.col_annotations_combo.currentData(Qt.UserRole) if self._row_annotations_enabled and \ isinstance(row_annot, RecentPath) and \ os.path.exists(row_annot.abspath): row_annot = row_annot.abspath # type: str else: row_annot = None if self._col_annotations_enabled and \ isinstance(col_annot, RecentPath) and \ os.path.exists(col_annot.abspath): col_annot = col_annot.abspath # type: str else: col_annot = None meta_parts = [] # type: List[pd.DataFrame] attrs = [] # type: List[ContinuousVariable] metas = [] # type: List[StringVariable] rstate = np.random.RandomState(0x667) skip_row = skip_col = None if self._sample_cols_enabled: p = self._sample_cols_p if p < 100: def skip_col(i, p=p): return i > 3 and rstate.uniform(0, 100) > p if self._sample_rows_enabled: p = self._sample_rows_p if p < 100: def skip_row(i, p=p): return i > 3 and rstate.uniform(0, 100) > p header_rows = self._header_rows_count header_rows_indices = [] if header_rows == 0: header_rows = None elif header_rows == 1: header_rows = 0 header_rows_indices = [0] else: header_rows = list(range(header_rows)) header_rows_indices = header_rows header_cols = self._header_cols_count header_cols_indices = [] if header_cols == 0: header_cols = None elif header_cols == 1: header_cols = 0 header_cols_indices = [0] else: header_cols = list(range(header_cols)) header_cols_indices = header_cols if transpose: _skip_row, _skip_col = skip_col, skip_row else: _skip_col, _skip_row = skip_col, skip_row _userows = _usecols = None userows_mask = usecols_mask = None if _skip_col is not None: ncols = pd.read_csv(path, sep=separator_from_filename(path), index_col=None, nrows=1).shape[1] usecols_mask = np.array([ not _skip_col(i) or i in header_cols_indices for i in range(ncols) ], dtype=bool) _usecols = np.flatnonzero(usecols_mask) if _skip_row is not None: userows_mask = [] # record the used rows def _skip_row(i, test=_skip_row): r = test(i) userows_mask.append(r) return r meta_df_index = None row_annot_header = 0 row_annot_columns = None col_annot_header = 0 col_annot_columns = None if os.path.splitext(path)[1] == ".mtx": # 10x cellranger output X = scipy.io.mmread(path) assert isinstance(X, scipy.sparse.coo_matrix) if transpose: X = X.T if _skip_row is not None: userows_mask = np.array( [not _skip_row(i) for i in range(X.shape[0])]) X = X.tocsr()[np.flatnonzero(userows_mask)] if _skip_col is not None: usecols_mask = np.array( [not _skip_col(i) for i in range(X.shape[1])]) X = X.tocsc()[:, np.flatnonzero(usecols_mask)] X = X.todense(order="F") if userows_mask is not None: meta_df = pd.DataFrame({}, index=np.flatnonzero(userows_mask)) else: meta_df = pd.DataFrame({}, index=pd.RangeIndex(X.shape[0])) meta_df_index = meta_df.index row_annot_header = None row_annot_columns = ["Barcodes"] col_annot_header = None col_annot_columns = ["Id", "Gene"] leading_cols = leading_rows = 0 else: df = pd.read_csv(path, sep=separator_from_filename(path), index_col=header_cols, header=header_rows, skiprows=_skip_row, usecols=_usecols) if _skip_row is not None: userows_mask = np.array(userows_mask, dtype=bool) if transpose: df = df.transpose() userows_mask, usecols_mask = usecols_mask, userows_mask leading_rows = len(header_cols_indices) leading_cols = len(header_rows_indices) else: leading_rows = len(header_rows_indices) leading_cols = len(header_cols_indices) X = df.values attrs = [ContinuousVariable.make(str(g)) for g in df.columns] meta_df = df.iloc[:, :0] # Take the index # type: pd.DataFrame meta_df_index = df.index meta_parts = (meta_df, ) self.Error.row_annotation_mismatch.clear() self.Error.col_annotation_mismatch.clear() if row_annot is not None: row_annot_df = pd.read_csv(row_annot, sep=separator_from_filename(row_annot), header=row_annot_header, names=row_annot_columns, index_col=None) if userows_mask is not None: # NOTE: we account for column header/ row index expected = len(userows_mask) - leading_rows else: expected = X.shape[0] if len(row_annot_df) != expected: self.Error.row_annotation_mismatch(expected, len(row_annot_df)) row_annot_df = None if row_annot_df is not None and userows_mask is not None: # use the same sample indices indices = np.flatnonzero(userows_mask[leading_rows:]) row_annot_df = row_annot_df.iloc[indices] # if path.endswith(".count") and row_annot.endswith('.meta'): # assert np.all(row_annot_df.iloc[:, 0] == df.index) if row_annot_df is not None and meta_df_index is not None: # Try to match the leading columns with the meta_df_index. # If found then drop the columns (or index if the level does # not have a name but the annotation col does) drop_cols = [] drop_index_level = [] for i in range(meta_df_index.nlevels): meta_df_level = meta_df_index.get_level_values(i) if np.all(row_annot_df.iloc[:, i] == meta_df_level): if meta_df_level.name is None: drop_index_level.append(i) elif meta_df_level.name == row_annot_df.columns[ i].name: drop_cols.append(i) if drop_cols: row_annot_df = row_annot_df.drop(columns=drop_cols) if drop_index_level: for i in reversed(drop_index_level): if isinstance(meta_df.index, pd.MultiIndex): meta_df_index = meta_df_index.droplevel(i) else: assert i == 0 meta_df_index = pd.RangeIndex(meta_df_index.size) meta_df = pd.DataFrame({}, index=meta_df_index) if row_annot_df is not None: meta_parts = (meta_df, row_annot_df) if col_annot is not None: col_annot_df = pd.read_csv(col_annot, sep=separator_from_filename(col_annot), header=col_annot_header, names=col_annot_columns, index_col=None) if usecols_mask is not None: expected = len(usecols_mask) - leading_cols else: expected = X.shape[1] if len(col_annot_df) != expected: self.Error.col_annotation_mismatch(expected, len(col_annot_df)) col_annot_df = None if col_annot_df is not None and usecols_mask is not None: indices = np.flatnonzero(usecols_mask[leading_cols:]) col_annot_df = col_annot_df.iloc[indices] if col_annot_df is not None: assert len(col_annot_df) == X.shape[1] if not attrs and X.shape[1]: # No column names yet attrs = [ ContinuousVariable.make(str(v)) for v in col_annot_df.iloc[:, 0] ] names = [str(c) for c in col_annot_df.columns] for var, values in zip(attrs, col_annot_df.values): var.attributes.update( {n: v for n, v in zip(names, values)}) if meta_parts: meta_parts = [ df_.reset_index() if not df_.index.is_integer() else df_ for df_ in meta_parts ] metas = [ StringVariable.make(name) for name in chain(*(_.columns for _ in meta_parts)) ] M = np.hstack(tuple(df_.values for df_ in meta_parts)) else: metas = None M = None if not attrs and X.shape[1]: attrs = Orange.data.Domain.from_numpy(X).attributes domain = Orange.data.Domain(attrs, metas=metas) d = Orange.data.Table.from_numpy(domain, X, None, M) self.Outputs.data.send(d) self.set_modified(False)
from orangecontrib.text.corpus import Corpus from nltk import word_tokenize import re import sys import datetime from Orange.data import Table, Domain from Orange.data import StringVariable N = 20 EMPTYLIST = [] EMPTYSTRING = "" FIELDNAMEDATE = "date" FIELDNAMETEXT = "text" FIELDNAMECOORDINATES = "coordinates" COLUMNDOMAIN = StringVariable.make(FIELDNAMECOORDINATES) def makeRefId(msgId, index): return (" ".join([str(msgId + 1), str(index)])) def getDateFromRefId(refId): return (" ".join(refId.split()[0:2])) def makePhrase(wordList, index): return (" ".join(wordList[index:index + N])) def addPhraseToRefs(phraseRefs, phrase, msgId, index): phraseRefs[phrase] = makeRefId(msgId, index)
def read(self): import opusFC if self.sheet: db = self.sheet else: db = self.sheets[0] db = tuple(db.split(" ")) dim = db[1] try: data = opusFC.getOpusData(self.filename, db) except Exception: raise IOError("Couldn't load spectrum from " + self.filename) attrs, clses, metas = [], [], [] attrs = [ ContinuousVariable.make(repr(data.x[i])) for i in range(data.x.shape[0]) ] y_data = None meta_data = None if type(data) == opusFC.MultiRegionDataReturn: y_data = [] meta_data = [] metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region'), TimeVariable.make('start_time') ]) for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) start_time = region.start_time meta_region = np.column_stack( (mapX, mapY, map_region, start_time)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.MultiRegionTRCDataReturn: y_data = [] meta_data = [] metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region') ]) attrs = [ ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels)) ] for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) meta_region = np.column_stack((mapX, mapY, map_region)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.ImageDataReturn: metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y') ]) data_3D = data.spectra for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.ImageTRCDataReturn: metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y') ]) attrs = [ ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels)) ] data_3D = data.traces for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.TimeResolvedTRCDataReturn: y_data = data.traces elif type(data) == opusFC.TimeResolvedDataReturn: metas.extend([ContinuousVariable.make('z')]) y_data = data.spectra meta_data = data.z elif type(data) == opusFC.SingleDataReturn: y_data = data.y[None, :] else: raise ValueError( "Empty or unsupported opusFC DataReturn object: " + type(data)) import_params = ['SRT', 'SNM'] for param_key in import_params: try: param = data.parameters[param_key] except KeyError: pass # TODO should notify user? else: try: param_name = opusFC.paramDict[param_key] except KeyError: param_name = param_key if param_key == 'SRT': var = TimeVariable.make(param_name) elif type(param) is float: var = ContinuousVariable.make(param_name) elif type(param) is str: var = StringVariable.make(param_name) else: raise ValueError #Found a type to handle metas.extend([var]) params = np.full((y_data.shape[0], ), param, np.array(param).dtype) if meta_data is not None: # NB dtype default will be np.array(fill_value).dtype in future meta_data = np.column_stack( (meta_data, params.astype(object))) else: meta_data = params domain = Orange.data.Domain(attrs, clses, metas) meta_data = np.atleast_2d(meta_data) table = Orange.data.Table.from_numpy(domain, y_data.astype(float, order='C'), metas=meta_data) return table
def read(self): who = matlab.whosmat(self.filename) if not who: raise IOError("Couldn't load matlab file " + self.filename) else: ml = matlab.loadmat(self.filename, chars_as_strings=True) ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)} # X is the biggest numeric array numarrays = [] for name, con in ml.items(): if issubclass(con.dtype.type, numbers.Number): numarrays.append( (name, reduce(lambda x, y: x * y, con.shape, 1))) X = None if numarrays: nameX = max(numarrays, key=lambda x: x[1])[0] X = ml.pop(nameX) # find an array with compatible shapes attributes = [] if X is not None: nameattributes = None for name, con in ml.items(): if con.shape in [(X.shape[1], ), (1, X.shape[1])]: nameattributes = name break attributenames = ml.pop(nameattributes).ravel( ) if nameattributes else range(X.shape[1]) attributenames = [str(a).strip() for a in attributenames ] # strip because of numpy char array attributes = [ ContinuousVariable.make(a) for a in attributenames ] metas = [] metaattributes = [] sizemetas = None if X is None: counts = defaultdict(list) for name, con in ml.items(): counts[len(con)].append(name) if counts: sizemetas = max(counts.keys(), key=lambda x: len(counts[x])) else: sizemetas = len(X) if sizemetas: for name, con in ml.items(): if len(con) == sizemetas: metas.append(name) metadata = [] for m in sorted(metas): f = ml[m] metaattributes.append(StringVariable.make(m)) f.resize(sizemetas, 1) metadata.append(f) metadata = np.hstack(tuple(metadata)) domain = Domain(attributes, metas=metaattributes) if X is None: X = np.zeros((sizemetas, 0)) return Orange.data.Table.from_numpy(domain, X, Y=None, metas=metadata)
def read(self): who = matlab.whosmat(self.filename) if not who: raise IOError("Couldn't load matlab file " + self.filename) else: ml = matlab.loadmat(self.filename, chars_as_strings=True) ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)} def num_elements(array): return reduce(lambda x, y: x * y, array.shape, 1) def find_biggest(arrays): sizes = [] for n, c in arrays.items(): sizes.append((num_elements(c), n)) return max(sizes)[1] def is_string_array(array): return issubclass(array.dtype.type, np.str_) def is_number_array(array): return issubclass(array.dtype.type, numbers.Number) numeric = {n: a for n, a in ml.items() if is_number_array(a)} # X is the biggest numeric array X = ml.pop(find_biggest(numeric)) if numeric else None # find an array with compatible shapes attributes = [] if X is not None: name_array = None for name in sorted(ml): con = ml[name] if con.shape in [(X.shape[1],), (1, X.shape[1])]: name_array = name break names = ml.pop(name_array).ravel() if name_array else range(X.shape[1]) names = [str(a).rstrip() for a in names] # remove matlab char padding attributes = [ContinuousVariable.make(a) for a in names] meta_names = [] metas = [] meta_size = None if X is None: counts = defaultdict(list) for name, con in ml.items(): counts[len(con)].append(name) if counts: meta_size = max(counts.keys(), key=lambda x: len(counts[x])) else: meta_size = len(X) if meta_size: for name, con in ml.items(): if len(con) == meta_size: meta_names.append(name) meta_data = [] for m in sorted(meta_names): f = ml[m] if is_string_array(f) and len(f.shape) == 1: # 1D string arrays metas.append(StringVariable.make(m)) f = np.array([a.rstrip() for a in f]) # remove matlab char padding f.resize(meta_size, 1) meta_data.append(f) elif is_number_array(f) and len(f.shape) == 2: if f.shape[1] == 1: names = [m] else: names = [m + "_" + str(i+1) for i in range(f.shape[1])] for n in names: metas.append(ContinuousVariable.make(n)) meta_data.append(f) meta_data = np.hstack(tuple(meta_data)) if meta_data else None domain = Domain(attributes, metas=metas) if X is None: X = np.zeros((meta_size, 0)) return Orange.data.Table.from_numpy(domain, X, Y=None, metas=meta_data)
def read(self): try: import opusFC except ImportError: raise RuntimeError(self._OPUS_WARNING) if self.sheet: db = self.sheet else: db = self.sheets[0] db = tuple(db.split(" ")) dim = db[1] try: data = opusFC.getOpusData(self.filename, db) except Exception: raise IOError("Couldn't load spectrum from " + self.filename) attrs, clses, metas = [], [], [] attrs = [ContinuousVariable.make(repr(data.x[i])) for i in range(data.x.shape[0])] y_data = None meta_data = None if type(data) == opusFC.MultiRegionDataReturn: y_data = [] meta_data = [] metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region'), TimeVariable.make('start_time')]) for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) start_time = region.start_time meta_region = np.column_stack((mapX, mapY, map_region, start_time)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.MultiRegionTRCDataReturn: y_data = [] meta_data = [] metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region')]) attrs = [ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels))] for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) meta_region = np.column_stack((mapX, mapY, map_region)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.ImageDataReturn: metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y')]) data_3D = data.spectra for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.ImageTRCDataReturn: metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y')]) attrs = [ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels))] data_3D = data.traces for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.TimeResolvedTRCDataReturn: y_data = data.traces elif type(data) == opusFC.TimeResolvedDataReturn: metas.extend([ContinuousVariable.make('z')]) y_data = data.spectra meta_data = data.z elif type(data) == opusFC.SingleDataReturn: y_data = data.y[None, :] else: raise ValueError("Empty or unsupported opusFC DataReturn object: " + type(data)) import_params = ['SRT', 'SNM'] for param_key in import_params: try: param = data.parameters[param_key] except KeyError: pass # TODO should notify user? else: try: param_name = opusFC.paramDict[param_key] except KeyError: param_name = param_key if param_key == 'SRT': var = TimeVariable.make(param_name) elif type(param) is float: var = ContinuousVariable.make(param_name) elif type(param) is str: var = StringVariable.make(param_name) else: raise ValueError #Found a type to handle metas.extend([var]) params = np.full((y_data.shape[0],), param, np.array(param).dtype) if meta_data is not None: # NB dtype default will be np.array(fill_value).dtype in future meta_data = np.column_stack((meta_data, params.astype(object))) else: meta_data = params domain = Orange.data.Domain(attrs, clses, metas) meta_data = np.atleast_2d(meta_data) table = Orange.data.Table.from_numpy(domain, y_data.astype(float, order='C'), metas=meta_data) return table
def read(self): who = matlab.whosmat(self.filename) if not who: raise IOError("Couldn't load matlab file " + self.filename) else: ml = matlab.loadmat(self.filename, chars_as_strings=True) ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)} def num_elements(array): return reduce(lambda x, y: x * y, array.shape, 1) def find_biggest(arrays): sizes = [] for n, c in arrays.items(): sizes.append((num_elements(c), n)) return max(sizes)[1] def is_string_array(array): return issubclass(array.dtype.type, np.str_) def is_number_array(array): return issubclass(array.dtype.type, numbers.Number) numeric = {n: a for n, a in ml.items() if is_number_array(a)} # X is the biggest numeric array X = ml.pop(find_biggest(numeric)) if numeric else None # find an array with compatible shapes attributes = [] if X is not None: name_array = None for name in sorted(ml): con = ml[name] if con.shape in [(X.shape[1], ), (1, X.shape[1])]: name_array = name break names = ml.pop(name_array).ravel() if name_array else range( X.shape[1]) names = [str(a).rstrip() for a in names] # remove matlab char padding attributes = [ContinuousVariable.make(a) for a in names] meta_names = [] metas = [] meta_size = None if X is None: counts = defaultdict(list) for name, con in ml.items(): counts[len(con)].append(name) if counts: meta_size = max(counts.keys(), key=lambda x: len(counts[x])) else: meta_size = len(X) if meta_size: for name, con in ml.items(): if len(con) == meta_size: meta_names.append(name) meta_data = [] for m in sorted(meta_names): f = ml[m] if is_string_array(f) and len( f.shape) == 1: # 1D string arrays metas.append(StringVariable.make(m)) f = np.array([a.rstrip() for a in f]) # remove matlab char padding f.resize(meta_size, 1) meta_data.append(f) elif is_number_array(f) and len(f.shape) == 2: if f.shape[1] == 1: names = [m] else: names = [ m + "_" + str(i + 1) for i in range(f.shape[1]) ] for n in names: metas.append(ContinuousVariable.make(n)) meta_data.append(f) meta_data = np.hstack(tuple(meta_data)) if meta_data else None domain = Domain(attributes, metas=metas) if X is None: X = np.zeros((meta_size, 0)) return Orange.data.Table.from_numpy(domain, X, Y=None, metas=meta_data)
def read(self): import opusFC if self.sheet: db = self.sheet else: db = self.sheets[0] db = tuple(db.split(" ")) dim = db[1] try: data = opusFC.getOpusData(self.filename, db) except Exception: raise IOError("Couldn't load spectrum from " + self.filename) attrs, clses, metas = [], [], [] attrs = [ContinuousVariable.make(repr(data.x[i])) for i in range(data.x.shape[0])] y_data = None meta_data = None if dim == '3D': metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y')]) if db[0] == 'TRC': attrs = [ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels))] data_3D = data.traces else: data_3D = data.spectra for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif dim == '2D': y_data = data.y[None,:] try: stime = data.parameters['SRT'] except KeyError: pass # TODO notify user? else: metas.extend([TimeVariable.make(opusFC.paramDict['SRT'])]) if meta_data is not None: dates = np.full(meta_data[:,0].shape, stime, np.array(stime).dtype) meta_data = np.column_stack((meta_data, dates.astype(object))) else: meta_data = np.array([stime])[None,:] import_params = ['SNM'] for param_key in import_params: try: param = data.parameters[param_key] except Exception: pass # TODO should notify user? else: try: param_name = opusFC.paramDict[param_key] except KeyError: param_name = param_key if type(param) is float: var = ContinuousVariable.make(param_name) elif type(param) is str: var = StringVariable.make(param_name) else: raise ValueError #Found a type to handle metas.extend([var]) if meta_data is not None: # NB dtype default will be np.array(fill_value).dtype in future params = np.full(meta_data[:,0].shape, param, np.array(param).dtype) meta_data = np.column_stack((meta_data, params.astype(object))) else: meta_data = np.array([param])[None,:] domain = Orange.data.Domain(attrs, clses, metas) table = Orange.data.Table.from_numpy(domain, y_data.astype(float, order='C'), metas=meta_data) return table