Пример #1
0
def _generate_corpus(records, required_text_fields):
    """
    Generates a corpus from the input NYT records.
    :param records: The input NYT records.
    :type records: list
    :param required_text_fields: A list of the available NYT text fields.
    :type required_text_fields: list
    :return: :class: `orangecontrib.text.corpus.Corpus`
    """
    metas, class_values = _parse_record_json(records, required_text_fields)

    # Create domain.
    meta_vars = [StringVariable.make(field) for field in required_text_fields]
    meta_vars += [
        StringVariable.make("pub_date"),
        StringVariable.make("country")
    ]
    class_vars = [
        DiscreteVariable("section_name", values=list(set(class_values)))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(None, Y, metas, domain, meta_vars)  # used all features
Пример #2
0
def concatenate_data(tables, filenames, label):
    domain, xs = domain_union_for_spectra(tables)
    ntables = [(table if isinstance(table, Table) else table[2]).transform(domain)
               for table in tables]
    data = type(ntables[0]).concatenate(ntables, axis=0)
    source_var = StringVariable.make("Filename")
    label_var = StringVariable.make("Label")

    # add other variables
    xs_atts = tuple([ContinuousVariable.make("%f" % f) for f in xs])
    domain = Domain(xs_atts + domain.attributes, domain.class_vars,
                    domain.metas + (source_var, label_var))
    data = data.transform(domain)

    # fill in spectral data
    xs_sind = np.argsort(xs)
    xs_sorted = xs[xs_sind]
    pos = 0
    for table in tables:
        t = table if isinstance(table, Table) else table[2]
        if not isinstance(table, Table):
            indices = xs_sind[np.searchsorted(xs_sorted, table[0])]
            data.X[pos:pos+len(t), indices] = table[1]
        pos += len(t)

    data[:, source_var] = np.array(list(
        chain(*(repeat(fn, len(table))
                for fn, table in zip(filenames, ntables)))
    )).reshape(-1, 1)
    data[:, label_var] = np.array(list(
        chain(*(repeat(label, len(table))
                for fn, table in zip(filenames, ntables)))
    )).reshape(-1, 1)
    return data
Пример #3
0
def parse_record_json(record, includes_metadata):
    """
    Parses the JSON representation of the record returned by the New York Times Article API.
    :param record: The JSON representation of the query's results.
    :param includes_metadata: The flags that determine which fields to include.
    :return: A list of articles parsed into documents and a list of the
        corresponding metadata, joined in a tuple.
    """
    text_fields = [
        "headline", "lead_paragraph", "snippet", "abstract", "keywords"
    ]

    documents = []
    class_values = []
    meta_vars = [
        StringVariable.make(field)
        for field, flag in zip(text_fields, includes_metadata) if flag
    ]
    # Also add pub_date and glocation.
    meta_vars += [
        StringVariable.make("pub_date"),
        StringVariable.make("country")
    ]
    metadata = np.empty((0, len(meta_vars)), dtype=object)
    for doc in record["response"]["docs"]:
        string_document = ""
        metas_row = []
        for field, flag in zip(text_fields, includes_metadata):
            if flag and field in doc:
                field_value = ""
                if isinstance(doc[field], dict):
                    field_value = " ".join(
                        [val for val in doc[field].values() if val])
                elif isinstance(doc[field], list):
                    field_value = " ".join(
                        [kw["value"] for kw in doc[field] if kw])
                else:
                    if doc[field]:
                        field_value = doc[field]
                string_document += field_value
                metas_row.append(field_value)
        # Add the pub_date.
        field_value = ""
        if "pub_date" in doc and doc["pub_date"]:
            field_value = doc["pub_date"]
        metas_row.append(field_value)
        # Add the glocation.
        metas_row.append(",".join([
            kw["value"] for kw in doc["keywords"] if kw["name"] == "glocations"
        ]))

        # Add the section_name.
        class_val = ""
        if "section_name" in doc and doc["section_name"]:
            class_val = doc["section_name"]

        documents.append(string_document)
        class_values.append(class_val)
        metadata = np.vstack((metadata, np.array(metas_row)))
    return documents, metadata, meta_vars, class_values
def concatenate_data(tables, filenames, label):
    domain, xs = domain_union_for_spectra(tables)
    ntables = [(table if isinstance(table, Table) else table[2]).transform(domain)
               for table in tables]
    data = type(ntables[0]).concatenate(ntables, axis=0)
    source_var = StringVariable.make("Filename")
    label_var = StringVariable.make("Label")

    # add other variables
    xs_atts = tuple([ContinuousVariable.make("%f" % f) for f in xs])
    domain = Domain(xs_atts + domain.attributes, domain.class_vars,
                    domain.metas + (source_var, label_var))
    data = data.transform(domain)

    # fill in spectral data
    xs_sind = np.argsort(xs)
    xs_sorted = xs[xs_sind]
    pos = 0
    for table in tables:
        t = table if isinstance(table, Table) else table[2]
        if not isinstance(table, Table):
            indices = xs_sind[np.searchsorted(xs_sorted, table[0])]
            data.X[pos:pos+len(t), indices] = table[1]
        pos += len(t)

    data[:, source_var] = np.array(list(
        chain(*(repeat(fn, len(table))
                for fn, table in zip(filenames, ntables)))
    )).reshape(-1, 1)
    data[:, label_var] = np.array(list(
        chain(*(repeat(label, len(table))
                for fn, table in zip(filenames, ntables)))
    )).reshape(-1, 1)
    return data
Пример #5
0
    def test_nyt_corpus_domain_generation(self):
        corpus = self.nyt.run_query('slovenia')
        meta_vars = [StringVariable.make(field) for field in NYT_TEXT_FIELDS] + \
                    [StringVariable.make('pub_date'), StringVariable.make('country')]

        self.assertEqual(len(meta_vars), len(corpus.domain.metas))
        self.assertEqual(len(corpus.Y), 10)
Пример #6
0
    def test_nyt_corpus_domain_generation(self):
        corpus = self.nyt.run_query('slovenia')
        meta_vars = [StringVariable.make(field) for field in NYT_TEXT_FIELDS] + \
                    [StringVariable.make('pub_date'), StringVariable.make('country')]

        self.assertEqual(len(meta_vars), len(corpus.domain.metas))
        self.assertEqual(len(corpus.Y), 10)
Пример #7
0
def corpusDomain(mails):
    return(Domain([TimeVariable.make(FIELDDATE),                              \
                   DiscreteVariable.make(FIELDFROM,set([x[1] for x in mails])),  \
                   DiscreteVariable.make(FIELDTO,set([x[2] for x in mails]))], \
            metas=[StringVariable.make(FIELDTEXT),                            \
                   StringVariable.make(FIELDFILE),                            \
                   StringVariable.make(FIELDCOUNSELOR),                       \
                   StringVariable.make(FIELDSUBJECT)]))
Пример #8
0
 def corpusDomain(self, mails):
     return(Domain([TimeVariable.make("date"),                                       \
                    DiscreteVariable.make("from",      set([x[1] for x in mails])),  \
                    DiscreteVariable.make("to",        set([x[2] for x in mails])),  \
                    DiscreteVariable.make("duplicate", set([x[3] for x in mails]))], \
             metas=[StringVariable.make("file"),                                     \
                    StringVariable.make("subject"),                                  \
                    StringVariable.make("extra"),                                    \
                    StringVariable.make("text")]))
Пример #9
0
    def capture_image(self):
        cap = self.cap
        for i in range(3):  # Need some warmup time; use the last frame
            success, frame = cap.read()
            if success:
                self.Error.no_webcam.clear()
            else:
                self.Error.no_webcam()
                return

        def normalize(name):
            return ''.join(
                ch
                for ch in unicodedata.normalize('NFD', name.replace(' ', '_'))
                if unicodedata.category(ch) in 'LuLlPcPd')

        timestamp = datetime.now().strftime('%Y%m%d%H%M%S.%f')
        image_title, self.image_title = self.image_title or self.DEFAULT_TITLE, ''
        normed_name = normalize(image_title)

        for image, suffix, output in ((frame, '', self.Output.SNAPSHOT),
                                      (self.clip_aspect_frame(frame),
                                       '_aspect',
                                       self.Output.SNAPSHOT_ASPECT)):
            path = os.path.join(
                self.IMAGE_DIR,
                '{normed_name}_{timestamp}{suffix}.png'.format(**locals()))
            cv2.imwrite(
                path,
                # imwrite expects original bgr image, so this is reversed
                self.bgr2rgb(image) if self.avatar_filter else image)

            size = ContinuousVariable.make('size')
            width = ContinuousVariable.make('width')
            height = ContinuousVariable.make('height')
            s, w, h = self.image_meta_data(path)
            image_var = StringVariable.make('image')
            image_var.attributes['type'] = 'image'
            metas = np.array([[image_title, path, s, w, h]], dtype=object)
            table = Table.from_numpy(Domain(
                [],
                metas=[
                    StringVariable.make('image name'), image_var, size, width,
                    height
                ]),
                                     np.empty((1, 0)),
                                     metas=metas)
            self.send(output, table)

        self.snapshot_flash = 80
Пример #10
0
def parse_record_json(record, includes_metadata):
    """
    Parses the JSON representation of the record returned by the New York Times Article API.
    :param record: The JSON representation of the query's results.
    :param includes_metadata: The flags that determine which fields to include.
    :return: A list of articles parsed into documents and a list of the
        corresponding metadata, joined in a tuple.
    """
    text_fields = ["headline", "lead_paragraph", "snippet", "abstract", "keywords"]

    documents = []
    class_values = []
    meta_vars = [StringVariable.make(field) for field, flag in zip(text_fields, includes_metadata) if flag]
    # Also add pub_date and glocation.
    meta_vars += [StringVariable.make("pub_date"), StringVariable.make("country")]
    metadata = np.empty((0, len(meta_vars)), dtype=object)
    for doc in record["response"]["docs"]:
        string_document = ""
        metas_row = []
        for field, flag in zip(text_fields, includes_metadata):
            if flag and field in doc:
                field_value = ""
                if isinstance(doc[field], dict):
                    field_value = " ".join([val for val in doc[field].values() if val])
                elif isinstance(doc[field], list):
                    field_value = " ".join([kw["value"] for kw in doc[field] if kw])
                else:
                    if doc[field]:
                        field_value = doc[field]
                string_document += field_value
                metas_row.append(field_value)
        # Add the pub_date.
        field_value = ""
        if "pub_date" in doc and doc["pub_date"]:
            field_value = doc["pub_date"]
        metas_row.append(field_value)
        # Add the glocation.
        metas_row.append(",".join([kw["value"] for kw in doc["keywords"] if kw["name"] == "glocations"]))

        # Add the section_name.
        class_val = ""
        if "section_name" in doc and doc["section_name"]:
            class_val = doc["section_name"]

        documents.append(string_document)
        class_values.append(class_val)
        metadata = np.vstack((metadata, np.array(metas_row)))
    return documents, metadata, meta_vars, class_values
Пример #11
0
    def make_orange_table(self, data, Y, file_names, domain_metas):
        """
        Make Orange table with min, max and avg classification probabilities

        :param data: min, max and avg classification probabilities
        :param Y: category values
        :param file_names: name of files
        :param domain_metas: all meta attributes from domain
        :return: orange table with category, file name, min, max and avg classification probabilities
        """

        category = DiscreteVariable(
            "Target class", values=self.discrete_atributes)

        attributes = []

        for i in range(5, len(domain_metas)):
            if str(type(domain_metas[i])) == "ContinuousVariable":
                attributes.append(ContinuousVariable.make(
                    "MIN_" + str(domain_metas[i].name)))
                attributes.append(ContinuousVariable.make(
                    "MAX_" + str(domain_metas[i].name)))
                attributes.append(ContinuousVariable.make(
                    "AVG_" + str(domain_metas[i].name)))

        METAS = [('File name', 'file_name')]
        meta_attr = [StringVariable.make(meta[0]) for meta in METAS]

        self.domain = Domain(attributes, class_vars=category, metas=meta_attr)

        orange_table = Table(self.domain, data, Y, file_names)

        return orange_table
Пример #12
0
def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_values, class_values = _records_to_corpus_entries(
            records,
            includes_metadata=includes_metadata
    )
    meta_vars = []
    for field_name, _ in includes_metadata:
        if field_name == 'pub_date':
            meta_vars.append(TimeVariable(field_name))
        else:
            meta_vars.append(StringVariable.make(field_name))

    class_vars = [
        DiscreteVariable('section_name', values=list(set(class_values)))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(domain=domain, Y=Y, metas=meta_values)
Пример #13
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        NUMERIC_TYPES = ("FLOAT", "DOUBLE", "DECIMAL")  # real, float8, numeric
        INT_TYPES = ("INT", "TINYINT", "SMALLINT", "MEDIUMINT", "BIGINT")
        DATE_TYPES = ("DATE", "DATETIME", "YEAR")
        TIME_TYPES = ("TIMESTAMP", "TIME")
        CHAR_TYPES = ("CHAR", "ENUM")

        if type_code in NUMERIC_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Пример #14
0
    def _create_corpus(self):
        corpus = None
        names = ["name", "path", "content"]
        data = []
        category_data = []
        text_categories = list(set(t.category for t in self._text_data))
        values = list(set(text_categories))
        category_var = DiscreteVariable.make("category", values=values)
        for textdata in self._text_data:
            data.append(
                [textdata.name,
                 textdata.path,
                 textdata.content]
            )
            category_data.append(category_var.to_val(textdata.category))
        if len(text_categories) > 1:
            category_data = np.array(category_data)
        else:
            category_var = []
            category_data = np.empty((len(data), 0))
        domain = Domain(
            [], category_var, [StringVariable.make(name) for name in names]
        )
        domain["name"].attributes["title"] = True
        data = np.array(data, dtype=object)
        if len(data):
            corpus = Corpus(domain,
                            Y=category_data,
                            metas=data,
                            text_features=[domain.metas[2]])

        return corpus
Пример #15
0
 def generate_corpus(self, url_list):
     """
     generate new corpus with values requested by user
     :param url_list:
     :return: corpus
     """
     new_table=None
     text_includes_params = [self.includes_article, self.includes_author, self.includes_date,
                              self.includes_title, self.includes_web_url]
     if True not in text_includes_params:
         self.warning(1, "You must select at least one text field.")
         return
     required_text_fields = [incl_field for yes, incl_field in zip(text_includes_params, ARTICLE_TEXT_FIELDS) if yes]
     meta_vars = [StringVariable.make(field) for field in required_text_fields]
     metadata=[]
     for url in url_list:
         info, is_cached =_get_info(url)
         final_fields = [incl_field for yes, incl_field in zip(text_includes_params, info) if yes]
         metadata.append(final_fields)
     metadata = np.array(metadata, dtype=object)
     metas=metadata
     domain = Domain([], class_vars=None, metas=(meta_vars))
     new_table = Corpus(None, None, metadata, domain, meta_vars)
     self.output_corpus=new_table
     self.send("Corpus",self.output_corpus)
Пример #16
0
    def __into_orange_table(self, attrs, X, meta_parts):
        if not attrs and X.shape[1]:
            attrs = Domain.from_numpy(X).attributes

        try:
            metas = None
            M = None
            if meta_parts:
                meta_parts = [
                    df_.reset_index() if not df_.index.is_integer() else df_
                    for df_ in meta_parts
                ]
                metas = [
                    StringVariable.make(name)
                    for name in chain(*(_.columns for _ in meta_parts))
                ]
                M = np.hstack(tuple(df_.values for df_ in meta_parts))

            domain = Domain(attrs, metas=metas)
            table = Table.from_numpy(domain, X, None, M)
        except ValueError:
            table = None
            rows = self.leading_cols if self.transposed else self.leading_rows
            cols = self.leading_rows if self.transposed else self.leading_cols
            self.errors["inadequate_headers"] = (rows, cols)
        return table
Пример #17
0
def etc_to_table(etc_json, time_var=False):
    """ Converts data from Json to :obj:`Orange.data.table`

    Args:
        etc_json (dict): Data in json like format from genesis
        time_var (bool): Create column of time points. Default is set to False.

    Returns:
        :obj:`Orange.data.Table`
    """

    variables = []
    time_point = 1
    for time in etc_json['etc']['timePoints']:
        var = ContinuousVariable('TP ' + str(time_point))
        var.attributes['Time'] = str(time)
        variables.append(var)
        time_point += 1

    meta_attr = StringVariable.make('Gene')
    domain = Domain(variables, metas=[meta_attr])

    table = []
    for row in etc_json['etc']['genes']:
        gene_expression = [exp for exp in etc_json['etc']['genes'][row]]
        gene_expression.append(row)
        table.append(gene_expression)

    orange_table = Table(domain, table)

    if time_var:
        orange_table = transpose_table(orange_table)

    return orange_table
Пример #18
0
def transpose_table(table):
    """ Transpose the rows and columns of the table.

    Args:
        table: Data in :obj:`Orange.data.Table`

    Returns:
         Transposed :obj:`Orange.data.Table`. (Genes as columns)
    """

    # TODO: remove this and use Orange.data.Table.transpose

    attrs = table.domain.attributes
    attr = [ContinuousVariable.make(ex['Gene'].value) for ex in table]
    #  Set metas
    new_metas = [
        StringVariable.make(name)
        if name != 'Time' else TimeVariable.make(name)
        for name in sorted(table.domain.variables[0].attributes.keys())
    ]
    domain = Domain(attr, metas=new_metas)
    meta_values = [[exp.attributes[var.name] for var in domain.metas]
                   for exp in attrs]

    return Table(domain, table.X.transpose(), metas=meta_values)
Пример #19
0
def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_values, class_values = _records_to_corpus_entries(
        records, includes_metadata=includes_metadata)
    meta_vars = []
    for field_name, _ in includes_metadata:
        if field_name == 'pub_date':
            meta_vars.append(TimeVariable(field_name))
        else:
            meta_vars.append(StringVariable.make(field_name))

    class_vars = [
        DiscreteVariable('section_name', values=list(set(class_values)))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(domain=domain, Y=Y, metas=meta_values)
Пример #20
0
    def _create_corpus(self):
        corpus = None
        names = ["name", "path", "content"]
        data = []
        category_data = []
        text_categories = list(set(t.category for t in self._text_data))
        values = list(set(text_categories))
        category_var = DiscreteVariable.make("category", values=values)
        for textdata in self._text_data:
            data.append([textdata.name, textdata.path, textdata.content])
            category_data.append(category_var.to_val(textdata.category))
        if len(text_categories) > 1:
            category_data = np.array(category_data)
        else:
            category_var = []
            category_data = np.empty((len(data), 0))
        domain = Domain([], category_var,
                        [StringVariable.make(name) for name in names])
        domain["name"].attributes["title"] = True
        data = np.array(data, dtype=object)
        if len(data):
            corpus = Corpus(domain,
                            Y=category_data,
                            metas=data,
                            text_features=[domain.metas[2]])

        return corpus
Пример #21
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        FLOATISH_TYPES = (700, 701, 1700)  # real, float8, numeric
        INT_TYPES = (20, 21, 23)  # bigint, int, smallint
        CHAR_TYPES = (
            25,
            1042,
            1043,
        )  # text, char, varchar
        BOOLEAN_TYPES = (16, )  # bool
        DATE_TYPES = (
            1082,
            1114,
            1184,
        )  # date, timestamp, timestamptz
        # time, timestamp, timestamptz, timetz
        TIME_TYPES = (
            1083,
            1114,
            1184,
            1266,
        )

        if type_code in FLOATISH_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in BOOLEAN_TYPES:
            return DiscreteVariable.make(field_name, ['false', 'true'])

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Пример #22
0
def _generate_corpus(records, required_text_fields):
    """
    Generates a corpus from the input NYT records.
    :param records: The input NYT records.
    :type records: list
    :param required_text_fields: A list of the available NYT text fields.
    :type required_text_fields: list
    :return: :class: `orangecontrib.text.corpus.Corpus`
    """
    metas, class_values = _parse_record_json(records, required_text_fields)
    documents = []
    for doc in metas:
        documents.append(" ".join([d for d in doc if d is not None]).strip())

    # Create domain.
    meta_vars = [StringVariable.make(field) for field in required_text_fields]
    meta_vars += [StringVariable.make("pub_date"), StringVariable.make("country")]
    class_vars = [DiscreteVariable("section_name", values=list(set(class_values)))]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(documents, None, Y, metas, domain)
    def commit(self):
        table = None
        if self.data is not None:
            if self.correlations_type == 2 and self.target_variable and \
                    self.target_variable.is_continuous:
                pearson = ContinuousVariable.make("Pearson")
                spearman = ContinuousVariable.make("Spearman")
                row_name = StringVariable.make("Variable")

                domain = Orange.data.Domain([pearson, spearman],
                                            metas=[row_name])
                table = Orange.data.Table(domain, self.target_correlations)
                for inst, name in zip(table, self.var_names):
                    inst[row_name] = name
        self.send("Correlations", table)
Пример #24
0
def create_data_from_states(example_states, example_traces):
    data_desc = example_states[0].domain
    attributes = data_desc.get_attributes()
    domain = Domain(
        attributes,
        ContinuousVariable.make("complexity"),
        metas=[StringVariable.make("id"),
               ContinuousVariable("trace")])
    data = Table.from_domain(domain)
    for si, s in enumerate(example_states):
        e = Instance(domain)
        for f in attributes:
            e[f] = s.get_attribute(f)
        e["id"] = s.get_id()
        e["trace"] = example_traces[si]
        data.append(e)
    return data
Пример #25
0
def transpose_table(table):
    """
    Transpose the rows and columns of the table.

    Args:
        table: Data in :obj:`Orange.data.Table`

    Returns:
         Transposed :obj:`Orange.data.Table`. (Genes as columns)
    """
    attrs = table.domain.attributes
    attr = [ContinuousVariable.make(ex['Gene'].value) for ex in table]
    #  Set metas
    new_metas = [StringVariable.make(name) if name is not 'Time' else TimeVariable.make(name)
                 for name in sorted(table.domain.variables[0].attributes.keys())]
    domain = Domain(attr, metas=new_metas)
    meta_values = [[exp.attributes[var.name] for var in domain.metas] for exp in attrs]

    return Table(domain, table.X.transpose(), metas=meta_values)
Пример #26
0
    def test_domaineditor_makes_variables(self):
        # Variables created with domain editor should be interchangeable
        # with variables read from file.

        dat = """V0\tV1\nc\td\n\n1.0\t2"""
        v0 = StringVariable.make("V0")
        v1 = ContinuousVariable.make("V1")

        with named_file(dat, suffix=".tab") as filename:
            self.open_dataset(filename)

            model = self.widget.domain_editor.model()
            model.setData(model.createIndex(0, 1), "text", Qt.EditRole)
            model.setData(model.createIndex(1, 1), "numeric", Qt.EditRole)
            self.widget.apply_button.click()

            data = self.get_output(self.widget.Outputs.data)
            self.assertEqual(data.domain["V0"], v0)
            self.assertEqual(data.domain["V1"], v1)
Пример #27
0
    def test_domaineditor_makes_variables(self):
        # Variables created with domain editor should be interchangeable
        # with variables read from file.

        dat = """V0\tV1\nc\td\n\n1.0\t2"""
        v0 = StringVariable.make("V0")
        v1 = ContinuousVariable.make("V1")

        with named_file(dat, suffix=".tab") as filename:
            self.open_dataset(filename)

            model = self.widget.domain_editor.model()
            model.setData(model.createIndex(0, 1), "text", Qt.EditRole)
            model.setData(model.createIndex(1, 1), "numeric", Qt.EditRole)
            self.widget.apply_button.click()

            data = self.get_output(self.widget.Outputs.data)
            self.assertEqual(data.domain["V0"], v0)
            self.assertEqual(data.domain["V1"], v1)
Пример #28
0
 def _create_corpus(self) -> Corpus:
     corpus = None
     names = ["name", "path", "content"] if not self.is_conllu else [
         "name", "path", "utterance", "content"
     ]
     data = []
     category_data = []
     text_categories = list(set(t.category for t in self._text_data))
     values = list(set(text_categories))
     category_var = DiscreteVariable.make("category", values=values)
     for textdata in self._text_data:
         datum = [
             # some characters are written as decomposed (č is char c
             # and separate char for caron), with NFC normalization we
             # normalize them to be written as precomposed (č is one
             # unicode char - 0x10D)
             # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
             normalize('NFC', textdata.name),
             normalize('NFC', textdata.path),
             normalize('NFC', textdata.content)
         ]
         if self.is_conllu:
             datum.insert(2, normalize('NFC', textdata.doc_id))
         data.append(datum)
         category_data.append(category_var.to_val(textdata.category))
     if len(text_categories) > 1:
         category_data = np.array(category_data)
     else:
         category_var = []
         category_data = np.empty((len(data), 0))
     domain = Domain([], category_var,
                     [StringVariable.make(name) for name in names])
     domain["name"].attributes["title"] = True
     data = np.array(data, dtype=object)
     if len(data):
         corpus = Corpus.from_numpy(domain,
                                    X=np.empty((len(category_data), 0)),
                                    Y=category_data,
                                    metas=data,
                                    text_features=[domain.metas[-1]])
     return corpus
Пример #29
0
def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_vars = []
    time_var = None
    for field_name, _ in includes_metadata:
        if field_name == PUBMED_FIELD_DATE:
            time_var = TimeVariable(field_name)
            meta_vars.append(time_var)
        else:
            meta_vars.append(StringVariable.make(field_name))
            if field_name == PUBMED_FIELD_TITLE:
                meta_vars[-1].attributes["title"] = True

    meta_values, class_values = _records_to_corpus_entries(
        records,
        includes_metadata=includes_metadata,
        time_var=time_var,
    )

    class_vars = [
        DiscreteVariable('section',
                         values=list(map(str, set(filter(None,
                                                         class_values)))))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus.from_numpy(domain=domain,
                             X=np.empty((len(Y), 0)),
                             Y=Y,
                             metas=meta_values)
Пример #30
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        FLOATISH_TYPES = (700, 701, 1700)  # real, float8, numeric
        INT_TYPES = (20, 21, 23)  # bigint, int, smallint
        CHAR_TYPES = (25, 1042, 1043,)  # text, char, varchar
        BOOLEAN_TYPES = (16,)  # bool
        DATE_TYPES = (1082, 1114, 1184, )  # date, timestamp, timestamptz
        # time, timestamp, timestamptz, timetz
        TIME_TYPES = (1083, 1114, 1184, 1266,)

        if type_code in FLOATISH_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in BOOLEAN_TYPES:
            return DiscreteVariable.make(field_name, ['false', 'true'])

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Пример #31
0
    def etc_to_table(self, etc_json, time_var=False, callback=lambda: None):
        """ Converts data from Json to :obj:`Orange.data.table`

        Args:
            etc_json (dict): Data in json like format
            time_var (bool): Create column of time points. Default is set to False.
        Returns:
            :obj:`Orange.data.Table`
        """
        cbc = CallBack(2, callback, callbacks=30)

        variables = []
        time_point = 1
        for time in etc_json['etc']['timePoints']:
            var = ContinuousVariable('TP ' + str(time_point))
            var.attributes['Time'] = str(time)
            variables.append(var)
            time_point += 1

        meta_attr = StringVariable.make('Gene')
        domain = Domain(variables, metas=[meta_attr])
        cbc()

        table = []
        for row in etc_json['etc']['genes']:
            gene_expression = [exp for exp in etc_json['etc']['genes'][row]]
            gene_expression.append(row)
            table.append(gene_expression)

        orange_table = Table(domain, table)

        if time_var:
            orange_table = transpose_table(orange_table)
            cbc()

        cbc.end()
        return orange_table
Пример #32
0
def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_vars = []
    time_var = None
    for field_name, _ in includes_metadata:
        if field_name == PUBMED_FIELD_DATE:
            time_var = TimeVariable(field_name)
            meta_vars.append(time_var)
        else:
            meta_vars.append(StringVariable.make(field_name))
            if field_name == PUBMED_FIELD_TITLE:
                meta_vars[-1].attributes["title"] = True

    meta_values, class_values = _records_to_corpus_entries(
        records,
        includes_metadata=includes_metadata,
        time_var=time_var,
    )

    class_vars = [
        DiscreteVariable('section',
                         values=list(map(str, set(filter(None, class_values)))))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(domain=domain, Y=Y, metas=meta_values)
Пример #33
0
    def read(self):
        who = matlab.whosmat(self.filename)
        if not who:
            raise IOError("Couldn't load matlab file " + self.filename)
        else:
            ml = matlab.loadmat(self.filename, chars_as_strings=True)

            ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)}

            # X is the biggest numeric array
            numarrays = []
            for name, con in ml.items():
                 if issubclass(con.dtype.type, numbers.Number):
                    numarrays.append((name, reduce(lambda x, y: x*y, con.shape, 1)))
            X = None
            if numarrays:
                nameX = max(numarrays, key=lambda x: x[1])[0]
                X = ml.pop(nameX)

            # find an array with compatible shapes
            attributes = []
            if X is not None:
                nameattributes = None
                for name, con in ml.items():
                    if con.shape in [(X.shape[1],), (1, X.shape[1])]:
                        nameattributes = name
                        break
                attributenames = ml.pop(nameattributes).ravel() if nameattributes else range(X.shape[1])
                attributenames = [str(a).strip() for a in attributenames]  # strip because of numpy char array
                attributes = [ContinuousVariable.make(a) for a in attributenames]

            metas = []
            metaattributes = []

            sizemetas = None
            if X is None:
                counts = defaultdict(list)
                for name, con in ml.items():
                    counts[len(con)].append(name)
                if counts:
                    sizemetas = max(counts.keys(), key=lambda x: len(counts[x]))
            else:
                sizemetas = len(X)
            if sizemetas:
                for name, con in ml.items():
                    if len(con) == sizemetas:
                        metas.append(name)

            metadata = []
            for m in sorted(metas):
                f = ml[m]
                metaattributes.append(StringVariable.make(m))
                f.resize(sizemetas, 1)
                metadata.append(f)

            metadata = np.hstack(tuple(metadata))

            domain = Domain(attributes, metas=metaattributes)
            if X is None:
                X = np.zeros((sizemetas, 0))
            return Orange.data.Table.from_numpy(domain, X, Y=None, metas=metadata)
Пример #34
0
    def commit(self):
        path = self._current_path
        if not path:
            return

        transpose = not self._cells_in_rows
        row_annot = self.row_annotations_combo.currentData(Qt.UserRole)
        col_annot = self.col_annotations_combo.currentData(Qt.UserRole)

        if self._row_annotations_enabled and \
                isinstance(row_annot, RecentPath) and \
                os.path.exists(row_annot.abspath):
            row_annot = row_annot.abspath  # type: str
        else:
            row_annot = None

        if self._col_annotations_enabled and \
                isinstance(col_annot, RecentPath) and \
                os.path.exists(col_annot.abspath):
            col_annot = col_annot.abspath  # type: str
        else:
            col_annot = None

        meta_parts = []  # type: List[pd.DataFrame]
        attrs = []  # type: List[ContinuousVariable]
        metas = []  # type: List[StringVariable]

        rstate = np.random.RandomState(0x667)

        skip_row = skip_col = None
        if self._sample_cols_enabled:
            p = self._sample_cols_p
            if p < 100:

                def skip_col(i, p=p):
                    return i > 3 and rstate.uniform(0, 100) > p

        if self._sample_rows_enabled:
            p = self._sample_rows_p
            if p < 100:

                def skip_row(i, p=p):
                    return i > 3 and rstate.uniform(0, 100) > p

        header_rows = self._header_rows_count
        header_rows_indices = []
        if header_rows == 0:
            header_rows = None
        elif header_rows == 1:
            header_rows = 0
            header_rows_indices = [0]
        else:
            header_rows = list(range(header_rows))
            header_rows_indices = header_rows

        header_cols = self._header_cols_count
        header_cols_indices = []
        if header_cols == 0:
            header_cols = None
        elif header_cols == 1:
            header_cols = 0
            header_cols_indices = [0]
        else:
            header_cols = list(range(header_cols))
            header_cols_indices = header_cols

        if transpose:
            _skip_row, _skip_col = skip_col, skip_row
        else:
            _skip_col, _skip_row = skip_col, skip_row

        _userows = _usecols = None
        userows_mask = usecols_mask = None

        if _skip_col is not None:
            ncols = pd.read_csv(path,
                                sep=separator_from_filename(path),
                                index_col=None,
                                nrows=1).shape[1]
            usecols_mask = np.array([
                not _skip_col(i) or i in header_cols_indices
                for i in range(ncols)
            ],
                                    dtype=bool)
            _usecols = np.flatnonzero(usecols_mask)

        if _skip_row is not None:
            userows_mask = []  # record the used rows

            def _skip_row(i, test=_skip_row):
                r = test(i)
                userows_mask.append(r)
                return r

        meta_df_index = None
        row_annot_header = 0
        row_annot_columns = None
        col_annot_header = 0
        col_annot_columns = None

        if os.path.splitext(path)[1] == ".mtx":
            # 10x cellranger output
            X = scipy.io.mmread(path)
            assert isinstance(X, scipy.sparse.coo_matrix)
            if transpose:
                X = X.T
            if _skip_row is not None:
                userows_mask = np.array(
                    [not _skip_row(i) for i in range(X.shape[0])])
                X = X.tocsr()[np.flatnonzero(userows_mask)]
            if _skip_col is not None:
                usecols_mask = np.array(
                    [not _skip_col(i) for i in range(X.shape[1])])
                X = X.tocsc()[:, np.flatnonzero(usecols_mask)]
            X = X.todense(order="F")
            if userows_mask is not None:
                meta_df = pd.DataFrame({}, index=np.flatnonzero(userows_mask))
            else:
                meta_df = pd.DataFrame({}, index=pd.RangeIndex(X.shape[0]))

            meta_df_index = meta_df.index

            row_annot_header = None
            row_annot_columns = ["Barcodes"]
            col_annot_header = None
            col_annot_columns = ["Id", "Gene"]
            leading_cols = leading_rows = 0
        else:
            df = pd.read_csv(path,
                             sep=separator_from_filename(path),
                             index_col=header_cols,
                             header=header_rows,
                             skiprows=_skip_row,
                             usecols=_usecols)

            if _skip_row is not None:
                userows_mask = np.array(userows_mask, dtype=bool)

            if transpose:
                df = df.transpose()
                userows_mask, usecols_mask = usecols_mask, userows_mask
                leading_rows = len(header_cols_indices)
                leading_cols = len(header_rows_indices)
            else:
                leading_rows = len(header_rows_indices)
                leading_cols = len(header_cols_indices)

            X = df.values
            attrs = [ContinuousVariable.make(str(g)) for g in df.columns]

            meta_df = df.iloc[:, :0]  # Take the index # type: pd.DataFrame
            meta_df_index = df.index
            meta_parts = (meta_df, )

        self.Error.row_annotation_mismatch.clear()
        self.Error.col_annotation_mismatch.clear()

        if row_annot is not None:
            row_annot_df = pd.read_csv(row_annot,
                                       sep=separator_from_filename(row_annot),
                                       header=row_annot_header,
                                       names=row_annot_columns,
                                       index_col=None)
            if userows_mask is not None:
                # NOTE: we account for column header/ row index
                expected = len(userows_mask) - leading_rows
            else:
                expected = X.shape[0]
            if len(row_annot_df) != expected:
                self.Error.row_annotation_mismatch(expected, len(row_annot_df))
                row_annot_df = None

            if row_annot_df is not None and userows_mask is not None:
                # use the same sample indices
                indices = np.flatnonzero(userows_mask[leading_rows:])
                row_annot_df = row_annot_df.iloc[indices]
                # if path.endswith(".count") and row_annot.endswith('.meta'):
                #     assert np.all(row_annot_df.iloc[:, 0] == df.index)

            if row_annot_df is not None and meta_df_index is not None:
                # Try to match the leading columns with the meta_df_index.
                # If found then drop the columns (or index if the level does
                # not have a name but the annotation col does)
                drop_cols = []
                drop_index_level = []
                for i in range(meta_df_index.nlevels):
                    meta_df_level = meta_df_index.get_level_values(i)
                    if np.all(row_annot_df.iloc[:, i] == meta_df_level):
                        if meta_df_level.name is None:
                            drop_index_level.append(i)
                        elif meta_df_level.name == row_annot_df.columns[
                                i].name:
                            drop_cols.append(i)

                if drop_cols:
                    row_annot_df = row_annot_df.drop(columns=drop_cols)

                if drop_index_level:
                    for i in reversed(drop_index_level):
                        if isinstance(meta_df.index, pd.MultiIndex):
                            meta_df_index = meta_df_index.droplevel(i)
                        else:
                            assert i == 0
                            meta_df_index = pd.RangeIndex(meta_df_index.size)
                    meta_df = pd.DataFrame({}, index=meta_df_index)

            if row_annot_df is not None:
                meta_parts = (meta_df, row_annot_df)

        if col_annot is not None:
            col_annot_df = pd.read_csv(col_annot,
                                       sep=separator_from_filename(col_annot),
                                       header=col_annot_header,
                                       names=col_annot_columns,
                                       index_col=None)
            if usecols_mask is not None:
                expected = len(usecols_mask) - leading_cols
            else:
                expected = X.shape[1]
            if len(col_annot_df) != expected:
                self.Error.col_annotation_mismatch(expected, len(col_annot_df))
                col_annot_df = None
            if col_annot_df is not None and usecols_mask is not None:
                indices = np.flatnonzero(usecols_mask[leading_cols:])
                col_annot_df = col_annot_df.iloc[indices]

            if col_annot_df is not None:
                assert len(col_annot_df) == X.shape[1]
                if not attrs and X.shape[1]:  # No column names yet
                    attrs = [
                        ContinuousVariable.make(str(v))
                        for v in col_annot_df.iloc[:, 0]
                    ]
                names = [str(c) for c in col_annot_df.columns]
                for var, values in zip(attrs, col_annot_df.values):
                    var.attributes.update(
                        {n: v
                         for n, v in zip(names, values)})

        if meta_parts:
            meta_parts = [
                df_.reset_index() if not df_.index.is_integer() else df_
                for df_ in meta_parts
            ]
            metas = [
                StringVariable.make(name)
                for name in chain(*(_.columns for _ in meta_parts))
            ]
            M = np.hstack(tuple(df_.values for df_ in meta_parts))
        else:
            metas = None
            M = None

        if not attrs and X.shape[1]:
            attrs = Orange.data.Domain.from_numpy(X).attributes

        domain = Orange.data.Domain(attrs, metas=metas)
        d = Orange.data.Table.from_numpy(domain, X, None, M)
        self.Outputs.data.send(d)

        self.set_modified(False)
from orangecontrib.text.corpus import Corpus
from nltk import word_tokenize
import re
import sys
import datetime
from Orange.data import Table, Domain
from Orange.data import StringVariable

N = 20
EMPTYLIST = []
EMPTYSTRING = ""
FIELDNAMEDATE = "date"
FIELDNAMETEXT = "text"
FIELDNAMECOORDINATES = "coordinates"
COLUMNDOMAIN = StringVariable.make(FIELDNAMECOORDINATES)


def makeRefId(msgId, index):
    return (" ".join([str(msgId + 1), str(index)]))


def getDateFromRefId(refId):
    return (" ".join(refId.split()[0:2]))


def makePhrase(wordList, index):
    return (" ".join(wordList[index:index + N]))


def addPhraseToRefs(phraseRefs, phrase, msgId, index):
    phraseRefs[phrase] = makeRefId(msgId, index)
Пример #36
0
    def read(self):
        import opusFC

        if self.sheet:
            db = self.sheet
        else:
            db = self.sheets[0]

        db = tuple(db.split(" "))
        dim = db[1]

        try:
            data = opusFC.getOpusData(self.filename, db)
        except Exception:
            raise IOError("Couldn't load spectrum from " + self.filename)

        attrs, clses, metas = [], [], []

        attrs = [
            ContinuousVariable.make(repr(data.x[i]))
            for i in range(data.x.shape[0])
        ]

        y_data = None
        meta_data = None

        if type(data) == opusFC.MultiRegionDataReturn:
            y_data = []
            meta_data = []
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y'),
                StringVariable.make('map_region'),
                TimeVariable.make('start_time')
            ])
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                start_time = region.start_time
                meta_region = np.column_stack(
                    (mapX, mapY, map_region, start_time))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.MultiRegionTRCDataReturn:
            y_data = []
            meta_data = []
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y'),
                StringVariable.make('map_region')
            ])
            attrs = [
                ContinuousVariable.make(repr(data.labels[i]))
                for i in range(len(data.labels))
            ]
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                meta_region = np.column_stack((mapX, mapY, map_region))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.ImageDataReturn:
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y')
            ])

            data_3D = data.spectra

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.ImageTRCDataReturn:
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y')
            ])

            attrs = [
                ContinuousVariable.make(repr(data.labels[i]))
                for i in range(len(data.labels))
            ]
            data_3D = data.traces

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.TimeResolvedTRCDataReturn:
            y_data = data.traces

        elif type(data) == opusFC.TimeResolvedDataReturn:
            metas.extend([ContinuousVariable.make('z')])

            y_data = data.spectra
            meta_data = data.z

        elif type(data) == opusFC.SingleDataReturn:
            y_data = data.y[None, :]

        else:
            raise ValueError(
                "Empty or unsupported opusFC DataReturn object: " + type(data))

        import_params = ['SRT', 'SNM']

        for param_key in import_params:
            try:
                param = data.parameters[param_key]
            except KeyError:
                pass  # TODO should notify user?
            else:
                try:
                    param_name = opusFC.paramDict[param_key]
                except KeyError:
                    param_name = param_key
                if param_key == 'SRT':
                    var = TimeVariable.make(param_name)
                elif type(param) is float:
                    var = ContinuousVariable.make(param_name)
                elif type(param) is str:
                    var = StringVariable.make(param_name)
                else:
                    raise ValueError  #Found a type to handle
                metas.extend([var])
                params = np.full((y_data.shape[0], ), param,
                                 np.array(param).dtype)
                if meta_data is not None:
                    # NB dtype default will be np.array(fill_value).dtype in future
                    meta_data = np.column_stack(
                        (meta_data, params.astype(object)))
                else:
                    meta_data = params

        domain = Orange.data.Domain(attrs, clses, metas)

        meta_data = np.atleast_2d(meta_data)

        table = Orange.data.Table.from_numpy(domain,
                                             y_data.astype(float, order='C'),
                                             metas=meta_data)

        return table
Пример #37
0
    def read(self):
        who = matlab.whosmat(self.filename)
        if not who:
            raise IOError("Couldn't load matlab file " + self.filename)
        else:
            ml = matlab.loadmat(self.filename, chars_as_strings=True)

            ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)}

            # X is the biggest numeric array
            numarrays = []
            for name, con in ml.items():
                if issubclass(con.dtype.type, numbers.Number):
                    numarrays.append(
                        (name, reduce(lambda x, y: x * y, con.shape, 1)))
            X = None
            if numarrays:
                nameX = max(numarrays, key=lambda x: x[1])[0]
                X = ml.pop(nameX)

            # find an array with compatible shapes
            attributes = []
            if X is not None:
                nameattributes = None
                for name, con in ml.items():
                    if con.shape in [(X.shape[1], ), (1, X.shape[1])]:
                        nameattributes = name
                        break
                attributenames = ml.pop(nameattributes).ravel(
                ) if nameattributes else range(X.shape[1])
                attributenames = [str(a).strip() for a in attributenames
                                  ]  # strip because of numpy char array
                attributes = [
                    ContinuousVariable.make(a) for a in attributenames
                ]

            metas = []
            metaattributes = []

            sizemetas = None
            if X is None:
                counts = defaultdict(list)
                for name, con in ml.items():
                    counts[len(con)].append(name)
                if counts:
                    sizemetas = max(counts.keys(),
                                    key=lambda x: len(counts[x]))
            else:
                sizemetas = len(X)
            if sizemetas:
                for name, con in ml.items():
                    if len(con) == sizemetas:
                        metas.append(name)

            metadata = []
            for m in sorted(metas):
                f = ml[m]
                metaattributes.append(StringVariable.make(m))
                f.resize(sizemetas, 1)
                metadata.append(f)

            metadata = np.hstack(tuple(metadata))

            domain = Domain(attributes, metas=metaattributes)
            if X is None:
                X = np.zeros((sizemetas, 0))
            return Orange.data.Table.from_numpy(domain,
                                                X,
                                                Y=None,
                                                metas=metadata)
Пример #38
0
    def read(self):
        who = matlab.whosmat(self.filename)
        if not who:
            raise IOError("Couldn't load matlab file " + self.filename)
        else:
            ml = matlab.loadmat(self.filename, chars_as_strings=True)
            ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)}

            def num_elements(array):
                return reduce(lambda x, y: x * y, array.shape, 1)

            def find_biggest(arrays):
                sizes = []
                for n, c in arrays.items():
                    sizes.append((num_elements(c), n))
                return max(sizes)[1]

            def is_string_array(array):
                return issubclass(array.dtype.type, np.str_)

            def is_number_array(array):
                return issubclass(array.dtype.type, numbers.Number)

            numeric = {n: a for n, a in ml.items() if is_number_array(a)}

            # X is the biggest numeric array
            X = ml.pop(find_biggest(numeric)) if numeric else None

            # find an array with compatible shapes
            attributes = []
            if X is not None:
                name_array = None
                for name in sorted(ml):
                    con = ml[name]
                    if con.shape in [(X.shape[1],), (1, X.shape[1])]:
                        name_array = name
                        break
                names = ml.pop(name_array).ravel() if name_array else range(X.shape[1])
                names = [str(a).rstrip() for a in names]  # remove matlab char padding
                attributes = [ContinuousVariable.make(a) for a in names]

            meta_names = []
            metas = []

            meta_size = None
            if X is None:
                counts = defaultdict(list)
                for name, con in ml.items():
                    counts[len(con)].append(name)
                if counts:
                    meta_size = max(counts.keys(), key=lambda x: len(counts[x]))
            else:
                meta_size = len(X)
            if meta_size:
                for name, con in ml.items():
                    if len(con) == meta_size:
                        meta_names.append(name)

            meta_data = []
            for m in sorted(meta_names):
                f = ml[m]
                if is_string_array(f) and len(f.shape) == 1:  # 1D string arrays
                    metas.append(StringVariable.make(m))
                    f = np.array([a.rstrip() for a in f])  # remove matlab char padding
                    f.resize(meta_size, 1)
                    meta_data.append(f)
                elif is_number_array(f) and len(f.shape) == 2:
                    if f.shape[1] == 1:
                        names = [m]
                    else:
                        names = [m + "_" + str(i+1) for i in range(f.shape[1])]
                    for n in names:
                        metas.append(ContinuousVariable.make(n))
                    meta_data.append(f)

            meta_data = np.hstack(tuple(meta_data)) if meta_data else None

            domain = Domain(attributes, metas=metas)
            if X is None:
                X = np.zeros((meta_size, 0))
            return Orange.data.Table.from_numpy(domain, X, Y=None, metas=meta_data)
Пример #39
0
    def read(self):
        try:
            import opusFC
        except ImportError:
            raise RuntimeError(self._OPUS_WARNING)

        if self.sheet:
            db = self.sheet
        else:
            db = self.sheets[0]

        db = tuple(db.split(" "))
        dim = db[1]

        try:
            data = opusFC.getOpusData(self.filename, db)
        except Exception:
            raise IOError("Couldn't load spectrum from " + self.filename)

        attrs, clses, metas = [], [], []

        attrs = [ContinuousVariable.make(repr(data.x[i]))
                 for i in range(data.x.shape[0])]

        y_data = None
        meta_data = None

        if type(data) == opusFC.MultiRegionDataReturn:
            y_data = []
            meta_data = []
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y'),
                          StringVariable.make('map_region'),
                          TimeVariable.make('start_time')])
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                start_time = region.start_time
                meta_region = np.column_stack((mapX, mapY,
                                               map_region, start_time))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.MultiRegionTRCDataReturn:
            y_data = []
            meta_data = []
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y'),
                          StringVariable.make('map_region')])
            attrs = [ContinuousVariable.make(repr(data.labels[i]))
                     for i in range(len(data.labels))]
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                meta_region = np.column_stack((mapX, mapY, map_region))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.ImageDataReturn:
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y')])

            data_3D = data.spectra

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.ImageTRCDataReturn:
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y')])

            attrs = [ContinuousVariable.make(repr(data.labels[i]))
                     for i in range(len(data.labels))]
            data_3D = data.traces

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.TimeResolvedTRCDataReturn:
            y_data = data.traces

        elif type(data) == opusFC.TimeResolvedDataReturn:
            metas.extend([ContinuousVariable.make('z')])

            y_data = data.spectra
            meta_data = data.z

        elif type(data) == opusFC.SingleDataReturn:
            y_data = data.y[None, :]

        else:
            raise ValueError("Empty or unsupported opusFC DataReturn object: " + type(data))

        import_params = ['SRT', 'SNM']

        for param_key in import_params:
            try:
                param = data.parameters[param_key]
            except KeyError:
                pass  # TODO should notify user?
            else:
                try:
                    param_name = opusFC.paramDict[param_key]
                except KeyError:
                    param_name = param_key
                if param_key == 'SRT':
                    var = TimeVariable.make(param_name)
                elif type(param) is float:
                    var = ContinuousVariable.make(param_name)
                elif type(param) is str:
                    var = StringVariable.make(param_name)
                else:
                    raise ValueError #Found a type to handle
                metas.extend([var])
                params = np.full((y_data.shape[0],), param, np.array(param).dtype)
                if meta_data is not None:
                    # NB dtype default will be np.array(fill_value).dtype in future
                    meta_data = np.column_stack((meta_data, params.astype(object)))
                else:
                    meta_data = params

        domain = Orange.data.Domain(attrs, clses, metas)

        meta_data = np.atleast_2d(meta_data)

        table = Orange.data.Table.from_numpy(domain,
                                             y_data.astype(float, order='C'),
                                             metas=meta_data)

        return table
Пример #40
0
    def read(self):
        who = matlab.whosmat(self.filename)
        if not who:
            raise IOError("Couldn't load matlab file " + self.filename)
        else:
            ml = matlab.loadmat(self.filename, chars_as_strings=True)
            ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)}

            def num_elements(array):
                return reduce(lambda x, y: x * y, array.shape, 1)

            def find_biggest(arrays):
                sizes = []
                for n, c in arrays.items():
                    sizes.append((num_elements(c), n))
                return max(sizes)[1]

            def is_string_array(array):
                return issubclass(array.dtype.type, np.str_)

            def is_number_array(array):
                return issubclass(array.dtype.type, numbers.Number)

            numeric = {n: a for n, a in ml.items() if is_number_array(a)}

            # X is the biggest numeric array
            X = ml.pop(find_biggest(numeric)) if numeric else None

            # find an array with compatible shapes
            attributes = []
            if X is not None:
                name_array = None
                for name in sorted(ml):
                    con = ml[name]
                    if con.shape in [(X.shape[1], ), (1, X.shape[1])]:
                        name_array = name
                        break
                names = ml.pop(name_array).ravel() if name_array else range(
                    X.shape[1])
                names = [str(a).rstrip()
                         for a in names]  # remove matlab char padding
                attributes = [ContinuousVariable.make(a) for a in names]

            meta_names = []
            metas = []

            meta_size = None
            if X is None:
                counts = defaultdict(list)
                for name, con in ml.items():
                    counts[len(con)].append(name)
                if counts:
                    meta_size = max(counts.keys(),
                                    key=lambda x: len(counts[x]))
            else:
                meta_size = len(X)
            if meta_size:
                for name, con in ml.items():
                    if len(con) == meta_size:
                        meta_names.append(name)

            meta_data = []
            for m in sorted(meta_names):
                f = ml[m]
                if is_string_array(f) and len(
                        f.shape) == 1:  # 1D string arrays
                    metas.append(StringVariable.make(m))
                    f = np.array([a.rstrip()
                                  for a in f])  # remove matlab char padding
                    f.resize(meta_size, 1)
                    meta_data.append(f)
                elif is_number_array(f) and len(f.shape) == 2:
                    if f.shape[1] == 1:
                        names = [m]
                    else:
                        names = [
                            m + "_" + str(i + 1) for i in range(f.shape[1])
                        ]
                    for n in names:
                        metas.append(ContinuousVariable.make(n))
                    meta_data.append(f)

            meta_data = np.hstack(tuple(meta_data)) if meta_data else None

            domain = Domain(attributes, metas=metas)
            if X is None:
                X = np.zeros((meta_size, 0))
            return Orange.data.Table.from_numpy(domain,
                                                X,
                                                Y=None,
                                                metas=meta_data)
Пример #41
0
    def read(self):
        import opusFC

        if self.sheet:
            db = self.sheet
        else:
            db = self.sheets[0]

        db = tuple(db.split(" "))
        dim = db[1]

        try:
            data = opusFC.getOpusData(self.filename, db)
        except Exception:
            raise IOError("Couldn't load spectrum from " + self.filename)

        attrs, clses, metas = [], [], []

        attrs = [ContinuousVariable.make(repr(data.x[i]))
                    for i in range(data.x.shape[0])]

        y_data = None
        meta_data = None

        if dim == '3D':
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y')])

            if db[0] == 'TRC':
                attrs = [ContinuousVariable.make(repr(data.labels[i]))
                            for i in range(len(data.labels))]
                data_3D = data.traces
            else:
                data_3D = data.spectra

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))
        elif dim == '2D':
            y_data = data.y[None,:]

        try:
            stime = data.parameters['SRT']
        except KeyError:
            pass # TODO notify user?
        else:
            metas.extend([TimeVariable.make(opusFC.paramDict['SRT'])])
            if meta_data is not None:
                dates = np.full(meta_data[:,0].shape, stime, np.array(stime).dtype)
                meta_data = np.column_stack((meta_data, dates.astype(object)))
            else:
                meta_data = np.array([stime])[None,:]

        import_params = ['SNM']

        for param_key in import_params:
            try:
                param = data.parameters[param_key]
            except Exception:
                pass # TODO should notify user?
            else:
                try:
                    param_name = opusFC.paramDict[param_key]
                except KeyError:
                    param_name = param_key
                if type(param) is float:
                    var = ContinuousVariable.make(param_name)
                elif type(param) is str:
                    var = StringVariable.make(param_name)
                else:
                    raise ValueError #Found a type to handle
                metas.extend([var])
                if meta_data is not None:
                    # NB dtype default will be np.array(fill_value).dtype in future
                    params = np.full(meta_data[:,0].shape, param, np.array(param).dtype)
                    meta_data = np.column_stack((meta_data, params.astype(object)))
                else:
                    meta_data = np.array([param])[None,:]

        domain = Orange.data.Domain(attrs, clses, metas)

        table = Orange.data.Table.from_numpy(domain,
                                             y_data.astype(float, order='C'),
                                             metas=meta_data)

        return table