Exemplo n.º 1
0
def run(mpfile, **kwargs):

    input_file = mpfile.document['_hdata'].pop('input_file')
    zip_path = os.path.join(os.environ['HOME'], 'work', input_file)
    if not os.path.exists(zip_path):
        return 'Please upload', zip_path
    zip_file = ZipFile(zip_path, 'r')

    composition_table_dict = mpfile.document['_hdata']['composition_table']
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4)
        d['position'] = RecursiveDict(
            (k, clean_value(v, 'mm'))
            for k, v in zip(['x', 'y'], [x, y])
        )

        # composition
        d['composition'] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items()
        )

        # identifier
        identifier = get_composition_from_string(''.join([
            '{}{}'.format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d['composition'].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print 'ERROR: Did not find %s in zip file' % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[['Energy', 'XAS', 'XMCD']]

        # min and max
        d.rec_update(RecursiveDict(
            (y, RecursiveDict([
                ('min', df[y].min()), ('max', df[y].max())
            ])) for y in ['XAS', 'XMCD']
        ))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Exemplo n.º 2
0
def run(mpfile, **kwargs):

    input_file = mpfile.document["_hdata"].pop("input_file")
    zip_path = os.path.join(os.environ["HOME"], "work", input_file)
    if not os.path.exists(zip_path):
        return "Please upload", zip_path
    zip_file = ZipFile(zip_path, "r")

    composition_table_dict = mpfile.document["_hdata"]["composition_table"]
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit("_", 4)
        d["position"] = RecursiveDict(
            (k, clean_value(v, "mm")) for k, v in zip(["x", "y"], [x, y]))

        # composition
        d["composition"] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items())

        # identifier
        identifier = get_composition_from_string("".join([
            "{}{}".format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d["composition"].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print "ERROR: Did not find %s in zip file" % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[["Energy", "XAS", "XMCD"]]

        # min and max
        d.rec_update(
            RecursiveDict(
                (y, RecursiveDict([("min", df[y].min()), ("max",
                                                          df[y].max())]))
                for y in ["XAS", "XMCD"]))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ["data"]),
                                     identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Exemplo n.º 3
0
def run(mpfile, **kwargs):

    input_file = mpfile.document['_hdata'].pop('input_file')
    zip_path = os.path.join(os.environ['HOME'], 'work', input_file)
    if not os.path.exists(zip_path):
        return 'Please upload', zip_path
    zip_file = ZipFile(zip_path, 'r')

    composition_table_dict = mpfile.document['_hdata']['composition_table']
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4)
        d['position'] = RecursiveDict(
            (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y]))

        # composition
        d['composition'] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items())

        # identifier
        identifier = get_composition_from_string(''.join([
            '{}{}'.format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d['composition'].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print 'ERROR: Did not find %s in zip file' % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[['Energy', 'XAS', 'XMCD']]

        # min and max
        d.rec_update(
            RecursiveDict(
                (y, RecursiveDict([('min', df[y].min()), ('max',
                                                          df[y].max())]))
                for y in ['XAS', 'XMCD']))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ['data']),
                                     identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Exemplo n.º 4
0
 def from_string(data):
     # use archieml-python parse to import data
     rdct = RecursiveDict(archieml.loads(data))
     rdct.rec_update()
     # post-process internal representation of file contents
     for key in rdct.keys():
         is_general, root_key = normalize_root_level(key)
         if is_general:
             # make part of shared (meta-)data, i.e. nest under `general` at
             # the beginning of the MPFile
             if mp_level01_titles[0] not in rdct:
                 rdct.insert_before(rdct.keys()[0],
                                    (mp_level01_titles[0], RecursiveDict()))
             rdct.rec_update(
                 nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key]))
         else:
             # normalize identifier key (pop & insert)
             # using rec_update since we're looping over all entries
             # also: support data in bare tables (marked-up only by
             #       root-level identifier) by nesting under 'data'
             value = rdct.pop(key)
             keys = [root_key]
             if isinstance(value, list): keys.append('table')
             rdct.rec_update(nest_dict(value, keys))
             # Note: CSV section is marked with 'data ' prefix during iterate()
             for k, v in rdct[root_key].iterate():
                 if isinstance(k, six.string_types) and \
                    k.startswith(mp_level01_titles[1]):
                     # k = table name (incl. data prefix)
                     # v = csv string from ArchieML free-form arrays
                     table_name = k[len(mp_level01_titles[1] + '_'):]
                     pd_obj = read_csv(v)
                     rdct[root_key].pop(table_name)
                     rdct[root_key].rec_update(
                         nest_dict(pd_obj.to_dict(), [k]))
                     rdct[root_key].insert_default_plot_options(pd_obj, k)
             # convert CIF strings into pymatgen structures
             if mp_level01_titles[3] in rdct[root_key]:
                 from pymatgen.io.cif import CifParser
                 for name in rdct[root_key][mp_level01_titles[3]].keys():
                     cif = rdct[root_key][mp_level01_titles[3]].pop(name)
                     parser = CifParser.from_string(cif)
                     structure = parser.get_structures(primitive=False)[0]
                     rdct[root_key][mp_level01_titles[3]].rec_update(
                         nest_dict(structure.as_dict(), [name]))
     return MPFile.from_dict(rdct)
Exemplo n.º 5
0
class MPFileCore(six.with_metaclass(ABCMeta, object)):
    """Abstract Base Class for representing a MP Contribution File"""
    def __init__(self, data=RecursiveDict()):
        if isinstance(data, dict):
            self.document = RecursiveDict(data)
        else:
            raise ValueError("Need dict (or inherited class) to init MPFile.")
        self.document.rec_update(
        )  # convert (most) OrderedDict's to RecursiveDict's
        self.unique_mp_cat_ids = True
        self.max_contribs = 10

    def __getitem__(self, key):
        item = self.from_dict({key: self.document[key]})
        general = self.document.get(mp_level01_titles[0])
        if general:
            item.insert_general_section(
                self.from_dict({mp_level01_titles[0]: general}))
        return item

    @property
    def ids(self):
        return [
            k for k in self.document.keys()
            if k.lower() != mp_level01_titles[0]
        ]

    @property
    def hdata(self):
        return HierarchicalData(self.document)

    @property
    def tdata(self):
        return TabularData(self.document)

    @property
    def gdata(self):
        return GraphicalData(self.document)

    @property
    def sdata(self):
        return StructuralData(self.document)

    @classmethod
    def from_file(cls,
                  filename_or_file=default_mpfile_path.replace(
                      ".txt", "_in.txt")):
        """Reads a MPFile from a file.

        Args:
            filename_or_file (str or file): name of file or file containing contribution data.

        Returns:
            MPFile object.
        """
        f = (open(filename_or_file) if isinstance(
            filename_or_file, six.string_types) else filename_or_file)
        return cls.from_string(f.read())

    @classmethod
    def from_dict(cls, data=RecursiveDict()):
        return cls(data=data)

    @classmethod
    def from_contribution(cls, contrib):
        """construct MPFile from contribution (see rest.adapter.submit_contribution)"""
        if "identifier" not in contrib or "content" not in contrib:
            raise ValueError("Dict not in contribution-style format")
        recdict = RecursiveDict({contrib["identifier"]: contrib["content"]})
        return cls.from_dict(recdict)

    def write_file(self,
                   filename=default_mpfile_path.replace(".txt", "_out.txt"),
                   **kwargs):
        """Writes MPFile to a file. The supported kwargs are the same as those
        for the MPFile.get_string method and are passed through directly."""
        with codecs.open(filename, encoding="utf-8", mode="w") as f:
            file_str = self.get_string(**kwargs) + "\n"
            f.write(file_str)
            print("{} ({:.3f}MB) written".format(
                filename,
                os.path.getsize(filename) / 1024.0 / 1024.0))

    def get_number_of_lines(self, **kwargs):
        return len(self.get_string(**kwargs).split("\n"))

    def split(self):
        general_mpfile = (self.pop_first_section() if mp_level01_titles[0]
                          in self.document.keys() else None)
        if not self.document:
            raise ValueError("No contributions in MPFile! Either the file is"
                             " empty or only contains shared (meta-)data not"
                             " correlated to core identifier.")
        while True:
            try:
                mpfile_single = self.pop_first_section()
                mpid_orig = mpfile_single.ids[0]
                if "--" in mpid_orig:
                    mpid = mpid_orig.split("--")[0]
                    mpfile_single.document.rec_update(
                        nest_dict(mpfile_single.document.pop(mpid_orig),
                                  [mpid]))
                if general_mpfile is not None:
                    mpfile_single.insert_general_section(general_mpfile)
                yield mpfile_single
            except KeyError:
                break

    def get_identifiers(self):
        """list of materials/composition identifiers as tuples w/ contribution IDs"""
        return [(k, self.document[k].get("cid", None)) for k in self.document
                if k.lower() != mp_level01_titles[0]]

    def pop_first_section(self):
        item = self.document.popitem(last=False)
        return self.from_dict(RecursiveDict([item]))

    def insert_general_section(self, general_mpfile):
        """insert general section from `general_mpfile` into this MPFile"""
        if general_mpfile is None:
            return
        general_title = mp_level01_titles[0]
        general_data = general_mpfile.document[general_title]
        root_key = list(self.document.keys())[0]
        for key, value in general_data.items():
            if key in self.document[root_key]:
                self.document.rec_update(nest_dict(value, [root_key, key]))
            else:
                self.document[root_key][key] = value
        for key in reversed(general_data.keys()):
            self.document[root_key].move_to_end(key, last=False)

    def get_unique_mp_cat_id(self, mp_cat_id):
        if not self.unique_mp_cat_ids or mp_cat_id in mp_level01_titles:
            return mp_cat_id
        mp_cat_id_idx = len([i for i in self.ids if i.startswith(mp_cat_id)])
        if mp_cat_id_idx == 0:
            return mp_cat_id
        return "{}--{}".format(mp_cat_id, mp_cat_id_idx)

    def concat(self, mpfile):
        """concatenate single-section MPFile with this MPFile"""
        try:
            if len(mpfile.document) > 1:
                raise ValueError(
                    "concatenation only possible with single section files")
        except AttributeError:
            raise ValueError("Provide a MPFile to concatenate")
        mp_cat_id = list(mpfile.document.keys())[0]
        general_title = mp_level01_titles[0]
        if general_title in mpfile.document[mp_cat_id]:
            general_data = mpfile.document[mp_cat_id].pop(general_title)
            if general_title not in self.document:
                self.document.rec_update(
                    nest_dict(general_data, [general_title]))
        self.document.rec_update(
            nest_dict(mpfile.document.pop(mp_cat_id),
                      [self.get_unique_mp_cat_id(mp_cat_id)]))

    def insert_top(self, mp_cat_id, key, value):
        """insert value for `mp_cat_id` as `key: <value>` at top"""
        self.document[mp_cat_id][key] = str(value)
        self.document[mp_cat_id].move_to_end(key, last=False)

    def add_data_table(self, identifier, dataframe, name, plot_options=None):
        """add a datatable to the root-level section

        Args:
            identifier (str): MP category ID (`mp_cat_id`)
            dataframe (pandas.DataFrame): tabular data as Pandas DataFrame
            name (str): table name, optional if only one table in section
            plot_options (dict): options for according plotly graph
        """
        # TODO: optional table name, required if multiple tables per root-level section
        name = "".join([replacements.get(c, c) for c in name])
        self.document.rec_update(
            nest_dict(Table(dataframe).to_dict(), [identifier, name]))
        self.document[identifier].insert_default_plot_options(
            dataframe, name, update_plot_options=plot_options)

    def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]):
        if len(self.ids) >= self.max_contribs:
            raise StopIteration(
                "Reached max. number of contributions in MPFile")
        self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))

    def add_structure(self, source, name=None, identifier=None, fmt=None):
        """add a structure to the mpfile"""
        from pymatgen import Structure, MPRester

        if isinstance(source, Structure):
            structure = source
        elif isinstance(source, dict):
            structure = Structure.from_dict(source)
        elif os.path.exists(source):
            structure = Structure.from_file(source, sort=True)
        elif isinstance(source, six.string_types):
            if fmt is None:
                raise ValueError("Need fmt to get structure from string!")
            structure = Structure.from_str(source, fmt, sort=True)
        else:
            raise ValueError(source, "not supported!")

        if name is not None:
            if not isinstance(name, six.string_types):
                raise ValueError("structure name needs to be a string")
            elif "." in name:
                raise ValueError("structure name cannot contain dots (.)")

        mpr = MPRester()
        if not mpr.api_key:
            raise ValueError(
                "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`."
            )
        matched_mpids = mpr.find_structure(structure)
        formula = get_composition_from_string(structure.composition.formula)
        if not matched_mpids:
            if identifier is None:
                identifier = formula
                print(
                    "Structure not found in MP! Please submit via MPComplete to "
                    "obtain mp-id or manually choose an anchor mp-id! Continuing "
                    "with {} as identifier!".format(identifier))
            else:
                print("Structure not found in MP! Forcing {} as identifier!".
                      format(identifier))
        elif identifier is None:
            identifier = matched_mpids[0]
            if len(matched_mpids) > 1:
                print("Multiple matching structures found in MP. Using",
                      identifier)
        elif identifier not in matched_mpids:
            msg = "Structure does not match {} but instead {}!".format(
                identifier, matched_mpids)
            raise ValueError(msg)

        idx = len(
            self.document.get(identifier, {}).get(mp_level01_titles[3], {}))
        sub_key = formula if name is None else name
        if sub_key in self.document.get(identifier,
                                        {}).get(mp_level01_titles[3], {}):
            sub_key += "_{}".format(idx)
        self.document.rec_update(
            nest_dict(structure.as_dict(),
                      [identifier, mp_level01_titles[3], sub_key]))
        return identifier

    def __repr__(self):
        return self.get_string(df_head_only=True)

    def __str__(self):
        return self.get_string(df_head_only=True)

    def _ipython_display_(self):
        from IPython.display import display_html

        display_html(self.hdata)
        display_html(self.tdata)
        display_html(self.gdata)
        display_html(self.sdata)

    # ----------------------------
    # Override these in subclasses
    # ----------------------------

    @staticmethod
    def from_string(data):
        """Reads a MPFile from a string containing contribution data."""
        return MPFileCore()

    def get_string(self, df_head_only=False):
        """Returns a string to be written as a file"""
        return repr(self.document)
Exemplo n.º 6
0
    def from_string(data):
        # use archieml-python parse to import data
        rdct = RecursiveDict(loads(data))
        rdct.rec_update()

        # post-process internal representation of file contents
        for key in list(rdct.keys()):
            is_general, root_key = normalize_root_level(key)

            if is_general:
                # make part of shared (meta-)data, i.e. nest under `general` at
                # the beginning of the MPFile
                if mp_level01_titles[0] not in rdct:
                    rdct[mp_level01_titles[0]] = RecursiveDict()
                    rdct.move_to_end(mp_level01_titles[0], last=False)

            # normalize identifier key (pop & insert)
            # using rec_update since we're looping over all entries
            # also: support data in bare tables (marked-up only by
            #       root-level identifier) by nesting under 'data'
            value = rdct.pop(key)
            keys = [mp_level01_titles[0]] if is_general else []
            keys.append(root_key)
            if isinstance(value, list):
                keys.append("table")
            rdct.rec_update(nest_dict(value, keys))

            # reference to section to iterate or parse as CIF
            section = (rdct[mp_level01_titles[0]][root_key]
                       if is_general else rdct[root_key])

            # iterate to find CSV sections to parse
            # also parse propnet quantities
            if isinstance(section, dict):
                scope = []
                for k, v in section.iterate():
                    level, key = k
                    key = "".join([replacements.get(c, c) for c in key])
                    level_reduction = bool(level < len(scope))
                    if level_reduction:
                        del scope[level:]
                    if v is None:
                        scope.append(key)
                    elif isinstance(v, list) and isinstance(v[0], dict):
                        table = ""
                        for row_dct in v:
                            table = "\n".join([table, row_dct["value"]])
                        pd_obj = read_csv(table)
                        d = nest_dict(pd_obj.to_dict(), scope + [key])
                        section.rec_update(d, overwrite=True)
                        if not is_general and level == 0:
                            section.insert_default_plot_options(pd_obj, key)
                    elif (Quantity is not None
                          and isinstance(v, six.string_types) and " " in v):
                        quantity = Quantity.from_key_value(key, v)
                        d = nest_dict(quantity.as_dict(), scope +
                                      [key])  # TODO quantity.symbol.name
                        section.rec_update(d, overwrite=True)

            # convert CIF strings into pymatgen structures
            if mp_level01_titles[3] in section:
                from pymatgen.io.cif import CifParser

                for name in section[mp_level01_titles[3]].keys():
                    cif = section[mp_level01_titles[3]].pop(name)
                    parser = CifParser.from_string(cif)
                    structure = parser.get_structures(primitive=False)[0]
                    section[mp_level01_titles[3]].rec_update(
                        nest_dict(structure.as_dict(), [name]))

        return MPFile.from_dict(rdct)
Exemplo n.º 7
0
class DataGenerator(object):
    """generate MP-like data from baseball database

    database: http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip
    """
    def __init__(self):
        try:
            from faker import Faker
            self.fake = Faker()
        except:
            self.fake = None
        self.master = os.path.join(csv_database, 'Master.csv')
        self.player = None
        self.player_id = None
        self.player_info = None
        self.player_data = None

    def set_player(self):
        """retrieve player from master file as pandas.Series"""
        df = read_csv(self.master, index_col=0)
        self.player_id = self.fake.random_element(elements=df.index)
        self.player = df.xs(self.player_id).dropna()

    def _split_string_at_caps(self, string):
        return re.split(r'([A-Z][a-z]*)', string)[:-1]

    def organize_player_info(self):
        """organize player info into nested dict"""
        splits = map(self._split_string_at_caps, self.player.index)
        counter = Counter([ el[0] for el in splits if el ])
        subsecs = [key for key,cnt in counter.iteritems() if cnt > 1]
        self.player_info = RecursiveDict({})
        for k,v in self.player.iteritems():
            keys = self._split_string_at_caps(k)
            nested = {keys[0]: {keys[1]: v}} if (
                keys and keys[0] in subsecs
            ) else {'other': {k: v}}
            self.player_info.rec_update(nested)

    def generate_dataset_for_player(self):
        """generate a dataset for a player"""
        for file_name in os.listdir(csv_database):
            if file_name == 'Master.csv': continue
            try:
                df = read_csv(os.path.join(csv_database, file_name))
            except:
                continue
            if 'playerID' not in df.columns: continue
            dataset = df[df['playerID']==self.player_id].dropna()
            if dataset.empty or dataset.shape[0] < 2: continue
            cols = [
                col for col in dataset.columns
                if not dataset[col].sum()
            ]
            self.player_data = dataset.drop(cols+['playerID'], axis=1)
            if self.player_data is not None:
              break

    def init(self, keep_dataset=False):
        """call all setters for a player"""
        if not keep_dataset:
            self.set_player()
            self.organize_player_info()
        self.generate_dataset_for_player()
        if self.player_data is None:
          # try different player if no dataset found
          self.init(keep_dataset=keep_dataset)