예제 #1
0
파일: io.py 프로젝트: cipoll4/pymake
    def get_atoms(cls):
        expe_designs = []
        atoms = dict()

        modules = get_pymake_settings('_spec')
        modules = [modules] if type(modules) is str else modules
        for module in modules:

            s = cls(module, class_filter=cls.module)

            for surname, _module in s.packages.items():
                name = _module.__name__
                module = s._cls_browse[name]

                expd = getattr(import_module(module.module), name)()

                content = {}
                content['script_name'] = surname
                content['module_name'] = '.'.join(
                    (_module.__module__, _module.__name__))
                content['_module'] = module
                content['exp'] = expd._specs()
                atoms[name] = content

        return atoms
예제 #2
0
    def fetch(self, *args):

        i = self.output_path.find('.pmk/')
        path = self.output_path
        local_path = path[i:]

        user = get_pymake_settings('ssh_user')
        machine = get_pymake_settings('ssh_machine')
        remote_loc = get_pymake_settings('ssh_remote')
        local_loc = os.path.dirname(path) + '/'

        ext = '.inf'
        _file = os.path.join(remote_loc, local_path + ext)

        cmd = ['scp', '%s@%s:%s' % (user, machine, _file), local_loc]

        os.makedirs(os.path.dirname(path), exist_ok=True)
        subprocess.call(cmd)
예제 #3
0
def load_stirling(style='npy'):
    stirling_path = get_pymake_settings('project_stirling')
    fn = os.path.join(stirling_path, 'stirling.npy')
    npy_exists = os.path.isfile(fn)
    if style == 'npy' and npy_exists:
        return np.load(fn)
    else:
        stirlg = lookup_stirling()
        return stirlg.load()
예제 #4
0
파일: io.py 프로젝트: cipoll4/pymake
    def get_packages(cls, **kwargs):
        module_name = get_pymake_settings('_spec')
        if not 'class_filter' in kwargs:
            kwargs['class_filter'] = cls.module

        if isinstance(module_name, list):
            packs = {}
            for m in module_name:
                packs.update(cls(m, **kwargs).packages)
            return packs
        else:
            return cls(module_name, **kwargs).packages
예제 #5
0
파일: io.py 프로젝트: cipoll4/pymake
def get_conf_from_file(target, mp):
    """ Return dictionary of property for an expe file.
        @mp: map parameters
        format model_K_hyper_N
        @template_file order important to align the dictionnary.
        """
    masterkeys = _MASTERKEYS.copy()
    template_file = masterkeys.keys()
    ##template_file = 'networks/generator/Graph13/debug11/immsb_10_auto_0_all.*'

    data_path = get_pymake_settings('project_data')
    # Relative path ignore
    if target.startswith(data_path):
        target.replace(data_path, '')

    path = target.lstrip('/').split('/')

    _prop = os.path.splitext(path.pop())[0]
    _prop = path + _prop.split('_')

    prop = {}
    cpt_hook_master = 0
    cpt_hook_user = 0

    # @Debug/Improve the nasty Hook here
    def update_pt(cur, master, user):
        return cur - master + user

    #prop = {k: _prop[i] for i, k in enumerate(template_file) if k in mp}
    for i, k in enumerate(template_file):
        if not k in mp:
            cpt_hook_master += 1
            continue
        pt = update_pt(i, cpt_hook_master, cpt_hook_user)
        hook = tree_hook(k, _prop[pt])
        if hook:
            cpt_hook_user += 1
            pt = update_pt(i, cpt_hook_master, cpt_hook_user)
        prop[k] = _prop[pt]

    return prop
예제 #6
0
파일: io.py 프로젝트: cipoll4/pymake
    def get_atoms(cls):
        atoms = dict()
        modules = get_pymake_settings('_script')
        modules = [modules] if type(modules) is str else modules
        for module in modules:

            s = cls(module, class_filter=cls.module)

            ## get decorator for each class
            #class2met2dec = {}
            #for method, _class in classs.packages.items():
            #    append decoratpr information to filter @atpymake

            for surname, _module in s.packages.items():
                name = _module.__name__
                module = s._cls_browse[name]
                methods = list(module.methods.keys())
                for m in methods.copy():
                    _m = getattr(s.packages[name.lower()], m)
                    if not inspect.isfunction(_m) and m != '__call__':
                        methods.remove(m)
                    elif '__call__' == m:
                        methods.remove('__call__')
                        methods.append(name.lower())
                    elif m.startswith('_'):
                        methods.remove(m)
                    elif m in dir(cls.module):
                        methods.remove(m)

                content = {}
                content['scriptname'] = name
                content['scriptsurname'] = surname
                content['module_file'] = module.file
                content['module'] = _module.__module__
                content['_module'] = _module
                #content['module_name'] = '.'.join((module.name, module.module))
                content['module_super'] = module.super
                content['methods'] = set(methods)
                atoms[name] = content

        return atoms
예제 #7
0
파일: io.py 프로젝트: cipoll4/pymake
    def get_atoms(cls, _type='short'):
        if _type == 'short':
            shrink_module_name = True
        elif _type == 'topos':
            shrink_module_name = False

        packages = get_pymake_settings('_model')
        atoms = OrderedDict()
        for pkg in packages:
            if len(pkg) > 8:
                prefix = pkg[:3]
                if '.' in pkg:
                    prefix += ''.join(map(lambda x: x[0], pkg.split('.')[1:]))
            else:
                prefix = True
            atoms.update(
                ModelsLoader.get_packages(
                    pkg,
                    prefix=prefix,
                    max_depth=3,
                    shrink_module_name=shrink_module_name))
        return atoms
예제 #8
0
파일: format.py 프로젝트: cipoll4/pymake
    def load_data(self, fn):
        ''' Load data in the data path folder defined in the pmk.cfg '''
        path = get_pymake_settings('project_data')
        path = os.path.join(path, fn)
        f, ext = os.path.splitext(path)
        if ext in ('.csv', '.txt'):
            import pandas as pd
            func = pd.read_csv
            kwargs = {}
        elif ext in ('.npy', ):
            func = sparse.load
            kwargs = {}
        elif ext in ('.npz', ):
            func = sparse.load_npz
            kwargs = {}
        else:
            raise NotImplementedError('extension not known: %s' % ext)

        self.log.info('Loading data: %s(%s, **%s)' %
                      (func.__name__, path, kwargs))
        data = func(path, **kwargs)
        self.log.info('%s data shape: %s' % (fn, str(data.shape)))

        return data
예제 #9
0
class tfidf(IndexManager):
    ''' Index documents.
        * Whoosh based.
        * format supported :
            * pdf
    '''

    _DATA_PATH = os.path.join(get_pymake_settings('project_data'), 'tfidf')

    _SCHEMA   = {'document' : ws.fields.Schema(hash      = ws.fields.ID(stored = True, unique=True),
                                               shortpath = ws.fields.ID(stored = True, unique=True),
                                               fullpath  = ws.fields.ID(stored = True, unique=True),
                                               title     = ws.fields.KEYWORD(stored = True),
                                               authors   = ws.fields.KEYWORD(stored = True), # names of the authors '||' separated
                                               references = ws.fields.KEYWORD(stored = True), # names of the references '||' separated
                                               date  = ws.fields.KEYWORD(stored = True), # date of publication (@todo: find it by cross reference !)
                                               content = ws.fields.TEXT),
                 #source  = '', # name of the journal/conf ertc
                 #type = '', # journal/conf etc
                }

    def __init__(self, expe):
        self.expe = expe
        super().__init__(default_index='document')

    def doc_yielder(self, path):
        ''' find all pdf and yield doc2bow doc '''

        path = os.path.expanduser(path)

        if os.path.isfile(path):
            self.expe.path = path.rpartition('/')[0] +'/'
            for p in  [path]:
                yield p
        elif not os.path.exists(path):
            self.log.error('path error: %s' % path)
            exit()

        for root, dirnames, filenames in os.walk(path):
            for filename in filenames:
                if not filename.endswith(('.pdf','.PDF')):
                    continue

                fullpath = os.path.join(root, filename)
                if match_pattern(fullpath, self.expe.get('exclude_path')):
                    continue

                yield fullpath


    def doc2xml(self, hit):
        import shutil

        # 0. Init cermine usage. (one path/pdf at a time).
        filename = os.path.basename(hit['fullpath'])
        fullpath = hit['fullpath']
        shortpath = hit['shortpath']
        pwd = os.getenv('PWD')
        os.chdir(os.path.join(pwd, 'data/lib/cermine/'))
        cermine_tar_dir = 'pdf_temp/'+filename.rpartition('.')[0] + '/'
        if not os.path.exists(cermine_tar_dir):
            os.makedirs(cermine_tar_dir)
        shutil.copy(hit['fullpath'], cermine_tar_dir)


        # 1. run Cermine
        jar = 'cermine-impl-1.14-SNAPSHOT-jar-with-dependencies.jar'
        classes = 'pl.edu.icm.cermine.ContentExtractor'
        try:
            self.log.info('extracting content of: %s' % (shortpath))
            output = subprocess.check_output(['java', '-cp', jar, classes, '-path', cermine_tar_dir])
        except Exception as e:
            self.log.error('Cermine Error %s : ' % e)
            self.log.error('Please try install/upgrade Cermine for pdf data extraction.')
            os.remove(cermine_tar_dir + filename) # remove the copied pdf
            os.chdir(pwd)
            return {}

        # 2. get the xml information
        cermine_file = cermine_tar_dir+ filename.rpartition('.')[0] + '.cermxml'
        if not os.path.isfile(cermine_file):
            self.log.error('Cermine failed...')
            return {}
        xml_strings = open(cermine_file).read()

        os.remove(cermine_tar_dir + filename) # remove the copied pdf
        os.chdir(pwd)
        return xml_strings

    # Two assumptions :
    #    * string is a pdf,
    #    * is a structured is as as scientific paper (journal ?).
    def extract_structured_kw(self, hit):
        structured = {}

        xml_strings = self.doc2xml(hit)

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            self.log.error('Please install BeautifulSoup4 to parse xml doc.')
            return {}

        try:
            soup = BeautifulSoup(xml_strings, 'lxml')
        except Exception as e:
            self.log.error('BeautifulSoup fail to parse a file:  %s : ' % e)
            return {}

        #titles = soup.findAll(re.compile(".*title.*"))

        # Main title
        # max probable title from cermine
        front = soup.front
        front_titles = front.findAll(re.compile(".*title.*"))
        #print(front_titles)
        main_title = ' '.join([o.string or '' for o in front_titles]).strip()
        structured['title'] = main_title

        authors = soup.findAll(attrs={'contrib-type':'author'})
        authors = [o.findAll('string-name') for o in authors]
        authors = sum(authors, [])
        authors = ' || '.join([o.string for o in authors])
        structured['authors'] = authors

        # Institution, Journal, Year etc...
        pass

        # References
        references = [ ' '.join(str(r).split()) for r in soup.findAll('mixed-citation')]
        structured['references'] = ' || '.join(references)

        return structured


    def fit(self):
        voca = Vocabulary(exclude_stopwords=True)
        writer = self.get_writer(reset=self.expe.reset, online=True)
        setattr(self, 'writer', writer)

        for _it, path in enumerate(self.doc_yielder(self.expe.path)):

            fullpath = path
            shortpath = '/' +  fullpath[len(os.path.expanduser(self.expe.path)):].rstrip('/').lstrip('/')

            is_known = False
            is_duplicated = False

            if self.getfirst(shortpath, 'shortpath'):
                # don't update document
                # could compute a diff here...
                is_known = True # assume already indexed
            else:
                text = extract_pdf(fullpath)
                text = voca.remove_stopwords(text)
                #bow = voca.doc2bow(text)
                if text in (None, ''):
                    # do nothing
                    continue

                doc = dict(shortpath=shortpath, fullpath=fullpath)
                doc['content'] = text
                doc['hash'] = hash_objects(text)

                first_m = self.getfirst(doc['hash'], 'hash')
                if first_m:
                    #if not 'content' in first_m:
                    #    writer.delete_by_term('hash', doc['hash'])
                    #    continue
                    # don't update document
                    self.log.warning("Duplicate file detected: %s renaming to %s" % (first_m['shortpath'], shortpath))
                    first_m['shortpath'] = shortpath
                    writer.update_document(**first_m)
                    is_duplicated = True
                else:
                    if self.expe.extract_structure:
                        # structured content
                        structured = self.extract_structured_kw(doc)
                        doc.update(structured)

            if not (is_known or is_duplicated):
                print("indexing `%s'" % (path))
                try:
                    writer.add_document(**doc)
                except Exception as e:
                    print('indexing doc %s failed!' % fullpath)

        return

    def close(self):
        if hasattr(self, 'writer'):
            try:
                self.writer.close()
            except Exception as e:
                print('Whoosh error: %s' %e)
예제 #10
0
파일: io.py 프로젝트: cipoll4/pymake
def forest_tensor(target_files, map_parameters):
    """ It has to be ordered the same way than the file properties.
        Fuze directory to find available files then construct the tensor
        according the set space fomed by object found.
        @in target_files has to be orderedDict to align the the tensor access.
    """
    # Expe analyser / Tabulyze It

    # res shape ([expe], [model], [measure]
    # =================================================================================
    # Expe: [debug, corpus] -- from the dirname
    # Model: [name, K, hyper, h**o] -- from the expe filename
    # measure:
    #   * 0: global precision,
    #   * 1: local precision,
    #   * 2: recall

    ### Output: rez.shape rez_map_l rez_map
    if not target_files:
        lgg.info('Target Files empty')
        return None

    #dim = get_conf_dim_from_files(target_files, map_parameters) # Rely on Expe...
    dim = dict((k, len(v)) if isinstance(v, (list, tuple)) else (k, len([v]))
               for k, v in map_parameters.items())

    rez_map = map_parameters.keys()  # order !
    # Expert knowledge value
    new_dims = _New_Dims
    # Update Mapping
    [dim.update(d) for d in new_dims]
    [rez_map.append(n.keys()[0]) for n in new_dims]

    # Create the shape of the Ananisys/Resulst Tensor
    #rez_map = dict(zip(rez_map_l, range(len(rez_map_l))))
    shape = []
    for n in rez_map:
        shape.append(dim[n])

    # Create the numpy array to store all experience values, whith various setings
    rez = np.zeros(shape) * np.nan

    not_finished = []
    info_file = []
    for _f in target_files:
        prop = get_conf_from_file(_f, map_parameters)
        pt = np.empty(rez.ndim)

        assert (len(pt) - len(new_dims) == len(prop))
        for k, v in prop.items():
            try:
                v = int(v)
            except:
                pass
            try:
                idx = map_parameters[k].index(v)
            except Exception as e:
                lgg.error(prop)
                lgg.error('key:value error --  %s, %s' % (k, v))
                raise ValueError
            pt[rez_map.index(k)] = idx

        f = os.path.join(get_pymake_settings('project_data'), _f)
        d = load(f)
        if not d:
            not_finished.append('%s not finish...\n' % _f)
            continue

        try:
            pt = list(pt.astype(int))
            for i, v in enumerate(_Key_measures):
                pt[-1] = i
                ### HOOK
                # v:  is the measure name
                # json_v: the value of the measure
                if v == 'homo_model_e':
                    try:
                        json_v = d.get('homo_model_o') - d.get(v)
                    except:
                        pass
                elif v == 'f1':
                    precision = d.get('Precision')
                    try:
                        recall = d.get('Recall')
                        recall * 2
                    except:
                        # future remove
                        recall = d.get('Rappel')
                    json_v = 2 * precision * recall / (precision + recall)
                else:
                    if v == 'Recall':
                        try:
                            v * 2
                        except:
                            v = 'Rappel'

                    json_v = d.get(v)
                rez[zip(pt)] = json_v

        except IndexError as e:
            lgg.error(e)
            lgg.error(
                'Index Error: Files are probably missing here to complete the results...\n'
            )

        #info_file.append( '%s %s; \t K=%s\n' % (corpus_type, f, K) )

    lgg.debug(''.join(not_finished))
    #lgg.debug(''.join(info_file))
    rez = np.ma.masked_array(rez, np.isnan(rez))
    return rez
예제 #11
0
    def __init__(self, default_index='model'):
        self._DATA_PATH = os.path.join(get_pymake_settings('PWD'), '.pmk')

        self._index_basename = 'ir_index'
        self._default_index = default_index
        self._ix = {}  # Index store by key
예제 #12
0
파일: format.py 프로젝트: cipoll4/pymake
 def full_fig_path(self, fn):
     figs_path = get_pymake_settings('project_figs')
     path = os.path.join(figs_path, self.expe.get('_refdir', ''),
                         self.specname(fn))
     make_path(path)
     return path
예제 #13
0
파일: format.py 프로젝트: cipoll4/pymake
 def get_data_path(self):
     path = get_pymake_settings('project_data')
     path = os.path.join(path, '')
     return path