예제 #1
0
파일: io.py 프로젝트: gitter-badger/pymake
    def get_packages(cls, **kwargs):
        module_name = get_pymake_settings('_spec')
        if not 'class_filter' in kwargs:
            kwargs['class_filter'] = cls.module

        if isinstance(module_name, list):
            packs = {}
            for m in module_name:
                packs.update(cls(m, **kwargs).packages)
            return packs
        else:
            return cls(module_name, **kwargs).packages
예제 #2
0
    def check_model_typo(self):
        ''' Assume default module is pymake '''
        for tensor in self._tensors:
            models = tensor.get('model', [])
            for i, m in enumerate(models):

                if not '.' in m:
                    # Set the model ref name
                    pkg = get_pymake_settings('default_model')
                    if len(pkg) > 8:
                        prefix = pkg[:3]
                        if '.' in pkg:
                            prefix  += ''.join(map(lambda x:x[0], pkg.split('.')[1:]))
                    else:
                        prefix = pkg.split('.')[0]

                    models[i] = '%s.%s'%(prefix, m)
예제 #3
0
파일: io.py 프로젝트: gitter-badger/pymake
    def get_atoms(cls,  _type='short'):
        if _type == 'short':
            shrink_module_name = True
        elif _type == 'topos':
            shrink_module_name = False

        packages = get_pymake_settings('_model')
        atoms = OrderedDict()
        for pkg in packages:
            if len(pkg) > 8:
                prefix = pkg[:3]
                if '.' in pkg:
                    prefix  += ''.join(map(lambda x:x[0], pkg.split('.')[1:]))
            else:
                prefix = True
            atoms.update(ModelsLoader.get_packages(pkg,  prefix=prefix, max_depth=3, shrink_module_name=shrink_module_name))
        return atoms
예제 #4
0
파일: io.py 프로젝트: gitter-badger/pymake
    def get_atoms(cls):
        atoms = dict()
        modules = get_pymake_settings('_script')
        modules = [modules] if type(modules) is str else modules
        for module in modules:

            s = cls(module, class_filter=cls.module)

            ## get decorator for each class
            #class2met2dec = {}
            #for method, _class in classs.packages.items():
            #    append decoratpr information to filter @atpymake

            for surname, _module in s.packages.items():
                name = _module.__name__
                module = s._cls_browse[name]
                methods = list(module.methods.keys())
                for m in methods.copy():
                    _m = getattr(s.packages[name.lower()], m)
                    if not inspect.isfunction(_m) and m != '__call__':
                        methods.remove(m)
                    elif '__call__' == m:
                        methods.remove('__call__')
                        methods.append(name.lower())
                    elif m.startswith('_'):
                        methods.remove(m)
                    elif m in dir(cls.module):
                        methods.remove(m)

                content = {}
                content['scriptname'] = name
                content['scriptsurname'] = surname
                content['module_file'] = module.file
                content['module'] = _module.__module__
                content['_module'] = _module
                #content['module_name'] = '.'.join((module.name, module.module))
                content['module_super'] = module.super
                content['methods'] = methods
                atoms[name] = content

        return atoms
예제 #5
0
파일: io.py 프로젝트: gitter-badger/pymake
def get_conf_from_file(target, mp):
    """ Return dictionary of property for an expe file.
        @mp: map parameters
        format model_K_hyper_N
        @template_file order important to align the dictionnary.
        """
    masterkeys = _MASTERKEYS.copy()
    template_file = masterkeys.keys()
    ##template_file = 'networks/generator/Graph13/debug11/immsb_10_auto_0_all.*'

    data_path = get_pymake_settings('project_data')
    # Relative path ignore
    if target.startswith(data_path):
        target.replace(data_path, '')

    path = target.lstrip('/').split('/')

    _prop = os.path.splitext(path.pop())[0]
    _prop = path + _prop.split('_')

    prop = {}
    cpt_hook_master = 0
    cpt_hook_user = 0
    # @Debug/Improve the nasty Hook here
    def update_pt(cur, master, user):
        return cur - master + user

    #prop = {k: _prop[i] for i, k in enumerate(template_file) if k in mp}
    for i, k in enumerate(template_file):
        if not k in mp:
            cpt_hook_master += 1
            continue
        pt = update_pt(i, cpt_hook_master, cpt_hook_user)
        hook = tree_hook(k, _prop[pt])
        if hook:
            cpt_hook_user += 1
            pt = update_pt(i, cpt_hook_master, cpt_hook_user)
        prop[k] = _prop[pt]

    return prop
예제 #6
0
파일: io.py 프로젝트: gitter-badger/pymake
    def get_atoms(cls):
        expe_designs = []
        atoms = dict()

        modules = get_pymake_settings('_spec')
        modules = [modules] if type(modules) is str else modules
        for module in modules:

            s = cls(module, class_filter=cls.module)

            for surname, _module in s.packages.items():
                name = _module.__name__
                module = s._cls_browse[name]

                expd = getattr(import_module(module.module), name)()

                content = {}
                content['script_name'] = surname
                content['module_name'] = '.'.join((_module.__module__, _module.__name__))
                content['_module'] = module
                content['exp'] = expd._specs()
                atoms[name] = content

        return atoms
예제 #7
0
class IndexManager(object):

    _DATA_PATH = os.path.join(get_pymake_settings('project_data'), '.pmk')

    _SCHEMA = {
        'model':
        ws.fields.Schema(name=ws.fields.ID(stored=True),
                         surname=ws.fields.ID(stored=True),
                         module=ws.fields.ID(stored=True),
                         category=ws.fields.KEYWORD(stored=True),
                         content=ws.fields.TEXT),
        'script':
        ws.fields.Schema(scriptname=ws.fields.ID(stored=True),
                         scriptsurname=ws.fields.ID(stored=True),
                         module=ws.fields.ID(stored=True),
                         method=ws.fields.KEYWORD(stored=True),
                         signature=ws.fields.TEXT(stored=True),
                         content=ws.fields.TEXT),
        'spec':
        ws.fields.Schema(module_name=ws.fields.ID(stored=True),
                         script_name=ws.fields.ID(stored=True),
                         expe_name=ws.fields.ID(stored=True),
                         content=ws.fields.TEXT),
    }

    log = logging.getLogger('root')

    def __init__(self, default_index='model'):
        self._index_basename = 'ir_index'
        self._default_index = default_index
        self._ix = {}  # Index store by key

    def get_index_path(self, name=None):
        name = name or self._default_index
        return os.path.join(self._DATA_PATH, self._index_basename, name + '/')

    def clean_index(self, name=None, schema=None, **kwargs):
        ''' make the index `name\' according to its `schema\' '''
        name = name or self._default_index
        index_path = self.get_index_path(name)
        if os.path.exists(index_path):
            shutil.rmtree(index_path)
        os.makedirs(index_path)

        if name in self._SCHEMA:
            schema = self._SCHEMA[name]
        else:
            raise NotImplementedError(
                'Dont know what to do, no schema defined ...?')

        self._ix[name] = ws.index.create_in(index_path, schema)

        return self._ix[name]

    def load_index(self, name=None):
        name = name or self._default_index
        index_path = self.get_index_path(name)
        return ws.index.open_dir(index_path)

    def get_index(self, name=None):
        name = name or self._default_index
        if name in self._ix:
            return self._ix[name]
        elif os.path.exists(self.get_index_path(name)):
            try:
                return self.load_index(name)
            except Exception as e:
                print('Whoos index i/o error %s' % e)
                print(
                    'removing to clean the index (removing data/ir_index folder)'
                )
                exit(2)
        else:
            return self.clean_index(name)

    def get_writer(self, reset=False, online=None, index=None):
        index = index or self._default_index
        if reset:
            ix = self.clean_index(index)
        else:
            ix = self.get_index(index)

        if online:
            import whoosh.writing
            if online is True:
                online = {}
            period = online.get('period', 600)
            limit = online.get('limit', 2)
            return ws.writing.BufferedWriter(ix, period=period, limit=limit)
        else:
            return ix.writer()

    def get_reader(self, index=None):
        index = index or self._default_index
        ix = self.get_index(index)
        return ix.searcher()

    @classmethod
    def build_indexes(cls, index_name=None):
        ''' Update the system index '''
        idx = cls()

        if index_name is None:
            schemas = idx._SCHEMA
        else:
            schemas = [index_name]

        for name in schemas:
            func = 'update_' + name + '_index'
            builder = getattr(idx, func)
            builder()

    def update_corpus_index(self):
        raise NotImplementedError

    def update_spec_index(self):
        ''' Update the schema of the Spec index '''
        from pymake.io import SpecLoader
        model = 'spec'
        self.log.info('Building %s index...' % model)
        Specs = SpecLoader.get_atoms()
        writer = self.clean_index(model).writer()
        for scriptname, _content in Specs.items():
            self.log.info('\tindexing %s' %
                          (str(scriptname) + str(_content['_module'])))

            for expe in _content['exp']:

                content = ''
                writer.add_document(script_name=_content['script_name'],
                                    module_name=_content['module_name'],
                                    expe_name=expe,
                                    content=content)
        writer.commit()

    def update_script_index(self):
        ''' Update the schema of the Scripts index '''
        from pymake.io import ScriptsLoader
        model = 'script'
        self.log.info('Building %s index...' % model)
        Scripts = ScriptsLoader.get_atoms()
        writer = self.clean_index(model).writer()
        for scriptname, _content in Scripts.items():
            self.log.info('\tindexing %s' %
                          (str(scriptname) + str(_content['_module'])))

            # Loop is context/model dependant
            for method in _content['methods']:

                content = ''
                writer.add_document(scriptname=_content['scriptname'],
                                    scriptsurname=_content['scriptsurname'],
                                    module=_content['module'],
                                    method=method,
                                    content=content)
        writer.commit()

    def update_model_index(self):
        ''' Update the schema of the Models index '''
        from pymake.io import ModelsLoader
        model = 'model'
        self.log.info('Building %s index...' % model)
        models = ModelsLoader.get_atoms()
        writer = self.clean_index(model).writer()
        for surname, module in models.items():
            self.log.info('\tindexing %s' % (str(surname) + str(module)))

            # Loop is context/model dependant
            topos = ' '.join(set(module.__module__.split('.')[1:]))
            content = ' '.join((surname, module.__name__, module.__module__))

            writer.add_document(surname=surname,
                                name=module.__name__,
                                category=topos,
                                module=module.__module__,
                                content=content)
        writer.commit()

    # @debug : online searcher
    def _search(self,
                query='',
                field=None,
                index=None,
                terms=False,
                limit=None):
        ''' query (exaxct mathch) search '''
        index = index or self._default_index
        ix = self.get_index(index)
        fieldin = field or 'content'

        qp = QueryParser(fieldin, ix.schema)
        qp.add_plugin(ws.qparser.SingleQuotePlugin())
        query = qp.parse(query, normalize=False)
        with ix.searcher() as searcher:
            if terms is True:
                results = searcher.search(query, terms=True,
                                          limit=limit).matched_terms()
            else:
                results = list(searcher.search(query, limit=limit).items())

        return results

    def search(self, query='', field=None, index=None, limit=None):
        ''' Text search '''
        index = index or self._default_index
        limit = None if limit == 'all' else limit
        ix = self.get_index(index)
        fieldin = field or 'content'

        fuzzy = '~' in query
        wildcard = '*' in query
        plusminus = '+' in query or '-' in query
        multifield = ':' in query
        boost = '^' in query

        qp = QueryParser(fieldin, ix.schema, group=OrGroup)

        # Check the difference between the both, not sure ?
        qp.add_plugin(ws.qparser.SingleQuotePlugin())
        qp.add_plugin(ws.qparser.PhrasePlugin())

        if wildcard:
            qp.add_plugin(ws.qparser.WildcardPlugin())
        if fuzzy:
            qp.add_plugin(ws.qparser.FuzzyTermPlugin())
        if plusminus:
            qp.add_plugin(ws.qparser.PlusMinusPlugin())
        if multifield:
            qp.add_plugin(ws.qparser.MultifieldPlugin([fieldin]))
        if boost:
            qp.add_plugin(ws.qparser.BoostPlugin())

        query = qp.parse(query)
        with ix.searcher() as searcher:
            results = searcher.search(query, limit=limit)

            results.fragmenter = ws.highlight.SentenceFragmenter(
                maxchars=200, charlimit=100042)
            #results.fragmenter = ws.highlight.ContextFragmenter(maxchars=200, surround=43)
            results.formatter = TerminalFormatter()

            for r in results:
                yield r

    def getbydocid(self, docid, index=None):
        ''' return the a document's stored fields in the index from docid '''
        index = index or self._default_index
        ix = self.get_index(index)
        with ix.searcher() as searcher:
            doc = searcher.stored_fields(docid)
        return doc

    def getfirst(self, query='', field=None, index=None):
        query = "'" + query + "'"
        results = self._search(query, field, index, limit=1)

        if not results:
            return None
        else:
            return self.getbydocid(results[0][0])

    # not need to commit ?! /conflict forward...
    #def delete_by_term(self, term, field):
    #    writer.delete_by_term('hash', doc['hash'])

    def getbydocids(self, docids, index=None):
        ''' return the a list of document's stored fields in the index from docids '''
        index = index or self._default_index
        ix = self.get_index(index)
        docs = []
        with ix.searcher() as searcher:
            for docid in docids:
                docs.append(searcher.stored_fields(docid))
        return docs

    # @debug : online searcher
    # @debug : get a list of terms (mongo projection equivalent ?!)
    def query(self, field=None, index=None, terms=False):
        ''' return all object that have the field entry set '''
        index = index or self._default_index
        ix = self.get_index(index)
        field = field or ix.schema.stored_names()[0]

        query = ws.query.Every(field)
        with ix.searcher() as searcher:
            results = searcher.search(query, limit=None)
            if terms is False:
                results = [r[field] for r in results]
            elif isinstance(terms, str):
                results = dict((o[field], o[terms]) for o in results)
            else:
                results = [dict(r) for r in results]

        return results
예제 #8
0
class tfidf(IndexManager):
    ''' Index documents.
        * Whoosh based.
        * format supported :
            * pdf
    '''

    _DATA_PATH = os.path.join(get_pymake_settings('project_data'), 'tfidf')

    _SCHEMA   = {'document' : ws.fields.Schema(hash   = ws.fields.ID(stored = True, unique=True),
                                               shortpath = ws.fields.ID(stored = True, unique=True),
                                               fullpath = ws.fields.ID(stored = True, unique=True),
                                               title  = ws.fields.KEYWORD(stored = True),
                                               authors = ws.fields.KEYWORD(stored = True), # names of the authors '||' separated
                                               references = ws.fields.KEYWORD(stored = True), # names of the references '||' separated
                                               date  = ws.fields.KEYWORD(stored = True), # date of publication (@todo: find it by cross reference !)
                                               content = ws.fields.TEXT),
                 #source  = '', # name of the journal/conf ertc
                 #type = '', # journal/conf etc
                }

    def __init__(self, expe):
        self.expe = expe
        super().__init__(default_index='document')

    def doc_yielder(self, path):
        ''' find all pdf and yield do2bow doc '''

        path = os.path.expanduser(path)

        if os.path.isfile(path):
            self.expe.path = path.rpartition('/')[0] +'/'
            for p in  [path]:
                yield p
        elif not os.path.exists(path):
            self.log.error('path error: %s' % path)
            exit()

        for root, dirnames, filenames in os.walk(path):
            for filename in filenames:
                if not filename.endswith(('.pdf','.PDF')):
                    continue

                fullpath = os.path.join(root, filename)
                if match_pattern(fullpath, self.expe.get('exclude_path')):
                    continue

                yield fullpath


    def doc2xml(self, hit):
        import shutil

        # 0. Init cermine usage. (one path/pdf at a time).
        filename = os.path.basename(hit['fullpath'])
        fullpath = hit['fullpath']
        shortpath = hit['shortpath']
        pwd = os.getenv('PWD')
        os.chdir(os.path.join(pwd, 'data/lib/cermine/'))
        cermine_tar_dir = 'pdf_temp/'+filename.rpartition('.')[0] + '/'
        if not os.path.exists(cermine_tar_dir):
            os.makedirs(cermine_tar_dir)
        shutil.copy(hit['fullpath'], cermine_tar_dir)


        # 1. run Cermine
        jar = 'cermine-impl-1.14-SNAPSHOT-jar-with-dependencies.jar'
        classes = 'pl.edu.icm.cermine.ContentExtractor'
        try:
            self.log.info('extracting content of: %s' % (shortpath))
            output = subprocess.check_output(['java', '-cp', jar, classes, '-path', cermine_tar_dir])
        except Exception as e:
            self.log.error('Cermine Error %s : ' % e)
            self.log.error('Please try install/upgrade Cermine for pdf data extraction.')
            os.remove(cermine_tar_dir + filename) # remove the copied pdf
            os.chdir(pwd)
            return {}

        # 2. get the xml information
        cermine_file = cermine_tar_dir+ filename.rpartition('.')[0] + '.cermxml'
        if not os.path.isfile(cermine_file):
            self.log.error('Cermine failed...')
            return {}
        xml_strings = open(cermine_file).read()

        os.remove(cermine_tar_dir + filename) # remove the copied pdf
        os.chdir(pwd)
        return xml_strings

    # Two assumptions :
    #    * string is a pdf,
    #    * is a structured is as as scientific paper (journal ?).
    def extract_structured_kw(self, hit):
        structured = {}

        xml_strings = self.doc2xml(hit)

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            self.log.error('Please install BeautifulSoup4 to parse xml doc.')
            return {}

        try:
            soup = BeautifulSoup(xml_strings, 'lxml')
        except Exception as e:
            self.log.error('BeautifulSoup fail to parse a file:  %s : ' % e)
            return {}

        #titles = soup.findAll(re.compile(".*title.*"))

        # Main title
        # max probable title from cermine
        front = soup.front
        front_titles = front.findAll(re.compile(".*title.*"))
        #print(front_titles)
        main_title = ' '.join([o.string or '' for o in front_titles]).strip()
        structured['title'] = main_title

        authors = soup.findAll(attrs={'contrib-type':'author'})
        authors = [o.findAll('string-name') for o in authors]
        authors = sum(authors, [])
        authors = ' || '.join([o.string for o in authors])
        structured['authors'] = authors

        # Institution, Journal, Year etc...
        pass

        # References
        references = [ ' '.join(str(r).split()) for r in soup.findAll('mixed-citation')]
        structured['references'] = ' || '.join(references)

        return structured


    def fit(self):
        voca = Vocabulary(exclude_stopwords=True)
        writer = self.get_writer(reset=self.expe.reset, online=True)
        setattr(self, 'writer', writer)

        for _it, path in enumerate(self.doc_yielder(self.expe.path)):

            fullpath = path
            shortpath = '/' +  fullpath[len(os.path.expanduser(self.expe.path)):].rstrip('/').lstrip('/')

            is_known = False
            is_duplicated = False

            if self.getfirst(shortpath, 'shortpath'):
                # don't update document
                # could compute a diff here...
                is_known = True # assume already indexed
            else:
                text = extract_pdf(fullpath)
                text = voca.remove_stopwords(text)
                #bow = voca.doc2bow(text)
                if text in (None, ''):
                    # do nothing
                    continue

                doc = dict(shortpath=shortpath, fullpath=fullpath)
                doc['content'] = text
                doc['hash'] = hash_objects(text)

                first_m = self.getfirst(doc['hash'], 'hash')
                if first_m:
                    #if not 'content' in first_m:
                    #    writer.delete_by_term('hash', doc['hash'])
                    #    continue
                    # don't update document
                    self.log.warning("Duplicate file detected: %s renaming to %s" % (first_m['shortpath'], shortpath))
                    first_m['shortpath'] = shortpath
                    writer.update_document(**first_m)
                    is_duplicated = True
                else:
                    if self.expe.extract_structure:
                        # structured content
                        structured = self.extract_structured_kw(doc)
                        doc.update(structured)

            if not (is_known or is_duplicated):
                print("indexing `%s'" % (path))
                try:
                    writer.add_document(**doc)
                except Exception as e:
                    print('indexing doc %s failed!' % fullpath)

        return

    def close(self):
        if hasattr(self, 'writer'):
            try:
                self.writer.close()
            except Exception as e:
                print('Whoosh error: %s' %e)
예제 #9
0
파일: io.py 프로젝트: gitter-badger/pymake
def forest_tensor(target_files, map_parameters):
    """ It has to be ordered the same way than the file properties.
        Fuze directory to find available files then construct the tensor
        according the set space fomed by object found.
        @in target_files has to be orderedDict to align the the tensor access.
    """
    # Expe analyser / Tabulyze It

    # res shape ([expe], [model], [measure]
    # =================================================================================
    # Expe: [debug, corpus] -- from the dirname
    # Model: [name, K, hyper, h**o] -- from the expe filename
    # measure:
    #   * 0: global precision,
    #   * 1: local precision,
    #   * 2: recall

    ### Output: rez.shape rez_map_l rez_map
    if not target_files:
        lgg.info('Target Files empty')
        return None

    #dim = get_conf_dim_from_files(target_files, map_parameters) # Rely on Expe...
    dim = dict( (k, len(v)) if isinstance(v, (list, tuple)) else (k, len([v])) for k, v in map_parameters.items() )

    rez_map = map_parameters.keys() # order !
    # Expert knowledge value
    new_dims = _New_Dims
    # Update Mapping
    [dim.update(d) for d in new_dims]
    [rez_map.append(n.keys()[0]) for n in new_dims]

    # Create the shape of the Ananisys/Resulst Tensor
    #rez_map = dict(zip(rez_map_l, range(len(rez_map_l))))
    shape = []
    for n in rez_map:
        shape.append(dim[n])

    # Create the numpy array to store all experience values, whith various setings
    rez = np.zeros(shape) * np.nan

    not_finished = []
    info_file = []
    for _f in target_files:
        prop = get_conf_from_file(_f, map_parameters)
        pt = np.empty(rez.ndim)

        assert(len(pt) - len(new_dims) == len(prop))
        for k, v in prop.items():
            try:
                v = int(v)
            except:
                pass
            try:
                idx = map_parameters[k].index(v)
            except Exception as e:
                lgg.error(prop)
                lgg.error('key:value error --  %s, %s'% (k, v))
                raise ValueError
            pt[rez_map.index(k)] = idx

        f = os.path.join(get_pymake_settings('project_data'), _f)
        d = load(f)
        if not d:
            not_finished.append( '%s not finish...\n' % _f)
            continue

        try:
            pt = list(pt.astype(int))
            for i, v in enumerate(_Key_measures):
                pt[-1] = i
                ### HOOK
                # v:  is the measure name
                # json_v: the value of the measure
                if v == 'homo_model_e':
                    try:
                        json_v =  d.get('homo_model_o') - d.get(v)
                    except: pass
                elif v == 'f1':
                    precision = d.get('Precision')
                    try:
                        recall = d.get('Recall')
                        recall*2
                    except:
                        # future remove
                        recall = d.get('Rappel')
                    json_v = 2*precision*recall / (precision+recall)
                else:
                    if v == 'Recall':
                        try:
                            v * 2
                        except:
                            v = 'Rappel'

                    json_v = d.get(v)
                rez[zip(pt)] = json_v

        except IndexError as e:
            lgg.error(e)
            lgg.error('Index Error: Files are probably missing here to complete the results...\n')

        #info_file.append( '%s %s; \t K=%s\n' % (corpus_type, f, K) )

    lgg.debug(''.join(not_finished))
    #lgg.debug(''.join(info_file))
    rez = np.ma.masked_array(rez, np.isnan(rez))
    return rez
예제 #10
0
파일: format.py 프로젝트: oboder/pymake
 def full_fig_path(self, fn):
     figs_path = get_pymake_settings('project_figs')
     path = os.path.join(figs_path, self.expe.get('_refdir', ''),
                         self.specname(fn))
     make_path(path)
     return path
예제 #11
0
파일: format.py 프로젝트: oboder/pymake
 def get_data_path(self):
     path = get_pymake_settings('project_data')
     path = os.path.join(path, '')
     return path