def get_packages(cls, **kwargs): module_name = get_pymake_settings('_spec') if not 'class_filter' in kwargs: kwargs['class_filter'] = cls.module if isinstance(module_name, list): packs = {} for m in module_name: packs.update(cls(m, **kwargs).packages) return packs else: return cls(module_name, **kwargs).packages
def check_model_typo(self): ''' Assume default module is pymake ''' for tensor in self._tensors: models = tensor.get('model', []) for i, m in enumerate(models): if not '.' in m: # Set the model ref name pkg = get_pymake_settings('default_model') if len(pkg) > 8: prefix = pkg[:3] if '.' in pkg: prefix += ''.join(map(lambda x:x[0], pkg.split('.')[1:])) else: prefix = pkg.split('.')[0] models[i] = '%s.%s'%(prefix, m)
def get_atoms(cls, _type='short'): if _type == 'short': shrink_module_name = True elif _type == 'topos': shrink_module_name = False packages = get_pymake_settings('_model') atoms = OrderedDict() for pkg in packages: if len(pkg) > 8: prefix = pkg[:3] if '.' in pkg: prefix += ''.join(map(lambda x:x[0], pkg.split('.')[1:])) else: prefix = True atoms.update(ModelsLoader.get_packages(pkg, prefix=prefix, max_depth=3, shrink_module_name=shrink_module_name)) return atoms
def get_atoms(cls): atoms = dict() modules = get_pymake_settings('_script') modules = [modules] if type(modules) is str else modules for module in modules: s = cls(module, class_filter=cls.module) ## get decorator for each class #class2met2dec = {} #for method, _class in classs.packages.items(): # append decoratpr information to filter @atpymake for surname, _module in s.packages.items(): name = _module.__name__ module = s._cls_browse[name] methods = list(module.methods.keys()) for m in methods.copy(): _m = getattr(s.packages[name.lower()], m) if not inspect.isfunction(_m) and m != '__call__': methods.remove(m) elif '__call__' == m: methods.remove('__call__') methods.append(name.lower()) elif m.startswith('_'): methods.remove(m) elif m in dir(cls.module): methods.remove(m) content = {} content['scriptname'] = name content['scriptsurname'] = surname content['module_file'] = module.file content['module'] = _module.__module__ content['_module'] = _module #content['module_name'] = '.'.join((module.name, module.module)) content['module_super'] = module.super content['methods'] = methods atoms[name] = content return atoms
def get_conf_from_file(target, mp): """ Return dictionary of property for an expe file. @mp: map parameters format model_K_hyper_N @template_file order important to align the dictionnary. """ masterkeys = _MASTERKEYS.copy() template_file = masterkeys.keys() ##template_file = 'networks/generator/Graph13/debug11/immsb_10_auto_0_all.*' data_path = get_pymake_settings('project_data') # Relative path ignore if target.startswith(data_path): target.replace(data_path, '') path = target.lstrip('/').split('/') _prop = os.path.splitext(path.pop())[0] _prop = path + _prop.split('_') prop = {} cpt_hook_master = 0 cpt_hook_user = 0 # @Debug/Improve the nasty Hook here def update_pt(cur, master, user): return cur - master + user #prop = {k: _prop[i] for i, k in enumerate(template_file) if k in mp} for i, k in enumerate(template_file): if not k in mp: cpt_hook_master += 1 continue pt = update_pt(i, cpt_hook_master, cpt_hook_user) hook = tree_hook(k, _prop[pt]) if hook: cpt_hook_user += 1 pt = update_pt(i, cpt_hook_master, cpt_hook_user) prop[k] = _prop[pt] return prop
def get_atoms(cls): expe_designs = [] atoms = dict() modules = get_pymake_settings('_spec') modules = [modules] if type(modules) is str else modules for module in modules: s = cls(module, class_filter=cls.module) for surname, _module in s.packages.items(): name = _module.__name__ module = s._cls_browse[name] expd = getattr(import_module(module.module), name)() content = {} content['script_name'] = surname content['module_name'] = '.'.join((_module.__module__, _module.__name__)) content['_module'] = module content['exp'] = expd._specs() atoms[name] = content return atoms
class IndexManager(object): _DATA_PATH = os.path.join(get_pymake_settings('project_data'), '.pmk') _SCHEMA = { 'model': ws.fields.Schema(name=ws.fields.ID(stored=True), surname=ws.fields.ID(stored=True), module=ws.fields.ID(stored=True), category=ws.fields.KEYWORD(stored=True), content=ws.fields.TEXT), 'script': ws.fields.Schema(scriptname=ws.fields.ID(stored=True), scriptsurname=ws.fields.ID(stored=True), module=ws.fields.ID(stored=True), method=ws.fields.KEYWORD(stored=True), signature=ws.fields.TEXT(stored=True), content=ws.fields.TEXT), 'spec': ws.fields.Schema(module_name=ws.fields.ID(stored=True), script_name=ws.fields.ID(stored=True), expe_name=ws.fields.ID(stored=True), content=ws.fields.TEXT), } log = logging.getLogger('root') def __init__(self, default_index='model'): self._index_basename = 'ir_index' self._default_index = default_index self._ix = {} # Index store by key def get_index_path(self, name=None): name = name or self._default_index return os.path.join(self._DATA_PATH, self._index_basename, name + '/') def clean_index(self, name=None, schema=None, **kwargs): ''' make the index `name\' according to its `schema\' ''' name = name or self._default_index index_path = self.get_index_path(name) if os.path.exists(index_path): shutil.rmtree(index_path) os.makedirs(index_path) if name in self._SCHEMA: schema = self._SCHEMA[name] else: raise NotImplementedError( 'Dont know what to do, no schema defined ...?') self._ix[name] = ws.index.create_in(index_path, schema) return self._ix[name] def load_index(self, name=None): name = name or self._default_index index_path = self.get_index_path(name) return ws.index.open_dir(index_path) def get_index(self, name=None): name = name or self._default_index if name in self._ix: return self._ix[name] elif os.path.exists(self.get_index_path(name)): try: return self.load_index(name) except Exception as e: print('Whoos index i/o error %s' % e) print( 'removing to clean the index (removing data/ir_index folder)' ) exit(2) else: return self.clean_index(name) def get_writer(self, reset=False, online=None, index=None): index = index or self._default_index if reset: ix = self.clean_index(index) else: ix = self.get_index(index) if online: import whoosh.writing if online is True: online = {} period = online.get('period', 600) limit = online.get('limit', 2) return ws.writing.BufferedWriter(ix, period=period, limit=limit) else: return ix.writer() def get_reader(self, index=None): index = index or self._default_index ix = self.get_index(index) return ix.searcher() @classmethod def build_indexes(cls, index_name=None): ''' Update the system index ''' idx = cls() if index_name is None: schemas = idx._SCHEMA else: schemas = [index_name] for name in schemas: func = 'update_' + name + '_index' builder = getattr(idx, func) builder() def update_corpus_index(self): raise NotImplementedError def update_spec_index(self): ''' Update the schema of the Spec index ''' from pymake.io import SpecLoader model = 'spec' self.log.info('Building %s index...' % model) Specs = SpecLoader.get_atoms() writer = self.clean_index(model).writer() for scriptname, _content in Specs.items(): self.log.info('\tindexing %s' % (str(scriptname) + str(_content['_module']))) for expe in _content['exp']: content = '' writer.add_document(script_name=_content['script_name'], module_name=_content['module_name'], expe_name=expe, content=content) writer.commit() def update_script_index(self): ''' Update the schema of the Scripts index ''' from pymake.io import ScriptsLoader model = 'script' self.log.info('Building %s index...' % model) Scripts = ScriptsLoader.get_atoms() writer = self.clean_index(model).writer() for scriptname, _content in Scripts.items(): self.log.info('\tindexing %s' % (str(scriptname) + str(_content['_module']))) # Loop is context/model dependant for method in _content['methods']: content = '' writer.add_document(scriptname=_content['scriptname'], scriptsurname=_content['scriptsurname'], module=_content['module'], method=method, content=content) writer.commit() def update_model_index(self): ''' Update the schema of the Models index ''' from pymake.io import ModelsLoader model = 'model' self.log.info('Building %s index...' % model) models = ModelsLoader.get_atoms() writer = self.clean_index(model).writer() for surname, module in models.items(): self.log.info('\tindexing %s' % (str(surname) + str(module))) # Loop is context/model dependant topos = ' '.join(set(module.__module__.split('.')[1:])) content = ' '.join((surname, module.__name__, module.__module__)) writer.add_document(surname=surname, name=module.__name__, category=topos, module=module.__module__, content=content) writer.commit() # @debug : online searcher def _search(self, query='', field=None, index=None, terms=False, limit=None): ''' query (exaxct mathch) search ''' index = index or self._default_index ix = self.get_index(index) fieldin = field or 'content' qp = QueryParser(fieldin, ix.schema) qp.add_plugin(ws.qparser.SingleQuotePlugin()) query = qp.parse(query, normalize=False) with ix.searcher() as searcher: if terms is True: results = searcher.search(query, terms=True, limit=limit).matched_terms() else: results = list(searcher.search(query, limit=limit).items()) return results def search(self, query='', field=None, index=None, limit=None): ''' Text search ''' index = index or self._default_index limit = None if limit == 'all' else limit ix = self.get_index(index) fieldin = field or 'content' fuzzy = '~' in query wildcard = '*' in query plusminus = '+' in query or '-' in query multifield = ':' in query boost = '^' in query qp = QueryParser(fieldin, ix.schema, group=OrGroup) # Check the difference between the both, not sure ? qp.add_plugin(ws.qparser.SingleQuotePlugin()) qp.add_plugin(ws.qparser.PhrasePlugin()) if wildcard: qp.add_plugin(ws.qparser.WildcardPlugin()) if fuzzy: qp.add_plugin(ws.qparser.FuzzyTermPlugin()) if plusminus: qp.add_plugin(ws.qparser.PlusMinusPlugin()) if multifield: qp.add_plugin(ws.qparser.MultifieldPlugin([fieldin])) if boost: qp.add_plugin(ws.qparser.BoostPlugin()) query = qp.parse(query) with ix.searcher() as searcher: results = searcher.search(query, limit=limit) results.fragmenter = ws.highlight.SentenceFragmenter( maxchars=200, charlimit=100042) #results.fragmenter = ws.highlight.ContextFragmenter(maxchars=200, surround=43) results.formatter = TerminalFormatter() for r in results: yield r def getbydocid(self, docid, index=None): ''' return the a document's stored fields in the index from docid ''' index = index or self._default_index ix = self.get_index(index) with ix.searcher() as searcher: doc = searcher.stored_fields(docid) return doc def getfirst(self, query='', field=None, index=None): query = "'" + query + "'" results = self._search(query, field, index, limit=1) if not results: return None else: return self.getbydocid(results[0][0]) # not need to commit ?! /conflict forward... #def delete_by_term(self, term, field): # writer.delete_by_term('hash', doc['hash']) def getbydocids(self, docids, index=None): ''' return the a list of document's stored fields in the index from docids ''' index = index or self._default_index ix = self.get_index(index) docs = [] with ix.searcher() as searcher: for docid in docids: docs.append(searcher.stored_fields(docid)) return docs # @debug : online searcher # @debug : get a list of terms (mongo projection equivalent ?!) def query(self, field=None, index=None, terms=False): ''' return all object that have the field entry set ''' index = index or self._default_index ix = self.get_index(index) field = field or ix.schema.stored_names()[0] query = ws.query.Every(field) with ix.searcher() as searcher: results = searcher.search(query, limit=None) if terms is False: results = [r[field] for r in results] elif isinstance(terms, str): results = dict((o[field], o[terms]) for o in results) else: results = [dict(r) for r in results] return results
class tfidf(IndexManager): ''' Index documents. * Whoosh based. * format supported : * pdf ''' _DATA_PATH = os.path.join(get_pymake_settings('project_data'), 'tfidf') _SCHEMA = {'document' : ws.fields.Schema(hash = ws.fields.ID(stored = True, unique=True), shortpath = ws.fields.ID(stored = True, unique=True), fullpath = ws.fields.ID(stored = True, unique=True), title = ws.fields.KEYWORD(stored = True), authors = ws.fields.KEYWORD(stored = True), # names of the authors '||' separated references = ws.fields.KEYWORD(stored = True), # names of the references '||' separated date = ws.fields.KEYWORD(stored = True), # date of publication (@todo: find it by cross reference !) content = ws.fields.TEXT), #source = '', # name of the journal/conf ertc #type = '', # journal/conf etc } def __init__(self, expe): self.expe = expe super().__init__(default_index='document') def doc_yielder(self, path): ''' find all pdf and yield do2bow doc ''' path = os.path.expanduser(path) if os.path.isfile(path): self.expe.path = path.rpartition('/')[0] +'/' for p in [path]: yield p elif not os.path.exists(path): self.log.error('path error: %s' % path) exit() for root, dirnames, filenames in os.walk(path): for filename in filenames: if not filename.endswith(('.pdf','.PDF')): continue fullpath = os.path.join(root, filename) if match_pattern(fullpath, self.expe.get('exclude_path')): continue yield fullpath def doc2xml(self, hit): import shutil # 0. Init cermine usage. (one path/pdf at a time). filename = os.path.basename(hit['fullpath']) fullpath = hit['fullpath'] shortpath = hit['shortpath'] pwd = os.getenv('PWD') os.chdir(os.path.join(pwd, 'data/lib/cermine/')) cermine_tar_dir = 'pdf_temp/'+filename.rpartition('.')[0] + '/' if not os.path.exists(cermine_tar_dir): os.makedirs(cermine_tar_dir) shutil.copy(hit['fullpath'], cermine_tar_dir) # 1. run Cermine jar = 'cermine-impl-1.14-SNAPSHOT-jar-with-dependencies.jar' classes = 'pl.edu.icm.cermine.ContentExtractor' try: self.log.info('extracting content of: %s' % (shortpath)) output = subprocess.check_output(['java', '-cp', jar, classes, '-path', cermine_tar_dir]) except Exception as e: self.log.error('Cermine Error %s : ' % e) self.log.error('Please try install/upgrade Cermine for pdf data extraction.') os.remove(cermine_tar_dir + filename) # remove the copied pdf os.chdir(pwd) return {} # 2. get the xml information cermine_file = cermine_tar_dir+ filename.rpartition('.')[0] + '.cermxml' if not os.path.isfile(cermine_file): self.log.error('Cermine failed...') return {} xml_strings = open(cermine_file).read() os.remove(cermine_tar_dir + filename) # remove the copied pdf os.chdir(pwd) return xml_strings # Two assumptions : # * string is a pdf, # * is a structured is as as scientific paper (journal ?). def extract_structured_kw(self, hit): structured = {} xml_strings = self.doc2xml(hit) try: from bs4 import BeautifulSoup except ImportError: self.log.error('Please install BeautifulSoup4 to parse xml doc.') return {} try: soup = BeautifulSoup(xml_strings, 'lxml') except Exception as e: self.log.error('BeautifulSoup fail to parse a file: %s : ' % e) return {} #titles = soup.findAll(re.compile(".*title.*")) # Main title # max probable title from cermine front = soup.front front_titles = front.findAll(re.compile(".*title.*")) #print(front_titles) main_title = ' '.join([o.string or '' for o in front_titles]).strip() structured['title'] = main_title authors = soup.findAll(attrs={'contrib-type':'author'}) authors = [o.findAll('string-name') for o in authors] authors = sum(authors, []) authors = ' || '.join([o.string for o in authors]) structured['authors'] = authors # Institution, Journal, Year etc... pass # References references = [ ' '.join(str(r).split()) for r in soup.findAll('mixed-citation')] structured['references'] = ' || '.join(references) return structured def fit(self): voca = Vocabulary(exclude_stopwords=True) writer = self.get_writer(reset=self.expe.reset, online=True) setattr(self, 'writer', writer) for _it, path in enumerate(self.doc_yielder(self.expe.path)): fullpath = path shortpath = '/' + fullpath[len(os.path.expanduser(self.expe.path)):].rstrip('/').lstrip('/') is_known = False is_duplicated = False if self.getfirst(shortpath, 'shortpath'): # don't update document # could compute a diff here... is_known = True # assume already indexed else: text = extract_pdf(fullpath) text = voca.remove_stopwords(text) #bow = voca.doc2bow(text) if text in (None, ''): # do nothing continue doc = dict(shortpath=shortpath, fullpath=fullpath) doc['content'] = text doc['hash'] = hash_objects(text) first_m = self.getfirst(doc['hash'], 'hash') if first_m: #if not 'content' in first_m: # writer.delete_by_term('hash', doc['hash']) # continue # don't update document self.log.warning("Duplicate file detected: %s renaming to %s" % (first_m['shortpath'], shortpath)) first_m['shortpath'] = shortpath writer.update_document(**first_m) is_duplicated = True else: if self.expe.extract_structure: # structured content structured = self.extract_structured_kw(doc) doc.update(structured) if not (is_known or is_duplicated): print("indexing `%s'" % (path)) try: writer.add_document(**doc) except Exception as e: print('indexing doc %s failed!' % fullpath) return def close(self): if hasattr(self, 'writer'): try: self.writer.close() except Exception as e: print('Whoosh error: %s' %e)
def forest_tensor(target_files, map_parameters): """ It has to be ordered the same way than the file properties. Fuze directory to find available files then construct the tensor according the set space fomed by object found. @in target_files has to be orderedDict to align the the tensor access. """ # Expe analyser / Tabulyze It # res shape ([expe], [model], [measure] # ================================================================================= # Expe: [debug, corpus] -- from the dirname # Model: [name, K, hyper, h**o] -- from the expe filename # measure: # * 0: global precision, # * 1: local precision, # * 2: recall ### Output: rez.shape rez_map_l rez_map if not target_files: lgg.info('Target Files empty') return None #dim = get_conf_dim_from_files(target_files, map_parameters) # Rely on Expe... dim = dict( (k, len(v)) if isinstance(v, (list, tuple)) else (k, len([v])) for k, v in map_parameters.items() ) rez_map = map_parameters.keys() # order ! # Expert knowledge value new_dims = _New_Dims # Update Mapping [dim.update(d) for d in new_dims] [rez_map.append(n.keys()[0]) for n in new_dims] # Create the shape of the Ananisys/Resulst Tensor #rez_map = dict(zip(rez_map_l, range(len(rez_map_l)))) shape = [] for n in rez_map: shape.append(dim[n]) # Create the numpy array to store all experience values, whith various setings rez = np.zeros(shape) * np.nan not_finished = [] info_file = [] for _f in target_files: prop = get_conf_from_file(_f, map_parameters) pt = np.empty(rez.ndim) assert(len(pt) - len(new_dims) == len(prop)) for k, v in prop.items(): try: v = int(v) except: pass try: idx = map_parameters[k].index(v) except Exception as e: lgg.error(prop) lgg.error('key:value error -- %s, %s'% (k, v)) raise ValueError pt[rez_map.index(k)] = idx f = os.path.join(get_pymake_settings('project_data'), _f) d = load(f) if not d: not_finished.append( '%s not finish...\n' % _f) continue try: pt = list(pt.astype(int)) for i, v in enumerate(_Key_measures): pt[-1] = i ### HOOK # v: is the measure name # json_v: the value of the measure if v == 'homo_model_e': try: json_v = d.get('homo_model_o') - d.get(v) except: pass elif v == 'f1': precision = d.get('Precision') try: recall = d.get('Recall') recall*2 except: # future remove recall = d.get('Rappel') json_v = 2*precision*recall / (precision+recall) else: if v == 'Recall': try: v * 2 except: v = 'Rappel' json_v = d.get(v) rez[zip(pt)] = json_v except IndexError as e: lgg.error(e) lgg.error('Index Error: Files are probably missing here to complete the results...\n') #info_file.append( '%s %s; \t K=%s\n' % (corpus_type, f, K) ) lgg.debug(''.join(not_finished)) #lgg.debug(''.join(info_file)) rez = np.ma.masked_array(rez, np.isnan(rez)) return rez
def full_fig_path(self, fn): figs_path = get_pymake_settings('project_figs') path = os.path.join(figs_path, self.expe.get('_refdir', ''), self.specname(fn)) make_path(path) return path
def get_data_path(self): path = get_pymake_settings('project_data') path = os.path.join(path, '') return path