# (3/4) téléchargement des fulltexts --------------------------------- my_ids = cobj.cols['istex_id'] my_basenames = cobj.bnames for the_shelf in ['PDF0', 'XMLN']: the_api_type = cobj.origin(the_shelf) the_ext = cobj.filext(the_shelf) tgt_dir = cobj.shelf_path(the_shelf) print("mkdir -p: %s" % tgt_dir) mkdir(tgt_dir) api.write_fulltexts_loop_interact( my_ids, my_basenames, tgt_dir = tgt_dir, api_types = [the_api_type] ) print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name) if debug > 0: print(" (=> target dir:%s)" % tgt_dir) # NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API # ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions cobj.assert_docs('PDF0') cobj.assert_docs('XMLN') # persistance du statut des 2 dossiers créés cobj.save_shelves_status()
def make_set(corpus_name, from_table=None, size=None, constraint=None, debug=0): """ Initialisation d'un corpus basique et remplissage de ses fulltexts 3 façons de l'appeler : - soit on fournit une table de métadonnées infos.tab (chemin fs) - soit on fournit une taille (sampling directement avec l'API) - soit on ne fournit rien et il fait un sampling de 10 docs Métadonnées, rangées dans CORPUS_HOME/<corpus_name>/meta/ - basenames.ls - infos.tab Données: 3 formats, rangés dans CORPUS_HOME/<corpus_name>/data/ - .pdf, - .xml (natif) - et .tei.xml (pub2tei) Position dans le système de fichier cf sous lib/global_conf.ini -> variable CORPUS_HOME -> mise par défaut à ./corpora/ """ # test de base avant de tout mettre dans le dossier # (le seul dossier qu'on n'écrase jamais) future_dir = path.join(MY_CORPUS_HOME, corpus_name) if path.exists(future_dir): print("ERR:'%s'\nLe nom '%s' est déjà pris dans le dossier des corpus." % (future_dir, corpus_name), file=stderr) exit(1) # (1/4) echantillon initial (juste la table) ------------------------- # soit on a déjà une table if from_table and size: print("""ERR bako.make_set: fournir au choix 'from_table' ou 'size', mais pas les 2.""", file=stderr) exit(1) if from_table: if path.exists(from_table): fic = open(from_table) my_tab = fic.readlines() fic.close() else: print("ERR bako.make_set: je ne trouve pas le fichier '%s'" % from_table, file=stderr) exit(1) # sinon sampling else: if not size: size = 10 if not constraint: ok_corpora = CONF['workshop']['PREPROC_READY_CORPORA'] corpora_constraint = " OR ".join(['corpusName:'+corpus for corpus in ok_corpora.split(",")]) constraint = "qualityIndicators.refBibsNative:true AND (" + corpora_constraint +")" if isinstance(size, int): my_tab = sampler.full_run( ['-n', str(size), '--outmode', 'tab', '--with', constraint, '-v' ] ) else: print("ERR bako.make_set: 'size' doit être un entier'%s'" % from_table, file=stderr) exit(1) # (2/4) notre classe corpus ------------------------------------------ # Corpus # initialisation # - mode tab seul => fera un dossier meta/ et un data/ vide, # - le corpus_type est mis en dur à 'gold' ce qui signale # simplement qu'on ne change pas les étagères par défaut) cobj = Corpus(corpus_name, new_infos = my_tab, new_home = MY_CORPUS_HOME, verbose = (debug>0), corpus_type='gold') # (3/4) téléchargement des fulltexts --------------------------------- my_ids = cobj.cols['istex_id'] my_basenames = cobj.bnames for the_shelf in ['PDF0', 'XMLN']: the_api_type = cobj.origin(the_shelf) the_ext = cobj.filext(the_shelf) tgt_dir = cobj.shelf_path(the_shelf) print("mkdir -p: %s" % tgt_dir,file=stderr) makedirs(tgt_dir) api.write_fulltexts_loop_interact( my_ids, my_basenames, api_conf = CONF['istex-api'], tgt_dir = tgt_dir, api_types = [the_api_type] ) print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name) if debug > 0: print(" (=> target dir:%s)" % tgt_dir) # NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API # ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions cobj.assert_docs('PDF0') cobj.assert_docs('XMLN') # persistance du statut des 2 dossiers créés cobj.save_shelves_status() # (4/4) conversion tei (type gold biblStruct) ------------------------ # copie en changeant les pointeurs dtd print("***DTD LINKING***") cobj.dtd_repair(debug_lvl = debug) print("***XML => TEI.XML CONVERSION***") # créera le dossier C-goldxmltei cobj.pub2goldtei(debug_lvl = debug) # conversion cobj.assert_docs('GTEI') # persistence du statut du dossier créé cobj.save_shelves_status() # we return the new filled corpus for further work or display return cobj
def full_run(arglist=None): global LOG global LISSAGE # output lines for direct use or print to STDOUT if __main__ output_array = [] # cli arguments args = my_parse_args(arglist) # do we need to change smoothing ? if args.smoothing_init and float(args.smoothing_init) > 0: print("Setting initial smoothing to %.2f" % args.smoothing_init, file=stderr) # global var change in main LISSAGE = args.smoothing_init # event log lines LOG = ['INIT: sampling %i' % args.sample_size] LOG.append('CRIT: fields(%s)' % ", ".join(args.criteria_list)) if args.with_constraint_query: LOG.append('WITH: constraint query "%s"' % args.with_constraint_query) run_counter = 0 # initial sampler run got_ids_idx = sample( args.sample_size, args.criteria_list, constraint_query = args.with_constraint_query, verbose = args.verbose, run_count = run_counter ) run_counter += 1 # how much is there? n_ids = len(got_ids_idx) # info print('-'*27 + " initial result : %i docs " % n_ids + '-'*27, file=stderr) LOG.append("XGOT: picked %i" % n_ids) # check combopools status insufficient_pool_flag = False for sig in LOG: if search("^LESS:", sig): insufficient_pool_flag = True break # --------- a posteriori corrections ------------- # # the initial quotas can take neither the "with_constraint arg" # nor "multiple choice fields" into account (unless use N_reponse?) # for that reason at this point in the process we may have more or # less than the requested sample_size # IF not enough => new sample run with lighter criteria if n_ids < args.sample_size: actual_criteria = args.criteria_list # keep trying... while (n_ids < args.sample_size and run_counter < MAX_RUNS): # => over "delta" (missing docs) remainder = args.sample_size - n_ids LOG.append("REDO: re-pioche sur %i docs" % remainder) # => with more help to small categories LISSAGE += 0.2 LOG.append("SMOO: smoothing up to %.02f" % LISSAGE) # => and with less criteria if necessary # (if criteria pool insufficient under some constraints, we # do need to relax at least one criterion, but which one?) if len(actual_criteria) > 1 and insufficient_pool_flag: # simplify criteria by removing the last one new_criteria = actual_criteria[0:-1] LOG.append("RLAX: abandon équilibrage champ '%s'" % actual_criteria[-1]) # reset flag (£TODO recalculate after run ?) insufficient_pool_flag = False else: new_criteria = actual_criteria # -------- RE-RUN --------- previous_ids = got_ids_idx got_ids_idx = sample( remainder, new_criteria, constraint_query = args.with_constraint_query, index = previous_ids, verbose = args.verbose ) # recount apport = len(got_ids_idx) - n_ids # update n_ids += apport run_counter += 1 # warn LOG.append("XGOT: picked %i" % apport) print('-'*22 + " resultat après run %i: %i documents " % (run_counter, n_ids) + '-'*22, file=stderr) # IF overflow => random pruning if n_ids > args.sample_size: deck = [did for did in got_ids_idx.keys()] # random removal of excess documents shuffle(deck) nd = n_ids - args.sample_size sacrificed = deck[0:nd] for did in sacrificed: del got_ids_idx[did] LOG.append("XDEL: sacrificing %i random docs" % nd) # last recount n_ids = len(got_ids_idx) print('-'*29 +" final result: %i docs "%n_ids+'-'*29, file=stderr) # -------------- OUTPUT -------------------------------------------- # ***(ids)*** if args.out_type == 'ids': for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']): output_array.append("%s" % did) # ***(tab)*** elif args.out_type == 'tab': # header line # £TODO STD_MAP output_array.append("\t".join(['istex_id', 'corpus', 'pub_year', 'pub_period', 'pdfver', 'pdfwc','bibnat', 'author_1','lang','doctype_1','cat_sci', 'title'])) # contents for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']): # provenance: sample() => boucle par hits (l.500 ++) # print("INFO----------",info, file=stderr) # exit() period = year_to_range(info['yr']) output_array.append("\t".join([ did, info['co'], info['yr'], period, info['ver'], str(info['wcp']), str(info['bibnat']), info['au'], info['lg'], info['typ'], info['cat'], info['ti'], #~ info['_q'] ] ) ) # ***(docs)*** # no output lines but writes a dir elif args.out_type == 'docs': my_dir = path.join(getcwd(),my_name) mkdir(my_dir) # two "parallel" lists ids = list(got_ids_idx.keys()) basenames = [std_filename(one_id, got_ids_idx[one_id]) for one_id in ids] # loop with interactive authentification prompt if needed api.write_fulltexts_loop_interact( ids, basenames, tgt_dir=my_dir, api_types=['metadata/xml', 'fulltext/pdf', 'fulltext/tei'] ) LOG.append("SAVE: saved docs in %s/" % my_dir) if args.log: # separate logging lines logfile = open(my_name+'.log', 'w') for lline in LOG: print(lline, file=logfile) logfile.close() return output_array
# (3/4) téléchargement des fulltexts --------------------------------- my_ids = cobj.cols['istex_id'] my_basenames = cobj.bnames for the_shelf in ['PDF0', 'XMLN']: the_api_type = cobj.origin(the_shelf) the_ext = cobj.filext(the_shelf) tgt_dir = cobj.shelf_path(the_shelf) print("mkdir -p: %s" % tgt_dir) mkdir(tgt_dir) api.write_fulltexts_loop_interact(my_ids, my_basenames, tgt_dir=tgt_dir, api_types=[the_api_type]) print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name) if debug > 0: print(" (=> target dir:%s)" % tgt_dir) # NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API # ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions cobj.assert_docs('PDF0') cobj.assert_docs('XMLN') # persistance du statut des 2 dossiers créés cobj.save_shelves_status() # (4/4) conversion tei (type gold biblStruct) ------------------------
def full_run(arglist=None): global LOG global LISSAGE # output lines for direct use or print to STDOUT if __main__ output_array = [] # cli arguments args = my_parse_args(arglist) # do we need to change smoothing ? if args.smoothing_init and float(args.smoothing_init) > 0: print("Setting initial smoothing to %.2f" % args.smoothing_init, file=stderr) # global var change in main LISSAGE = args.smoothing_init # event log lines LOG = ['INIT: sampling %i' % args.sample_size] LOG.append('CRIT: fields(%s)' % ", ".join(args.criteria_list)) if args.with_constraint_query: LOG.append('WITH: constraint query "%s"' % args.with_constraint_query) run_counter = 0 # initial sampler run got_ids_idx = sample(args.sample_size, args.criteria_list, constraint_query=args.with_constraint_query, verbose=args.verbose, run_count=run_counter) run_counter += 1 # how much is there? n_ids = len(got_ids_idx) # info print('-' * 27 + " initial result : %i docs " % n_ids + '-' * 27, file=stderr) LOG.append("XGOT: picked %i" % n_ids) # check combopools status insufficient_pool_flag = False for sig in LOG: if search("^LESS:", sig): insufficient_pool_flag = True break # --------- a posteriori corrections ------------- # # the initial quotas can take neither the "with_constraint arg" # nor "multiple choice fields" into account (unless use N_reponse?) # for that reason at this point in the process we may have more or # less than the requested sample_size # IF not enough => new sample run with lighter criteria if n_ids < args.sample_size: actual_criteria = args.criteria_list # keep trying... while (n_ids < args.sample_size and run_counter < MAX_RUNS): # => over "delta" (missing docs) remainder = args.sample_size - n_ids LOG.append("REDO: re-pioche sur %i docs" % remainder) # => with more help to small categories LISSAGE += 0.2 LOG.append("SMOO: smoothing up to %.02f" % LISSAGE) # => and with less criteria if necessary # (if criteria pool insufficient under some constraints, we # do need to relax at least one criterion, but which one?) if len(actual_criteria) > 1 and insufficient_pool_flag: # simplify criteria by removing the last one new_criteria = actual_criteria[0:-1] LOG.append("RLAX: abandon équilibrage champ '%s'" % actual_criteria[-1]) # reset flag (£TODO recalculate after run ?) insufficient_pool_flag = False else: new_criteria = actual_criteria # -------- RE-RUN --------- previous_ids = got_ids_idx got_ids_idx = sample(remainder, new_criteria, constraint_query=args.with_constraint_query, index=previous_ids, verbose=args.verbose) # recount apport = len(got_ids_idx) - n_ids # update n_ids += apport run_counter += 1 # warn LOG.append("XGOT: picked %i" % apport) print('-' * 22 + " resultat après run %i: %i documents " % (run_counter, n_ids) + '-' * 22, file=stderr) # IF overflow => random pruning if n_ids > args.sample_size: deck = [did for did in got_ids_idx.keys()] # random removal of excess documents shuffle(deck) nd = n_ids - args.sample_size sacrificed = deck[0:nd] for did in sacrificed: del got_ids_idx[did] LOG.append("XDEL: sacrificing %i random docs" % nd) # last recount n_ids = len(got_ids_idx) print('-' * 29 + " final result: %i docs " % n_ids + '-' * 29, file=stderr) # -------------- OUTPUT -------------------------------------------- # ***(ids)*** if args.out_type == 'ids': for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']): output_array.append("%s" % did) # ***(tab)*** elif args.out_type == 'tab': # header line # £TODO STD_MAP output_array.append("\t".join([ 'istex_id', 'corpus', 'pub_year', 'pub_period', 'pdfver', 'pdfwc', 'author_1', 'lang', 'doctype_1', 'cat_sci', 'title' ])) # contents for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']): # provenance: sample() => boucle par hits (l.500 ++) # print("INFO----------",info, file=stderr) # exit() period = year_to_range(info['yr']) output_array.append("\t".join([ did, info['co'], info['yr'], period, info['ver'], str(info['wcp']), info['au'], info['lg'], info['typ'], info['cat'], info['ti'], #~ info['_q'] ])) # ***(docs)*** # no output lines but writes a dir elif args.out_type == 'docs': my_dir = path.join(getcwd(), my_name) mkdir(my_dir) # two "parallel" lists ids = list(got_ids_idx.keys()) basenames = [ std_filename(one_id, got_ids_idx[one_id]) for one_id in ids ] # loop with interactive authentification prompt if needed api.write_fulltexts_loop_interact( ids, basenames, tgt_dir=my_dir, api_types=['metadata/xml', 'fulltext/pdf']) LOG.append("SAVE: saved docs in %s/" % my_dir) return (output_array, LOG)