示例#1
0
def main():
    if len(sys.argv) != 2:
        print('Python LePrEF 2015')
        print('Usage: python3 %s argv[1]' % sys.argv[0])
        print('argv[1]: Execution configuration file')
        print('\tJson Execution configuration file must have the attributes below:')
        print('\t  {')
        print('\t  ftrainame:\t\ttrain file name,')
        print('\t  fvaliname:\t\tvalidation file name,')
        print('\t  ftestname:\t\ttest file name,')
        print('\t  ffiltersresults:\tfilters result file name,')
        print('\t  foutputname:\tAOL output file name,')
        print('\t  resultdir:\t\tOut directory for results,')
        print('\t  randomseed:\t\tRandom Seed,')
        print('\t  popsize:\t\tPopulation size,')
        print('\t  generations:\tGenerations number,')
        print('\t  cxprob:\t\tCrossover Probability,')
        print('\t  mutprob:\t\tMutation Probability,')
        print('\t  hoflen:\t\tHall of Fame length')
        print('\t  }')
        return 1

    #Load config file
    fexecname = sys.argv[1]
    exec_config = load_config_file(fexecname)

    ftrainame = exec_config['ftrainame']
    fvaliname = exec_config['fvaliname']
    ftestname = exec_config['ftestname']
    ffiltersresults = exec_config['ffiltersresults']
    foutputname = exec_config['foutputname']
    resultdir = exec_config['resultdir']
    randomseed = int(exec_config['randomseed'])
    popsize = int(exec_config['popsize'])
    generations = int(exec_config['generations'])
    cxprob = float(exec_config['cxprob'])
    mutprob = float(exec_config['mutprob'])
    hoflen = int(exec_config['hoflen'])
    #end load config file

    #Start up print
    print('Bm25GP [%s] - %s [%s]' %
      (fexecname, pwd.getpwuid(os.getuid())[4],os.getlogin()),
      '\nRunning on Python',sys.version)
    logger(resultdir, 'Bm25GP [%s] - %s [%s]' %
      (fexecname, pwd.getpwuid(os.getuid())[4],os.getlogin()),
      '\nRunning on Python',sys.version)

    #Set pset and toolbox
    pset = create_primitiveset()
    toolbox = create_toolbox(pset)
    set_gpoperator(toolbox, pset)
    #set_evaluate(toolbox, evalQuery, queries, index, aolstats, results)

    #results
    print('Reading results from "' + ffiltersresults + '"...')
    logger(resultdir, 'Reading results from "' + ffiltersresults + '"...')
    filtersresults_data = load_results(ffiltersresults)['results']

    print('Processing results...')
    logger(resultdir, 'Processing results...')
    results = create_result_dict(filtersresults_data)

    print('Results ready!')
    logger(resultdir, 'Results ready!')

    #Get train data
    print('Reading train queries from "' + ftrainame + '"...')
    logger(resultdir, 'Reading train queries from "' + ftrainame + '"...')
    queries = lepref_util.carregar_queries(ftrainame)[0]
    #lepref_util.configurar_idcg_medio(queries, topN = MAXEVAL)

    print(len(queries), 'queries read!')
    logger(resultdir, len(queries), 'queries read!')

    #Create Bm25 Index
    print('Creating index from queries...')
    logger(resultdir, 'Creating index from queries...')
    index = bm25.Bm25Index()
    index.generate_from_queries(queries)

    print('Index created with', len(index), 'terms!')
    logger(resultdir, 'Index created with', len(index), 'terms!')

    #Set evaluate function and data
    print('Reading AOL output file from "' + foutputname + '"...')
    logger(resultdir, 'Reading AOL output file from "' + foutputname + '"...')
    aoldata, featurelist = aol_parser.output_read(foutputname)

    print('Precomputing AOL statistics...')
    logger(resultdir, 'Precomputing AOL statistics...')
    aolstats = process_aol_stats(aoldata)

    print(len(aolstats['stats']), 'AOL statistics created!')
    logger(resultdir, len(aolstats['stats']), 'AOL statistics created!')

    set_evaluate(toolbox, evalQuery, queries, index, aolstats, results)

    print('All data is ready!')
    logger(resultdir, 'All data is ready!')

    if os.path.exists(resultdir + os.sep + 'checkpoint.data'):
        #Load Checkpoint
        print('Checkpoint found! Loading...')
        logger(resultdir, 'Checkpoint found! Loading...')

        pop, igen, stats, hof, logbook, randomstate = load_checkpoint(resultdir)
        random.setstate(randomstate)

    else:
        igen = 0
        logbook = None
        pop, hof, stats = prepare_gp(toolbox, randomseed, popsize, hoflen)

    starttime = datetime.now()
    if igen == 0:
        print('Starting new evolution at ', starttime,'!', sep = '')
        logger(resultdir, 'Starting new evolution at ', starttime,'!', sep = '')
    else:
        print('Resuming evolution in generation ', igen, ' at ', starttime ,'!', sep = '')
        logger(resultdir, 'Resuming evolution in generation ', igen, ' at ', starttime ,'!', sep = '')

#    eaCheckpoint(pop, cxprob, MutProb, igen, generations, stats, halloffame=hof,
#                         logbook=logbook, cpfile_location = dirresult)
    eaCheckpoint(pop, toolbox, cxprob, mutprob, igen, generations, resultdir, stats, halloffame=hof,
             logbook=logbook)

    endtime = datetime.now()
    print('Evolution training finishes at ', endtime,'!', sep = '')
    print('Total time: ', endtime - starttime)
    logger(resultdir, 'Evolution training finishes at ', endtime,'!', sep = '')
    logger(resultdir, 'Total time: ', endtime - starttime)

    #validating
    #Get validation data
    print('Prepair for validation and test!')
    logger(resultdir, 'Prepair for validation and test!')
    queries = lepref_util.carregar_queries(fvaliname)[0]
    #Set validation data to toolbox.evaluate
    set_evaluate(toolbox, evalQuery, queries, index, aolstats, results)

    #run validation
    global tobedone
    tobedone = len(hof)
    valiresults = [evaluation[0] for evaluation in map(toolbox.evaluate, hof)]

    #output validation
    print(':validation:')
    logger(resultdir, ':validation:')
    for vali in valiresults:
        print(vali)
        logger(resultdir, vali)
    bestvali = max(zip(hof, valiresults), key = lambda z : z[1])
    print(':bestvalidation:')
    print(bestvali[1])
    logger(resultdir, ':bestvalidation:')
    logger(resultdir, bestvali[1])

    #Testing
    #Get test data
    queries = lepref_util.carregar_queries(ftestname)[0]
    #Set test data to toolbox.evaluate
    set_evaluate(toolbox, evalQuery, queries, index, aolstats, results)

    #run test
    tobedone = 1
    resulttest = toolbox.evaluate(bestvali[0])

    #output test
    print(':test:')
    print(resulttest[0])
    logger(resultdir, ':test:')
    logger(resultdir, resulttest[0])
示例#2
0
def main():

    if len(sys.argv) != 5:
        print('Python 2015')
        print('Uso: python3 bm25.py argv[1]')
        print('argv[1]: arquivo da base de dados')
        print('argv[2]: nome do arquivo onde serão salvos os dados')
        print('argv[3]: nome do diretório onde serão salvos os arquivos de resultados')
        print('argv[4]: nome do arquivo de range')
    #    print('argv[x]:')
        sys.exit(-1)
    else:
        dbname = sys.argv[1]
        fdataname = sys.argv[2]
        dirresultname = sys.argv[3]
        rangefname = sys.argv[4]
        try:
            with open(rangefname, 'r') as rangefile:
                rangestring=rangefile.read()
        except Exception as error:
            raise error

        rangedict = json.loads(rangestring)
        qiini = rangedict['qiini']
        fiini = rangedict['fiini']
        qiend = rangedict['qiend']
        fiend = rangedict['fiend']

        global EXECSTHRESHOLD
        try:
            EXECSTHRESHOLD = rangedict['bkplen']
        except KeyError:
            print('Using bkplen default =', EXECSTHRESHOLD)

        global NOSTOPWORDS
        try:
            NOSTOPWORDS = rangedict['nostopwords']
        except KeyError:
            print('Using nostopwords default =', NOSTOPWORDS)

    #Filters
    global filters
    filters = {
        1 : {'name': 'default(or)', 'query': True, 'posproc': False},
        2 :     {'name': 'and', 'query': True, 'posproc': True},
        3 :  {'name': 'filter', 'query': False, 'posproc': True}
    }

    if os.path.exists(fdataname):
        print("Carregando dados do arquivo ", fdataname, '...',sep = '')
        queries, index = load_data(fdataname)

    else:
        #Obter dados de treino
        print("Lendo queries...")
        queries, lista_invertida = lepref_util.carregar_queries(sys.argv[1])
        print('Queries lidas:', len(queries))

        index = Bm25Index()

        print("Adicionando queries ao índice")
        index.generate_from_queries(queries)

        save_data(queries, index, fdataname)

    lepref_util.configurar_idcg_medio(queries, topN = TOPN)

###Evaluations
##Simple
#    print('MeanNDCG: ', evaluate(queries, index, topN = TOPN))
##Filters permutations
    doit = False
    if os.path.exists(dirresultname):
        print("Diretório ", dirresultname, " de resultado encontrado!", sep = '')

        while (True):
            inputstring = input('Deseja calcular novamente?(N,y): ')
            if inputstring in ['Y','y']:
                doit = True
                break

            elif inputstring in ['N','n', '']:
                print('Saindo...')
                sys.exit()
            else:
                print('Opção <', inputstring, '> Invalida!', sep='')
    else:
        doit = True

#    print_idf(index)
    if doit:
        #print(count_execs(queries, qiini, fiini, qiend, fiend))
        #results = evaluate_filters(queries, index, topN = TOPN, qiini = 1, fiini = 0, qiend = 2, fiend = 8)
        evaluate_filters(queries, index, dirresultname, topN = TOPN, qiini = qiini, fiini = fiini, qiend = qiend, fiend = fiend)
        #evaluate_filters(queries, index, dirresultname, topN = TOPN, qiini = 0, fiini = 0, qiend = 3, fiend = None)

        #save_results(results, dirresultname)
        print('Docs lidos:', len(index.doc))
        print('AVGDL:', index.avgdl)
        print('Vocabulary Length:', len(index))