示例#1
0
    sentsPerProc = int(math.floor(len(sentences)*1.0/numOfProcesses))
    processes = []
    lock = Lock()
    test_json_file = open(test_json, 'w', 0)
    for i in range(numOfProcesses):
        if i == numOfProcesses - 1:
            p = Process(target=parse, args=(sentences[i*sentsPerProc:], test_json_file, lock))
        else:
            p = Process(target=parse, args=(sentences[i*sentsPerProc:(i+1)*sentsPerProc], test_json_file, lock))
        p.start()
        processes.append(p)
    for proc in processes:
        proc.join()
    test_json_file.close()
    print 'Start feature extraction'
    pipeline(train_json, indir + '/brown', outdir)
    filter(outdir+'/feature.map', outdir+'/train_x.txt', outdir+'/feature.txt', outdir+'/train_x_new.txt')
    pipeline_test(test_json, indir + '/brown', outdir+'/feature.txt',outdir+'/type.txt', outdir)
    supertype(outdir)
    distribution(outdir)

    # Perform no pruning to generate training data
    print 'Start training and test data generation'
    feature_number = get_number(outdir+'/feature.txt')
    type_number = get_number(outdir+'/type.txt')
    prune(outdir, outdir, 'no', feature_number, type_number)

    # Generate type type correlation
    print 'Start type correlation calculation'
    share_entity(indir + '/type_entities.txt', outdir + '/type.txt', outdir + '/type_type_kb.txt')
    print 'Test set parsing done'

    print 'Start em feature extraction'
    pipeline(train_json,
             indir + '/brown',
             outdir_em,
             requireEmType=requireEmType,
             isEntityMention=True)

    filter(outdir_em + '/feature.map', outdir_em + '/train_x.txt',
           outdir_em + '/feature.txt', outdir_em + '/train_x_new.txt')

    pipeline_test(test_json,
                  indir + '/brown',
                  outdir_em + '/feature.txt',
                  outdir_em + '/type.txt',
                  outdir_em,
                  requireEmType=requireEmType,
                  isEntityMention=True)
    supertype(outdir_em)

    ### Perform no pruning to generate training data
    print 'Start em training and test data generation'
    feature_number = get_number(outdir_em + '/feature.txt')
    type_number = get_number(outdir_em + '/type.txt')
    prune(outdir_em,
          outdir_em,
          'no',
          feature_number,
          type_number,
          neg_label_weight=float(sys.argv[4]),