Пример #1
0
def experiment(config, scorer_class, *args, detailed_log=None):
    (ws_full, ws_sim, ws_rel) = WordSim353.load(config)
    sl = SimLex999.load(config)
    # rw = RareWords.load(config)

    scorer = scorer_class(*args)
    results = {}

    for (dat, lbl) in [
        (ws_full, WS353),
            # (ws_sim, WS353_SIM),
            # (ws_rel, WS353_REL),
        (sl, SL999),
            # (rw, RW)
    ]:
        log.writeln('\nChecking %s...' % lbl)
        ds_metrics = assignScores(dat,
                                  scorer,
                                  detailed_log=detailed_log,
                                  ds_name=lbl)

        results[lbl] = ds_metrics
        log.writeln('Rho: %.2f (Evaluated %d/%d)' %
                    (ds_metrics.rho, ds_metrics.evaluated, ds_metrics.total))

    log.writeln('\n\n-- Final report --\n')
    # for lbl in [WS353, WS353_SIM, WS353_REL, SL999, RW]:
    for lbl in [WS353, SL999]:
        m = results[lbl]
        log.writeln('%s --> Rho: %.3f (%d/%d samples)' %
                    (lbl, m.rho, m.evaluated, m.total))
def KNearestNeighbors(emb_arr, top_k, neighbor_file, threads=2, batch_size=5):
    '''docstring goes here
    '''
    # set up threads
    log.writeln('1 | Thread initialization')
    index_subsets = util.prepareForParallel(list(range(len(emb_arr))),
                                            threads - 1,
                                            data_only=True)
    nn_q = mp.Queue()
    nn_writer = mp.Process(target=_nn_writer,
                           args=(neighbor_file, len(emb_arr), nn_q))
    computers = [
        mp.Process(target=_threadedNeighbors,
                   args=(index_subsets[i], emb_arr, batch_size, top_k, nn_q))
        for i in range(threads - 1)
    ]
    nn_writer.start()
    log.writeln('2 | Neighbor computation')
    util.parallelExecute(computers)
    nn_q.put(_SIGNALS.HALT)
    nn_writer.join()
Пример #3
0
def buildGraph(neighbor_files, k):
    log.writeln('Building neighborhood graph...')
    graph = {}

    # construct frequency-weighted edges
    log.track(message='  >> Loaded {0}/%d neighborhood files' %
              len(neighbor_files),
              writeInterval=1)
    for neighbor_file in neighbor_files:
        neighborhoods = readNeighbors(neighbor_file, k)
        for (source, neighbors) in neighborhoods.items():
            if graph.get(source, None) is None:
                graph[source] = {}
            for nbr in neighbors:
                graph[source][nbr] = graph[source].get(nbr, 0) + 1
        log.tick()
    log.flushTracker()

    log.writeln('  >> Normalizing edge weights...')
    max_count = float(len(neighbor_files))
    for (source, neighborhood) in graph.items():
        for (nbr, freq) in neighborhood.items():
            graph[source][nbr] = freq / max_count

    log.writeln('Graph complete!')
    return graph
            ('Input embedding file mode', options.embedding_mode),
            ('Output neighbor file', options.outputf),
            ('Ordered vocabulary file', options.vocabf),
            ('Number of nearest neighbors', options.k),
            ('Batch size', options.batch_size),
            ('Number of threads', options.threads),
        ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode)
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings in {1}s.\n'.format(
                      len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing ordered vocabulary to %s...\n' % options.vocabf)
        pyemblib.listVocab(emb, options.vocabf)
    else:
        log.writeln('Reading ordered vocabulary from %s...\n' % options.vocabf)
    ordered_vocab = util.readList(options.vocabf, encoding='utf-8')

    emb_arr = np.array([emb[v] for v in ordered_vocab])

    log.writeln('Calculating k nearest neighbors.')
    KNearestNeighbors(emb_arr,
                      options.k,
                      options.outputf,
                      threads=options.threads,
                      batch_size=options.batch_size)
    log.writeln('Done!\n')
Пример #5
0
 def __init__(self, embf, embmode, log=log):
     log.writeln('Reading embeddings from %s...' % embf)
     embs = pyemblib.read(embf, mode=embmode, replace_errors=True)
     log.writeln('  Read %d embeddings.' % len(embs))
     self._embs = embs
Пример #6
0
            default=10)
        parser.add_option(
            '-l',
            '--logfile',
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if len(args) == 0:
            parser.print_help()
            exit()
        neighbor_files = args
        return neighbor_files, options

    neighbor_files, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        *[('Neighborhood sample file %d' % (i + 1), neighbor_files[i])
          for i in range(len(neighbor_files))],
        ('Output file', options.outputf),
        ('Number of neighbors to include in edge construction', options.k),
    ], 'Nearest neighborhood graph generation')

    graph = buildGraph(neighbor_files, options.k)

    log.write('Writing graph to %s...' % options.outputf)
    writeGraph(graph, options.outputf)
    log.writeln('Done!')

    log.stop()