Exemplo n.º 1
0
    def __init__(self, filename, labeler=None):
        self.filename = filename

        instance_dict = load_evidence_file(filename)
        num_docs = len(instance_dict)
        doc_names = sorted(instance_dict.keys())
        feature_ids = sorted(set(chain(*[each.iterkeys() for each in instance_dict.values()])))
        vocab_size = len(feature_ids)

        # Create a map of feature_id => dense feature index
        feature_index = {k:i for i, k in enumerate(feature_ids)}

        doc_matrix = np.zeros((num_docs, vocab_size))
        doc_labels = []

        # For each document, convert sparse features to dense L2-normalized feature vector and write it into the
        # document matrix
        for i, name in enumerate(doc_names):
            sparse_features = instance_dict[name]

            doc_data = np.zeros(vocab_size)
            for id, count in sparse_features.iteritems():
                doc_data[feature_index[id]] = count
            doc_data = l2_normalize(doc_data)
            doc_labels[i] = labeler(name) if labeler else None
            doc_matrix[i, :] = doc_data

        self.num_docs = num_docs
        self.raw_labels = doc_labels
        self.class_names = sorted(set(doc_labels))
        self.doc_matrix = doc_matrix
        self.dim = len(feature_index)
Exemplo n.º 2
0
    def run(self, options):
        labeler = None if options.labeler is None else labelers.registry[
            options.labeler]

        # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing
        writer = None
        log.info('Writing SAM corpus to %s' % options.dest_corpus)

        filenames = open(options.file_list).readlines()
        for i, filename in enumerate(filenames):
            filename = filename.strip()
            log.info('Processing image %d/%d' % (i + 1, len(filenames)))

            descriptor = color_gist(
                filename) if options.color else grayscale_gist(filename)
            if writer is None:
                dim = descriptor.size
                writer = CorpusWriter(options.dest_corpus,
                                      data_series='sam',
                                      dim=dim)

            normalized_descriptor = l2_normalize(descriptor)
            doc_label = labeler(filename) if labeler else None
            writer.write_doc(ascolvector(normalized_descriptor),
                             name=filename,
                             label=doc_label)

        writer.close()
Exemplo n.º 3
0
    def __init__(self, filename, labeler=None):
        self.filename = filename

        instance_dict = load_evidence_file(filename)
        num_docs = len(instance_dict)
        doc_names = sorted(instance_dict.keys())
        feature_ids = sorted(
            set(chain(*[each.iterkeys() for each in instance_dict.values()])))
        vocab_size = len(feature_ids)

        # Create a map of feature_id => dense feature index
        feature_index = {k: i for i, k in enumerate(feature_ids)}

        doc_matrix = np.zeros((num_docs, vocab_size))
        doc_labels = []

        # For each document, convert sparse features to dense L2-normalized feature vector and write it into the
        # document matrix
        for i, name in enumerate(doc_names):
            sparse_features = instance_dict[name]

            doc_data = np.zeros(vocab_size)
            for id, count in sparse_features.iteritems():
                doc_data[feature_index[id]] = count
            doc_data = l2_normalize(doc_data)
            doc_labels[i] = labeler(name) if labeler else None
            doc_matrix[i, :] = doc_data

        self.num_docs = num_docs
        self.raw_labels = doc_labels
        self.class_names = sorted(set(doc_labels))
        self.doc_matrix = doc_matrix
        self.dim = len(feature_index)
Exemplo n.º 4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser()
    parser.add_argument('input_file',
                        type=str,
                        help='Input file in evidence format')
    parser.add_argument('output_file',
                        type=str,
                        help='Path to destination corpus file')
    parser.add_argument('--labeler', type=str, help='Labeler to apply')
    options = parser.parse_args(argv[1:])

    labeler = None
    if options.labeler is None:
        log.warning('no labeler provided')
    elif options.labeler not in labelers.registry:
        labeler_names = ', '.join(sorted(labelers.registry.keys()))
        parser.error('Invalid labeler "%s"; available options are %s' %
                     (options.labeler, labeler_names))
    else:
        labeler = labelers.registry[options.labeler]

    instance_dict = load_evidence_file(options.input_file)
    num_docs = len(instance_dict)
    feature_ids = sorted(
        set(chain(*[each.iterkeys() for each in instance_dict.values()])))
    vocab_size = len(feature_ids)
    log.info('Read %d docs (vocabulary size %d) from %s' %
             (num_docs, vocab_size, options.input_file))

    log.info('Writing L2-normalized corpus to %s' % options.output_file)
    writer = CorpusWriter(options.output_file,
                          data_series='sam',
                          dim=vocab_size)

    # Create a map of feature_id => dense feature index
    feature_index = {k: i for i, k in enumerate(feature_ids)}

    # For each document, convert sparse features to dense L2-normalized feature vector and write it to the corpus
    for name, sparse_features in instance_dict.iteritems():
        doc_data = np.zeros((vocab_size, 1))
        for id, count in sparse_features.iteritems():
            doc_data[feature_index[id]] = count
        doc_data = l2_normalize(doc_data)
        doc_label = labeler(name) if labeler else None

        writer.write_doc(doc_data, name=name, label=doc_label)
    writer.close()

    wordlist_path = options.output_file + '.wordlist'
    log.info('Writing wordlist to %s' % wordlist_path)
    with open(wordlist_path, 'w') as f:
        f.writelines([s + '\n' for s in feature_ids])
Exemplo n.º 5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser()
    parser.add_argument('input_file', type=str, help='Input file in evidence format')
    parser.add_argument('output_file', type=str, help='Path to destination corpus file')
    parser.add_argument('--labeler', type=str, help='Labeler to apply')
    options = parser.parse_args(argv[1:])

    labeler = None
    if options.labeler is None:
        log.warning('no labeler provided')
    elif options.labeler not in labelers.registry:
        labeler_names = ', '.join(sorted(labelers.registry.keys()))
        parser.error('Invalid labeler "%s"; available options are %s' % (options.labeler, labeler_names))
    else:
        labeler = labelers.registry[options.labeler]

    instance_dict = load_evidence_file(options.input_file)
    num_docs = len(instance_dict)
    feature_ids = sorted(set(chain(*[each.iterkeys() for each in instance_dict.values()])))
    vocab_size = len(feature_ids)
    log.info('Read %d docs (vocabulary size %d) from %s' % (num_docs, vocab_size, options.input_file))

    log.info('Writing L2-normalized corpus to %s' % options.output_file)
    writer = CorpusWriter(options.output_file, data_series='sam', dim=vocab_size)

    # Create a map of feature_id => dense feature index
    feature_index = {k:i for i, k in enumerate(feature_ids)}

    # For each document, convert sparse features to dense L2-normalized feature vector and write it to the corpus
    for name, sparse_features in instance_dict.iteritems():
        doc_data = np.zeros((vocab_size, 1))
        for id, count in sparse_features.iteritems():
            doc_data[feature_index[id]] = count
        doc_data = l2_normalize(doc_data)
        doc_label = labeler(name) if labeler else None

        writer.write_doc(doc_data, name=name, label=doc_label)
    writer.close()

    wordlist_path = options.output_file + '.wordlist'
    log.info('Writing wordlist to %s' % wordlist_path)
    with open(wordlist_path, 'w') as f:
        f.writelines([s + '\n' for s in feature_ids])
Exemplo n.º 6
0
    def run(self, options):
        labeler = labelers.registry[options.labeler]

        # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing
        filenames = open(options.file_list).readlines()
        labels = [labeler(each) for each in filenames]
        class_list = sorted(set(labels))

        writer = ArffWriter(options.dest, class_list=class_list)
        log.info('Writing GIST data to %s' % options.dest)

        for i, (filename, label) in enumerate(izip(filenames, labels)):
            filename = filename.strip()
            log.info('Processing image %d/%d' % (i+1, len(filenames)))

            descriptor = color_gist(filename) if options.color else grayscale_gist(filename)

            if options.normalize:
                descriptor = l2_normalize(descriptor)
            writer.write_example(descriptor, label)
        writer.close()
Exemplo n.º 7
0
    def run(self, options):
        labeler = labelers.registry[options.labeler]

        # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing
        filenames = open(options.file_list).readlines()
        labels = [labeler(each) for each in filenames]
        class_list = sorted(set(labels))

        writer = ArffWriter(options.dest, class_list=class_list)
        log.info('Writing GIST data to %s' % options.dest)

        for i, (filename, label) in enumerate(izip(filenames, labels)):
            filename = filename.strip()
            log.info('Processing image %d/%d' % (i + 1, len(filenames)))

            descriptor = color_gist(
                filename) if options.color else grayscale_gist(filename)

            if options.normalize:
                descriptor = l2_normalize(descriptor)
            writer.write_example(descriptor, label)
        writer.close()
Exemplo n.º 8
0
    def run(self, options):
        labeler = None if options.labeler is None else labelers.registry[options.labeler]

        # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing
        writer = None
        log.info('Writing SAM corpus to %s' % options.dest_corpus)

        filenames = open(options.file_list).readlines()
        for i, filename in enumerate(filenames):
            filename = filename.strip()
            log.info('Processing image %d/%d' % (i+1, len(filenames)))

            descriptor = color_gist(filename) if options.color else grayscale_gist(filename)
            if writer is None:
                dim = descriptor.size
                writer = CorpusWriter(options.dest_corpus, data_series='sam', dim=dim)

            normalized_descriptor = l2_normalize(descriptor)
            doc_label = labeler(filename) if labeler else None
            writer.write_doc(ascolvector(normalized_descriptor), name=filename, label=doc_label)

        writer.close()