Пример #1
0
Файл: main.py Проект: kldcr/USN
def train(logger, args, example_idx):
    # load data
    embed, vocab, train_data, val_data, test_data = load_data(logger, args)
    args.embed_num = len(embed)
    args.embed_dim = len(embed[0])
    args.user_num = vocab.user_num
    args.product_num = vocab.product_num

    # data iter
    train_dataset = Dataset(train_data)
    val_dataset = Dataset(val_data)
    train_iter = DataLoader(dataset=train_dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            drop_last=True)
    val_iter = DataLoader(dataset=val_dataset,
                          batch_size=args.batch_size,
                          shuffle=False,
                          drop_last=True)

    # define generator and discriminator in GAN
    gen = Generator(args, embed)

    # define GPU devices
    if args.use_cuda:
        gen = gen.cuda()
        gen = nn.DataParallel(gen, device_ids=args.gpu_id)

    # GENERATOR MLE TRAINING
    logger.info('Starting Generator MLE Training...')
    gen_optimizer = optim.Adam(gen.parameters(), lr=args.lr)
    train_generator_MLE(logger, args, gen, gen_optimizer,
                        args.MLE_TRAIN_EPOCHS, vocab, train_data, example_idx,
                        train_iter, val_iter)
Пример #2
0

if __name__ == '__main__':
    parser = ArgumentParser(description='Extract features of a dataset.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('dataset',
        help='Path to the directory containing folders for each class that \
        contain the images and metadata')
    parser.add_argument('-i', '--image', action='store_true',
        help='Whether to extract visual features from the image')
    parser.add_argument('-t', '--textual', action='store_true',
        help='Whether to extract textual features; requires JSON metadata \
        files for all the images')
    parser.add_argument('-v', '--vocabulary', default='<dataset>/vocabulary.json',
        help='Filename of the JSON vocabulary used for bag of words')
    parser.add_argument('-o', '--output', default='<dataset>/features.json',
        help='Where to store the extracted features')
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    args.vocabulary = args.vocabulary.replace('<dataset>', args.dataset)

    assert args.image or args.textual, 'Need at least one feature source'
    if args.textual:
        assert args.vocabulary, 'Vocabulary is needed for textual features'
        WordsFeature.load_vocabulary(args.vocabulary)

    dataset = Dataset()
    dataset.read(args.dataset, args.image, args.textual)
    dataset.save(args.output)
    best = max(scores)
    return worst, average, best


if __name__ == '__main__':
    parser = ArgumentParser(description='Learning algorithm used to classify '
        'images.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('features',
        help='Path to the JSON file containing extracted features of the '
        'dataset')
    parser.add_argument('-s', '--split', type=float, default=0.25,
        help='Fraction of data used for validation')
    parser.add_argument('-c', '--copy-predicted',
        default='<folder>/../<folder>-predicted/',
        help='Folder to copy predicted images into; sub directories for all '
        'labels are created; <folder> is the directory of the features file')
    args = parser.parse_args()

    if '<folder>' in args.copy_predicted:
        folder = os.path.splitext(args.features)[0]
        args.copy_predicted = args.copy_predicted.replace('<folder>', folder)

    dataset = Dataset()
    dataset.load(args.features)

    classifier = RandomForestClassifier(n_estimators=300)
    prediction = train_and_predict(classifier, dataset, args.split)
    prediction.print_scores()
    prediction.plot_confusion_matrix()
Пример #4
0
    assert len(captions) == len(data)
    assert all(len(x) == len(data[0]) for x in data)
    with open(filename, 'w') as csv:
        csv.write(','.join(captions) + '\n')
        for row in range(len(data[0])):
            csv.write(','.join(str(column[row]) for column in data) + '\n')


if __name__ == '__main__':
    parser = ArgumentParser(description='Measure statistics of features \
        within the images of the same class to evaluate features.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('features',
        help='Path to the JSON file containing extracted features of the \
        dataset')
    parser.add_argument('-o', '--output', default='<folder>/evaluation.csv',
        help='Filename of the CSV file where p-values will be written to; \
        <folder> is the directory of the features file')
    args = parser.parse_args()

    folder = os.path.splitext(os.path.split(args.features)[0])[0]
    args.output = args.output.replace('<folder>', folder)

    dataset = Dataset()
    dataset.load(args.features)

    print_chi(dataset)
    print('Write CSV table to', args.output)
    write_chi(args.output, dataset)
    print('Done')
def iterate_overall_texts(root):
    for directory in Dataset()._walk_directories(root):
        yield from iterate_texts(os.path.join(root, directory))
def iterate_texts(directory):
    for filename in Dataset()._walk_images(directory):
        metadata = get_metadata(os.path.join(directory, filename))
        text = preprocess_text(metadata['url'], metadata['title'],
                               metadata['description'])
        yield text
        '--limit',
        type=int,
        default=20,
        help='Maximal amount of words to display for each class')
    parser.add_argument(
        '-o',
        '--output',
        default='<dataset>/vocabulary.json',
        help='Filename of the JSON vocabulary that will be written')
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)

    text = iterate_overall_texts(args.dataset)
    overall = get_frequencies(text)

    vocabulary = {}
    for directory in Dataset()._walk_directories(args.dataset):
        print_headline(directory)
        texts = iterate_texts(os.path.join(args.dataset, directory))
        frequencies = get_frequencies(texts)
        frequencies = compute_tfidf(frequencies, overall)
        synonyms = list(get_top_frequencies(frequencies, args.limit).keys())
        vocabulary[directory] = synonyms
        print_frequencies(frequencies, args.limit)

    print('')
    print('Write vocabulary to', args.output)
    json.dump(vocabulary, open(args.output, 'w'))
    print('Done')
def classify_images(self, keywords=[], limit=25):
    """
    Worker task to retrieve a list of automatically categorized images from
    Wikimedia Commons from a given list of keywords.
    """
    if not len(keywords):
        raise AssertionError

    with app.app_context():

        def supported_extractors():
            extractors = []
            extractors.append(SizeFeature())
            extractors.append(ColorFeature())
            extractors.append(HistogramFeature())
            extractors.append(GradientFeature())
            extractors.append(FaceFeature(app.config['FACE_CLASSIFIER']))
            extractors.append(GeoFeature())
            extractors.append(FormatFeature())
            extractors.append(
                WordsFeature.create_from(app.config['WORDS_CONFIG']))
            return extractors

        def create_response_entry(label, sample):
            return {
                'thumbnail': sample.thumbnail,
                'image': sample.url,
                'label': label,
                'title': sample.url
            }

        def create_response(entries):
            return {'current': 100, 'total': 100, 'result': entries}

        # keep track of progress
        progress_observer = ProgressObserver(self)
        progress_observer.update(5)

        # query dpedia for related images based on given keywords
        if limit > app.config['QUERY_LIMIT']:
            limit = app.config['QUERY_LIMIT']
        searchterm = ' '.join(keywords)
        uris = fetch_uris_from_metadata(searchterm, limit, multiple=False)
        progress_observer.update(20)

        # download images and metadata into temp folder with unique task id
        temp_folder = os.path.join(app.config['DOWNLOAD_DIRECTORY'],
                                   classify_images.request.id)
        images_and_metadata(uris,
                            temp_folder,
                            False,
                            observer=progress_observer)
        progress_observer.update(80)

        # load dataset and extract features
        dataset = Dataset(logging=True)
        dataset.read(root=temp_folder,
                     extractors=supported_extractors(),
                     unlabeled_data=True)
        dataset_config = json.load(open(app.config['DATASET_CONFIG']))
        dataset.means = dataset_config['means']
        dataset.stds = dataset_config['stds']
        dataset.normalize()
        progress_observer.update(90)

        # predict labels using the trained classifier
        classifier = joblib.load(app.config['WIKIMEDIA_CLASSIFIER'])
        predictions = classifier.predict(dataset.data)
        progress_observer.update(95)

        # build response
        suggestions = []
        for index, sample in enumerate(dataset.samples):
            label = np.asscalar(predictions[index])
            entry = create_response_entry(label, sample)
            suggestions.append(entry)
        result = create_response(suggestions)

        # cleanup temporary directory
        delete_directory(temp_folder)

        progress_observer.update(100)

        return result
Пример #9
0
def read_samples(root):
    samples = Dataset()._read_samples(root)
    return samples
Пример #10
0
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    assert args.visual or args.textual, 'Need at least one feature source'

    extractors = []
    if args.visual:
        extractors.append(SizeFeature())
        extractors.append(ColorFeature())
        extractors.append(HistogramFeature())
        extractors.append(GradientFeature())
        # extractors.append(BlobFeature())
        # extractors.append(BriefFeature())
        if os.path.isfile(args.trained_faces):
            extractors.append(FaceFeature(args.trained_faces))
        else:
            print('Skip face feature since the trained face detector was not '
                  'found at {}.'.format(args.trained_faces))
    if args.textual:
        samples = read_samples(args.dataset)
        if os.path.isfile(args.stopwords):
            args.stopwords = open(args.stopwords)
        extractors.append(GeoFeature())
        extractors.append(FormatFeature())
        extractors.append(WordsFeature(samples, args.stopwords))
        # extractors.append(RandomFeature())

    dataset = Dataset(logging=True)
    dataset.read(args.dataset, extractors)
    dataset.save(args.output)
def classify_images(self, keywords=[], limit=25):
    """
    Worker task to retrieve a list of automatically categorized images from
    Wikimedia Commons from a given list of keywords.
    """
    if not len(keywords):
        raise AssertionError

    with app.app_context():

        def supported_extractors():
            extractors = []
            extractors.append(SizeFeature())
            extractors.append(ColorFeature())
            extractors.append(HistogramFeature())
            extractors.append(GradientFeature())
            extractors.append(FaceFeature(app.config["FACE_CLASSIFIER"]))
            extractors.append(GeoFeature())
            extractors.append(FormatFeature())
            extractors.append(WordsFeature.create_from(app.config["WORDS_CONFIG"]))
            return extractors

        def create_response_entry(label, sample):
            return {"thumbnail": sample.thumbnail, "image": sample.url, "label": label, "title": sample.url}

        def create_response(entries):
            return {"current": 100, "total": 100, "result": entries}

        # keep track of progress
        progress_observer = ProgressObserver(self)
        progress_observer.update(5)

        # query dpedia for related images based on given keywords
        if limit > app.config["QUERY_LIMIT"]:
            limit = app.config["QUERY_LIMIT"]
        searchterm = " ".join(keywords)
        uris = fetch_uris_from_metadata(searchterm, limit, multiple=False)
        progress_observer.update(20)

        # download images and metadata into temp folder with unique task id
        temp_folder = os.path.join(app.config["DOWNLOAD_DIRECTORY"], classify_images.request.id)
        images_and_metadata(uris, temp_folder, False, observer=progress_observer)
        progress_observer.update(80)

        # load dataset and extract features
        dataset = Dataset(logging=True)
        dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True)
        dataset_config = json.load(open(app.config["DATASET_CONFIG"]))
        dataset.means = dataset_config["means"]
        dataset.stds = dataset_config["stds"]
        dataset.normalize()
        progress_observer.update(90)

        # predict labels using the trained classifier
        classifier = joblib.load(app.config["WIKIMEDIA_CLASSIFIER"])
        predictions = classifier.predict(dataset.data)
        progress_observer.update(95)

        # build response
        suggestions = []
        for index, sample in enumerate(dataset.samples):
            label = np.asscalar(predictions[index])
            entry = create_response_entry(label, sample)
            suggestions.append(entry)
        result = create_response(suggestions)

        # cleanup temporary directory
        delete_directory(temp_folder)

        progress_observer.update(100)

        return result
Пример #12
0
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    assert args.visual or args.textual, 'Need at least one feature source'

    extractors = []
    if args.visual:
        extractors.append(SizeFeature())
        extractors.append(ColorFeature())
        extractors.append(HistogramFeature())
        extractors.append(GradientFeature())
        # extractors.append(BlobFeature())
        # extractors.append(BriefFeature())
        if os.path.isfile(args.trained_faces):
            extractors.append(FaceFeature(args.trained_faces))
        else:
            print('Skip face feature since the trained face detector was not '
                'found at {}.'.format(args.trained_faces))
    if args.textual:
        samples = read_samples(args.dataset)
        if os.path.isfile(args.stopwords):
            args.stopwords = open(args.stopwords)
        extractors.append(GeoFeature())
        extractors.append(FormatFeature())
        extractors.append(WordsFeature(samples, args.stopwords))
        # extractors.append(RandomFeature())

    dataset = Dataset(logging=True)
    dataset.read(args.dataset, extractors)
    dataset.save(args.output)
    assert all(len(x) == len(data[0]) for x in data)
    with open(filename, 'w') as csv:
        csv.write(','.join(captions) + '\n')
        for row in range(len(data[0])):
            csv.write(','.join(str(column[row]) for column in data) + '\n')


if __name__ == '__main__':
    parser = ArgumentParser(description='Measure statistics of features '
        'within the images of the same class to evaluate features.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('features',
        help='Path to the JSON file containing extracted features of the '
        'dataset')
    parser.add_argument('-o', '--output', default='<folder>/evaluation.csv',
        help='Filename of the CSV file where p-values will be written to; '
        '<folder> is the directory of the features file')
    args = parser.parse_args()

    folder = os.path.splitext(os.path.split(args.features)[0])[0]
    args.output = args.output.replace('<folder>', folder)

    dataset = Dataset()
    dataset.load(args.features)
    dataset.normalize()

    print_chi(dataset)
    print('Write CSV table to', args.output)
    write_chi(args.output, dataset)
    print('Done')