Exemplo n.º 1
0
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

vision = Problem("Vision", ["agi", "vision", "world-modelling"])

image_comprehension = Problem("Image comprehension", ["agi", "vision", "language", "world-modelling"])
image_classification = Problem("Image classification", ["vision", "agi"])
image_classification.add_subproblem(image_comprehension)
vision.add_subproblem(image_classification)

imagenet = image_classification.metric("Imagenet Image Recognition", "http://image-net.org", scale=error_rate, target=0.051)
imagenet.notes = """
Correctly label images from the Imagenet dataset. As of 2016, this includes:
 - Object localization for 1000 categories.
 - Object detection for 200 fully labeled categories.
 - Object detection from video for 30 fully labeled categories.
 - Scene classification for 365 scene categories (Joint with MIT Places team) on Places2 Database http://places2.csail.mit.edu.
 - Scene parsing for 150 stuff and discrete object categories (Joint with MIT Places team).
"""
imagenet.measure(date(2010,8,31), 0.28191, "NEC UIUC", "http://image-net.org/challenges/LSVRC/2010/results")
imagenet.measure(date(2011,10,26), 0.2577, "XRCE","http://image-net.org/challenges/LSVRC/2011/results")
imagenet.measure(date(2012,10,13), 0.16422, "SuperVision", "http://image-net.org/challenges/LSVRC/2012/results.html")
imagenet.measure(date(2013,11,14), 0.11743, "Clarifai","http://www.image-net.org/challenges/LSVRC/2013/results.php")
imagenet.measure(date(2014,8,18), 0.07405, "VGG", "http://image-net.org/challenges/LSVRC/2014/index")
imagenet.measure(date(2015,12,10), 0.03567, "MSRA", "http://image-net.org/challenges/LSVRC/2015/results", algorithms=["residual-networks"])
imagenet.measure(date(2016,9,26), 0.02991, "Trimps-Soushen", "http://image-net.org/challenges/LSVRC/2016/results")

# Test automatic detection of withdrawn papers
Exemplo n.º 2
0
# we need to match/double check the release date of the specific version of cmix that got this performance?
# hp_compression.measure(date(2014,4,13), 1.245, "cmix", "http://www.byronknoll.com/cmix.html")
hp_compression.measure(date(2013,8,4), 1.67, "RNN, LSTM", "https://arxiv.org/abs/1308.0850")
hp_compression.measure(date(2011,6,28), 1.60, "RNN", "http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf")

lambada = modelling_english.metric("LAMBADA prediction of words in discourse", url="https://arxiv.org/abs/1606.06031",
                                   scale=correct_percent, target=86, target_source="https://arxiv.org/abs/1610.08431v3")
lambada.measure(None, 21.7, "Stanford Reader", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1606.02858")
lambada.measure(None, 32.1, "Modified Stanford", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1606.02858")
lambada.measure(None, 49.0, "GA + feat.", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1606.01549v2")
lambada.measure(None, 44.5, "AS + feat.", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1603.01547")
lambada.measure(None, 51.6, "MAGE (48)", url="https://arxiv.org/abs/1703.02620v1")

turing_test = Problem("Conduct arbitrary sustained, probing conversation", ["agi", "language", "world-modelling", "communication"])
easy_turing_test = Problem("Turing test for casual conversation", ["agi", "language", "world-modelling", "communication"])
turing_test.add_subproblem(easy_turing_test)

loebner = easy_turing_test.metric("The Loebner Prize scored selection answers", url="http://www.aisb.org.uk/events/loebner-prize", 
                                  scale=correct_percent, changeable=True, target=100, target_label="Completely plausible answers",
                                axis_label='Percentage of answers rated plausible\n(each year is a different test)')
# XXX humans probably don't get 100% on the Loebner Prize selection questions; we should ask the organizers to score
# some humans


loebner.notes = """
The Loebner Prize is an actual enactment of the Turing Test. Importantly, judges are instructed to engage in casual, natural
conversation rather than deliberately probing to determine if participants are "intelligent" (Brian Christian, The Most Human Human).
This makes it considerably easier than a probing Turing Test, and it is close to being solved. 

However these aren't scores for the full Loebner Turing Test; since 2014 the Loebner prize has scored its entrants by
giving them a corpus of conversation and scoring their answers. We use these numbers because they remove variability
Exemplo n.º 3
0
hp_compression.measure(date(2011,6,28), 1.60, "RNN", "http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf")

hp_compression.measure(None, 1.42, "RHN", "https://arxiv.org/abs/1607.03474v2")
hp_compression.measure(None, 1.27, "Large RHN depth 10", "https://arxiv.org/abs/1607.03474v4")

lambada = modelling_english.metric("LAMBADA prediction of words in discourse", url="https://arxiv.org/abs/1606.06031",
                                   scale=correct_percent, target=86, target_source="https://arxiv.org/abs/1610.08431v3")
lambada.measure(None, 21.7, "Stanford Reader", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1606.02858")
lambada.measure(None, 32.1, "Modified Stanford", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1606.02858")
lambada.measure(None, 49.0, "GA + feat.", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1606.01549v2")
lambada.measure(None, 44.5, "AS + feat.", url="https://arxiv.org/abs/1610.08431v3", algorithm_src_url="https://arxiv.org/abs/1603.01547")
lambada.measure(None, 51.6, "GA+MAGE (48)", url="https://arxiv.org/abs/1703.02620v1")

turing_test = Problem("Conduct arbitrary sustained, probing conversation", ["agi", "language", "world-modelling", "communication"])
easy_turing_test = Problem("Turing test for casual conversation", ["agi", "language", "world-modelling", "communication"])
turing_test.add_subproblem(easy_turing_test)

loebner = easy_turing_test.metric("The Loebner Prize scored selection answers", url="http://www.aisb.org.uk/events/loebner-prize", 
                                  scale=correct_percent, changeable=True, target=100, target_label="Completely plausible answers",
                                axis_label='Percentage of answers rated plausible\n(each year is a different test)')
# XXX humans probably don't get 100% on the Loebner Prize selection questions; we should ask the organizers to score
# some humans


loebner.notes = """
The Loebner Prize is an actual enactment of the Turing Test. Importantly, judges are instructed to engage in casual, natural
conversation rather than deliberately probing to determine if participants are "intelligent" (Brian Christian, The Most Human Human).
This makes it considerably easier than a probing Turing Test, and it is close to being solved. 

However these aren't scores for the full Loebner Turing Test; since 2014 the Loebner prize has scored its entrants by
giving them a corpus of conversation and scoring their answers. We use these numbers because they remove variability
Exemplo n.º 4
0
* * *
**_Generative models of CIFAR-10 Natural Images _****[Year: bits-per-subpixel, method]. Compiled by Durk Kingma.**

**Why we care:**
(1) The compression=prediction=understanding=intelligence view (see Hutter prize, etc.). (Note that perplexity, log-likelihood, and #bits are all equivalent measurements.)
(2) Learning a generative model is a prominent auxiliary task towards semi-supervised learning. Current SOTA semi-supervised classification results utilize generative models.
3) You're finding patterns in the data that let you compress it more efficiently. Ultimate pattern recognition benchmark because you're trying to find the patterns in all the data. 

"""

image_generation = Problem("Drawing pictures", ["vision", "agi"])
# note: this section is not on scene generation, but making the distinction seemed like a good idea.
scene_generation = Problem(
    "Be able to generate complex scene e.g. a baboon receiving their degree at convocatoin.",
    ["vision", "world-modelling", "agi"])
scene_generation.add_subproblem(image_generation)

# NOTE: scale, and target need to be checked
image_generation_metric = image_generation.metric(
    "Generative models of CIFAR-10 images",
    scale=bits_per_x,
    axis_label="Model entropy (bits per pixel)")

image_generation_metric.measure(date(2014, 10, 30), 4.48, "NICE",
                                "https://arxiv.org/abs/1410.8516")
image_generation_metric.measure(date(2015, 2, 16), 4.13, "DRAW",
                                "https://arxiv.org/abs/1502.04623")
image_generation_metric.measure(date(2016, 5, 27), 3.49, "Real NVP",
                                "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(
    date(2016, 6, 15), 3.11, "VAE with IAF",
Exemplo n.º 5
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

abstract_strategy_games = Problem("Abstract strategy games", ["agi", "abstract-games"])

playing_with_hints = Problem("Playing abstract games with extensive hints", ["abstract-games"], solved=True)
abstract_strategy_games.add_subproblem(playing_with_hints)
playing_with_hints.notes = """
  Complex abstract strategy games have been solved to super-human levels
  by computer systems with extensive rule-hinting and heuristics,
  in some cases combined with machine learning techniques.
"""
computer_chess = playing_with_hints.metric("Computer Chess", scale=elo, target=2882, target_label="Best human play", target_source="https://en.wikipedia.org/w/index.php?title=Comparison_of_top_chess_players_throughout_history&oldid=777500496#Elo_system")
computer_go = playing_with_hints.metric("Computer Go", scale=elo, target=3632, target_label="Best human play", target_source="https://www.goratings.org/en/history/")
computer_go.solved = True # until we get proper data

# For some caveats, see https://en.wikipedia.org/w/index.php?title=Chess_engine&oldid=764341963#Ratings
# We could script ingestion of data from CCRL, or get data from Katja
computer_chess.measure(date(1997,5,11), 2725, "Deep Blue", uncertainty=25, url="https://www.quora.com/What-was-Deep-Blues-Elo-rating")
computer_chess.measure(date(2006,5,27), 2995, "Rybka 1.1 64bit", uncertainty=25, url="https://web.archive.org/web/20060531091049/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2010,8,7), 3269, "Rybka 4 64bit", uncertainty=22, url="https://web.archive.org/web/20100923131123/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2013,7,20), 3248, "Houdini 3 64bit", uncertainty=16, url="https://web.archive.org/web/20130415000000*/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2015,7,4), 3332, "Komodo 9", uncertainty=24, url="https://web.archive.org/web/20150708104805/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2017,2,27), 3393, "Stockfish", uncertainty=50, url="https://web.archive.org/web/20170227044521/http://www.computerchess.org.uk/ccrl/4040/")
# Wikipedia has some nice data here:
computer_chess.measure(date(1984,12,31), 1631, "Novag Super Constellation 6502 4 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1985,12,31), 1827, "Mephisto Amsterdam 68000 12 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1986,12,31), 1827, "Mephisto Amsterdam 68000 12 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1987,12,31), 1923, "Mephisto Dallas 68020 14 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
Exemplo n.º 6
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

read_stem_papers = Problem(
    "Read a scientific or technical paper, and comprehend its contents",
    ["language", "world-modelling", "super"])

# Getting some major results from an abstract, tables or conclusion is much easier than understanding the entire paper, its assumptions, robustness, support for its claims, etc
extract_results = Problem(
    "Extract major numerical results or progress claims from a STEM paper",
    ["language", "world-modelling", "agi"])
read_stem_papers.add_subproblem(extract_results)

extract_results.metric("Automatically find new relevant ML results on arXiv")
extract_results.notes = """
This metric is the ability to automatically update the ipython Notebook you are reading by spotting results in pdfs uploaded to arxiv.org.
Pull requests demonstrating solutions are welcome :)
"""

solve_technical_problems = Problem(
    "Given an arbitrary technical problem, solve it as well as a typical professional in that field",
    ["language", "world-modelling"])

program_induction = Problem("Writing software from specifications")
solve_technical_problems.add_subproblem(program_induction)
program_induction.metric("Card2Code",
                         url="https://github.com/deepmind/card2code",
                         scale=correct_percent)
Exemplo n.º 7
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

""" 
* * *
**_Generative models of CIFAR-10 Natural Images _****[Year: bits-per-subpixel, method]. Compiled by Durk Kingma.**

**Why we care:**
(1) The compression=prediction=understanding=intelligence view (see Hutter prize, etc.). (Note that perplexity, log-likelihood, and #bits are all equivalent measurements.)
(2) Learning a generative model is a prominent auxiliary task towards semi-supervised learning. Current SOTA semi-supervised classification results utilize generative models.
3) You're finding patterns in the data that let you compress it more efficiently. Ultimate pattern recognition benchmark because you're trying to find the patterns in all the data. 

"""

image_generation = Problem("Drawing pictures", ["vision", "agi"])
# note: this section is not on scene generation, but making the distinction seemed like a good idea.
scene_generation = Problem("Be able to generate complex scene e.g. a baboon receiving their degree at convocatoin.", ["vision", "world-modelling", "agi"])
scene_generation.add_subproblem(image_generation)

# NOTE: scale, and target need to be checked
image_generation_metric = image_generation.metric("Generative models of CIFAR-10 images", scale=bits_per_x, axis_label="Model entropy (bits per pixel)")

image_generation_metric.measure(date(2014,10,30), 4.48, "NICE", "https://arxiv.org/abs/1410.8516")
image_generation_metric.measure(date(2015,2,16), 4.13, "DRAW", "https://arxiv.org/abs/1502.04623")
image_generation_metric.measure(date(2016,5,27), 3.49, "Real NVP", "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(date(2016,6,15), 3.11, "VAE with IAF", "https://papers.nips.cc/paper/6581-improved-variational-inference-with-inverse-autoregressive-flow")
image_generation_metric.measure(date(2016,5,27), 3.0, "PixelRNN", "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(date(2016,11,4), 2.92, "PixelCNN++","https://openreview.net/forum?id=BJrFC6ceg", replicated="https://github.com/openai/pixel-cnn")
Exemplo n.º 8
0
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

vision = Problem("Vision", ["agi", "vision", "world-modelling"])

# note that there is also a lot of vision data in awty.py, which was
# originally created by the Are We There Yet? scraper. Probably it should just
# be merged into this file...

image_comprehension = Problem("Image comprehension", ["agi", "vision", "language", "world-modelling"])
image_classification = Problem("Image classification", ["vision", "agi"])
image_classification.add_subproblem(image_comprehension)
vision.add_subproblem(image_classification)

imagenet = image_classification.metric("Imagenet Image Recognition", "http://image-net.org", scale=error_rate, target=0.051)
imagenet.notes = """
Correctly label images from the Imagenet dataset. As of 2016, this includes:
 - Object localization for 1000 categories.
 - Object detection for 200 fully labeled categories.
 - Object detection from video for 30 fully labeled categories.
 - Scene classification for 365 scene categories (Joint with MIT Places team) on Places2 Database http://places2.csail.mit.edu.
 - Scene parsing for 150 stuff and discrete object categories (Joint with MIT Places team).
"""
imagenet.measure(date(2010,8,31), 0.28191, "NEC UIUC", "http://image-net.org/challenges/LSVRC/2010/results")
imagenet.measure(date(2011,10,26), 0.2577, "XRCE","http://image-net.org/challenges/LSVRC/2011/results")
imagenet.measure(date(2012,10,13), 0.16422, "AlexNet / SuperVision",
"http://image-net.org/challenges/LSVRC/2012/results.html", algorithm_src_url="https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks")
imagenet.measure(date(2013,11,14), 0.11743, "Clarifai","http://www.image-net.org/challenges/LSVRC/2013/results.php")
Exemplo n.º 9
0
computer_games = Problem(
    "Play real-time computer & video games",
    ["world-modelling", "realtime-games", "agi", "language"])

games_requiring_novel_language = Problem(
    "Games that require inventing novel language, forms of speech, or communication"
)
games_requiring_speech = Problem(
    "Games that require both understanding and speaking a language")
games_requiring_speech.metric("Starcraft")

games_requiring_language_comprehension = Problem(
    "Games that require language comprehension", ["agi", "languge"])

computer_games.add_subproblem(games_requiring_novel_language)
games_requiring_novel_language.add_subproblem(games_requiring_speech)
games_requiring_speech.add_subproblem(games_requiring_language_comprehension)

# Atari 2600 Games: Breakout, Enduro, Pong, Q*Bert, Seaquest, S. Invaders. Each game has its own metric.
# We previously used hand-compiled by Yomna Nasser and Miles Brundage; this is
# now mostly obsolete, and the data is scraped in scrapers/atari.py

simple_games = Problem("Simple video games",
                       ["world-modelling", "realtime-games", "agi"])
computer_games.add_subproblem(simple_games)

# Alien
alien_metric = simple_games.metric(
    "Atari 2600 Alien",
    scale=atari_linear,
Exemplo n.º 10
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

computer_games = Problem("Play real-time computer & video games", ["world-modelling", "realtime-games", "agi", "language"])

games_requiring_novel_language = Problem("Games that require inventing novel language, forms of speech, or communication")
games_requiring_speech = Problem("Games that require both understanding and speaking a language")
games_requiring_speech.metric("Starcraft")

games_requiring_language_comprehension = Problem("Games that require language comprehension", ["agi", "languge"])

computer_games.add_subproblem(games_requiring_novel_language)
games_requiring_novel_language.add_subproblem(games_requiring_speech)
games_requiring_speech.add_subproblem(games_requiring_language_comprehension)


# Atari 2600 Games: Breakout, Enduro, Pong, Q*Bert, Seaquest, S. Invaders. Each game has its own metric.
# We previously used hand-compiled by Yomna Nasser and Miles Brundage; this is
# now mostly obsolete, and the data is scraped in scrapers/atari.py

simple_games = Problem("Simple video games", ["world-modelling", "realtime-games", "agi"]) 
computer_games.add_subproblem(simple_games)

# Alien
alien_metric = simple_games.metric("Atari 2600 Alien", scale=atari_linear, target=6875, target_source="https://www.semanticscholar.org/paper/Human-level-control-through-deep-reinforcement-Mnih-Kavukcuoglu/340f48901f72278f6bf78a04ee5b01df208cc508")
# alien_metric.measure(date(2015, 2, 26), 3069, "DQN", "https://www.semanticscholar.org/paper/Human-level-control-through-deep-reinforcement-Mnih-Kavukcuoglu/340f48901f72278f6bf78a04ee5b01df208cc508")
# alien_metric.measure(date(2015,11,20), 1620, "DQN","https://arxiv.org/abs/1511.06581v1")
# alien_metric.measure(date(2015,11,20), 3747.7, "DDQN","https://arxiv.org/abs/1511.06581v1")
# alien_metric.measure(date(2015,11,20), 4461.4, "Duel","https://arxiv.org/abs/1511.06581v1")
Exemplo n.º 11
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

read_stem_papers = Problem("Read a scientific or technical paper, and comprehend its contents", ["language", "world-modelling", "super"])

# Getting some major results from an abstract, tables or conclusion is much easier than understanding the entire paper, its assumptions, robustness, support for its claims, etc
extract_results = Problem("Extract major numerical results or progress claims from a STEM paper", ["language", "world-modelling", "agi"])
read_stem_papers.add_subproblem(extract_results)

extract_results.metric("Automatically find new relevant ML results on arXiv")
extract_results.notes = """
This metric is the ability to automatically update the ipython Notebook you are reading by spotting results in pdfs uploaded to arxiv.org.
Pull requests demonstrating solutions are welcome :)
"""

solve_technical_problems = Problem("Given an arbitrary technical problem, solve it as well as a typical professional in that field", ["language", "world-modelling"])

program_induction = Problem("Writing software from specifications")
solve_technical_problems.add_subproblem(program_induction)
program_induction.metric("Card2Code", url="https://github.com/deepmind/card2code", scale=correct_percent)

vaguely_constrained_technical_problems = Problem("Solve vaguely or under-constrained technical problems")
solve_technical_problems.add_subproblem(vaguely_constrained_technical_problems)

# This subset of technical problems is much easier; here we assume that a human / worldly problem has been reduced to something that can be
# subjected to clear computational evaluation ("is this purported proof of theorem X correct?", "does this circuit perform task Y efficiently?"
# "will this airframe fly with reasonable characteristics?")
solve_constrained_technical_problems = Problem("Solve technical problems with clear constraints (proofs, circuit design, aerofoil design, etc)")
solve_technical_problems.add_subproblem(solve_constrained_technical_problems)