Exemplo n.º 1
0
def _get_stories():
    environment = get_environment()
    r = redis.StrictRedis(
        host=environment[REDIS_HOST],
        port=environment[REDIS_PORT],
        password=environment[REDIS_PASS]
    )
    return get_stories(r)
Exemplo n.º 2
0
def test_get_stories(fake_redis_store):
    expected_response = [
        {HACKER_NEWS_ID: 8712349,
         BODY: {TEXT: 'one\ntwo\nthree', SENTENCES: ['one', 'two', 'three']},
         URL: 'http://totalurl.com',
         TITLE: 'I am a title!',
         DATE_FOUND: '2014-12-08 02:04:36.143372'},
        {HACKER_NEWS_ID: 8712417,
         BODY: {TEXT: 'four\nfive\nsix',
                SENTENCES: ['four', 'five', 'six']},
         URL: 'http://real.com',
         TITLE: 'I am a second title!',
         DATE_FOUND: '2014-12-08 02:05:18.078519'},
        {HACKER_NEWS_ID: 8712277, BODY: {TEXT: 'seven', SENTENCES: []},
         URL: 'http://imathingy.com',
         TITLE: 'I am a third title!',
         DATE_FOUND: '2014-12-08 02:05:28.326434'}
    ]
    assert get_stories(fake_redis_store) == expected_response
print ('Reading babi files')
tar = tarfile.open('data/babi/babi-tasks-v1-2.tar.gz')

mypath = 'data/babi/tasks_1-20_v1-2/en-10k'
challenge_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
challenge_files = ['tasks_1-20_v1-2/en-10k/' + f.replace('train', '{}') for f 
                   in challenge_files if 'train.txt' == f[-9:]]

# Read all files
train_facts_split = []
test_facts_split = []
train_facts = []
test_facts = []
for challenge in challenge_files:
    train_facts_split.append(get_stories(tar.extractfile(challenge.format('train'))))
    test_facts_split.append(get_stories(tar.extractfile(challenge.format('test'))))
    train_facts += train_facts_split[-1]
    test_facts += test_facts_split[-1]

print('Processing input data')
train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q,a) for fact,q,a in train_facts]
test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q,a) for fact,q,a in test_facts]

vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train_stories + test_stories)))
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1

facts_maxlen = max(map(len, (x for h,_,_ in train_facts + test_facts for x in h)))
if enable_time:
    facts_maxlen += 1
Exemplo n.º 4
0
def test_failed_to_find_story():
    # Perhaps I should do something more than print....
    unfilled_redis = {TOP_STORIES_KEY: json.dumps([1]).encode('utf-8')}
    with patch('builtins.print') as print_fn:
        get_stories(unfilled_redis)
        assert print_fn.called_with(FAILED_TO_FIND.format(1))
Exemplo n.º 5
0

file_names = ['facts_list_all.txt']

training_set = []
test_set = []
data = []

for file_name in file_names:
    print 'Reading {0} .....'.format(file_name)
    read_file = open('data/{0}'.format(file_name), 'r')
    # Data in format:
    # [([[fact1],[fact2],..][answer])...]
    # where each fact is a list of words
    
    data += get_stories(read_file)

    read_file.close()


# Data augmenting strategies
#1. Changing randomly nouns by synonyms
print('Data augmentation: synonyms')
data_dict = synonyms(data)
print('Number of diseases: {0}'.format(len(data_dict)))
#2. Removing facts randomly
print('Data augmentation: removing')
data_dict = remove(data,data_dict)
#3. Changing facts order
print('Data augmentation: permutation')
data_dict = permute(data,data_dict)
Exemplo n.º 6
0
# Reading all file names
mypath = 'tasks_1-20_v1-2/en'
challenge_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
challenge_files = [
    'tasks_1-20_v1-2/en-10k/' + f.replace('train', '{}')
    for f in challenge_files if 'train.txt' == f[-9:]
]

# Read all files
train_facts_split = []
test_facts_split = []
train_facts = []
test_facts = []
for challenge in challenge_files:
    train_facts_split.append(
        get_stories(tar.extractfile(challenge.format('train'))))
    test_facts_split.append(
        get_stories(tar.extractfile(challenge.format('test'))))
    train_facts += train_facts_split[-1]
    test_facts += test_facts_split[-1]

test_facts = np.array(test_facts)
train_facts = np.array(train_facts)
test_facts = list(test_facts[np.random.choice(len(test_facts),
                                              len(test_facts),
                                              replace=False)])
train_facts = list(train_facts[np.random.choice(len(train_facts),
                                                len(train_facts),
                                                replace=False)])

# Parameters deifinition
Exemplo n.º 7
0
    parser.add_argument('-n',
                        '--num_words',
                        required=False,
                        type=int,
                        help='number of words to generate')
    args = vars(parser.parse_args())
    return args


###
# main function
if __name__ == '__main__':
    # parse arguments
    args = parse_args()

    train = utils.get_stories(
        open('tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt', 'r'))
    test = utils.get_stories(
        open('tasks_1-20_v1-2/en/qa1_single-supporting-fact_test.txt', 'r'))

    vocab = set()
    for story, q, answer in train + test:
        vocab |= set(story + q + [answer])
    vocab = sorted(vocab)

    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
    story_maxlen = max(map(len, (x for x, _, _ in train + test)))
    query_maxlen = max(map(len, (x for _, x, _ in train + test)))

    x, xq, y = utils.vectorize_stories(train, word_idx, story_maxlen,