def _get_stories(): environment = get_environment() r = redis.StrictRedis( host=environment[REDIS_HOST], port=environment[REDIS_PORT], password=environment[REDIS_PASS] ) return get_stories(r)
def test_get_stories(fake_redis_store): expected_response = [ {HACKER_NEWS_ID: 8712349, BODY: {TEXT: 'one\ntwo\nthree', SENTENCES: ['one', 'two', 'three']}, URL: 'http://totalurl.com', TITLE: 'I am a title!', DATE_FOUND: '2014-12-08 02:04:36.143372'}, {HACKER_NEWS_ID: 8712417, BODY: {TEXT: 'four\nfive\nsix', SENTENCES: ['four', 'five', 'six']}, URL: 'http://real.com', TITLE: 'I am a second title!', DATE_FOUND: '2014-12-08 02:05:18.078519'}, {HACKER_NEWS_ID: 8712277, BODY: {TEXT: 'seven', SENTENCES: []}, URL: 'http://imathingy.com', TITLE: 'I am a third title!', DATE_FOUND: '2014-12-08 02:05:28.326434'} ] assert get_stories(fake_redis_store) == expected_response
print ('Reading babi files') tar = tarfile.open('data/babi/babi-tasks-v1-2.tar.gz') mypath = 'data/babi/tasks_1-20_v1-2/en-10k' challenge_files = [f for f in listdir(mypath) if isfile(join(mypath, f))] challenge_files = ['tasks_1-20_v1-2/en-10k/' + f.replace('train', '{}') for f in challenge_files if 'train.txt' == f[-9:]] # Read all files train_facts_split = [] test_facts_split = [] train_facts = [] test_facts = [] for challenge in challenge_files: train_facts_split.append(get_stories(tar.extractfile(challenge.format('train')))) test_facts_split.append(get_stories(tar.extractfile(challenge.format('test')))) train_facts += train_facts_split[-1] test_facts += test_facts_split[-1] print('Processing input data') train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q,a) for fact,q,a in train_facts] test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q,a) for fact,q,a in test_facts] vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train_stories + test_stories))) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 facts_maxlen = max(map(len, (x for h,_,_ in train_facts + test_facts for x in h))) if enable_time: facts_maxlen += 1
def test_failed_to_find_story(): # Perhaps I should do something more than print.... unfilled_redis = {TOP_STORIES_KEY: json.dumps([1]).encode('utf-8')} with patch('builtins.print') as print_fn: get_stories(unfilled_redis) assert print_fn.called_with(FAILED_TO_FIND.format(1))
file_names = ['facts_list_all.txt'] training_set = [] test_set = [] data = [] for file_name in file_names: print 'Reading {0} .....'.format(file_name) read_file = open('data/{0}'.format(file_name), 'r') # Data in format: # [([[fact1],[fact2],..][answer])...] # where each fact is a list of words data += get_stories(read_file) read_file.close() # Data augmenting strategies #1. Changing randomly nouns by synonyms print('Data augmentation: synonyms') data_dict = synonyms(data) print('Number of diseases: {0}'.format(len(data_dict))) #2. Removing facts randomly print('Data augmentation: removing') data_dict = remove(data,data_dict) #3. Changing facts order print('Data augmentation: permutation') data_dict = permute(data,data_dict)
# Reading all file names mypath = 'tasks_1-20_v1-2/en' challenge_files = [f for f in listdir(mypath) if isfile(join(mypath, f))] challenge_files = [ 'tasks_1-20_v1-2/en-10k/' + f.replace('train', '{}') for f in challenge_files if 'train.txt' == f[-9:] ] # Read all files train_facts_split = [] test_facts_split = [] train_facts = [] test_facts = [] for challenge in challenge_files: train_facts_split.append( get_stories(tar.extractfile(challenge.format('train')))) test_facts_split.append( get_stories(tar.extractfile(challenge.format('test')))) train_facts += train_facts_split[-1] test_facts += test_facts_split[-1] test_facts = np.array(test_facts) train_facts = np.array(train_facts) test_facts = list(test_facts[np.random.choice(len(test_facts), len(test_facts), replace=False)]) train_facts = list(train_facts[np.random.choice(len(train_facts), len(train_facts), replace=False)]) # Parameters deifinition
parser.add_argument('-n', '--num_words', required=False, type=int, help='number of words to generate') args = vars(parser.parse_args()) return args ### # main function if __name__ == '__main__': # parse arguments args = parse_args() train = utils.get_stories( open('tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt', 'r')) test = utils.get_stories( open('tasks_1-20_v1-2/en/qa1_single-supporting-fact_test.txt', 'r')) vocab = set() for story, q, answer in train + test: vocab |= set(story + q + [answer]) vocab = sorted(vocab) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) story_maxlen = max(map(len, (x for x, _, _ in train + test))) query_maxlen = max(map(len, (x for _, x, _ in train + test))) x, xq, y = utils.vectorize_stories(train, word_idx, story_maxlen,