예제 #1
0
def get_encoded_text(line: str,
                     article_length: int = 1000,
                     silent: bool = True):
    '''get encoded text of an article

	:param line: one line of article
	:return: array of word indexes
	'''
    mt = tag_pattern.search(line)
    assert mt
    line = line[mt.end():].strip()
    # get words
    words = filter_words(line.split(' '))
    encoded_list = []
    # encoding
    for word in words:
        index = word_index.get(word)
        if index is None:
            if not silent: print('word %s is unknown' % word)
        else:
            encoded_list.append(index)
    # padding
    if article_length > 0:
        padding = article_length - len(encoded_list)
        if padding < 0:
            # article too long, randomly slice short
            cut = np.random.randint(0, len(encoded_list) - article_length)
            encoded_list = encoded_list[cut:cut + article_length]
        else:
            # pad zeros to time_step
            for _ in range(padding):
                encoded_list.append(0)

    return encoded_list
예제 #2
0
def generator_from_file_debug(raw_path: str,
                              batch_size: int = 10,
                              shuffle: bool = True):
    '''
	generate data from file

	:param raw_path: news path
	:return: yield (X, Y, articles) X shape like (batch_size, 500), Y shape like (batch_size, 8),
		articles shape like (batch_size,) of str
	'''
    with open(raw_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    if shuffle: np.random.shuffle(lines)

    cur_idx = 0
    while True:
        X, Y, articles = [], [], []
        for _ in range(batch_size):
            line = lines[cur_idx]
            cur_idx += 1
            if cur_idx == len(lines): cur_idx = 0  # start over
            Y.append(get_tag(line))
            X.append(get_encoded_text(line))
            mt = tag_pattern.search(line)
            articles.append(''.join(line[mt.end():].split(' ')))

        yield np.array(X), np.array(Y), articles
예제 #3
0
def get_tag_one_hot(line: str):
    '''

	:param line: one line of article
	:return: tag (one hot) , ndarray
	'''
    mt = tag_pattern.search(line)
    assert mt
    tag = list(map(int, mt.groups()))
    y = np.zeros_like(tag)
    y[np.argmax(tag)] = 1
    return y
예제 #4
0
def get_tag_norm(line: str):
    '''

	:param line: one line of article
	:return: tag (normalized) , ndarray
	'''
    mt = tag_pattern.search(line)
    assert mt
    tag = list(map(int, mt.groups()))
    sum_up = sum(tag)
    tag = list(map(lambda x: x / sum_up, tag))  # normalize
    return np.array(tag)
예제 #5
0
def get_article_max_len(file: str):
    '''
	stat longest article in a dataset

	:param file: filename of dataset
	:return: int, max_len
	'''
    with open(file, 'r', encoding='utf-8') as f:
        max_len = 0
        while True:
            line = f.readline()
            if not line: break
            mt = tag_pattern.search(line)
            assert mt
            line = line[mt.end():].strip()
            words = line.split(' ')
            max_len = max(max_len, len(words))

    return max_len