def encode_data(data_path, word_dict_path, encoded_data_path):
    data_lines = load_training_data(data_path)

    # load word dict and make dict
    word_dict = dict()
    with open(word_dict_path, 'r', encoding='utf8') as word_dict_in:
        lines = [line.strip().split('\t') for line in word_dict_in]
    for index, word in lines:
        word_dict[word] = index

    output_lines = list()
    for id, sentiment, review in data_lines:
        line = id + '\t' + sentiment + '\t'
        assert len(review) > 0, 'len(review) is {}'.format(len(review))
        review_sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', review)  # split to sentences
        for sentence in review_sentences:
            for word in sentence.split(' '):
                if word not in word_dict:
                    word = '[__OOV__]'
                line += word_dict[word] + ' '
            # padding between two sentences
            line += word_dict['[__PAD__]'] + ' '
        line = line.strip()
        line += '\n'
        output_lines.append(line)

    with open(encoded_data_path, 'w', encoding='utf8', newline='\n') as encode_out:
        encode_out.writelines(output_lines)
def generate_validation_set(data_path, split_ratio, output_dir, output_file_prefix):
    lines = load_training_data(file_path=data_path)

    num_reviews = len(lines)
    random.shuffle(lines)

    training_list = lines[:int(num_reviews * split_ratio)]
    validation_list = lines[int(num_reviews * split_ratio):]

    with open(output_dir + output_file_prefix + '_training.tsv', 'w', encoding='utf8', newline='\n') as training_out:
        for id, sentiment, review in training_list:
            training_out.write(id + '\t' + sentiment + '\t' + review + '\n')

    with open(output_dir + output_file_prefix + '_validation.tsv', 'w', encoding='utf8', newline='\n') as validation_out:
        for id, sentiment, review in validation_list:
            validation_out.write(id + '\t' + sentiment + '\t' + review + '\n')
Exemplo n.º 3
0
def count_label(file_path):
    lines = load_training_data(file_path=file_path)

    pos_count = 0
    neg_count = 0

    for id, sentiment, review in lines:
        # skip first line
        if id == 'id' and sentiment == 'sentiment' and review == 'review':
            continue

        if sentiment == '1':
            pos_count += 1
        elif sentiment == '0':
            neg_count += 1

    print(pos_count)  # 12500
    print(neg_count)  # 12500
Exemplo n.º 4
0

def max_min_length(review_lines):
    max_length = -1
    min_length = 999999

    for review in review_lines:
        review_word_length = len(review.split(' '))
        max_length = max(max_length, review_word_length)
        min_length = min(min_length, review_word_length)

    print('max length of review is {}'.format(max_length))
    print('min length of review is {}'.format(min_length))
    # dataOutput/labeledTrainData_clean.tsv
    # max length of review is 2738 (words)
    # min length of review is 11 (words)

    # dataOutput/testData_clean.tsv
    # WARNING: test
    # max length of review is 2595 (words)
    # min length of review is 8 (words)


if __name__ == '__main__':
    # count_label('data/labeledTrainData.tsv')

    file_path = 'dataOutput/labeledTrainData_clean.tsv'
    lines = load_training_data(file_path=file_path)
    # lines = load_testing_data(file_path=file_path)
    max_min_length(review_lines=[line[-1] for line in lines])
    token_lemma_sentences = list()
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        token_lemma_sentences.append(' '.join(tokens) + '\n')

    return token_lemma_sentences


def output_file(output_path, lines):
    with open(output_path, 'w', encoding='utf8', newline='\n') as out:
        out.writelines(lines)


if __name__ == '__main__':
    # # clean training data
    # lines = load_training_data('data/labeledTrainData.tsv')
    # output_lines = clean_training_data(lines)
    # output_file(output_path='dataOutput/labeledTrainData_clean.tsv', lines=output_lines)

    # # clean testing data
    # lines = load_testing_data('data/testData.tsv')
    # output_lines = clean_training_data(lines)
    # output_file(output_path='dataOutput/testData_clean.tsv', lines=output_lines)

    # generate w2v training file
    lines = load_training_data('data/labeledTrainData.tsv')
    sentences = generate_w2v_data([line[2].strip('"') for line in lines])
    output_file('dataOutput/labeledTrainData_sentences.txt', lines=sentences)