示例#1
0
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE):
    """
    1. Get word frequency distribution
    2. Sort is based on word frequencies
    3. Make a vocab dist using the most frequent words
    4. Store vocab dist in a file in format <word, identifier>

    :param lower: Identifiers below this are reserved
    :param n: Number of unique expected words
    :return: A dict of vocabulary words and an assigned identifier
    """

    try:
        vocab_to_code = read_binary(VOCAB_TO_CODE_FILE)
        code_to_vocab = read_binary(CODE_TO_VOCAB_FILE)
        print('vocabulary loaded')
        return vocab_to_code, code_to_vocab
    except IOError:
        print('building vocabulary')
    freq = build_word_frequency_distribution()

    # sorting words in ascending order based on frequency and then pick top n words
    top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1]
    # create optimum vocab size
    print('Vocab count : ' + str(len(top_words)))
    # global MAX_VOCAB_SIZE
    # global UNKNOWN
    max_vocab_size = len(top_words) + 2
    unknown = max_vocab_size - 1
    vocab_to_code = {}
    code_to_vocab = {}

    vocab_to_code['<UNK>'] = unknown
    code_to_vocab[unknown] = '<UNK>'
    vocab_to_code['<PAD>'] = PAD
    code_to_vocab[PAD] = '<PAD>'

    # lower vocab indexes are reserved for padding and unknown words
    i = lower
    for w, freq in top_words:
        vocab_to_code[w] = i
        code_to_vocab[i] = w
        i += 1
    write_binary(vocab_to_code, VOCAB_TO_CODE_FILE)
    write_binary(code_to_vocab, CODE_TO_VOCAB_FILE)
    return vocab_to_code, code_to_vocab
示例#2
0
def combine_processed_data():
    combined_dataset = []

    restaurant = read_binary(filename=PROCESSED_RESTAURANT_FILE_NAME)
    print('Restaurant-' + str(len(restaurant)))
    combined_dataset.extend(restaurant)
    print(len(combined_dataset))

    laptops = read_binary(filename=PROCESSED_LAPTOPS_FILE_NAME)
    print('Laptops-' + str(len(laptops)))
    combined_dataset.extend(laptops)
    print(len(combined_dataset))

    # organic = read_binary(filename = PROCESSED_ORGANIC_FILE_NAME)
    # print('Organic-' + str(len(organic)))
    # combined_dataset.extend(organic)
    # print(len(combined_dataset))

    write_binary(combined_dataset, OUTPUT_FILE_NAME)
def build_word_frequency_distribution():
    """
    1. Extract tokens from the review text
    2. Calculate frequency of each token
    3. Create a freq dict and store it in a file

    :return: A dict of <token, freq>
    """
    try:
        freq_dist_f = read_binary(WORD_FREQ_FILE)
        print('frequency distribution loaded')
        return freq_dist_f
    except IOError:
        pass

    print('building frequency distribution')
    freq = defaultdict(int)
    if FILE_NAME == 'restaurant':
        for aspect_word in RESTAURANT_ASPECT_WORDS:
            freq[aspect_word] += 1
    elif FILE_NAME == 'laptops':
        for aspect_word in LAPTOPS_ASPECT_WORDS:
            freq[aspect_word] += 1

    files = [FORMATTED_FILE_NAME]
    if EMBEDDING_TYPE == 'fasttext':
        files.append(FORMATTED_FILE_NAME.replace('train', 'test'))
        files.append(FORMATTED_FILE_NAME.replace('train', 'val'))

    for file_path in files:
        print('building vocab from file - ' + file_path)
        for i, review in enumerate(read_binary(file_path)):
            sentences = review[1]

            for sent in sentences:
                tokens = NLP.tokenizer(sent[0])
                for token in tokens:
                    freq[token.orth_] += 1
                if i % 100 == 0:
                    write_binary(freq, WORD_FREQ_FILE)
                    print('dump at {}'.format(i))
            write_binary(freq, WORD_FREQ_FILE)
    return freq
示例#4
0
def process_data():
    vocab_to_code, code_to_vocab = build_vocabulary()
    max_vocab_size = len(vocab_to_code)
    print('Final Vocab Size : ' + str(max_vocab_size))
    try:
        tokenized_dataset = []
        all_sentences = []
        for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)):
            tokenized_aspect = []
            tokenized_sentences = []

            if i == 0:
                print(review)

            sentences = review[1]
            aspect_words = review[0]
            polarities = review[2]

            for aspect_word in aspect_words:
                tokenized_aspect.append(aspect_word)
                all_sentences.append([aspect_word])

            for sent in sentences:
                tokenized_sentence = []

                # remove duplicate spaces from the sentence. This is causing problem for elmo.
                s = re.sub(' +', ' ', sent[0])

                tokens = NLP.tokenizer(s)
                for token in tokens:
                    tokenized_sentence.append(token.orth_)
                tokenized_sentences.append(tokenized_sentence)

                # all these sentences will be written to a separate txt file at the end of the process.
                all_sentences.append(tokenized_sentence)

            tokenized_review = [
                tokenized_aspect, tokenized_sentences, polarities
            ]

            # dataset
            tokenized_dataset.append(tokenized_review)
            write_binary(tokenized_dataset, PROCESSED_FILE_NAME)
            print('dump at {}'.format(i))

        all_sentences = space_separated_token_string(all_sentences)
        save_sentences_to_text(all_sentences)
        # hack for elmo
        remove_duplicate_sentences()
    except KeyboardInterrupt:
        pass
示例#5
0
def process_data():
    vocab_to_code, code_to_vocab = build_vocabulary()
    max_vocab_size = len(vocab_to_code)
    unknown = max_vocab_size - 1
    print('Final Vocab Size : ' + str(max_vocab_size))
    try:
        coded_dataset = []
        for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)):
            coded_aspect = []
            coded_sentences = []

            if i == 0:
                print(review)

            sentences = review[1]
            aspect_words = review[0]
            polarities = review[2]

            for aspect_word in aspect_words:
                coded_aspect.append(vocab_to_code.get(aspect_word, unknown))

            for sent in sentences:
                coded_sentence = []
                tokens = NLP.tokenizer(sent[0])
                for token in tokens:
                    coded_sentence.append(
                        vocab_to_code.get(token.orth_, unknown))
                coded_sentences.append(coded_sentence)

            coded_review = [coded_aspect, coded_sentences, polarities]

            # dataset
            coded_dataset.append(coded_review)
            write_binary(coded_dataset, PROCESSED_FILE_NAME)
            print('dump at {}'.format(i))

        datapoint = coded_dataset[0]
        print(datapoint)
        print(get_uncoded_data(code_to_vocab, datapoint))
    except KeyboardInterrupt:
        pass
def process_data():
    vocab_to_code, code_to_vocab = build_vocabulary()
    vocab_size = len(vocab_to_code)
    unknown = vocab_size - 1
    print('Final Vocab Size : ' + str(vocab_size))
    coded_dataset = []
    for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)):
        coded_aspect = []
        coded_text = []

        if i == 0:
            print(review)

        text = review[1]
        aspect_words = review[0]
        polarity = review[2]

        for aspect_word in aspect_words:
            a = vocab_to_code.get(aspect_word, unknown)
            if a == unknown:
                print('STOP')
                print(aspect_word)
            coded_aspect.append(a)

        for word in text:
            word_code = vocab_to_code.get(word, unknown)
            coded_text.append(word_code)

        coded_review = [coded_aspect, [coded_text], [polarity]]
        coded_dataset.append(coded_review)
        write_binary(coded_dataset, PROCESSED_FILE_NAME)
        print('dump at {}'.format(i))

    datapoint = coded_dataset[0]
    print(datapoint)
    print(get_uncoded_data(code_to_vocab, datapoint))
示例#7
0
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE):
    """
    1. Get word frequency distribution
    2. Sort is based on word frequencies
    3. Make a vocab dist using the most frequent words
    4. Store vocab dist in a file in format <word, identifier>

    :param lower: Identifiers below this are reserved
    :param n: Number of unique expected words
    :return: A dict of vocabulary words and an assigned identifier
    """

    try:
        vocab_to_code = read_binary(VOCAB_TO_CODE_FILE)
        code_to_vocab = read_binary(CODE_TO_VOCAB_FILE)
        print('vocabulary loaded')
        return vocab_to_code, code_to_vocab
    except IOError:
        print('building vocabulary')
    freq = build_word_frequency_distribution()

    # get glove embeddings
    print('loading embeddings')
    if EMBEDDING_TYPE == 'glove':
        word_to_embeddings = load_glove_embeddings()
    elif EMBEDDING_TYPE == 'fasttext':
        word_to_embeddings = load_oov_fastText_embeddings()
    else:
        word_to_embeddings = {}

    # sorting words in ascending order based on frequency and then pick top n words
    top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1]
    # create optimum vocab size
    print('Vocab count : ' + str(len(top_words)))
    # global MAX_VOCAB_SIZE
    # global UNKNOWN
    max_vocab_size = len(top_words) + 2
    unknown = max_vocab_size - 1
    vocab_to_code = {}
    code_to_vocab = {}

    # an array of embeddings with index referring to the vocab code. First and last index is
    # reserved for padding and unknown words respectively.
    code_to_embed = np.zeros(shape=(max_vocab_size, EMBEDDING_DIMENSION),
                             dtype=np.float32)
    code_to_embed[PAD] = PAD_EMBEDDING
    code_to_embed[unknown] = UNKNOWN_EMBEDDING
    vocab_to_code['<UNK>'] = unknown
    code_to_vocab[unknown] = '<UNK>'
    vocab_to_code['<PAD>'] = PAD
    code_to_vocab[PAD] = '<PAD>'

    # lower vocab indexes are reserved for padding and unknown words
    i = lower
    for w, freq in top_words:
        vocab_to_code[w] = i
        code_to_vocab[i] = w
        try:
            if EMBEDDING_TYPE == 'glove':
                embedding = word_to_embeddings.word_vec(w)
            elif EMBEDDING_TYPE == 'fasttext':
                embedding = word_to_embeddings.get_word_vector(w)
        except KeyError:
            embedding = UNKNOWN_EMBEDDING
        code_to_embed[i] = embedding
        i += 1
    write_binary(vocab_to_code, VOCAB_TO_CODE_FILE)
    write_binary(code_to_vocab, CODE_TO_VOCAB_FILE)
    write_binary(code_to_embed, CODE_TO_EMBED_FILE)
    return vocab_to_code, code_to_vocab
示例#8
0
文件: util.py 项目: dugarsumit/absa
def fasttext_embeddings(shape):
    print('using fasttext..')
    fasttext = read_binary(CODE_TO_EMBED_FILE)
    fasttext = fasttext[0:shape[0], 0:shape[1]]
    return tf.convert_to_tensor(value=fasttext)
示例#9
0
文件: util.py 项目: dugarsumit/absa
def glove_embeddings(shape):
    print('using glove...')
    glove = read_binary(CODE_TO_EMBED_FILE)
    glove = glove[0:shape[0], 0:shape[1]]
    return tf.convert_to_tensor(value=glove)
示例#10
0
文件: util.py 项目: dugarsumit/absa
def vocab_to_code(words):
    vocab_to_code_map = read_binary(VOCAB_TO_CODE_FILE)
    codes = []
    for word in words:
        codes.append(vocab_to_code_map.get(word))
    return codes
示例#11
0
文件: util.py 项目: dugarsumit/absa
def code_to_vocab(codes):
    code_to_vocab_map = read_binary(CODE_TO_VOCAB_FILE)
    words = []
    for code in codes:
        words.append(code_to_vocab_map.get(code))
    return words
示例#12
0
文件: test.py 项目: dugarsumit/absa
def read_words():
    data = read_binary(WORD_FREQ_FILE)
    for i, (w, f) in enumerate(data.items()):
        print(str(i) + '-' + w + ' : ' + str(f))