Пример #1
0
def get_word2vec():
    import data_manager as data
    import numpy as np
    import os
    from gensim.models import KeyedVectors
    from tqdm import tqdm
    import tables as tb

    print("loading word2vec...")
    model = data.get_pickle("data/word2vec.model.pkl")
    # model = KeyedVectors.load_word2vec_format(os.path.join(os.path.dirname(__file__), "data/word2vec.300d.txt"))
    # data.save_pickle("data/word2vec.model.pkl", model)
    print("word2vec loaded")

    texts = data.get_pickle("data/preprocessed.pkl")
    print("splitting sentences into words")
    texts = list(map(lambda t: t.split(" "), texts))
    print("done!")

    arr_file = tb.open_file("data/docvecs.hdf",
                            "w",
                            filters=tb.Filters(complib='zlib', complevel=0))
    document_vecs = None  # arr_file.create_earray(arr_file.root, "docvecs")
    for sentence in tqdm(texts):
        word_vecs = []
        for word in sentence:
            try:
                word_vecs.append(model.get_vector(word))
            except:
                pass

        # if len(word_vecs) == 0:
        #     word_vecs.append([0 for i in range(300)])
        # document_vecs.append(np.mean(word_vecs, axis=0))

        # Tried keeping order and padding but too heavy
        if len(word_vecs) > 300:
            word_vecs = word_vecs[:300]
        len_vecs = len(word_vecs)
        for i in range(300 - len_vecs):
            word_vecs.append([0 for i in range(300)])

        word_vecs = np.array([word_vecs])
        if document_vecs is None:
            document_vecs = arr_file.create_earray(arr_file.root,
                                                   "docvecs",
                                                   obj=word_vecs)
        else:
            document_vecs.append(word_vecs)

    #data.save_pickle("data/wordvecs_np.pkl", document_vecs)

    return document_vecs
Пример #2
0
def get_spelling_ratios():
    if os.path.isfile("data/spelling_ratios.pkl"):
        return data.get_pickle("data/spelling_ratios.pkl")
    import tables as tb
    arr_file = tb.open_file("data/docvecs.hdf",
                            "r",
                            filters=tb.Filters(complib='zlib', complevel=0))
    docvecs = arr_file.create_earray(arr_file.root, "docvecs")

    sent_lengths = get_sentence_lengths()
    sent_lengths = list(map(lambda sent: sum(sent), sent_lengths))
    print(sent_lengths[:2])

    ratios = []
    for i, embeddings in enumerate(tqdm(docvecs)):
        count = 0
        for embedding in embeddings:
            if not np.any(embedding):
                break
            count += 1

        ratio = count / min(sent_lengths[i], 300)
        ratios.append(ratio)

    data.save_pickle("data/spelling_ratios.pkl", ratios)

    return ratios
Пример #3
0
def extract_tags(texts):
    if os.path.isfile(TAGS_DIR):
        return data.get_pickle(TAGS_DIR)
    else:
        texts = preprocess(texts)
        texts = get_pos_tags(map(lambda text: text.split(), texts))
        tags = list(map(lambda texts: [tagged[1] for tagged in texts], texts))
        data.save_pickle(TAGS_DIR, tags)
        return tags
Пример #4
0
def normalize_tag_bow():
    if os.path.isfile("data/tags_bow_norm.pkl"):
        return
    tag_counts = data.get_pickle(TAGS_BOW)
    result = []
    for tag in tag_counts:
        res = tag
        if any(tag):
            res = normalize(tag)
        result.append(res)
    data.save_pickle("data/tags_bow_norm.pkl", result)
    print(result[0])
Пример #5
0
def get_sentence_lengths():
    if os.path.isfile("data/sentence_lengths.pkl"):
        return data.get_pickle("data/sentence_lengths.pkl")
    else:
        texts = data.get_pickle("data/preprocessed.pkl")

        texts = list(map(lambda t: t.split(" "), texts))
        result = []
        for review in texts:
            review_len = len(review)
            sent_lengths = []
            c = 1
            for i, word in enumerate(review):
                if word in ".!?" or i == review_len - 1:
                    sent_lengths.append(c)
                    c = 1
                else:
                    c += 1
            result.append(sent_lengths)
        data.save_pickle("data/sentence_lengths.pkl", result)
        return result
Пример #6
0
def get_tags_bow(sentences):
    if os.path.isfile(TAGS_BOW):
        return data.get_pickle(TAGS_BOW)
    else:
        from collections import Counter
        from nltk.data import load
        corpus = list(load('help/tagsets/upenn_tagset.pickle').keys())
        f = lambda x: Counter([y for y in x if y in corpus])
        df = pd.DataFrame({"tags": sentences})
        df["bow"] = (pd.DataFrame(df["tags"].apply(f).values.tolist()).reindex(
            columns=corpus).fillna(0).astype(int).values.tolist())
        result = df["bow"].tolist()
        data.save_pickle(TAGS_BOW, result)
        return result
Пример #7
0
def preprocess(texts):
    if os.path.isfile(PREPROCESSED_DIR):
        return data.get_pickle(PREPROCESSED_DIR)
    else:
        result = []
        for text in texts:
            text = clean(text)
            text = get_sentences(text)
            text = get_words(text)
            # text =          get_pos_tags(text)
            # text =        lemmatize(text)
            text = flatten_sentences(text)
            text = flatten_paragraph(text)
            print(text)

            result.append(text)
        data.save_pickle(PREPROCESSED_DIR, result)
    return result
def get_tfidf():
    documents = data.get_pickle("data/preprocessed.pkl")
    documents = list(map(lambda d: d.split(" "), documents))
    print(documents[:2])
    vectorizer = TfidfVectorizer(
        analyzer="word",
        token_pattern=None,
        stop_words="english",
        tokenizer=dummy,
        preprocessor=dummy,
        ngram_range=(1,3),
        max_features=12000,
        max_df=0.99
    )

    matrix = vectorizer.fit_transform(documents)

    data.save_pickle("data/tfidf.pkl", matrix)

    print(vectorizer.get_feature_names())
Пример #9
0
def get_word_counts(texts):
    from statistics import mean
    if os.path.isfile("data/word_counts.pkl"):
        return data.get_pickle("data/word_counts.pkl")
    else:
        result = []
        i = 0
        for text in texts:
            text = clean(text)
            sent = get_sentences(text)
            word = get_words(sent)
            sentence_lengths = [len(sentence) for sentence in word]
            try:
                if sentence_lengths:
                    result.append(sum(sentence_lengths))
                else:
                    result.append(0)
                i += 1
            except:
                print(text, i)
                return
        data.save_pickle("data/word_counts.pkl", result)
        return result
        validation_size = len(self.validation_split)
        while True:
            if self.current_valid_index >= validation_size - self.n:
                self.current_valid_index = 0
            indices = self.validation_split[self.current_valid_index:self.current_valid_index + self.n]
            self.current_valid_index += self.n
            yield self.get_items(indices)

    def reset(self):
        self.current_train_index = 0
        self.current_valid_index = 0


arr_file = tb.open_file("data/docvecs.hdf", "r", filters=tb.Filters(complib='zlib', complevel=0))
embeddings = arr_file.create_earray(arr_file.root, "docvecs")
tfidf = data.get_pickle("data/tfidf.pkl").toarray()
product_counts = np.array(data.get_pickle("data/product_review_counts.pkl"))
reviewer_counts = np.array(data.get_pickle("data/reviewer_review_counts.pkl"))
ratings = np.array(data.get_pickle("data/ratings_disc.pkl"))
sent_counts = np.array(data.get_pickle("data/sent_counts_norm.pkl"))
word_counts = np.array(data.get_pickle("data/word_counts_norm.pkl"))
spell_ratios = np.array(data.get_pickle("data/spelling_ratios.pkl"))
pos_dist = np.array(data.get_pickle("data/tags_bow_norm.pkl"))
helpfulness = np.array(data.get_pickle("data/labels_disc.pkl"))

batch_size = 128
batcher = Batcher(product_counts, reviewer_counts, ratings, sent_counts, word_counts, spell_ratios, pos_dist, tfidf,
                  embeddings, helpfulness, batch_size)


def train_cnn():
Пример #11
0
def normalize_word_counts():
    word_counts = data.get_pickle("data/word_counts.pkl")
    normed = normalize(word_counts)
    data.save_pickle("data/word_counts_norm.pkl", normed)
    print(normed)
Пример #12
0
def normalize_sentence_lengths():
    sentence_lengths = data.get_pickle("data/sentence_lengths.pkl")
    normed = normalize(sentence_lengths)
    data.save_pickle("data/sentence_lengths_norm.pkl", normed)
    print(normed)
def test():
    import numpy as np
    matrix = data.get_pickle("data/tfidf.pkl").toarray()
    indices = [1,2,3]
    print([matrix[i] for i in indices])
def get_tags():
    return data.get_pickle("data/text_tags.pkl")