Пример #1
0
def test_closer_to(start, close, far):
    start = docvector(word2vec, simple_tokenize(start))
    close = docvector(word2vec, simple_tokenize(close))
    far = docvector(word2vec, simple_tokenize(far))

    close_dist = distance.cosine(start, close)
    far_dist = distance.cosine(start, far)

    assert close_dist < far_dist
Пример #2
0
 def preprocess(self, text):
     seen = set()
     sents = []
     for sent in text:
         processed = tuple(simple_tokenize(sent))
         if processed not in seen:
             sents.append(processed)
             seen.add(processed)
     return sents
    def most_common_words(self):
        """
        Find the most common words in the procurement tender descriptions.
        Then return top 12 for the top most common words
        :return: List of String
        """
        # check is it is already computed
        if self.cache_most_common_words is None:
            word_counts = {}
            for p in data_holder.procurements:
                for token in simple_tokenize(p.tender_description.lower()):
                    if token in gensim.parsing.preprocessing.STOPWORDS:
                        continue

                    word_counts.setdefault(token, 0)
                    word_counts[token] += 1

            sorted_keys = sorted(word_counts.keys(),
                                 key=word_counts.get,
                                 reverse=True)
            self.cache_most_common_words = sorted_keys[:12]

        # return computed result
        return self.cache_most_common_words
Пример #4
0
def tokenizer(string: str):
    return [s for s in simple_tokenize(string)]
Пример #5
0
 def review2idx(review):
     return [word2idx(word) for word in simple_tokenize(review)]
Пример #6
0
def imdb_preprocess():
    imdb_dir = './data/aclImdb'
    subdirs = [
        'train/neg',
        'train/pos',
        'test/neg',
        'test/pos'
    ]

    # Load reviews into memory
    reviews = dict()
    for subdir in subdirs:
        reviews[subdir] = []
        working_dir = os.path.join(imdb_dir, subdir)
        for filepath in glob.glob(working_dir+"/*"):
            with open(filepath, 'r') as f:
                reviews[subdir].append(f.read())

    # Create vocabulary
    vocab_counts = Counter()

    for review_set in [reviews['train/neg'], reviews['train/pos']]:
        for review in review_set:
            vocab_counts.update(simple_tokenize(review))

    word_counts = vocab_counts.most_common()
    word_counts = [pair for pair in word_counts if pair[1] > 2]
    word_index = {
        a[0]: i+2 for i, a in enumerate(word_counts)
    }
    word_index["<EMPTY>"] = 0
    word_index["<UNKNOWN>"] = 1

    with open('./data/word-index.json', 'w') as f:
        json.dump(word_index, f)

    # Encode reviews using word_index
    def word2idx(word):
        if word in word_index.keys():
            return word_index[word]
        else:
            return 1

    def review2idx(review):
        return [word2idx(word) for word in simple_tokenize(review)]

    transformed_reviews = dict()

    for subdir in subdirs:
        transformed_reviews[subdir] = []
        for review in reviews[subdir]:
            transformed_reviews[subdir].append(review2idx(review))

    # Create train, validation and test datasets
    train = [(r, 1) for r in transformed_reviews['train/pos']]
    train += [(r, 0) for r in transformed_reviews['train/neg']]
    shuffle(train)

    test = [(r, 1) for r in transformed_reviews['test/pos']]
    test += [(r, 0) for r in transformed_reviews['test/neg']]
    shuffle(test)

    with open('./data/imdb-reviews.json', 'w') as f:
        json.dump(
            {
                'train': train,
                'test': test
            },
            f
        )