def get_tf_idf_model(citations=None): if citations is None: citations = TextPreprocessor() citations.preprocess() documents = [ citation['title'] + ' \n' + citation['abstract'] for citation in list(citations.values()) ] bigram_vectorizer = CountVectorizer(ngram_range=(1, 2)) bigrams = bigram_vectorizer.fit_transform(documents) tfidf = TfidfTransformer().fit_transform(bigrams) return citations, bigram_vectorizer, tfidf
for token_sequence in sequences: words.extend(token_sequence) word_counts = dict(Counter(words).most_common(max_words)) most_common_words = list(word_counts.keys()) word_ids = list(range(len(most_common_words))) vocabulary = dict(zip(most_common_words, word_ids)) return vocabulary sentences = np.genfromtxt('./tickets_QIT.txt', dtype=str, delimiter='\n') prep = TextPreprocessor(sentences) prep = QITEmailBodyCleaner(prep) prep = Tokenizer(prep, language='italian') tokens = prep.preprocess() vocabulary = build_vocabulary(tokens) unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id) prep = WordContextPairsGenerator(prep, window_length=2) word_context_pairs = prep.preprocess() target_words = [tw for (tw, cw) in word_context_pairs] context_words = [cw for (tw, cw) in word_context_pairs] np.savetxt('target_words.txt', target_words, fmt='%d') np.savetxt('context_words.txt', context_words, fmt='%d')
prep = Tokenizer(prep, language) # Load vocabulary with open('vocabulary_wikipedia', 'r') as vocabulary_file: vocabulary = eval(vocabulary_file.read()) # Add integer encoding decorator unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id) # Add padding decorator padding_token_id = max(vocabulary.values()) + 2 prep = Padder(prep, padding_token_id, max_length) # Get final tokens final_tokens = prep.preprocess() # Load labels labels = np.genfromtxt('../upsampled/y_QIT.txt', delimiter='\n', dtype=str).reshape((-1, 1)) # Convert labels into one-hot dummies enc = OneHotEncoder(sparse=False) one_hot_labels = enc.fit_transform(labels) # Split dataset into training and test data x_train, x_test, y_train, y_test = train_test_split(final_tokens, one_hot_labels, test_size=0.3, stratify=labels)