def dump_reuters(): nb_total = 0 synth_file = open(str('data_set\\' + 'synthese_reuters' + '.txt'), 'w') for cat in reuters.categories(): # cat = 'housing' text_arr = np.unique(np.array(reuters.paras(categories=[cat]))) file_object = open(str('data_set\\reuters_nltk\\' + cat + '.txt'), 'w') nb_paraph = 0 for p in range(0, text_arr.shape[0]): len_para = 0 for _, sent in enumerate(text_arr[p]): len_para = len_para + len(sent) if len_para > 50: paragraph = '' for i in range(0, len(text_arr[p])): sent = ' '.join(text_arr[p][i]) paragraph = str(paragraph) + sent file_object.write(paragraph) file_object.write('\n') nb_paraph = nb_paraph + 1 file_object.close() synth_file.write( str('categorie ' + cat + ' : ' + str(nb_paraph) + '\n')) nb_total = nb_total + nb_paraph synth_file.write(str('Total : ' + str(nb_total))) synth_file.close()
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(normalize_tokens(content)) w2v = Word2Vec(texts, size=100, window=5, min_count=5, workers=4) w2v.save(model_path)
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) svd = TruncatedSVD(n_components=100) lsa = svd.fit_transform(tf.T) lsa.dump(open(model_path, 'wb')) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
def get_most_populous_categs(self): cats = reuters.categories() categ_dict = {} total_multi = 0 for c in cats: lcat = len(reuters.paras(categories=[c])) total_multi += lcat categ_dict[c] = lcat most_populous_categs = sorted(categ_dict.items(), key=itemgetter(1), reverse=True) # getting top 10 top_10_poplous_categs = most_populous_categs[:10] top_10_poplous_categs top_10_poplous_categs_names = [i[0] for i in top_10_poplous_categs] return top_10_poplous_categs_names
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) test_vocab = set() reader = csv.reader(open(global_truth_path)) for line in reader: w1, w2, score = line test_vocab.add(stemmer.stem(w1)) test_vocab.add(stemmer.stem(w2)) test_vocab = {k: v for v, k in enumerate(test_vocab)} model = np.zeros((len(test_vocab), len(transformer.vocabulary_))) for text in texts: text = text.split() for i in range(len(text)): if text[i] not in test_vocab: continue for j in (i - window_size, i + window_size + 1): if j < 0 or j >= len(text): continue if text[j] not in transformer.vocabulary_: continue model[test_vocab[text[i]]][transformer.vocabulary_[ text[j]]] += 1 model.dump(model_path) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb')) pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) test_vocab = set() reader = csv.reader(open(global_truth_path)) for line in reader: w1, w2, score = line test_vocab.add(stemmer.stem(w1)) test_vocab.add(stemmer.stem(w2)) test_vocab = {k: v for v, k in enumerate(test_vocab)} model = np.zeros((len(test_vocab), len(transformer.vocabulary_))) for text in texts: text = text.split() for i in range(len(text)): if text[i] not in test_vocab: continue for j in (i-window_size, i+window_size+1): if j < 0 or j >= len(text): continue if text[j] not in transformer.vocabulary_: continue model[test_vocab[text[i]]][transformer.vocabulary_[text[j]]] += 1 model.dump(model_path) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb')) pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
docs = reuters.fileids(categories=[category]) count = 0 docAmount = len(docs) trainingPath = 'C:/MLprojekt/SSK/reuters/reuters/' + category + 'Training' testPath = 'C:/MLprojekt/SSK/reuters/reuters/' + category + 'Test' if (not os.path.exists(trainingPath)): os.makedirs(trainingPath) if (not os.path.exists(testPath)): os.makedirs(testPath) for doc in docs: count += 1 if (doc.find('test') != -1): name = doc.replace('test/', '') document = reuters.paras(fileids=[doc]) parseDoc(document[0], name, testPath) else: name = doc.replace('training/', '') document = reuters.paras(fileids=[doc]) parseDoc(document[0], name, trainingPath) print(docAmount - count, ' documents left') # Parses a file, removing all stopwords and saves it with the same name in the new folder """ def parse(file, filename, newPath): lines = file.read() words = lines.split() table = str.maketrans("", "", string.punctuation)