def create_sentence_files(): stop_words = set( pd.read_csv('./Primary_data/PersianStopWordList.txt', header=None)[0]) questions = QuickDataFrame.read_csv('./Primary_data/result_filtered.csv', sep=';') topics = QuickDataFrame.read_csv('./Primary_data/topic_vector_Q.csv') files = dict() for tpc in topics.cols: files[tpc + '-p'] = open('./Primary_data/sent_topic/' + tpc + '.p', 'w', encoding='utf-8') files[tpc + '-n'] = open('./Primary_data/sent_topic/' + tpc + '.n', 'w', encoding='utf-8') prog = Progresser(len(questions['sentence'])) # build the train data for i, qrow in enumerate(questions['sentence']): prog.count() snt = [] for word in tokenise(qrow): if word not in stop_words: snt.append(word) snt = ' '.join(snt) for tpc in topics.cols: if topics[tpc][i] == '0': files[tpc + '-n'].write(snt + '\n') elif topics[tpc][i] == '1': files[tpc + '-p'].write(snt + '\n') else: print("wattt") for fl in files.values(): fl.close()
def find_frequent_words(): data = pd.read_csv('./StackExchange_data/all_data.csv') words = dict() lemmatiser = WordNetLemmatizer() stop_words = set() for w in stopwords.words('english'): stop_words.add(w) # with open('./StackExchange_data/words50000.csv', 'r') as infile: # for line in infile: # w, _, f = line.partition(',') # words[w] = int(f) p = Progresser(data.shape[0]) cleaner = re.compile('^\s*-*|-\s*$') for i, row in data.iterrows(): # if i <= 50000: # continue p.show_progress(i) tokens_pos = pos_tag(word_tokenize(row['body'])) for word_pos in tokens_pos: if len(word_pos[0]) < 2: continue word = word_pos[0].lower() word = re.sub(cleaner, '', word) if word in stop_words: continue if len(word) > 2: word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos(word_pos[1])) if word not in stop_words: if word in words: words[word] += 1 else: words[word] = 1 if i % 5000 == 0: with open('./StackExchange_data/words' + str(i) + '.csv', 'w') as outfile: for word, freq in words.items(): outfile.write(word + ',' + str(freq) + '\n') sorted_words = sorted(words, key=lambda x: words[x], reverse=True) with open('./StackExchange_data/words_frequency.csv', 'w') as outfile: for word in sorted_words: try: outfile.write(str(word) + ',' + str(words[word]) + '\n') except: pass with open('./StackExchange_data/1000words.csv', 'w') as outfile: for word in sorted_words[:1000]: outfile.write(str(word) + '\n')
def create_w2v_vectors(): # with open('./word2vec/IRBlog/w2v_per_300.pkl', 'rb') as infile: with open('./word2vec/Mixed/w2v_per.pkl', 'rb') as infile: w2v = pickle.load(infile) w2v_length = 100 # 300 stop_words = set( pd.read_csv('./Primary_data/PersianStopWordList.txt', header=None)[0]) questions = pd.read_csv('./Primary_data/result_filtered.csv', delimiter=';') train = QuickDataFrame(['w' + str(i) for i in range(0, w2v_length)]) prog = Progresser(questions.shape[0]) # build the train data for i, qrow in questions.iterrows(): prog.count() sum_array = np.zeros(w2v_length) number_of_words = 0 for word in tokenise(qrow['sentence']): if word not in stop_words and word in w2v: number_of_words += 1 sum_array += w2v[word] if i != len(train): print('wat?!!') train.append(list(sum_array / number_of_words)) train.to_csv('./Primary_data/w2v-100_vector_Q.csv')
def lemmatise_all(): id_mappings = pd.read_csv('./EurLex_data/eurlex_ID_mappings.csv', sep='\t') lemmatiser = WordNetLemmatizer() stop_words = set() for w in stopwords.words('english'): stop_words.add(w) cleaner = re.compile('^\s*-*|-\s*$') prog = Progresser(id_mappings.shape[0]) for i, row in id_mappings.iterrows(): prog.count() try: # if file already processed then continue if os.path.isfile('./EurLex_data/lem_txt/' + str(row['DocID']) + '-lem.txt'): continue try: with open('./EurLex_data/eurlex_txt/' + str(row['DocID']) + '.txt', 'r', encoding="utf8") as infile: raw_text = infile.read() except: continue lemmatised_doc = '' # lemmatise each sentence for sent in sent_tokenize(raw_text): lemmatised_sent = '' tokens_pos = pos_tag(word_tokenize(sent)) # lemmatise each word in sentence for word_pos in tokens_pos: if len(word_pos[0]) < 2: continue word = word_pos[0].lower() word = re.sub(cleaner, '', word) if word in stop_words: continue if len(word) > 2: word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos( word_pos[1])) if word in stop_words: continue lemmatised_sent += word + ' ' lemmatised_doc += lemmatised_sent + '\n' # write doc to file with open('./EurLex_data/lem_txt/' + str(row['DocID']) + '-lem.txt', 'w', encoding="utf8") as outfile: outfile.write(lemmatised_doc) except Exception as e: print(e)
def inter_cross_validation(x, y, algs, k=10): print('ytrain.shape:', y.shape) # create a k fold with no unique classes count = 0 while True: count += 1 kf = list( KFold(n_splits=k, shuffle=True, random_state=randint(0, 100000)).split(x)) good_folds = True for train_index, test_index in kf: for i in range(len(y[0])): if len(np.unique(y[train_index, i])) < 2: print(i) good_folds = False break if not good_folds: break if good_folds: break fold_num = 0 f1scr = {alg: [] for alg in algs.keys()} prog = Progresser(k) for train_index, test_index in kf: x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] for alg_name, alg_cls in algs.items(): topic_classifier = BinaryRelevance(classifier=alg_cls, require_dense=[True, True]) try: topic_classifier.fit(x_train, y_train) except Exception as e: print('\nfit error!:', e, alg_name) continue try: predictions = topic_classifier.predict(x_test) f1scr[alg_name].append( f1_score(y_test, predictions, average='macro')) print('--', alg_name, f1scr[alg_name]) except Exception as e: print('Eval error!:', e) fold_num += 1 prog.count() best_alg = '' best_score = 0 for alg_name, score in f1scr.items(): mean_score = np.mean(score) if mean_score > best_score: best_alg = alg_name best_score = mean_score print(best_alg, best_score, '+-', np.std(f1scr[best_alg])) return best_alg
def find_frequent_words(): id_mappings = QuickDataFrame.read_csv( './EurLex_data/eurlex_ID_mappings.csv', sep='\t') words = dict() prog = Progresser(len(id_mappings)) for i in range(len(id_mappings)): prog.count() try: # read the file try: with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) + '-lem.txt', 'r', encoding="utf8") as infile: doc_text = infile.read() except IOError as e: # print(e) continue # count the words for word in word_tokenize(doc_text): if word in words: words[word] += 1 else: words[word] = 1 except Exception as e: print(e) # remove bad words cleaner = re.compile('^(19\d\d)$|^(2\d\d\d)$|^((?!\d)\w)*$') filtered_words = dict() for word, freq in words.items(): if cleaner.match(word): filtered_words[word] = freq sorted_words = sorted(filtered_words, key=lambda x: filtered_words[x], reverse=True) with open('./EurLex_data/words_frequency.csv', 'w', encoding="utf8") as outfile: for word in sorted_words: try: outfile.write(str(word) + ',' + str(words[word]) + '\n') except Exception as e: print(word, e) pass with open('./EurLex_data/1000words.csv', 'w', encoding="utf8") as outfile: for word in sorted_words[:1000]: outfile.write(str(word) + '\n')
def process_html_files(): id_mappings = pd.read_csv('./EurLex_data/eurlex_ID_mappings.csv', sep='\t') no_file_num = 0 no_en_num = 0 no_texte_num = 0 prog = Progresser(id_mappings.shape[0]) for i, row in id_mappings.iterrows(): prog.count() try: # if file already processed then continue if os.path.isfile('./EurLex_data/eurlex_txt/' + str(row['DocID']) + '.txt'): continue # read the html with open('./EurLex_data/eurlex_html_EN_NOT/' + row['Filename'].replace(':', '_'), 'r', encoding="utf8") as infile: html = infile.read() # extract raw text soup = BeautifulSoup(html, 'html.parser') elem = soup.findAll('div', {'class': 'texte'}) if len(elem) == 0: no_texte_num += 1 continue raw_text = elem[0].text.strip() if raw_text.startswith('/* There is no English') or raw_text == '': no_en_num += 1 continue # write the text into a new file with open('./EurLex_data/eurlex_txt/' + str(row['DocID']) + '.txt', 'w', encoding="utf8") as outfile: outfile.write(raw_text) except IOError: no_file_num += 1 except Exception as e: print(e) print('NO texte:', no_texte_num) print('NO EN:', no_en_num) print('NO file:', no_file_num)
def build_w2v_vectors(): with open('./word2vec/word2vec-En.pkl', 'rb') as infile: w2v = pickle.load(infile) w2v_length = 300 stop_words = set() for w in stopwords.words('english'): stop_words.add(w) id_mappings = QuickDataFrame.read_csv( './EurLex_data/eurlex_ID_mappings.csv', sep='\t') # create DataFrame cols_list = ['doc_id'] + ['w' + str(i) for i in range(0, w2v_length)] train = QuickDataFrame(columns=cols_list) prog = Progresser(len(id_mappings)) for i in range(len(id_mappings)): prog.count() # read the file try: with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) + '-lem.txt', 'r', encoding="utf8") as infile: doc_text = infile.read() except IOError: continue try: sum_array = np.zeros(w2v_length) number_of_words = 0 for word in word_tokenize(doc_text): if word not in stop_words and word in w2v: number_of_words += 1 sum_array += w2v[word] if number_of_words > 0: sum_array = sum_array / number_of_words train.append([id_mappings['DocID'][i]] + list(sum_array)) except Exception as e: print(e) train.to_csv('./EurLex_data/w2v_vector_Q.csv')
def build_tag_vectors(): data = pd.read_csv('./StackExchange_data/all_data.csv') topics = pd.read_csv('./StackExchange_data/tags.csv') cols_list = list(topics['term']) + ['question_id'] data = data.set_index('id') train_arr = np.zeros((data.shape[0], len(cols_list)), dtype=np.int16) col_index = dict() for ind, col in enumerate(cols_list): col_index[col] = ind # build the train data print('Building train data...') p = Progresser(data.shape[0]) for i, qrow in data.iterrows(): p.show_progress(i) train_arr[i][col_index['question_id']] = i # set occurrence values row_tags = eval(qrow['tag']) for tp in row_tags: try: train_arr[i][col_index[tp]] = 1 except Exception as e: # print(e) pass # write to file print('\nWriting to file...') with open('./StackExchange_data/data_tags.csv', 'w') as outfile: for ind, col in enumerate(cols_list): outfile.write(col) if ind == len(cols_list) - 1: outfile.write('\n') else: outfile.write(',') p = Progresser(data.shape[0]) line_num = 0 line_len = len(train_arr[0]) for line in train_arr: p.show_progress(line_num) line_num += 1 for i in range(0, line_len): outfile.write(str(line[i])) if i == line_len - 1: outfile.write('\n') else: outfile.write(',')
def build_all_vectors(): id_mappings = QuickDataFrame.read_csv( './EurLex_data/eurlex_ID_mappings.csv', sep='\t') subject_data = QuickDataFrame.read_csv( './EurLex_data/eurlex_id2class/id2class_eurlex_subject_matter.qrels', header=False, columns=['sub', 'doc_id', 'col2'], sep=' ') words_vector = QuickDataFrame.read_csv('./EurLex_data/1000words.csv', header=False, columns=['term']) topics = QuickDataFrame.read_csv('./EurLex_data/tags.csv') # train = QuickDataFrame.read_csv('./EurLex_data/w2v_vector_Q.csv') # train.set_index(train['doc_id'], unique=True) # create DataFrame cols_list = ['doc_id'] + list(words_vector['term']) train = QuickDataFrame(columns=cols_list) # filling word columns prog = Progresser(len(id_mappings)) for i in range(len(id_mappings)): prog.count() try: # read the file try: with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) + '-lem.txt', 'r', encoding="utf8") as infile: doc_text = infile.read() except IOError: continue # add a new row train.append(value=0) # complete the data in that row train['doc_id'][len(train) - 1] = id_mappings['DocID'][i] for word in word_tokenize(doc_text): if word in train.data: train[word][len(train) - 1] = 1 except Exception as e: print(e) # index by doc id train.set_index(train['doc_id'], unique=True) # rename word columns rename_dict = dict() index = 0 for wrd in list(words_vector['term']): rename_dict[wrd] = 'wrd' + str(index) index += 1 train.rename(columns=rename_dict) # add topic columns for col in list(topics['term']): train.add_column(name=col, value=0) # filling topic columns for i in range(len(subject_data)): try: sub = subject_data['sub'][i] doc_id = subject_data['doc_id'][i] train[sub, doc_id] = 1 except Exception as e: print(e) # rename topic columns rename_dict = dict() index = 0 for tpc in list(topics['term']): rename_dict[tpc] = 'tpc' + str(index) index += 1 train.rename(columns=rename_dict) # write to file print('\nWriting to file...') # train.to_csv('./EurLex_data/eurlex_combined_vectors.csv') train.to_csv('./EurLex_data/eurlex_combined_vectors-w2v.csv')
def evaluate_model_cnn(x, y, learn_path, k=10): print(len(y), len(y[0])) # create a k fold with no unique classes count = 0 while True: count += 1 # print(count, 'Finding a proper KF...') kf = list( KFold(n_splits=k, shuffle=True, random_state=randint(0, 100000)).split(x)) good_folds = True for train_index, test_index in kf: for i in range(len(y[0])): if len(np.unique( y[train_index, i])) < 2: # or len(np.unique(y[test_index, i])) < 2: # print(y[train_index, i],np.unique(y[train_index, i])) print(i) good_folds = False break if not good_folds: break if good_folds: break print('Found a good KF in', count, 'try!') with open(learn_path + 'topic_classifier-folds.pkl', 'wb') as out_file: pickle.dump(kf, out_file) fold_num = 0 stats = QuickDataFrame([ 'Jaccard (normalised)', 'Accuracy (normalised)', 'Accuracy', 'F1_score (micro averaged)', 'F1_score (macro averaged by labels)', 'F1_score (averaged by samples)', 'Hamming loss', 'Label Ranking loss:' ]) txt_cnns = [TextCNN() for _ in range(y.shape[1])] prog = Progresser(k) for train_index, test_index in kf: # print(train_index, test_index) print('___________________________________________________') x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] # cls = SVC(kernel='linear') # cls = SVC(kernel='poly', probability=True, tol=1e-5) # cls = GaussianNB() # cls = RandomForestClassifier(max_features='auto', random_state=1) # topic_classifier = BinaryRelevance(classifier=cls, require_dense=[True, True]) # try: # topic_classifier.fit(x_train, y_train) for i in range(y.shape[1]): txt_cnns[i].train_cnn(train_index, i) # except Exception as e: # print('\nfit error!:', e) # continue # with open(learn_path + 'topic_classifier-SVC' + str(fold_num) + '.pkl', 'wb') as out_file: # pickle.dump(topic_classifier, out_file) # try: predictions = np.zeros((len(x_test), y.shape[1])) for i in range(y.shape[1]): predictions[:, i] = np.array(txt_cnns[i].predict_text(test_index)) s = [ jaccard_score(y_test, predictions, average='micro'), accuracy_score(y_test, predictions, normalize=True), accuracy_score(y_test, predictions, normalize=False), f1_score(y_test, predictions, average='micro'), f1_score(y_test, predictions, average='macro'), f1_score(y_test, predictions, average='samples'), hamming_loss(y_test, predictions), label_ranking_loss(y_test, predictions) ] stats.append(s) print(stats[stats.length - 1]) # except Exception as e: # print('Eval error!:', e) fold_num += 1 prog.count() for col in stats.cols: print(col, np.mean(stats[col]))
def build_word_vectors(): # data = pd.read_csv('./StackExchange_data/all_data.csv') data = pd.read_csv('./StackExchange_data/all_data-lemmatised.csv') words_vector = pd.read_csv('./StackExchange_data/1000words.csv', header=None, names={'term'}) data = data.set_index(data['id']) # create DataFrame cols_list = list(words_vector['term']) + ['question_id'] train = pd.DataFrame(dtype=object, columns=cols_list) # # lemmatise questions # lemmatiser = WordNetLemmatizer() # data['lem_body'] = '' # cleaner = re.compile('^\s*-*|-\s*$') # p = Progresser(data.shape[0]) # for i, qrow in data.iterrows(): # p.show_progress(i) # # question = '' # tokens_pos = pos_tag(word_tokenize(qrow['body'])) # for word_pos in tokens_pos: # word = word_pos[0].lower() # word = re.sub(cleaner, '', word) # if len(word) > 2: # word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos(word_pos[1])) # question += word + ' ' # data.loc[i, 'lem_body'] = question # if i % 2000 == 0: # data.to_csv('./StackExchange_data/all_data-lemmatised' + str(i) + '.csv', index=False) # # data.to_csv('./StackExchange_data/all_data-lemmatised.csv', index=False) # print('data lemmatised') train_arr = np.zeros((data.shape[0], len(cols_list)), dtype=np.int16) col_index = dict() for ind, col in enumerate(cols_list): col_index[col] = ind # build the train data print('Building train data...') p = Progresser(data.shape[0]) for i, qrow in data.iterrows(): p.show_progress(i) train_arr[i][col_index['question_id']] = i # set occurrence values for word in qrow['lem_body'].split(): if word in col_index: train_arr[i][col_index[word]] = 1 # write to file print('\nWriting to file...') with open('./StackExchange_data/data_1000word.csv', 'w') as outfile: for ind, col in enumerate(cols_list): outfile.write(col) if ind == len(cols_list) - 1: outfile.write('\n') else: outfile.write(',') p = Progresser(data.shape[0]) line_num = 0 line_len = len(train_arr[0]) for line in train_arr: p.show_progress(line_num) line_num += 1 for i in range(0, line_len): outfile.write(str(line[i])) if i == line_len - 1: outfile.write('\n') else: outfile.write(',')