def further_preprocessing_phase(temp_data_frame): temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '') # textlist = temp_data_frame['text'].to_numpy() textlist = temp_data_frame['text'].tolist() # if it raises an exeption could be the empty texts patent_dictionary = Dictionary(textlist) corpus = [patent_dictionary.doc2bow(text) for text in textlist] print('original dictionary size: ', len(patent_dictionary)) vocab_tf={} for i in corpus: for item, count in dict(i).items(): if item in vocab_tf: vocab_tf[item]+=int(count) else: vocab_tf[item] =int(count) remove_ids=[] no_of_ids_below_limit=0 for id,count in vocab_tf.items(): if count<=5: remove_ids.append(id) patent_dictionary.filter_tokens(bad_ids=remove_ids) patent_dictionary.filter_extremes(no_below=0) patent_dictionary.filter_n_most_frequent(30) print('parsed dictionary size: ', len(patent_dictionary)) vocabulary = list(patent_dictionary.token2id.keys()) ids_list = [] data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification']) temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1) print(len(ids_list)) data_frame.set_index(data_frame['patent_id'], inplace=True) data_frame.drop(ids_list, axis=0, inplace=True) return data_frame