def tfidf_selection (document_collection_category1, document_collection_category2 = None, specific_word = None, list_size = 10): """This method selects relevant features from one or two document collections based on the tf-idf value of the word. Args: document_collection_category1(str, list or file directory): document collection of the first category document_collection_category2(str, list or file directory): (dafualt None) document collection of the second category specific_word(str): (default None) word whose relevance in the document collection(s) is to be returned list_size(int): (default 10) number of features to be returned Returns: pandas.core.series.Series: most relevant features and sum of their Tf-idf weights in all documents """ cat1 = document_transformer(document_collection_category1) if document_collection_category2 != None: cat2 = document_transformer(document_collection_category2) full_document = cat1+cat2 else: full_document = cat1 tf_idf = tfidf(full_document) tf_idf_sum = tf_idf.sum() if specific_word != None: return pd.Series({specific_word : tf_idf_sum.loc[specific_word]}) tf_idf_sort=tf_idf_sum.sort_values(ascending=False) return tf_idf_sort[:list_size]
def latent_semantic_analysis (document_collection_category1, document_collection_category2, list_size = 10 , visualize = False ): """This method selects relevant features from two document collections based on the singular-value decomposition. Args: document_collection_category1 (str, list or file directory): document collection of the first category document_collection_category2 (str, list or file directory): document collection of the second category list_size (int): (default 10) number of features to be returned visualize (bool): (default False) if True it represents the features graphically Returns: pandas.core.frame.DataFrame: most relevant features for each category and their relevance values """ cat1 = document_transformer(document_collection_category1) cat2 = document_transformer(document_collection_category2) documents = cat1 + cat2 vectorizer = TfidfVectorizer(use_idf=True) documents_tfidf = vectorizer.fit_transform(documents) feat_names = vectorizer.get_feature_names() #singular value decomposition lsa = TruncatedSVD(100) documents_lsa = lsa.fit_transform(documents_tfidf) values_list = [] for category in range(0, 2): cat = lsa.components_[category] indeces = numpy.argsort(cat).tolist() indeces.reverse() word = [feat_names[weightIndex] for weightIndex in indeces[0:list_size]] value = [cat[weightIndex] for weightIndex in indeces[0:list_size]] if visualize == True: word.reverse() value.reverse() positions = arange(list_size) + .5 figure(category) barh(positions, value, align='center') yticks(positions, word) xlabel('Weight') title('Strongest terms for category %d' % (category)) show() lsa_values = { } for i in range (list_size): lsa_values [word[i]] = value[i] values_list.append(lsa_values) values_list_df = pd.DataFrame(values_list) return values_list_df.T
def bag_of_words(document_collection, index_of_document=None, specific_word=None, array=False, binary_count=False): """This method converts a collection of text documents to a matrix of token counts Args: document_collection (str, list of strings or file directory): document collection indexOfDocument (int): (default None) index of the document whose vector is to be returned specific_word (str): (default None) word whose vector is to be returned array (bool): (default False) True, if Bag of Words representation as ndarray is needed for other method Returns: pandas.core.frame.DataFrame: Vector representation for all documents pandas.core.series.Series: vector representation for selected document or word numpy.ndarray: (if optional = True) Vector representation for all documents for another methods """ vectorizer = CountVectorizer(binary=binary_count) full_document = document_transformer(document_collection) term_document_matrix = vectorizer.fit_transform(full_document).toarray() if array: return term_document_matrix vocabulary = vectorizer.get_feature_names() bag_of_words = pd.DataFrame(term_document_matrix, columns=vocabulary) if index_of_document != None: return bag_of_words.iloc[index_of_document] elif specific_word != None: return bag_of_words.loc[:, specific_word] else: return bag_of_words
def pos_tagging(document_collection, language="en"): """This method determines the part of speech of each word. Args: document_collection (str, list or file directory): document collection language (str): (default „en“) „en“ for english; „de“ for german: for which language the method is to be executed Returns: list: list of assigned part-of-speech-tags for each word in form (term, part-of-speech-tag) """ transformed_document = document_transformer(document_collection) documents_pos_tag = [] if language == "en": if type(transformed_document) == str: document_pos_tag = pos_tag(word_tokenize(transformed_document)) return document_pos_tag else: for document_part in transformed_document: documents_pos_tag.append(pos_tag(word_tokenize(document_part))) return documents_pos_tag elif language == "de": with open('nltk_german_classifier_data.pickle', 'rb') as f: tagger = pickle.load(f) if type(transformed_document) == str: document_pos_tag = tagger.tag(transformed_document.split()) return document_pos_tag else: for document_part in transformed_document: documents_pos_tag.append(tagger.tag(document_part.split())) return documents_pos_tag
def stemming(document_collection, language="en"): """This method reduces each word to their word stem or root form. Args: document_collection (str, list or file directory): document collection language (str): (default „en“) „en“ for english; „de“ for german: for which language the method is to be executed Returns: str: string with stemmed words list: list of strings with stemmed words """ if language == "en": stemmer = SnowballStemmer("english") elif language == "de": stemmer = SnowballStemmer("german") transformed_document = document_transformer(document_collection) stemmed_documents = [] for document_part in transformed_document: document_tokens = word_tokenize(document_part) stemmed_document = "" for word in document_tokens: word = stemmer.stem(word) stemmed_document = stemmed_document + " " + word stemmed_documents.append(stemmed_document.strip()) return stemmed_documents
def inverse_document_frequency(document_collection, smooth=True, specific_word=None): """This method calculates idf-weights for each term in the document collection Args: document_collection (str, list or file directory): document collection smooth(bool): (default True) add one to document frequencies and prevents zero divisions specific_word(str): (default None) word whose idf-weight is to be returned Returns: pandas.core.frame.DataFrame: idf-weights of each feature in the document collection pandas.core.series.Series: idf-weight of selected word """ vectorizer = CountVectorizer() full_document = document_transformer(document_collection) termFrequency = vectorizer.fit_transform(full_document).toarray() transformer = TfidfTransformer(smooth_idf=smooth) transformer.fit_transform(termFrequency).toarray() vocabulary = vectorizer.get_feature_names() idf_values = transformer.idf_ idf_values_df = pd.DataFrame(idf_values, index=vocabulary, columns=["idf"]) if specific_word != None: return idf_values_df.loc[specific_word] else: return idf_values_df
def term_frequency(document_collection, index_of_document=None, specific_word=None, scaled=False): """This method calculates term frequency in each document of the document collection Args: document_collection (str, list of strings or file directory): document collection index_of_document(int): (default None) index of the document whose vector is to be returned specific_word(str): (default None) word whose vector is to be returned scaled(bool): (default False) True, if scaling relative to the frequency of words in the document is needed Returns: pandas.core.frame.DataFrame: tf representation of the document collection pandas.core.series.Series: tf representation of the selected document or word """ vectorizer = CountVectorizer() full_document = document_transformer(document_collection) termFrequency = vectorizer.fit_transform(full_document).toarray() vocabulary = vectorizer.get_feature_names() if scaled == True: termFrequency = scale(termFrequency) term_frequency_df = pd.DataFrame(termFrequency, columns=vocabulary) if index_of_document != None: return term_frequency_df.iloc[index_of_document] elif specific_word != None: return term_frequency_df.loc[:, specific_word] else: return term_frequency_df
def tokenization(document_collection, index_of_document=None): """This method divides a documents into individual words (strings) by splitting on the blank spaces. Args: document_collection (str, list or file directory): document collection to be tokenized Returns: list: list of divided documents into individual words """ full_document = document_transformer(document_collection) tokenized_document = [] for document_part in full_document: tokenized_document.append(word_tokenize(document_part)) if index_of_document != None: return tokenized_document[index_of_document] return tokenized_document
def words_counting(document_collection, specific_word=None, binary=False): """This method counts the number of words in the whole document collection Args: document_collection (str, list of strings or file directory): document collection specific_word (str): (default None) word whose number is to be returned Returns: pandas.core.series.Series: an assignment of terms to number of terms in all documents """ full_document = document_transformer(document_collection) tf = bag_of_words(full_document, binary_count=binary) tf_sum = tf.sum() tf_sum_sorted = tf_sum.sort_values(ascending=False) return tf_sum_sorted
def term_frequency_selection(document_collection_category1, document_collection_category2 = None, specific_word = None, list_size = 10): """This method selects relevant features from one or two document collections based on the frequency of occurrence of the word in the document collection. Args: document_collection_category1 (str, list or file directory): document_collection_category2 (str, list or file directory): specific_word(str): (default None) word whose relevance in the document collection(s) is to be returned list_size (int): (default 10) number of features to be returned Returns: pandas.core.series.Series: most relevant features and their frequency in all documents """ cat1 = document_transformer(document_collection_category1) if document_collection_category2 != None: cat2 = document_transformer(document_collection_category2) full_document = cat1+cat2 else: full_document = cat1 tf_sum = words_counting(full_document) if specific_word != None: return pd.Series({specific_word : tf_sum.loc[specific_word]}) tf_sort=tf_sum.sort_values(ascending=False) return tf_sort[:list_size]
def n_grams_tokenization(document_collection, n, index_of_document=None): """Divides a documents into sequence of n words (strings) by splitting on the blank spaces. Args: document_collection (str, list or file directory): document collection to be tokenized n (int): length of the sequence of words Returns: list: list of divided documents into sequences of words """ all_n_grams = [] full_document = document_transformer(document_collection) for document_part in full_document: ngrams_document = [] n_grams = ngrams(document_part.split(), n) for grams in n_grams: ngrams_document.append(grams) all_n_grams.append(ngrams_document) if index_of_document != None: return all_n_grams[index_of_document] return all_n_grams
def stop_words(document_collection, language="en", user_definded_stop_word_list=None, punctuation=True): """This method removes stop words which do not contribute to any future operations. Args: document_collection (str, list or file directory): document collection where stop words are to be removed language (str): (default „en“) if "en": a pre-defened set of english stop words will be used if "de": a pre-defened set of german stop words will be used user_defined_stop_word_list (list): (default None) a user defined set of stop words will be used punctuation (bool): (default True) special characters will be removed Returns: str: String without stop words list: list of documents without stop words """ stop_word_list = [] if user_definded_stop_word_list != None: stop_word_list = user_definded_stop_word_list elif language == "en": stop_word_list = set(stopwords.words('english')) elif language == "de": stop_word_list = set(stopwords.words('german')) full_document = document_transformer(document_collection) documents_without_stopwords = [] for document_part in full_document: document_tokens = word_tokenize(document_part) document_without_stopwords = "" for word in document_tokens: if word not in stop_word_list: if punctuation: if word.isalpha(): document_without_stopwords = document_without_stopwords + " " + word else: document_without_stopwords = document_without_stopwords + " " + word documents_without_stopwords.append(document_without_stopwords.strip()) return documents_without_stopwords
def tfidf(document_collection, smooth=True, index_of_document=None): """This method calculates tf-idf weights for each term in the document collection Args: document_collection (str, list or file directory): document collection smooth(bool): (default True) add one to document frequencies and prevents zero divisions indexOfDocument(int): (default None) index of the document whose vector is to be returned Returns: pandas.core.frame.DataFrame: tf-idf representation of the document collection pandas.core.series.Series: tf-idf representation of selected document """ vectorizer = CountVectorizer() full_document = document_transformer(document_collection) termFrequency = vectorizer.fit_transform(full_document).toarray() transformer = TfidfTransformer(smooth_idf=smooth) tfidf = transformer.fit_transform(termFrequency).toarray() vocabulary = vectorizer.get_feature_names() tfidf_df = pd.DataFrame(tfidf, columns=vocabulary) if index_of_document != None: return tfidf_df.iloc[index_of_document] else: return tfidf_df
def information_gain(document_collection_category1, document_collection_category2, specific_word = None, list_size = 10, visualize = False): """This method selects relevant features from two document collections based on the information gain algorithm. Args: document_collection_category1 (str, list or file directory): document collection of the first category document_collection_category2 (str, list or file directory): document collection of the second category specific_word (str): (default None) word whose relevance in the document collection(s) is to be returned list_size (int): (default 10) number of features to be returned visualize (bool): (default False) if True it represents the features graphically Returns: pandas.core.series.Series: most relevant features and their their information gain values """ cat1 = document_transformer(document_collection_category1) cat2 = document_transformer(document_collection_category2) individual_words = bag_of_words(cat1+cat2).columns.tolist() ig = { } cat1_words_counting = words_counting(cat1, binary=True) cat2_words_counting = words_counting(cat2, binary=True) for word in individual_words: cat_a = 0 #kommt in der Kategorie cat_b = 0 #kommt in der anderen Kategorie not_cat_a = 0 # kommt nicht in der Kategorie not_cat_b = 0 # kommt nicht in der anderer Kategorie if word in cat1_words_counting.index.values: cat_a = cat1_words_counting.loc[word] not_cat_a = len(cat1) - cat_a if word in cat2_words_counting.index.values: cat_b = cat2_words_counting.loc[word] not_cat_b = len(cat2) - cat_b if cat_a*cat_b*not_cat_a*not_cat_b!=0: all_words = cat_a + cat_b + not_cat_a + not_cat_b h_word = (-( (cat_a + not_cat_a)/all_words * log ( (cat_a + not_cat_a)/all_words )/log (2) + (cat_b + not_cat_b)/all_words * log((cat_b + not_cat_b)/all_words )/log(2) )) h_word_pos = (- (cat_a/(cat_a + cat_b) * log (cat_a/(cat_a + cat_b)) / log(2) + cat_b/(cat_a + cat_b) * log (cat_b/(cat_a + cat_b))/ log(2) ) ) h_word_neg = (- (not_cat_a/(not_cat_a + not_cat_b) * log (not_cat_a/(not_cat_a + not_cat_b))/ log(2) + not_cat_b/(not_cat_a + not_cat_b) * log (not_cat_b/(not_cat_a + not_cat_b))/ log(2) ) ) h_word_over = (cat_a + cat_b)/all_words * h_word_pos + (not_cat_a + not_cat_b)/all_words * h_word_neg ig_cat = h_word - h_word_over ig[word] = ig_cat result_ig = { } if list_size != None: size = list_size else: size = len(ig) if specific_word != None: result_ig[specific_word] = ig[specific_word] else: while len(result_ig) < size: max_key = max(ig, key=ig.get) max_value = ig[max_key] result_ig [max_key] = max_value del ig [max_key] result_ig_df = pd.Series(result_ig) if visualize == True: positions = arange(list_size) + .5 figure() barh(positions, list(result_ig.values()), align='center') yticks(positions, list(result_ig.keys())) xlabel('Weight') title('Strongest terms for categories' ) show() return result_ig_df
def chi_square (document_collection_category1, document_collection_category2, specific_word = None, list_size = 10, visualize = False): """This method selects relevant features from two document collections based on the chi square test. Args: document_collection_category1 (str, list or file directory): document collection of the first category document_collection_category2 (str, list or file directory): document collection of the second category specific_word (str): (default None) word whose relevance in the document collection(s) is to be returned list_size (int): (default 10) number of features to be returned visualize (bool): (default False) if True it represents the features graphically Returns: pandas.core.series.Series: most relevant features and their chi square test values """ cat1 = document_transformer(document_collection_category1) cat2 = document_transformer(document_collection_category2) documents = [] categories = [] for doc in cat1: documents.append(doc) categories.append(0) for doc in cat2: documents.append(doc) categories.append(1) vectorizer = CountVectorizer() count_doc = vectorizer.fit_transform(documents) chi2score = chi2(count_doc, categories)[0] wscores = zip(vectorizer.get_feature_names(),chi2score) wchi2 = sorted(wscores,key=lambda x:x[1]) chi= { } for date in wchi2: chi [date[0]] = date [1] if list_size != None: size = list_size else: size = len(chi) result_chi_square = {} if specific_word != None: result_chi_square[specific_word] = chi[specific_word] else: while len(result_chi_square) < size: max_key = max(chi, key=chi.get) max_value = chi[max_key] result_chi_square [max_key] = max_value del chi [max_key] result_df = pd.Series(result_chi_square) if visualize == True: positions = arange(list_size) + .5 figure() barh(positions, list(result_chi_square.values()), align='center') yticks(positions, list(result_chi_square.keys())) xlabel('Weight') title('Strongest terms for categories' ) show() return result_df
def lemmatizer(document_collection, language="en"): """This method reduces each word to their word stem or dictionary form. Args: document_collection (str, list or file directory): document collection language (str): (default „en“) „en“ for english; „de“ for german: for which language the method is to be executed Returns: str: string with lemmatized words list: list of strings with lemmatized words """ transformed_document = document_transformer( document_collection ) #aus einem File/List/String wird List mit Strings gemacht total_lemmatized_document = [] #neuer List mit lemmatizierten Dokumenten if language == "en": #für englisch wnl = WordNetLemmatizer() for document in transformed_document: # jedes Dokument ... document_tokens = document.split() # ...auf Wörter verteilen lemmatized_document_part = "" #neuen lemmatizierter Dokument for word in document_tokens: pos = pos_tag(word_tokenize(word))[0][1] if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: lemmatized_document_part = lemmatized_document_part + " " + wnl.lemmatize( word, pos="v") elif pos in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']: lemmatized_document_part = lemmatized_document_part + " " + wnl.lemmatize( word, pos="a") else: lemmatized_document_part = lemmatized_document_part + " " + wnl.lemmatize( word) total_lemmatized_document.append(lemmatized_document_part.strip()) return total_lemmatized_document elif language == "de": #für deutsch lem = GermaLemma() for document in transformed_document: document_tokens = document.split() lemmatized_document_part = "" for word in document_tokens: pos = pos_tagging(word, language="de")[0][0][1] if pos in [ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'VMFIN', 'VMINF', 'VAFIN', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP' ]: lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma( word, "V") elif pos in [ 'ADJA', 'ADJD', 'PDAT', 'PDS', 'PIAT', 'PIS', 'PPOSAT', 'PWAT' ]: lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma( word, "ADJ") elif pos in [ 'ADV', 'PAV', 'PAVREL', 'PTKA', 'PWAV', 'PWAVREL' ]: lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma( word, "ADV") elif pos in ['NA', 'NE', 'NN']: lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma( word, "N") else: lemmatized_document_part = lemmatized_document_part + " " + word total_lemmatized_document.append(lemmatized_document_part.strip()) return total_lemmatized_document