示例#1
0
def tfidf_selection (document_collection_category1, document_collection_category2 = None, specific_word = None, list_size = 10):
    """This method selects relevant features from one or two document collections based on the tf-idf value of the word.
        Args:
            document_collection_category1(str, list or file directory): document collection of the first category
			document_collection_category2(str, list or file directory): (dafualt None) document collection of the second category
            specific_word(str): (default None) word whose relevance in the document collection(s) is to be returned 
			list_size(int): (default 10) number of features to be returned
        Returns:
            pandas.core.series.Series: most relevant features and sum of their Tf-idf weights in all documents            
    """
    cat1 = document_transformer(document_collection_category1)
    if document_collection_category2 != None:
        cat2 = document_transformer(document_collection_category2)
        full_document = cat1+cat2

    else:
        full_document = cat1

    tf_idf = tfidf(full_document)

    tf_idf_sum = tf_idf.sum()
    
    if specific_word != None:
        return pd.Series({specific_word : tf_idf_sum.loc[specific_word]})
    
    tf_idf_sort=tf_idf_sum.sort_values(ascending=False)
    return tf_idf_sort[:list_size]
示例#2
0
def latent_semantic_analysis (document_collection_category1, document_collection_category2, list_size = 10 , visualize = False ):
    """This method selects relevant features from two document collections based on the singular-value decomposition.
        Args:
            document_collection_category1 (str, list or file directory): document collection of the first category
			document_collection_category2 (str, list or file directory): document collection of the second category
			list_size (int): (default 10) number of features to be returned
            visualize (bool): (default False) if True it represents the features graphically
        Returns:
            pandas.core.frame.DataFrame: most relevant features for each category and their relevance values             
    """
    cat1 = document_transformer(document_collection_category1)
    cat2 = document_transformer(document_collection_category2)

    documents = cat1 + cat2
    vectorizer = TfidfVectorizer(use_idf=True)

    documents_tfidf = vectorizer.fit_transform(documents)


    feat_names = vectorizer.get_feature_names()

    #singular value decomposition 
    lsa = TruncatedSVD(100)
    documents_lsa = lsa.fit_transform(documents_tfidf)

    values_list = []
    for category in range(0, 2):

        cat = lsa.components_[category]
    
        indeces = numpy.argsort(cat).tolist()
    
        indeces.reverse()    
        word = [feat_names[weightIndex] for weightIndex in indeces[0:list_size]]    
        value = [cat[weightIndex] for weightIndex in indeces[0:list_size]]   
        
        if visualize == True:
            word.reverse()
            value.reverse()
            positions = arange(list_size) + .5 
            figure(category)
            barh(positions, value, align='center')
            yticks(positions, word)
            xlabel('Weight')
            title('Strongest terms for category %d' % (category))
            show()

        lsa_values = { }

        for i in range (list_size):
            lsa_values [word[i]] = value[i]
            
        values_list.append(lsa_values)
        values_list_df = pd.DataFrame(values_list)
    return values_list_df.T
示例#3
0
def bag_of_words(document_collection,
                 index_of_document=None,
                 specific_word=None,
                 array=False,
                 binary_count=False):
    """This method converts a collection of text documents to a matrix of token counts
    Args:
        document_collection (str, list of strings or file directory): document collection 
		indexOfDocument (int): (default None) index of the document whose vector is to be returned
		specific_word (str): (default None) word whose vector is to be returned
        array (bool): (default False) True, if Bag of Words representation as ndarray is needed for other method
    Returns:
        pandas.core.frame.DataFrame: Vector representation for all documents
        pandas.core.series.Series: vector representation for selected document or word
        numpy.ndarray: (if optional = True) Vector representation for all documents for another methods
    """
    vectorizer = CountVectorizer(binary=binary_count)
    full_document = document_transformer(document_collection)

    term_document_matrix = vectorizer.fit_transform(full_document).toarray()
    if array:
        return term_document_matrix
    vocabulary = vectorizer.get_feature_names()
    bag_of_words = pd.DataFrame(term_document_matrix, columns=vocabulary)
    if index_of_document != None:
        return bag_of_words.iloc[index_of_document]
    elif specific_word != None:
        return bag_of_words.loc[:, specific_word]
    else:
        return bag_of_words
示例#4
0
def pos_tagging(document_collection, language="en"):
    """This method determines the part of speech of each word.
        Args:
            document_collection (str, list or file directory): document collection
			language (str): (default „en“) „en“ for english; „de“ for german: for which language the method is to be executed 
        Returns:   
            list: list of assigned part-of-speech-tags for each word in form (term, part-of-speech-tag)
    """
    transformed_document = document_transformer(document_collection)
    documents_pos_tag = []

    if language == "en":
        if type(transformed_document) == str:
            document_pos_tag = pos_tag(word_tokenize(transformed_document))
            return document_pos_tag

        else:
            for document_part in transformed_document:
                documents_pos_tag.append(pos_tag(word_tokenize(document_part)))
            return documents_pos_tag

    elif language == "de":
        with open('nltk_german_classifier_data.pickle', 'rb') as f:
            tagger = pickle.load(f)

        if type(transformed_document) == str:
            document_pos_tag = tagger.tag(transformed_document.split())
            return document_pos_tag
        else:
            for document_part in transformed_document:
                documents_pos_tag.append(tagger.tag(document_part.split()))
            return documents_pos_tag
示例#5
0
def stemming(document_collection, language="en"):
    """This method reduces each word to their word stem or root form.
        Args:
            document_collection (str, list or file directory): document collection
			language (str): (default „en“) „en“ for english; „de“ for german: for which language the method is to be executed 	
        Returns:
            str: string with stemmed words
            list: list of strings with stemmed words
    """
    if language == "en":
        stemmer = SnowballStemmer("english")
    elif language == "de":
        stemmer = SnowballStemmer("german")

    transformed_document = document_transformer(document_collection)
    stemmed_documents = []

    for document_part in transformed_document:
        document_tokens = word_tokenize(document_part)
        stemmed_document = ""
        for word in document_tokens:
            word = stemmer.stem(word)
            stemmed_document = stemmed_document + " " + word
        stemmed_documents.append(stemmed_document.strip())

    return stemmed_documents
示例#6
0
def inverse_document_frequency(document_collection,
                               smooth=True,
                               specific_word=None):
    """This method calculates idf-weights for each term in the document collection
    Args:
        document_collection (str, list or file directory): document collection					
        smooth(bool): (default True) add one to document frequencies and prevents zero divisions
		specific_word(str): (default None) word whose idf-weight is to be returned
    Returns: 
        pandas.core.frame.DataFrame: idf-weights of each feature in the document collection
		pandas.core.series.Series: idf-weight of selected word
    """
    vectorizer = CountVectorizer()
    full_document = document_transformer(document_collection)

    termFrequency = vectorizer.fit_transform(full_document).toarray()
    transformer = TfidfTransformer(smooth_idf=smooth)

    transformer.fit_transform(termFrequency).toarray()
    vocabulary = vectorizer.get_feature_names()
    idf_values = transformer.idf_
    idf_values_df = pd.DataFrame(idf_values, index=vocabulary, columns=["idf"])

    if specific_word != None:
        return idf_values_df.loc[specific_word]
    else:
        return idf_values_df
示例#7
0
def term_frequency(document_collection,
                   index_of_document=None,
                   specific_word=None,
                   scaled=False):
    """This method calculates term frequency in each document of the document collection 
    Args:
        document_collection (str, list of strings or file directory): document collection 						
        index_of_document(int):  (default None) index of the document whose vector is to be returned
		specific_word(str): (default None) word whose vector is to be returned
		scaled(bool):  (default False) True, if scaling relative to the frequency of words in the document is needed
    Returns:
        pandas.core.frame.DataFrame: tf representation of the document collection
        pandas.core.series.Series: tf representation of the selected document or word
    """
    vectorizer = CountVectorizer()
    full_document = document_transformer(document_collection)

    termFrequency = vectorizer.fit_transform(full_document).toarray()
    vocabulary = vectorizer.get_feature_names()
    if scaled == True:
        termFrequency = scale(termFrequency)

    term_frequency_df = pd.DataFrame(termFrequency, columns=vocabulary)
    if index_of_document != None:
        return term_frequency_df.iloc[index_of_document]
    elif specific_word != None:
        return term_frequency_df.loc[:, specific_word]
    else:
        return term_frequency_df
示例#8
0
def tokenization(document_collection, index_of_document=None):
    """This method divides a documents into individual words (strings) by splitting on the blank spaces.
        Args:
            document_collection (str, list or file directory): document collection to be tokenized 
        Returns: 
            list: list of divided documents into individual words
    """
    full_document = document_transformer(document_collection)
    tokenized_document = []
    for document_part in full_document:
        tokenized_document.append(word_tokenize(document_part))
    if index_of_document != None:
        return tokenized_document[index_of_document]
    return tokenized_document
示例#9
0
def words_counting(document_collection, specific_word=None, binary=False):
    """This method counts the number of words in the whole document collection
    Args:
        document_collection (str, list of strings or file directory): document collection
    	specific_word (str): (default None) word whose number is to be returned
    Returns:
        pandas.core.series.Series: an assignment of terms to number of terms in all documents	
    """
    full_document = document_transformer(document_collection)
    tf = bag_of_words(full_document, binary_count=binary)

    tf_sum = tf.sum()

    tf_sum_sorted = tf_sum.sort_values(ascending=False)
    return tf_sum_sorted
示例#10
0
def term_frequency_selection(document_collection_category1, document_collection_category2 = None, specific_word = None, list_size = 10):
    """This method selects relevant features from one or two document collections based on the frequency of occurrence of the word in the document collection.
        Args:
            document_collection_category1 (str, list or file directory):
			document_collection_category2 (str, list or file directory):
            specific_word(str): (default None) word whose relevance in the document collection(s) is to be returned 
			list_size (int): (default 10) number of features to be returned
        Returns: 
            pandas.core.series.Series: most relevant features and their frequency in all documents           
    """
    cat1 = document_transformer(document_collection_category1)
    if document_collection_category2 != None:
        cat2 = document_transformer(document_collection_category2)
        full_document = cat1+cat2

    else:
        full_document = cat1

    tf_sum = words_counting(full_document)

    if specific_word != None:
        return pd.Series({specific_word : tf_sum.loc[specific_word]})
    tf_sort=tf_sum.sort_values(ascending=False)
    return tf_sort[:list_size]
示例#11
0
def n_grams_tokenization(document_collection, n, index_of_document=None):
    """Divides a documents into sequence of n words (strings) by splitting on the blank spaces.
        Args:
            document_collection (str, list or file directory): document collection to be tokenized
 			n (int): length of the sequence of words
        Returns: 
            list: list of divided documents into sequences of words
    """
    all_n_grams = []
    full_document = document_transformer(document_collection)

    for document_part in full_document:
        ngrams_document = []
        n_grams = ngrams(document_part.split(), n)
        for grams in n_grams:
            ngrams_document.append(grams)
        all_n_grams.append(ngrams_document)
    if index_of_document != None:
        return all_n_grams[index_of_document]
    return all_n_grams
示例#12
0
def stop_words(document_collection,
               language="en",
               user_definded_stop_word_list=None,
               punctuation=True):
    """This method removes stop words which do not contribute to any future operations.
        Args:
            document_collection (str, list or file directory): document collection where stop words are to be removed 
 			language (str): (default „en“)  if "en": a pre-defened set of english stop words will be used
                                            if "de": a pre-defened set of german stop words will be used
			user_defined_stop_word_list (list): (default None) a user defined set of stop words will be used
			punctuation (bool): (default True) special characters will be removed
        Returns: 
            str: String without stop words
            list: list of documents without stop words
    """
    stop_word_list = []
    if user_definded_stop_word_list != None:
        stop_word_list = user_definded_stop_word_list
    elif language == "en":
        stop_word_list = set(stopwords.words('english'))
    elif language == "de":
        stop_word_list = set(stopwords.words('german'))

    full_document = document_transformer(document_collection)
    documents_without_stopwords = []

    for document_part in full_document:
        document_tokens = word_tokenize(document_part)
        document_without_stopwords = ""
        for word in document_tokens:
            if word not in stop_word_list:
                if punctuation:
                    if word.isalpha():
                        document_without_stopwords = document_without_stopwords + " " + word
                else:
                    document_without_stopwords = document_without_stopwords + " " + word

        documents_without_stopwords.append(document_without_stopwords.strip())

    return documents_without_stopwords
示例#13
0
def tfidf(document_collection, smooth=True, index_of_document=None):
    """This method calculates tf-idf weights for each term in the document collection  
    Args:
        document_collection (str, list or file directory): document collection
		smooth(bool): (default True) add one to document frequencies and prevents zero divisions
		indexOfDocument(int): (default None) index of the document whose vector is to be returned
    Returns:
        pandas.core.frame.DataFrame: tf-idf representation of the document collection
		pandas.core.series.Series: tf-idf representation of selected document
    """
    vectorizer = CountVectorizer()
    full_document = document_transformer(document_collection)

    termFrequency = vectorizer.fit_transform(full_document).toarray()
    transformer = TfidfTransformer(smooth_idf=smooth)
    tfidf = transformer.fit_transform(termFrequency).toarray()
    vocabulary = vectorizer.get_feature_names()

    tfidf_df = pd.DataFrame(tfidf, columns=vocabulary)
    if index_of_document != None:
        return tfidf_df.iloc[index_of_document]
    else:
        return tfidf_df
示例#14
0
def information_gain(document_collection_category1, document_collection_category2, specific_word = None, list_size = 10, visualize = False):
    """This method selects relevant features from two document collections based on the information gain algorithm.
        Args:
            document_collection_category1 (str, list or file directory): document collection of the first category
			document_collection_category2 (str, list or file directory): document collection of the second category
            specific_word (str): (default None) word whose relevance in the document collection(s) is to be returned 
			list_size (int): (default 10) number of features to be returned
            visualize (bool): (default False) if True it represents the features graphically
        Returns:
            pandas.core.series.Series: most relevant features and their their information gain values             
    """
    cat1 = document_transformer(document_collection_category1)
    cat2 = document_transformer(document_collection_category2)

    individual_words = bag_of_words(cat1+cat2).columns.tolist()
    ig = { }
    cat1_words_counting = words_counting(cat1, binary=True)
    cat2_words_counting = words_counting(cat2, binary=True)

    for word in individual_words:
        cat_a = 0 #kommt in der Kategorie 
        cat_b = 0 #kommt in der anderen Kategorie 
        not_cat_a = 0 # kommt nicht in der Kategorie   
        not_cat_b = 0 # kommt nicht in der anderer Kategorie   
        if word in cat1_words_counting.index.values:
            cat_a = cat1_words_counting.loc[word] 
        not_cat_a = len(cat1) - cat_a
    
        if word in cat2_words_counting.index.values:
            cat_b = cat2_words_counting.loc[word] 
        
        not_cat_b = len(cat2) - cat_b

        if cat_a*cat_b*not_cat_a*not_cat_b!=0:
            all_words = cat_a + cat_b + not_cat_a + not_cat_b
            h_word = (-( (cat_a + not_cat_a)/all_words * log ( (cat_a + not_cat_a)/all_words )/log (2) + (cat_b + not_cat_b)/all_words * log((cat_b + not_cat_b)/all_words )/log(2) ))
            h_word_pos = (- (cat_a/(cat_a + cat_b) * log (cat_a/(cat_a + cat_b)) / log(2) + cat_b/(cat_a + cat_b) * log (cat_b/(cat_a + cat_b))/ log(2) ) )
            h_word_neg = (- (not_cat_a/(not_cat_a + not_cat_b) * log (not_cat_a/(not_cat_a + not_cat_b))/ log(2) + not_cat_b/(not_cat_a + not_cat_b) * log (not_cat_b/(not_cat_a + not_cat_b))/ log(2) ) )
           
            h_word_over = (cat_a + cat_b)/all_words * h_word_pos + (not_cat_a + not_cat_b)/all_words * h_word_neg

            ig_cat = h_word - h_word_over
            ig[word] = ig_cat
            
    result_ig = { }
    if list_size != None:
        size = list_size
    else: 
        size = len(ig)    
    if specific_word != None:
       
        result_ig[specific_word] = ig[specific_word]
    
    else:
        while len(result_ig) < size:
            max_key = max(ig, key=ig.get)
            max_value = ig[max_key]
            result_ig [max_key] = max_value
            del ig [max_key]
    result_ig_df = pd.Series(result_ig)
    if visualize == True:
        positions = arange(list_size) + .5 
        figure()
        barh(positions, list(result_ig.values()), align='center')
        yticks(positions, list(result_ig.keys()))
        xlabel('Weight')
        title('Strongest terms for categories' )
        show()  
        
    return result_ig_df
示例#15
0
def chi_square (document_collection_category1, document_collection_category2, specific_word = None, list_size = 10, visualize = False):
    """This method selects relevant features from two document collections based on the chi square test.
        Args:
            document_collection_category1 (str, list or file directory): document collection of the first category
			document_collection_category2 (str, list or file directory): document collection of the second category
            specific_word (str): (default None) word whose relevance in the document collection(s) is to be returned 
			list_size (int): (default 10) number of features to be returned
            visualize (bool): (default False) if True it represents the features graphically
        Returns:
            pandas.core.series.Series: most relevant features and their chi square test values             
    """
    cat1 = document_transformer(document_collection_category1)
    cat2 = document_transformer(document_collection_category2)

    documents = []
    categories = []

    for doc in cat1:
        documents.append(doc)
        categories.append(0)
    
    for doc in cat2:
        documents.append(doc)
        categories.append(1)
    
    vectorizer = CountVectorizer()
    count_doc = vectorizer.fit_transform(documents)

    chi2score = chi2(count_doc, categories)[0]

    wscores = zip(vectorizer.get_feature_names(),chi2score)
    wchi2 = sorted(wscores,key=lambda x:x[1]) 

    chi= { }
    for date in wchi2:
        chi [date[0]] = date [1]

    if list_size != None:
        size = list_size
    else: 
        size = len(chi)    

    result_chi_square = {}
    if specific_word != None:
       
        result_chi_square[specific_word] = chi[specific_word]
    
    else:
       while len(result_chi_square) < size:
            max_key = max(chi, key=chi.get)
            max_value = chi[max_key]
            result_chi_square [max_key] = max_value
            del chi [max_key]
    result_df = pd.Series(result_chi_square)
    if visualize == True:
        
        positions = arange(list_size) + .5 
        figure()
        barh(positions, list(result_chi_square.values()), align='center')
        yticks(positions, list(result_chi_square.keys()))
        xlabel('Weight')
        title('Strongest terms for categories' )
        show()
     
    return result_df
示例#16
0
def lemmatizer(document_collection, language="en"):
    """This method reduces each word to their word stem or dictionary form.
        Args:
            document_collection (str, list or file directory): document collection
			language (str): (default „en“) „en“ for english; „de“ for german: for which language the method is to be executed 	
        Returns:
            str: string with lemmatized words
            list: list of strings with lemmatized words
    """
    transformed_document = document_transformer(
        document_collection
    )  #aus einem File/List/String wird List mit Strings gemacht
    total_lemmatized_document = []  #neuer List mit lemmatizierten Dokumenten

    if language == "en":  #für englisch
        wnl = WordNetLemmatizer()
        for document in transformed_document:  # jedes Dokument ...
            document_tokens = document.split()  # ...auf Wörter verteilen
            lemmatized_document_part = ""  #neuen lemmatizierter Dokument
            for word in document_tokens:
                pos = pos_tag(word_tokenize(word))[0][1]
                if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                    lemmatized_document_part = lemmatized_document_part + " " + wnl.lemmatize(
                        word, pos="v")
                elif pos in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
                    lemmatized_document_part = lemmatized_document_part + " " + wnl.lemmatize(
                        word, pos="a")
                else:
                    lemmatized_document_part = lemmatized_document_part + " " + wnl.lemmatize(
                        word)
            total_lemmatized_document.append(lemmatized_document_part.strip())
        return total_lemmatized_document

    elif language == "de":  #für deutsch
        lem = GermaLemma()
        for document in transformed_document:
            document_tokens = document.split()
            lemmatized_document_part = ""
            for word in document_tokens:
                pos = pos_tagging(word, language="de")[0][0][1]

                if pos in [
                        'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'VMFIN', 'VMINF',
                        'VAFIN', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU',
                        'VVPP'
                ]:
                    lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma(
                        word, "V")
                elif pos in [
                        'ADJA', 'ADJD', 'PDAT', 'PDS', 'PIAT', 'PIS', 'PPOSAT',
                        'PWAT'
                ]:
                    lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma(
                        word, "ADJ")
                elif pos in [
                        'ADV', 'PAV', 'PAVREL', 'PTKA', 'PWAV', 'PWAVREL'
                ]:
                    lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma(
                        word, "ADV")
                elif pos in ['NA', 'NE', 'NN']:
                    lemmatized_document_part = lemmatized_document_part + " " + lem.find_lemma(
                        word, "N")
                else:
                    lemmatized_document_part = lemmatized_document_part + " " + word

            total_lemmatized_document.append(lemmatized_document_part.strip())

        return total_lemmatized_document