예제 #1
0
def get_term_docs_stats(term, documents, freq_dtm=None, feature_names=None):
    """
    Given a set of documents, return statistics on appearance of one term in them
    :param term: The term to look for
    :param documents: The set of documents where to look for the term
    :param freq_dtm: If there is a DTM already calculated, do not calculate it again
    :param feature_names: ndarray with the column names of the DTM (terms)
    :return: A dictionary with statistics based on appearance the searched term in the documents
    """
    if freq_dtm is None:
        vec = CountVectorizer()
        freq_dtm = vec.fit_transform(documents)
        feature_names = np.array(vec.get_feature_names())

    num_docs = len(documents)
    #Get the frequency column for the term across the documents
    term_index = np.where(feature_names == term)[0]
    term_freq = freq_dtm[:, term_index].toarray().flatten()
    # Get the nonzero frequencies
    freq_nonzero_index = term_freq.nonzero()[0]
    freq_nonzero_docs = [documents[i] for i in freq_nonzero_index]
    nonzero_frequencies = term_freq[freq_nonzero_index]

    result = {}
    # Total appearances of the term in the documents
    result['num_appearances'] = term_freq.sum()
    # Number of documents where the term appears
    result['num_docs_with_term'] = len(
        nonzero_frequencies)  # np.count_nonzero(term_freq)
    result['pct_docs_with_term'] = result['num_docs_with_term'] / float(
        num_docs)
    # Statistics of frequency of the term in the documents
    result['term_in_doc_median'] = np.median(nonzero_frequencies)
    result['term_in_doc_mean'] = np.mean(nonzero_frequencies)
    result['term_in_doc_std'] = np.std(nonzero_frequencies)
    # Mean position of the first appearance of the term in the documents
    position_first_occurrences_abs = []  # Positions of the first occurrences
    position_first_occurrences_pct = [
    ]  # Positions relative to the length of each document
    for doc in freq_nonzero_docs:
        words = get_words(doc, lowercase=True)
        first_occurrence_abs = words.index(term) + 1.
        first_occurrence_pct = first_occurrence_abs / len(words)
        position_first_occurrences_abs.append(first_occurrence_abs)
        position_first_occurrences_pct.append(first_occurrence_pct)
    result['pos_first_appearance_abs_median'] = np.median(
        position_first_occurrences_abs)
    result['pos_first_appearance_abs_mean'] = np.mean(
        position_first_occurrences_abs)
    result['pos_first_appearance_abs_std'] = np.std(
        position_first_occurrences_abs)
    result['pos_first_appearance_pct_median'] = np.median(
        position_first_occurrences_pct)
    result['pos_first_appearance_pct_mean'] = np.mean(
        position_first_occurrences_pct)
    result['pos_first_appearance_pct_std'] = np.std(
        position_first_occurrences_pct)

    return result
예제 #2
0
 def remove_sw_from_text(self, doc, as_string=True, lowercase=False, remove_punctuation=True):
     """
     :param doc: The text to clean of stop words
     :param as_string: If output must be a string or a list
     :param lowercase: If resulting text/words must be lowercased
     :param remove_punctuation: If punctuation must be removed from the text/list
     :return: The text without stop words
     """
     words = text_transformations.get_words(doc, lowercase, remove_punctuation)
     return self.remove_sw_from_list(words, as_string)
 def test_get_words_list_with_punctuation(self):
     result = get_words(self.input_str, remove_punctuation=False)
     self.assertEqual([
         u"Párrafo", u"en", u"español", u":", u"lowercase", u"and",
         u"UPPERCASE", u"!", u"!"
     ], result)
 def test_get_lowercase_words_list(self):
     result = get_words(self.input_str, lowercase=True)
     self.assertEqual([
         u"párrafo", u"en", u"español", u"lowercase", u"and", u"uppercase"
     ], result)
 def test_get_words_list(self):
     result = get_words(self.input_str)
     self.assertEqual([
         u"Párrafo", u"en", u"español", u"lowercase", u"and", u"UPPERCASE"
     ], result)