Python TextCollection примеры, nltk.TextCollection Python примеры использования

Пример #1

0

Показать файл

Файл: abstract.py Проект: nihaofuyue0617/pythia

 def construct_term_doc_matrix(self, pca=False):
     '''
     Constructs a term-document matrix such that td_matrix[document][term] 
     contains the weighting score for the term in the document.
     '''
     if not self.filter_terms:    
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
     else:
         corpus = nltk.TextCollection(self._filter_terms())
         
     terms = list(set(corpus))
     data_rows = numpy.zeros([len(self.document_dict), len(set(corpus))])
     
     for i, document in enumerate(self.document_dict.values()):
         text = nltk.Text(document.tokens)
         for item in document.word_frequencies:
             data_rows[i][terms.index(item.word)] = corpus.tf_idf(item.word, text)
     
     
     #table = Orange.data.Table("iris.tab")
     self.attributes = terms#table.domain.features
     #a, c, w = table.to_numpy()        
     self.td_matrix = data_rows#a
             
     #If PCA is True then we project our points on their principal components
     #for dimensionality reduction
     if pca:
         t = construct_orange_table(self.attributes, self.td_matrix)
         self.td_matrix = orange_pca(t)
         #Attributes names have no meaning after dimensionality reduction
         self.attributes = [i for i in range(self.td_matrix.shape[1])]

Пример #2

0

Показать файл

Файл: DocumentSimilarityLibrary.py Проект: ericwang1120/projectSample

def compute_tf_idf_document_matrix(articles_dict):

    all_articles = range(len(articles_dict))
    for k, v in articles_dict.iteritems():
        text = v['content'].lower().split()
        all_articles[int(k)] = text
        v['tokenized'] = text

    #create a TextCollection corpus from all articles
    #this allows us to perform tf-idf
    tc = nltk.TextCollection(all_articles)

    #this is our target - matrix of all tf-idf values for every word and document
    td_matrix = {}
    for k, v in articles_dict.iteritems():
        post = v['tokenized']
        fdist = nltk.FreqDist(post)

        doc_review_id = v['review_id']
        td_matrix[doc_review_id] = {}

        for term in fdist.iterkeys():
            td_matrix[doc_review_id][term] = tc.tf_idf(term, post)

    return td_matrix

Пример #3

0

Показать файл

Файл: tf_idf_term_search.py Проект: hanhanwu/Hanhan-NaturalLanguageProcessing_Basic

def main():
    f_path = '[change to your googleplus_posts.json location]'
    data = json.loads(open(f_path).read())

    QUERY_TERMS = ['mobile']  # You can change the search terms here

    activities = [
        activity['object']['content'].lower().split() for activity in data
        if activity['object']['content'] != ''
    ]

    # nltk TextCollection has tf-idf itself
    tc = nltk.TextCollection(activities)

    relevant_activities = []

    for i in range(len(activities)):
        score = 0
        for term in QUERY_TERMS:
            score += tc.tf_idf(term.lower(), activities[i])
        if score > 0:
            relevant_activities.append({
                'score': score,
                'title': data[i]['title'],
                'url': data[i]['url']
            })

    relevant_activities = sorted(relevant_activities,
                                 key=lambda a: a['score'],
                                 reverse=True)
    for ra in relevant_activities:
        print 'title: ', ra['title']
        print 'url: ', ra['url']
        print 'score: ', ra['score']

Пример #4

0

Показать файл

Файл: online.py Проект: nihaofuyue0617/pythia

    def construct_term_doc_matrix(self, index, document):
        '''
        Overrides the parent method for constructing a td_matrix. The reason is 
        because we want to construct the matrix based on a sliding window approach.
        '''
        if index < self.window:
            documents = self.document_dict.values()
        else:
            window = (index - self.window + 1, index)
            documents = self.document_dict.values()[window[0]:window[1]]

        #Online clustering doesn't support term filtering yet
        corpus = nltk.TextCollection(
            [document.tokens for document in documents])

        terms = list(set(corpus))
        term_vector = numpy.zeros(len(set(corpus)))

        text = nltk.Text(document.tokens)
        for item in document.word_frequencies:
            term_vector[terms.index(item.word)] = corpus.tf_idf(
                item.word, text)

        self.attributes = terms
        self.td_matrix = term_vector

Пример #5

0

Показать файл

Файл: PNDBasicClustering.py Проект: Kino1994/data-science-urjc

def cluster_texts(texts, clustersNumber, distance):
    # Convierte texto en una coleccion
    # Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos
    # Get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")
    print(vectors)

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters

Пример #6

0

Показать файл

Файл: search.py Проект: ysenarath/opinion-framework

def render_wordcloud(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += [
                w for w in tknzr.tokenize(sent.strip())
                if w.lower() not in stopwords_en
            ]
        texts.append(tokens)
    corpus = nltk.TextCollection(texts)
    corpus.collocations(100)
    # noinspection PyProtectedMember
    results = {
        'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)],
        'collocations': corpus._collocations,
    }
    view = render_template('./templates/search/results_wordcloud.html',
                           form=form,
                           results=results,
                           **kwargs)
    session.close()
    return view

Пример #7

0

Показать файл

Файл: word_frequency_job_new.py Проект: mitmedialab/Boston-Attention-Mapping

 def calculate_results(self):
     vocab = nltk.TextCollection(self.articles).vocab().items()
     overall_freqdist = [(fd[0], float(fd[1]) / float(vocab[0][1]))
                         for fd in vocab]
     for city in self.cities:
         self.cities[city]["freqdist"] = self.tf_icf(city)[0:100]
         self.db.save(self.cities[city])

Пример #8

0

Показать файл

Файл: knn.py Проект: zzhaoiii/201834890ZhangZhao

def TF_IDF2(documents, dictionary):
    print('tf-idf')
    vectors = []
    i = 0
    # 重新构造文本集
    Texts = []
    for document in documents:
        Text = ''
        for token in document:
            if token in dictionary:
                Text += (' ' + token)
        Texts.append(Text)
        print(i)
        i += 1
    # 加载计算tf-idf类库
    tc = nltk.TextCollection(Texts)
    i = 0
    for document in Texts:
        vector = []
        for item in dictionary:
            # 计算tf-idf
            weight = tc.tf_idf(str(item), document)
            vector.append(weight)
        vectors.append(vector)
        print(i)
        i += 1
    # pd.DataFrame(vectors).to_csv(out, sep=",", header=None, index=None)

    return vectors

Пример #9

0

Показать файл

Файл: tfidf_vectorizer.py Проект: zeyefkey/accel-brain-code

 def __init__(self, token_list_list):
     '''
     Initialize.
     
     Args:
         token_list_list:    The list of list of tokens.
     '''
     self.__collection = nltk.TextCollection(token_list_list)

Пример #10

0

Показать файл

Файл: PeopleNameDisambiguation_IgnacioArias_RaulSánchez.py Проект: Kino1994/data-science-urjc

def cluster_texts(texts, clustersNumber, distanceFunction, clusterMode):
    """
    Function to cluster several texts. The following inputs must be
    specified:
        *) texts: collection of texts to cluster
        *) clustersNumber: number of clusters to be used
        *) distanceFunction: distance function to be used by the
           clustering algorithms
        *) clusterMode: cluster mode to be used:"AgglomerativeClustering",
           "KMeans" or "MiniBatchKMeans", all of them belonging to the
           scikit-learn library

    """

    collection = nltk.TextCollection(texts)
    # print("Created a collection of", len(collection), "terms.")

    # Get a list of unique terms
    unique_terms = list(set(collection))
    # print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    # print("Vectors created.")
    # print(vectors)

    # for vector in vectors:
    # print("Vector ", len(vector))

    # initialize the clusterer
    # clusterer = GAAClusterer(clustersNumber)
    # clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn

    if clusterMode == "AgglomerativeClustering":

        clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                            linkage="average",
                                            affinity=distanceFunction)
        clusters = clusterer.fit_predict(vectors)

    elif clusterMode == "KMeans":

        clusterer = KMeans(n_clusters=clustersNumber, random_state=0)
        clusters = clusterer.fit(vectors).predict(vectors)

    elif clusterMode == "MiniBatchKMeans":

        clusterer = MiniBatchKMeans(n_clusters=clustersNumber, random_state=0)
        clusters = clusterer.fit(vectors).predict(vectors)
    else:
        print("Invalid cluster mode")
        return None

    return clusters

Пример #11

0

Показать файл

 def get_most_frequent_terms(self, N=5):
     '''
     Returns the top N occuring terms in this cluster.
     '''
     if self.top_patterns != None:
         return self.top_patterns
     else:
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
         return nltk.FreqDist(corpus).items()[:N]

Пример #12

0

Показать файл

Файл: tf-idf_sample.py Проект: zaakya666/test

def tfidf(doc, docs):
    """対象の文書と全文の形態素解析した単語リストを指定すると対象の文書のTF-IDFを返す"""
    tokens = list(chain.from_iterable(docs))  #flatten
    A = nltk.TextCollection(docs)
    token_types = set(tokens)
    return [{
        "word": token_type,
        "tfidf": A.tf_idf(token_type, doc)
    } for token_type in token_types]

Пример #13

0

Показать файл

def tf_idf(docs):
    tokens = []
    for doc in docs:
        tokens += doc
    tf_idf = {}
    A = nltk.TextCollection(docs)
    token_types = set(tokens)
    for token_type in token_types:
        #print token_type,'=', A.tf_idf(token_type,tokens)
        tf_idf[token_type] = A.tf_idf(token_type, tokens)
    return tf_idf

Пример #14

0

Показать файл

def get_tf(docid, term, index):
    if is_phrase_term(term):
        # if it's a phrase, return error
        return "Not valid term, can not be term"
    else:
        if docid in index._doc_contents:
            doc = nltk.Text(nltk.word_tokenize(index._doc_contents[docid]))
            col = nltk.TextCollection([doc])
            return col.tf(term, doc)
        else:
            return "Not Found"

Пример #15

0

Показать файл

    def _calculate_centroid(self):
        '''
        It calculates the centroid of this collection of documents.
        '''
        corpus = nltk.TextCollection([document.tokens for document in self.documents.values()])
        terms = list(set(corpus))

        centroid = numpy.zeros([len(self.documents.items()), len(terms)])
        for i, document in enumerate(self.documents.values()):
            centroid[i] = document.fv

        self.centroid = numpy.mean(centroid, axis=0)

Пример #16

0

Показать файл

 def get_collocations(self, n=2, N=5):
     '''
     Returns the top collocations of the cluster corpus 
     based on Jaccard index. The collocations correspond 
     to n-grams and more specifically we limited the options
     to bigrams (n=2) and trigrams (n=3) ( n defaults to 2 ). 
     '''
     corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
     finder = nltk.BigramCollocationFinder.from_words(corpus)
     scorer = nltk.metrics.BigramAssocMeasures.jaccard
     #finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w:w in nltk.corpus.stopwords.words('english'))
     collocations = finder.nbest(scorer, N)

Пример #17

0

Показать файл

    def _attach_feature_vectors(self):
        '''
        Iterates over the summarizer documents and calculates a tf-idf
        weighted feature vector for each document. The feature vectors is
        attached to the document.
        '''
        corpus = nltk.TextCollection([document.tokens for document in self.documents.values()])
        terms = list(set(corpus))

        for id, document in self.documents.iteritems():
            text = nltk.Text(document.tokens)
            fv = numpy.zeros([len(set(corpus))])
            for item in document.word_frequencies:
                fv[terms.index(item.word)] = corpus.tf_idf(item.word, text)
            self.documents[id].fv = fv

Пример #18

0

Показать файл

def convertToTexts():
    print("Converting clean files to text collection...")
    textList = []
    for filename in os.listdir(os.getcwd()):
        if "c_" in filename:
            file = open(filename, 'r', encoding='utf-8')
            text = file.read().lower()
            text = re.sub('[^\w\s]', ' ', text)
            tokens = nltk.word_tokenize(text)
            tokens = remove_stopwords(tokens)
            text = nltk.Text(tokens)
            textList.append(text)
            file.close()
    print("Finished converting clean files to Text collection")
    return [nltk.TextCollection(textList), textList]

Пример #19

0

Показать файл

def tfidf(word):
    collection = nltk.TextCollection(word)
    doc = []
    for do in word:
        wo = []
        for term in set(do):
            a = collection.tf_idf(term, do)
            if a > 0:
                wo.append([term, a])
        wo.sort(key=lambda x: x[1])
        wo.reverse()
        slice1 = [i[0] for i in wo]
        lists = slice1[:20]
        doc.append(list(lists))

    return doc

Пример #20

0

Показать файл

def tf_idf(sentence, resources):
    result = []
    filename = resources["corpus"]
    file = open(filename)
    data = file.read()
    file.close()
    print("Finished reading file....")

    #data = data.decode("utf-8")
    line = data.split("\n")

    # 与えられた文章を形態素解析
    mt = MeCab.Tagger(dic_path)
    mt.parse('')
    res = mt.parseToNode(sentence)

    elements = []
    while res:
        ft = res.feature.split(",")
        #elements.append(res.surface.decode("utf-8"))
        elements.append(res.surface)
        #print res.surface, res.feature
        res = res.next

    print("Finished morphological analysis....")

    elements = elements[1:-1]

    docs = []
    docs.append(elements)

    for l in line:
        docs.append(l.split(" "))

    print("Finished spliting word....")

    collection = nltk.TextCollection(docs)
    uniqTerms = list(set(collection))

    for term in elements:
        #print("%s : %f" % (term, collection.tf_idf(term, elements)))
        result.append((term.encode("utf-8"), collection.tf_idf(term,
                                                               elements)))

    result = sorted(result, reverse=True, key=lambda x: float(x[1]))
    return result

Пример #21

0

Показать файл

def TFIDF(document):
    dokumen = ''
    kum_kata = set()
    for dokumen in document:
        kum_kata = kum_kata.union(set(
            dokumen.split(' ')))  #proses penggabungan
    kum_kata = sorted(kum_kata)
    collection = nltk.TextCollection(
        kum_kata)  #mengurutkan kumpulan kata berdasarkan abjad
    unique_terms = list(collection)  #print list(collection)
    word_tfidf = []
    for word in unique_terms:
        word_tfidf.append(collection.tf_idf(word, document))
    # file = open("TF_IDF.txt", "wb")
    # file.write("%s " %kum_kata + "%s\n" %word_tfidf)
    # file.close()
    return word_tfidf

Пример #22

0

Показать файл

Файл: tf_idf.py Проект: theblind/quora_question_pairs

def question_match_tf_idf(data_question1, data_question2):
    """Calculate the match rate between two questions based on TF_IDF"""
    # Calculate IDF
    question_corpus = []
    question_corpus.extend(data_question1.tolist())
    question_corpus.extend(data_question2.tolist())
    text_collection = nltk.TextCollection(question_corpus)
    weights = {
        word: text_collection.idf(word)
        for word in text_collection.tokens
    }

    # Calculate the match rate
    result = []
    for question1, question2 in zip(data_question1, data_question2):
        result.append(match_rate_tf_idf(question1, question2, weights))
    return result

Пример #23

0

Показать файл

Файл: HashTable.py Проект: rafaelpiresm/studying-indexing

    def create_index(self, documentos):
        listaTextos = []
        for d in documentos:
            listaTextos.append(
                nltk.wordpunct_tokenize(
                    nltk.clean_html(d.texto.encode('utf-8'))))

        for d in documentos:
            tokens = nltk.wordpunct_tokenize(nltk.clean_html(d.texto))
            tokens = [token.lower() for token in tokens]
            frequencency = nltk.FreqDist(tokens)
            for i in frequencency.items():
                termo = self.remove_punctuation(i[0])
                if len(termo) > 0:
                    tc = nltk.TextCollection(listaTextos)
                    tf_idf = tc.tf_idf(termo, d.texto)
                    achou = False
                    index = 0
                    for c in self.contents:
                        index += 1
                        if c.termo == termo:
                            achou = True
                            break
                    content = Content()
                    content.termo = termo
                    if not achou:
                        content.urls.append(url=d.url,
                                            tf_idf=tf_idf,
                                            frequencia=i[1])
                        self.contents.append(content)
                    else:
                        try:
                            self.contents[index].urls.append(url=d.url,
                                                             tf_idf=tf_idf,
                                                             frequencia=i[1])
                        except:
                            print 'Nao foi possivel adicionar um termo'
                '''chave = KeyValue(i[0],d.url,tf_idf)
				if self.hashTable.lookup(chave):
					self.hashTable.append(chave)
				else:
					self.hashTable.add(chave)'''
        return self.contents

Пример #24

0

Показать файл

    def load_possible_terms(self, np_text_list):
        """
			Retrieve possible words/terms from numpy list of text

			Args:
				np_text_list(np(list(string))): Numpy list containing text which term to be extracted
		"""

        temp_word_list = np.array([])

        for text in np_text_list:
            text = StringManipulator.normalize_text(text)
            temp_word_list = np.append(
                temp_word_list, StringManipulator.retrieve_unique_words(text))

        self.word_list = np.append(self.word_list, temp_word_list)
        self.word_list = np.unique(self.word_list)

        self.text_collection = nltk.TextCollection(self.word_list)

Пример #25

0

Показать файл

Файл: BasicNewsClustering_original.py Проект: marinamashina/natural-language-processing-nltk

def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(TF(f,unique_terms, collection)) for f in texts]
    print("Vectors created.")

    # initialize the clusterer
    clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                      linkage="average", affinity=distanceFunction) # esto se deja as
    clusters = clusterer.fit_predict(vectors) # que este predict sea parecido a reference

    return clusters

Пример #26

0

Показать файл

Файл: main.py Проект: NataliShort/-static_analysis_of_web_pages

 def getRelevantNews(self):
     # Определите здесь свой запрос
     QUERY_TERMS = ['стол', 'кубка', 'регион']
     # получаем массив новостей
     self.news = self.getNews()
     # Textcollection определяет абстракции tf, idf и tf_idf,
     # поэтому нам не требуется определять свои версии
     tc = nltk.TextCollection(self.news)
     relevant = []
     for idx in range(len(self.news)):
         score = 1
         for term in [t.lower() for t in QUERY_TERMS]:
             score += tc.tf_idf(term, self.news[idx])
         if score > 0:
             relevant.append({'score': score, 'title': self.news[idx]})
     # Сортировать результаты по релевантности и выводим
     relevants = sorted(relevant, key=lambda p: p['score'], reverse=True)
     for post in relevants:
         print('{0}'.format(post['title']))
     return relevants

Пример #27

0

Показать файл

Файл: practica.py Проект: sariogonfer/Master-Data-Science---RInfo-Practica-I

def cluster_texts(texts, cluster_number, distance, verbose=True, measure=TF):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)

    #get a list of unique terms
    unique_terms = list(set(collection))

    if verbose:
        print("Creando collecion de %d terminos" % len(collection))
        print("Terminos unicos encontrados: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(measure(f,unique_terms, collection)) for f in texts]

    # initialize the clusterer
    clusterer = AgglomerativeClustering(n_clusters=cluster_number,
                                      linkage="average", affinity='cosine')
    clusters = clusterer.fit_predict(vectors)

    return clusters

Пример #28

0

Показать файл

Файл: reuter.py Проект: praveeneln/socialcomputing

def getTDMatrix(textCorpus):
    all_articles = [article['text'].lower().split() for article in textCorpus]

    tc = nltk.TextCollection(all_articles)

    # Compute a term-document matrix such that td_matrix[doc_title][term]
    # returns a tf-idf score for the term in the document
    td_matrix = {}
    i = 0
    for idx in range(len(all_articles)):
        i += 1
        print i
        article = all_articles[idx]
        fdist = nltk.FreqDist(article)
        doc_title = textCorpus[idx]['author']
        td_matrix[doc_title] = {}
        # takes long..
        for term in fdist.iterkeys():
            td_matrix[doc_title][term] = tc.tf_idf(term, article)
    return td_matrix

Пример #29

0

Показать файл

Файл: NewsClustering.py Проект: Javier162380/language-processing-assigment

def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of {0}, terms.".format(len(collection)))
    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))
    ### And here we actually call the function and create our array of vectors.
    vectors_tf_idf = [
        numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts
    ]

    vectors_idf = [
        numpy.array(IDF(f, unique_terms, collection)) for f in texts
    ]
    print("Vectors created.")
    # initialize the clusterer
    cluster = AgglomerativeClustering(n_clusters=clustersNumber,
                                      linkage="average",
                                      affinity=distance)
    clusters_tfidf = cluster.fit_predict(vectors_tf_idf)
    clusters_idf = cluster.fit_predict(vectors_idf)
    return (clusters_tfidf, clusters_idf)

Пример #30

0

Показать файл

def cluster_texts(texts, clustersNumber, distance):

    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Get a list of unique terms
    unique_terms = list(set(collection))

    print("Unique terms found: ", len(unique_terms))

    # And here we actually call the function and create our array of vectors.
    vectors = [
        numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts
    ]  # NUEVO
    print("Vectors created.")

    # Initialize the clusterer -> classify the words into groups
    clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                        linkage="average",
                                        affinity=distanceFunction)
    clusters = clusterer.fit_predict(vectors)

    return clusters

Python TextCollection примеры использования