def load_document_as_bos(input_file, language="en", normalization="stemming", stoplist=[]): """Load a document as a bag of words/stems/lemmas. Args: input_file (str): path to input file. language (str): language of the input documents, used for stop_words in sklearn CountVectorizer, defaults to 'en'. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. stoplist (list): the stop words for filtering tokens, default to []. """ # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=input_file, language=language, normalization=normalization) # initialize document vector vector = defaultdict(int) # loop through the sentences and add the stems to the vector for i, sentence in enumerate(doc.sentences): for j, stem in enumerate(sentence.stems): if stem in stoplist: continue vector[stem] += 1 return vector
def expand_word_graph(self, input_file, similarity, window=10, pos=None): """Expands the word graph using the given document. Args: input_file (str): path to the input file. similarity (float): similarity for weighting edges. window (int): the window within the sentence for connecting two words in the graph, defaults to 10. pos (set): the set of valid pos for words to be considered as nodes in the graph, defaults to ('NOUN', 'PROPN', 'ADJ'). """ # define default pos tags set if pos is None: pos = {'NOUN', 'PROPN', 'ADJ'} # initialize document loader doc = LoadFile() # read document doc.load_document(input=input_file, language=self.language, normalization=self.normalization) # flatten document and initialize nodes sequence = [] for sentence in doc.sentences: for j, node in enumerate(sentence.stems): if node not in self.graph and sentence.pos[j] in pos: self.graph.add_node(node) sequence.append((node, sentence.pos[j])) # loop through sequence to build the edges in the graph for j, node_1 in enumerate(sequence): for k in range(j + 1, min(j + window, len(sequence))): node_2 = sequence[k] if node_1[1] in pos and node_2[1] in pos \ and node_1[0] != node_2[0]: if not self.graph.has_edge(node_1[0], node_2[0]): self.graph.add_edge(node_1[0], node_2[0], weight=0) self.graph[node_1[0]][node_2[0]]['weight'] += similarity
def load_document_as_bos(input_file, format="corenlp", use_lemmas=False, stemmer="porter", stoplist=[]): """Load a document as a bag of stems. Args: input_file (str): path to input file. format (str): the input files format, defaults to corenlp. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. stoplist (list): the stop words for filtering tokens, default to []. """ # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # initialize document vector vector = defaultdict(int) # loop through the sentences for i, sentence in enumerate(doc.sentences): # loop through the tokens for j, stem in enumerate(sentence.stems): # skip stem if it occurs in the stoplist if stem in stoplist: continue # count the occurrence of the stem vector[stem] += 1 return vector
def compute_document_frequency( documents, output_file, language='en', stoplist=None, normalization='stemming', delimiter='\t', # TODO: What is the use case for changing this ? n=3): """Compute the n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the document frequencies were computed (--NB_DOC-- tab XXX). The output file is compressed using gzip. Args: documents (list): list of pke-readable documents. output_file (str): the output file. language (str): language of the input documents (used for computing the n-stem or n-lemma forms), defaults to 'en' (english). stoplist (list): the stop words for filtering n-grams, default to pke.lang.stopwords[language]. normalization (str): word normalization method, defaults to 'stemming'. Other possible value is 'none' for using word surface forms instead of stems/lemmas. delimiter (str): the delimiter between n-grams and document frequencies, defaults to tabulation (\t). n (int): the size of the n-grams, defaults to 3. """ # document frequency container frequencies = defaultdict(int) # initialize number of documents nb_documents = 0 # loop through the documents for document in documents: # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=document, language=language, stoplist=stoplist, normalization=normalization) # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering() # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form] += 1 nb_documents += 1 if nb_documents % 1000 == 0: logging.info("{} docs, memory used: {} mb".format( nb_documents, sys.getsizeof(frequencies) / 1024 / 1024)) # create directories from path if not exists if os.path.dirname(output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) # dump the df container with gzip.open(output_file, 'wt', encoding='utf-8') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line + '\n') for ngram in frequencies: line = ngram + delimiter + str(frequencies[ngram]) f.write(line + '\n')
def compute_lda_model(documents, output_file, n_topics=500, language="en", stoplist=None, normalization="stemming"): """Compute a LDA model from a collection of documents. Latent Dirichlet Allocation is computed using sklearn module. Args: documents (str): list fo pke-readable documents. output_file (str): the output file. n_topics (int): number of topics for the LDA model, defaults to 500. language (str): language of the input documents, used for stop_words in sklearn CountVectorizer, defaults to 'en'. stoplist (list): the stop words for filtering words, default to pke.lang.stopwords[language]. normalization (str): word normalization method, defaults to 'stemming'. Other possible value is 'none' for using word surface forms instead of stems/lemmas. """ # texts container texts = [] # loop throught the documents for document in documents: # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=document, language=language, normalization=normalization) # container for current document text = [] # loop through sentences for sentence in doc.sentences: # get the tokens (stems) from the sentence if they are not # punctuation marks text.extend([ sentence.stems[i] for i in range(sentence.length) if sentence.pos[i] != 'PUNCT' and sentence.pos[i].isalpha() ]) # add the document to the texts container texts.append(' '.join(text)) # vectorize dataset # get the stoplist from pke.lang because CountVectorizer only contains # english stopwords atm if stoplist is None: stoplist = stopwords.get(language) tf_vectorizer = CountVectorizer(stop_words=stoplist) tf = tf_vectorizer.fit_transform(texts) # extract vocabulary vocabulary = tf_vectorizer.get_feature_names() # TODO: deprecation warning: use get_feature_names_out # create LDA model and train lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0, learning_method='batch') lda_model.fit(tf) # save all data necessary for later prediction saved_model = (vocabulary, lda_model.components_, lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_) # Dump the df container logging.info('writing LDA model to {}'.format(output_file)) # create directories from path if not exists if os.path.dirname(output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) # dump the LDA model with gzip.open(output_file, 'wb') as fp: pickle.dump(saved_model, fp)
def compute_document_frequency(input_dir, output_file, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter", stoplist=None, delimiter='\t', n=3): """ Compute n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the frequencies were computed (--NB_DOC-- tab XX). Args: input_dir (str): the input directory. output_file (str): the output file. format (str): the input files format, defaults to corenlp. extension (str): file extension for input documents, defaults to xml. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. stoplist (list): the stop words for filtering n-grams, default to None. delimiter (str): the delimiter between n-grams and document frequencies, default to tabulation. n (int): the length for ngrams, defaults to 3. """ # document frequency container frequencies = defaultdict(set) # initialize number of documents nb_documents = 0 # loop throught the documents for input_file in glob.glob(input_dir+'/*.'+extension): logging.info('reading file '+input_file) # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering(stoplist=stoplist) # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form].add(input_file) nb_documents += 1 # Dump the df container with gzip.open(output_file, 'wb') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line.encode('utf-8') + b'\n') for ngram in frequencies: line = ngram + delimiter + str(len(frequencies[ngram])) f.write(line.encode('utf-8') + b'\n')
def compute_lda_model(input_dir, output_file, n_topics=500, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter", language="english"): """ Compute a LDA model from a collection of documents. Latent Dirichlet Allocation is computed using sklearn module. Args: input_dir (str): the input directory. output_file (str): the output file. n_topics (int): number of topics for the LDA model, defaults to 500. format (str): the input files format, defaults to corenlp. extension (str): file extension for input documents, defaults to xml. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. language (str): the language of the documents, used for stop_words in sklearn CountVectorizer, defaults to 'english'. """ # texts container texts = [] # loop throught the documents for input_file in glob.glob(input_dir+'/*.'+extension): logging.info('reading file '+input_file) # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # container for current document text = [] # loop through sentences for sentence in doc.sentences: # get the tokens (stems) from the sentence if they are not # punctuation marks text.extend([ sentence.stems[i] for i in range(sentence.length) \ if not re.search('[^A-Z$]', sentence.pos[i]) ]) # add the document to the texts container texts.append(' '.join(text)) # vectorize dataset # get the stoplist from nltk because CountVectorizer only contains english # stopwords atm tf_vectorizer = CountVectorizer(stop_words=stopwords.words(language)) tf = tf_vectorizer.fit_transform(texts) # extract vocabulary vocabulary = tf_vectorizer.get_feature_names() # create LDA model and train lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0, learning_method='batch') lda_model.fit(tf) # save all data necessary for later prediction saved_model = (vocabulary, lda_model.components_, lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_) # Dump the df container logging.info('writing LDA model to '+output_file) with gzip.open(output_file, 'wb') as fp: pickle.dump(saved_model, fp)
def compute_document_frequency(input_dir, output_file, extension='xml', language='en', normalization="stemming", stoplist=None, delimiter='\t', n=3, max_length=10**6, encoding=None): """Compute the n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the document frequencies were computed (--NB_DOC-- tab XXX). The output file is compressed using gzip. Args: input_dir (str): the input directory. output_file (str): the output file. extension (str): file extension for input documents, defaults to xml. language (str): language of the input documents (used for computing the n-stem or n-lemma forms), defaults to 'en' (english). normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. stoplist (list): the stop words for filtering n-grams, default to None. delimiter (str): the delimiter between n-grams and document frequencies, defaults to tabulation (\t). n (int): the size of the n-grams, defaults to 3. encoding (str): encoding of files in input_dir, default to None. """ # document frequency container frequencies = defaultdict(int) # initialize number of documents nb_documents = 0 # loop through the documents for input_file in glob.iglob(input_dir + os.sep + '*.' + extension): #logging.info('reading file {}'.format(input_file)) # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=input_file, language=language, normalization=normalization, max_length=max_length, encoding=encoding) # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering(stoplist=stoplist) # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form] += 1 nb_documents += 1 if nb_documents % 1000 == 0: logging.info("{} docs, memory used: {} mb".format( nb_documents, sys.getsizeof(frequencies) / 1024 / 1024)) # create directories from path if not exists if os.path.dirname(output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) # dump the df container with gzip.open(output_file, 'wt', encoding='utf-8') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line + '\n') for ngram in frequencies: line = ngram + delimiter + str(frequencies[ngram]) f.write(line + '\n')
def compute_lda_model(input_dir, output_file, n_topics=500, extension="xml", language="en", normalization="stemming", max_length=10**6, encoding=None): """Compute a LDA model from a collection of documents. Latent Dirichlet Allocation is computed using sklearn module. Args: input_dir (str): the input directory. output_file (str): the output file. n_topics (int): number of topics for the LDA model, defaults to 500. extension (str): file extension for input documents, defaults to xml. language (str): language of the input documents, used for stop_words in sklearn CountVectorizer, defaults to 'en'. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. encoding (str): encoding of files in `input_dir`, default to None. """ # texts container texts = [] # loop throught the documents for input_file in glob.iglob(input_dir + os.sep + '*.' + extension): logging.info('reading file {}'.format(input_file)) # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=input_file, language=language, normalization=normalization, max_length=max_length, encoding=encoding) # container for current document text = [] # loop through sentences for sentence in doc.sentences: # get the tokens (stems) from the sentence if they are not # punctuation marks text.extend([ sentence.stems[i] for i in range(sentence.length) if sentence.pos[i] != 'PUNCT' and sentence.pos[i].isalpha() ]) # add the document to the texts container texts.append(' '.join(text)) # vectorize dataset # get the stoplist from nltk because CountVectorizer only contains english # stopwords atm tf_vectorizer = CountVectorizer(stop_words=get_stopwords(language)) tf = tf_vectorizer.fit_transform(texts) # extract vocabulary vocabulary = tf_vectorizer.get_feature_names() # create LDA model and train lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0, learning_method='batch') lda_model.fit(tf) # save all data necessary for later prediction saved_model = (vocabulary, lda_model.components_, lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_) # Dump the df container logging.info('writing LDA model to {}'.format(output_file)) # create directories from path if not exists if os.path.dirname(output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) # dump the LDA model with gzip.open(output_file, 'wb') as fp: pickle.dump(saved_model, fp)