예제 #1
0
파일: utils.py 프로젝트: jordanlmx/pke
def load_document_as_bos(input_file,
                         language="en",
                         normalization="stemming",
                         stoplist=[]):
    """Load a document as a bag of words/stems/lemmas.

    Args:
        input_file (str): path to input file.
        language (str): language of the input documents, used for stop_words
            in sklearn CountVectorizer, defaults to 'en'.
        normalization (str): word normalization method, defaults to 'stemming'.
            Other possible values are 'lemmatization' or 'None' for using word
            surface forms instead of stems/lemmas.
        stoplist (list): the stop words for filtering tokens, default to [].
    """

    # initialize load file object
    doc = LoadFile()

    # read the input file
    doc.load_document(input=input_file,
                      language=language,
                      normalization=normalization)

    # initialize document vector
    vector = defaultdict(int)

    # loop through the sentences and add the stems to the vector
    for i, sentence in enumerate(doc.sentences):
        for j, stem in enumerate(sentence.stems):
            if stem in stoplist:
                continue
            vector[stem] += 1

    return vector
예제 #2
0
    def expand_word_graph(self,
                          input_file,
                          similarity,
                          window=10,
                          pos=None):
        """Expands the word graph using the given document.

        Args:
            input_file (str): path to the input file.
            similarity (float): similarity for weighting edges.
            window (int): the window within the sentence for connecting two
                words in the graph, defaults to 10.
            pos (set): the set of valid pos for words to be considered as nodes
                in the graph, defaults to ('NOUN', 'PROPN', 'ADJ').
        """

        # define default pos tags set
        if pos is None:
            pos = {'NOUN', 'PROPN', 'ADJ'}

        # initialize document loader
        doc = LoadFile()

        # read document
        doc.load_document(input=input_file,
                          language=self.language,
                          normalization=self.normalization)

        # flatten document and initialize nodes 
        sequence = []

        for sentence in doc.sentences:
            for j, node in enumerate(sentence.stems):
                if node not in self.graph and sentence.pos[j] in pos:
                    self.graph.add_node(node)
                sequence.append((node, sentence.pos[j]))

        # loop through sequence to build the edges in the graph
        for j, node_1 in enumerate(sequence):
            for k in range(j + 1, min(j + window, len(sequence))):
                node_2 = sequence[k]
                if node_1[1] in pos and node_2[1] in pos \
                        and node_1[0] != node_2[0]:
                    if not self.graph.has_edge(node_1[0], node_2[0]):
                        self.graph.add_edge(node_1[0], node_2[0], weight=0)
                    self.graph[node_1[0]][node_2[0]]['weight'] += similarity
예제 #3
0
def load_document_as_bos(input_file,
                         format="corenlp",
                         use_lemmas=False,
                         stemmer="porter",
                         stoplist=[]):
    """Load a document as a bag of stems.

    Args:
        input_file (str): path to input file.
        format (str): the input files format, defaults to corenlp.
        use_lemmas (bool): whether lemmas from stanford corenlp are used
            instead of stems (computed by nltk), defaults to False.
        stemmer (str): the stemmer in nltk to used (if used), defaults
            to porter.
        stoplist (list): the stop words for filtering tokens, default to [].
    """

    # initialize load file object
    doc = LoadFile(input_file)

    # read the input file
    doc.read_document(format=format,
                      use_lemmas=use_lemmas,
                      stemmer=stemmer,
                      sep='/')

    # initialize document vector
    vector = defaultdict(int)

    # loop through the sentences
    for i, sentence in enumerate(doc.sentences):

        # loop through the tokens
        for j, stem in enumerate(sentence.stems):

            # skip stem if it occurs in the stoplist
            if stem in stoplist:
                continue

            # count the occurrence of the stem
            vector[stem] += 1

    return vector
예제 #4
0
파일: utils.py 프로젝트: atomicjets/pke
def compute_document_frequency(
        documents,
        output_file,
        language='en',
        stoplist=None,
        normalization='stemming',
        delimiter='\t',
        # TODO: What is the use case for changing this ?
        n=3):
    """Compute the n-gram document frequencies from a set of input documents.
    An extra row is added to the output file for specifying the number of
    documents from which the document frequencies were computed
    (--NB_DOC-- tab XXX). The output file is compressed using gzip.

    Args:
        documents (list): list of pke-readable documents.
        output_file (str): the output file.
        language (str): language of the input documents (used for computing the
            n-stem or n-lemma forms), defaults to 'en' (english).
        stoplist (list): the stop words for filtering n-grams, default to
            pke.lang.stopwords[language].
        normalization (str): word normalization method, defaults to
            'stemming'. Other possible value is 'none' for using word surface
            forms instead of stems/lemmas.
        delimiter (str): the delimiter between n-grams and document
            frequencies, defaults to tabulation (\t).
        n (int): the size of the n-grams, defaults to 3.
    """

    # document frequency container
    frequencies = defaultdict(int)

    # initialize number of documents
    nb_documents = 0

    # loop through the documents
    for document in documents:

        # initialize load file object
        doc = LoadFile()

        # read the input file
        doc.load_document(input=document,
                          language=language,
                          stoplist=stoplist,
                          normalization=normalization)

        # candidate selection
        doc.ngram_selection(n=n)

        # filter candidates containing punctuation marks
        doc.candidate_filtering()

        # loop through candidates
        for lexical_form in doc.candidates:
            frequencies[lexical_form] += 1

        nb_documents += 1

        if nb_documents % 1000 == 0:
            logging.info("{} docs, memory used: {} mb".format(
                nb_documents,
                sys.getsizeof(frequencies) / 1024 / 1024))

    # create directories from path if not exists
    if os.path.dirname(output_file):
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # dump the df container
    with gzip.open(output_file, 'wt', encoding='utf-8') as f:

        # add the number of documents as special token
        first_line = '--NB_DOC--' + delimiter + str(nb_documents)
        f.write(first_line + '\n')

        for ngram in frequencies:
            line = ngram + delimiter + str(frequencies[ngram])
            f.write(line + '\n')
예제 #5
0
파일: utils.py 프로젝트: atomicjets/pke
def compute_lda_model(documents,
                      output_file,
                      n_topics=500,
                      language="en",
                      stoplist=None,
                      normalization="stemming"):
    """Compute a LDA model from a collection of documents. Latent Dirichlet
    Allocation is computed using sklearn module.

    Args:
        documents (str): list fo pke-readable documents.
        output_file (str): the output file.
        n_topics (int): number of topics for the LDA model, defaults to 500.
        language (str): language of the input documents, used for stop_words
            in sklearn CountVectorizer, defaults to 'en'.
        stoplist (list): the stop words for filtering words, default to
            pke.lang.stopwords[language].
        normalization (str): word normalization method, defaults to
            'stemming'. Other possible value is 'none'
            for using word surface forms instead of stems/lemmas.
    """

    # texts container
    texts = []

    # loop throught the documents
    for document in documents:

        # initialize load file object
        doc = LoadFile()

        # read the input file
        doc.load_document(input=document,
                          language=language,
                          normalization=normalization)

        # container for current document
        text = []

        # loop through sentences
        for sentence in doc.sentences:
            # get the tokens (stems) from the sentence if they are not
            # punctuation marks
            text.extend([
                sentence.stems[i] for i in range(sentence.length)
                if sentence.pos[i] != 'PUNCT' and sentence.pos[i].isalpha()
            ])

        # add the document to the texts container
        texts.append(' '.join(text))

    # vectorize dataset
    # get the stoplist from pke.lang because CountVectorizer only contains
    # english stopwords atm
    if stoplist is None:
        stoplist = stopwords.get(language)
    tf_vectorizer = CountVectorizer(stop_words=stoplist)
    tf = tf_vectorizer.fit_transform(texts)

    # extract vocabulary
    vocabulary = tf_vectorizer.get_feature_names()
    # TODO: deprecation warning: use get_feature_names_out

    # create LDA model and train
    lda_model = LatentDirichletAllocation(n_components=n_topics,
                                          random_state=0,
                                          learning_method='batch')
    lda_model.fit(tf)

    # save all data necessary for later prediction
    saved_model = (vocabulary, lda_model.components_,
                   lda_model.exp_dirichlet_component_,
                   lda_model.doc_topic_prior_)

    # Dump the df container
    logging.info('writing LDA model to {}'.format(output_file))

    # create directories from path if not exists
    if os.path.dirname(output_file):
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # dump the LDA model
    with gzip.open(output_file, 'wb') as fp:
        pickle.dump(saved_model, fp)
예제 #6
0
파일: utils.py 프로젝트: xiaoman0220/pke
def compute_document_frequency(input_dir,
                               output_file,
                               format="corenlp",
                               extension="xml",
                               use_lemmas=False,
                               stemmer="porter",
                               stoplist=None,
                               delimiter='\t',
                               n=3):
    """ Compute n-gram document frequencies from a set of input documents. An
        extra row is added to the output file for specifying the number of
        documents from which the frequencies were computed (--NB_DOC-- tab XX).

        Args:
            input_dir (str): the input directory.
            output_file (str): the output file.
            format (str): the input files format, defaults to corenlp.
            extension (str): file extension for input documents, defaults to
                xml.
            use_lemmas (bool): whether lemmas from stanford corenlp are used
                instead of stems (computed by nltk), defaults to False.
            stemmer (str): the stemmer in nltk to used (if used), defaults
                to porter.
            stoplist (list): the stop words for filtering n-grams, default to
                None.
            delimiter (str): the delimiter between n-grams and document
                frequencies, default to tabulation.
            n (int): the length for ngrams, defaults to 3.
    """

    # document frequency container
    frequencies = defaultdict(set)

    # initialize number of documents
    nb_documents = 0

    # loop throught the documents
    for input_file in glob.glob(input_dir+'/*.'+extension):

        logging.info('reading file '+input_file)

        # initialize load file object
        doc = LoadFile(input_file)

        # read the input file
        doc.read_document(format=format,
                          use_lemmas=use_lemmas,
                          stemmer=stemmer,
                          sep='/')

        # candidate selection
        doc.ngram_selection(n=n)

        # filter candidates containing punctuation marks
        doc.candidate_filtering(stoplist=stoplist)

        # loop through candidates
        for lexical_form in doc.candidates:
            frequencies[lexical_form].add(input_file)

        nb_documents += 1

    # Dump the df container
    with gzip.open(output_file, 'wb') as f:

        # add the number of documents as special token
        first_line = '--NB_DOC--' + delimiter + str(nb_documents)
        f.write(first_line.encode('utf-8') + b'\n')

        for ngram in frequencies:
            line = ngram + delimiter + str(len(frequencies[ngram]))
            f.write(line.encode('utf-8') + b'\n')
예제 #7
0
파일: utils.py 프로젝트: xiaoman0220/pke
def compute_lda_model(input_dir,
                      output_file,
                      n_topics=500,
                      format="corenlp",
                      extension="xml",
                      use_lemmas=False,
                      stemmer="porter",
                      language="english"):
    """ Compute a LDA model from a collection of documents. Latent Dirichlet
        Allocation is computed using sklearn module.

        Args:
            input_dir (str): the input directory.
            output_file (str): the output file.
            n_topics (int): number of topics for the LDA model, defaults to 500.
            format (str): the input files format, defaults to corenlp.
            extension (str): file extension for input documents, defaults to
                xml.
            use_lemmas (bool): whether lemmas from stanford corenlp are used
                instead of stems (computed by nltk), defaults to False.
            stemmer (str): the stemmer in nltk to used (if used), defaults
                to porter.
            language (str): the language of the documents, used for stop_words
                in sklearn CountVectorizer, defaults to 'english'.
    """

    # texts container
    texts = []

    # loop throught the documents
    for input_file in glob.glob(input_dir+'/*.'+extension):

        logging.info('reading file '+input_file)

        # initialize load file object
        doc = LoadFile(input_file)

        # read the input file
        doc.read_document(format=format,
                          use_lemmas=use_lemmas,
                          stemmer=stemmer,
                          sep='/')

        # container for current document
        text = []

        # loop through sentences
        for sentence in doc.sentences:

            # get the tokens (stems) from the sentence if they are not
            # punctuation marks 
            text.extend([ sentence.stems[i] for i in range(sentence.length) \
                          if not re.search('[^A-Z$]', sentence.pos[i]) ])
        
        # add the document to the texts container
        texts.append(' '.join(text))

    # vectorize dataset
    # get the stoplist from nltk because CountVectorizer only contains english
    # stopwords atm
    tf_vectorizer = CountVectorizer(stop_words=stopwords.words(language))
    tf = tf_vectorizer.fit_transform(texts)

    # extract vocabulary
    vocabulary = tf_vectorizer.get_feature_names()

    # create LDA model and train
    lda_model = LatentDirichletAllocation(n_components=n_topics,
                                          random_state=0,
                                          learning_method='batch')
    lda_model.fit(tf)

    # save all data necessary for later prediction
    saved_model = (vocabulary,
                   lda_model.components_,
                   lda_model.exp_dirichlet_component_,
                   lda_model.doc_topic_prior_)

    # Dump the df container
    logging.info('writing LDA model to '+output_file)
    with gzip.open(output_file, 'wb') as fp:
        pickle.dump(saved_model, fp)
예제 #8
0
def compute_document_frequency(input_dir,
                               output_file,
                               extension='xml',
                               language='en',
                               normalization="stemming",
                               stoplist=None,
                               delimiter='\t',
                               n=3,
                               max_length=10**6,
                               encoding=None):
    """Compute the n-gram document frequencies from a set of input documents. An
    extra row is added to the output file for specifying the number of
    documents from which the document frequencies were computed
    (--NB_DOC-- tab XXX). The output file is compressed using gzip.

    Args:
        input_dir (str): the input directory.
        output_file (str): the output file.
        extension (str): file extension for input documents, defaults to xml.
        language (str): language of the input documents (used for computing the
            n-stem or n-lemma forms), defaults to 'en' (english).
        normalization (str): word normalization method, defaults to 'stemming'.
            Other possible values are 'lemmatization' or 'None' for using word
            surface forms instead of stems/lemmas.
        stoplist (list): the stop words for filtering n-grams, default to None.
        delimiter (str): the delimiter between n-grams and document frequencies,
            defaults to tabulation (\t).
        n (int): the size of the n-grams, defaults to 3.
        encoding (str): encoding of files in input_dir, default to None.
    """

    # document frequency container
    frequencies = defaultdict(int)

    # initialize number of documents
    nb_documents = 0

    # loop through the documents
    for input_file in glob.iglob(input_dir + os.sep + '*.' + extension):

        #logging.info('reading file {}'.format(input_file))

        # initialize load file object
        doc = LoadFile()

        # read the input file
        doc.load_document(input=input_file,
                          language=language,
                          normalization=normalization,
                          max_length=max_length,
                          encoding=encoding)

        # candidate selection
        doc.ngram_selection(n=n)

        # filter candidates containing punctuation marks
        doc.candidate_filtering(stoplist=stoplist)

        # loop through candidates
        for lexical_form in doc.candidates:
            frequencies[lexical_form] += 1

        nb_documents += 1

        if nb_documents % 1000 == 0:
            logging.info("{} docs, memory used: {} mb".format(
                nb_documents,
                sys.getsizeof(frequencies) / 1024 / 1024))

    # create directories from path if not exists
    if os.path.dirname(output_file):
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # dump the df container
    with gzip.open(output_file, 'wt', encoding='utf-8') as f:

        # add the number of documents as special token
        first_line = '--NB_DOC--' + delimiter + str(nb_documents)
        f.write(first_line + '\n')

        for ngram in frequencies:
            line = ngram + delimiter + str(frequencies[ngram])
            f.write(line + '\n')
예제 #9
0
def compute_lda_model(input_dir,
                      output_file,
                      n_topics=500,
                      extension="xml",
                      language="en",
                      normalization="stemming",
                      max_length=10**6,
                      encoding=None):
    """Compute a LDA model from a collection of documents. Latent Dirichlet
    Allocation is computed using sklearn module.

    Args:
        input_dir (str): the input directory.
        output_file (str): the output file.
        n_topics (int): number of topics for the LDA model, defaults to 500.
        extension (str): file extension for input documents, defaults to xml.
        language (str): language of the input documents, used for stop_words
            in sklearn CountVectorizer, defaults to 'en'.
        normalization (str): word normalization method, defaults to 'stemming'.
            Other possible values are 'lemmatization' or 'None' for using word
            surface forms instead of stems/lemmas.
        encoding (str): encoding of files in `input_dir`, default to None.
    """

    # texts container
    texts = []

    # loop throught the documents
    for input_file in glob.iglob(input_dir + os.sep + '*.' + extension):

        logging.info('reading file {}'.format(input_file))

        # initialize load file object
        doc = LoadFile()

        # read the input file
        doc.load_document(input=input_file,
                          language=language,
                          normalization=normalization,
                          max_length=max_length,
                          encoding=encoding)

        # container for current document
        text = []

        # loop through sentences
        for sentence in doc.sentences:
            # get the tokens (stems) from the sentence if they are not
            # punctuation marks
            text.extend([
                sentence.stems[i] for i in range(sentence.length)
                if sentence.pos[i] != 'PUNCT' and sentence.pos[i].isalpha()
            ])

        # add the document to the texts container
        texts.append(' '.join(text))

    # vectorize dataset
    # get the stoplist from nltk because CountVectorizer only contains english
    # stopwords atm
    tf_vectorizer = CountVectorizer(stop_words=get_stopwords(language))
    tf = tf_vectorizer.fit_transform(texts)

    # extract vocabulary
    vocabulary = tf_vectorizer.get_feature_names()

    # create LDA model and train
    lda_model = LatentDirichletAllocation(n_components=n_topics,
                                          random_state=0,
                                          learning_method='batch')
    lda_model.fit(tf)

    # save all data necessary for later prediction
    saved_model = (vocabulary, lda_model.components_,
                   lda_model.exp_dirichlet_component_,
                   lda_model.doc_topic_prior_)

    # Dump the df container
    logging.info('writing LDA model to {}'.format(output_file))

    # create directories from path if not exists
    if os.path.dirname(output_file):
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # dump the LDA model
    with gzip.open(output_file, 'wb') as fp:
        pickle.dump(saved_model, fp)