예제 #1
0
파일: utils.py 프로젝트: xiaoman0220/pke
def compute_document_frequency(input_dir,
                               output_file,
                               format="corenlp",
                               extension="xml",
                               use_lemmas=False,
                               stemmer="porter",
                               stoplist=None,
                               delimiter='\t',
                               n=3):
    """ Compute n-gram document frequencies from a set of input documents. An
        extra row is added to the output file for specifying the number of
        documents from which the frequencies were computed (--NB_DOC-- tab XX).

        Args:
            input_dir (str): the input directory.
            output_file (str): the output file.
            format (str): the input files format, defaults to corenlp.
            extension (str): file extension for input documents, defaults to
                xml.
            use_lemmas (bool): whether lemmas from stanford corenlp are used
                instead of stems (computed by nltk), defaults to False.
            stemmer (str): the stemmer in nltk to used (if used), defaults
                to porter.
            stoplist (list): the stop words for filtering n-grams, default to
                None.
            delimiter (str): the delimiter between n-grams and document
                frequencies, default to tabulation.
            n (int): the length for ngrams, defaults to 3.
    """

    # document frequency container
    frequencies = defaultdict(set)

    # initialize number of documents
    nb_documents = 0

    # loop throught the documents
    for input_file in glob.glob(input_dir+'/*.'+extension):

        logging.info('reading file '+input_file)

        # initialize load file object
        doc = LoadFile(input_file)

        # read the input file
        doc.read_document(format=format,
                          use_lemmas=use_lemmas,
                          stemmer=stemmer,
                          sep='/')

        # candidate selection
        doc.ngram_selection(n=n)

        # filter candidates containing punctuation marks
        doc.candidate_filtering(stoplist=stoplist)

        # loop through candidates
        for lexical_form in doc.candidates:
            frequencies[lexical_form].add(input_file)

        nb_documents += 1

    # Dump the df container
    with gzip.open(output_file, 'wb') as f:

        # add the number of documents as special token
        first_line = '--NB_DOC--' + delimiter + str(nb_documents)
        f.write(first_line.encode('utf-8') + b'\n')

        for ngram in frequencies:
            line = ngram + delimiter + str(len(frequencies[ngram]))
            f.write(line.encode('utf-8') + b'\n')
예제 #2
0
파일: utils.py 프로젝트: atomicjets/pke
def compute_document_frequency(
        documents,
        output_file,
        language='en',
        stoplist=None,
        normalization='stemming',
        delimiter='\t',
        # TODO: What is the use case for changing this ?
        n=3):
    """Compute the n-gram document frequencies from a set of input documents.
    An extra row is added to the output file for specifying the number of
    documents from which the document frequencies were computed
    (--NB_DOC-- tab XXX). The output file is compressed using gzip.

    Args:
        documents (list): list of pke-readable documents.
        output_file (str): the output file.
        language (str): language of the input documents (used for computing the
            n-stem or n-lemma forms), defaults to 'en' (english).
        stoplist (list): the stop words for filtering n-grams, default to
            pke.lang.stopwords[language].
        normalization (str): word normalization method, defaults to
            'stemming'. Other possible value is 'none' for using word surface
            forms instead of stems/lemmas.
        delimiter (str): the delimiter between n-grams and document
            frequencies, defaults to tabulation (\t).
        n (int): the size of the n-grams, defaults to 3.
    """

    # document frequency container
    frequencies = defaultdict(int)

    # initialize number of documents
    nb_documents = 0

    # loop through the documents
    for document in documents:

        # initialize load file object
        doc = LoadFile()

        # read the input file
        doc.load_document(input=document,
                          language=language,
                          stoplist=stoplist,
                          normalization=normalization)

        # candidate selection
        doc.ngram_selection(n=n)

        # filter candidates containing punctuation marks
        doc.candidate_filtering()

        # loop through candidates
        for lexical_form in doc.candidates:
            frequencies[lexical_form] += 1

        nb_documents += 1

        if nb_documents % 1000 == 0:
            logging.info("{} docs, memory used: {} mb".format(
                nb_documents,
                sys.getsizeof(frequencies) / 1024 / 1024))

    # create directories from path if not exists
    if os.path.dirname(output_file):
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # dump the df container
    with gzip.open(output_file, 'wt', encoding='utf-8') as f:

        # add the number of documents as special token
        first_line = '--NB_DOC--' + delimiter + str(nb_documents)
        f.write(first_line + '\n')

        for ngram in frequencies:
            line = ngram + delimiter + str(frequencies[ngram])
            f.write(line + '\n')
예제 #3
0
def compute_document_frequency(input_dir,
                               output_file,
                               extension='xml',
                               language='en',
                               normalization="stemming",
                               stoplist=None,
                               delimiter='\t',
                               n=3,
                               max_length=10**6,
                               encoding=None):
    """Compute the n-gram document frequencies from a set of input documents. An
    extra row is added to the output file for specifying the number of
    documents from which the document frequencies were computed
    (--NB_DOC-- tab XXX). The output file is compressed using gzip.

    Args:
        input_dir (str): the input directory.
        output_file (str): the output file.
        extension (str): file extension for input documents, defaults to xml.
        language (str): language of the input documents (used for computing the
            n-stem or n-lemma forms), defaults to 'en' (english).
        normalization (str): word normalization method, defaults to 'stemming'.
            Other possible values are 'lemmatization' or 'None' for using word
            surface forms instead of stems/lemmas.
        stoplist (list): the stop words for filtering n-grams, default to None.
        delimiter (str): the delimiter between n-grams and document frequencies,
            defaults to tabulation (\t).
        n (int): the size of the n-grams, defaults to 3.
        encoding (str): encoding of files in input_dir, default to None.
    """

    # document frequency container
    frequencies = defaultdict(int)

    # initialize number of documents
    nb_documents = 0

    # loop through the documents
    for input_file in glob.iglob(input_dir + os.sep + '*.' + extension):

        #logging.info('reading file {}'.format(input_file))

        # initialize load file object
        doc = LoadFile()

        # read the input file
        doc.load_document(input=input_file,
                          language=language,
                          normalization=normalization,
                          max_length=max_length,
                          encoding=encoding)

        # candidate selection
        doc.ngram_selection(n=n)

        # filter candidates containing punctuation marks
        doc.candidate_filtering(stoplist=stoplist)

        # loop through candidates
        for lexical_form in doc.candidates:
            frequencies[lexical_form] += 1

        nb_documents += 1

        if nb_documents % 1000 == 0:
            logging.info("{} docs, memory used: {} mb".format(
                nb_documents,
                sys.getsizeof(frequencies) / 1024 / 1024))

    # create directories from path if not exists
    if os.path.dirname(output_file):
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # dump the df container
    with gzip.open(output_file, 'wt', encoding='utf-8') as f:

        # add the number of documents as special token
        first_line = '--NB_DOC--' + delimiter + str(nb_documents)
        f.write(first_line + '\n')

        for ngram in frequencies:
            line = ngram + delimiter + str(frequencies[ngram])
            f.write(line + '\n')