예제 #1
0
def ComputeDF(pathToCollectionOfDocs, lang, normalization, pathToDFFile):
    """Compute Document Frequency (DF) counts from a collection of documents.

    N-grams up to 3-grams are extracted and converted to their n-stems forms.
    Those containing a token that occurs in a stoplist are filtered out.
    Output file is in compressed (gzip) tab-separated-values format (tsv.gz).
    """

    # path to the collection of documents
    print(
        f"DF will be computed on top of the following collection of docs: {pathToCollectionOfDocs}"
    )

    if os.path.exists(pathToDFFile):
        print(f"DF Model already exists here:  {pathToDFFile} ")
    else:
        print(
            f"DF Model doesn't exist. It will be created (and may take a while) and will be saved here: {pathToDFFile}"
        )
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += load_stop_words(lang)

        compute_document_frequency(pathToCollectionOfDocs,
                                   pathToDFFile,
                                   extension='txt',
                                   language=lang,
                                   normalization=normalization,
                                   stoplist=stoplist)
def compute_document_frequency(input_file):
    #stopwords = list(punctuation)
    pke.compute_document_frequency(
        input_dir=input_file,
        output_file='trial_doc_freq.tsv.gz',
        extension='txt',  # input file extension
        language='en',  # language of files
        normalization="stemming")  #,    # use porter stemmer
예제 #3
0
def main():

    #process the document frequency of the reference corpus
    """Compute Document Frequency (DF) counts from a collection of documents.

	N-grams up to 3-grams are extracted and converted to their n-stems forms.
	Those containing a token that occurs in a stoplist are filtered out.
	Output file is in compressed (gzip) tab-separated-values format (tsv.gz).
	"""

    # stoplist for filtering n-grams
    stoplist = list(punctuation)

    # compute df counts and store as n-stem -> weight values
    compute_document_frequency(
        input_dir=
        '/Users/gmt28/Documents/Workspace/Docker_Engine/varad/Yale_Projects/shoah-foundation-data-restored/shoah-foundation-data/data/inputs/fortunoff/transcripts/',
        output_file=
        '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz',
        extension='txt',  # input file extension
        language='en',  # language of files
        normalization=None,  # use porter stemmer
        stoplist=stoplist,
        n=1)

    pdb.set_trace()
    """Keyphrase extraction using TfIdf and newly computed DF counts."""

    # initialize TfIdf model
    extractor = pke.unsupervised.TfIdf()

    # load the DF counts from file
    df_counts = pke.load_document_frequency_file(
        input_file=
        '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz'
    )

    # load the content of the document
    extractor.load_document(
        input=
        '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/text.txt',
        normalization=None,
        language='en')

    # keyphrase candidate selection
    extractor.candidate_selection(n=1, stoplist=list(string.punctuation))

    # candidate weighting with the provided DF counts
    extractor.candidate_weighting(df=df_counts)

    # N-best selection, keyphrases contains the 10 highest scored candidates as
    # (keyphrase, score) tuples
    keyphrases = extractor.get_n_best(n=15)
    print(keyphrases)
    pdb.set_trace()
예제 #4
0
def try_export_jsonl():
    n = 10
    # snlp_folder = "../data/processed/news/relevant/train/"
    snlp_folder = "../data/processed/news/relevant/train/"
    compute_document_frequency(
        snlp_folder,
        os.path.join("../data/interim/news_cargo_df.tsv.gz"),
        stoplist=list(STOP_WORDS))
    cargo_df = load_document_frequency_file(
        "../data/interim/news_cargo_df.tsv.gz")
    pke_factory = {
        "grammar": r"""
                NBAR:
                    {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>}

                NP:
                    {<NBAR>}
                    {<NBAR><ADP><NBAR>}
                """,
        "filtering_params": {
            "stoplist": list(STOP_WORDS)
        },
        "extractors": {
            "kpm": {
                "instance": PKEBasedTermsExtractor(KPMiner),
                "weighting_params": {
                    "df": cargo_df
                }
            },
        }
    }
    for name in pke_factory["extractors"]:
        log.info(f"Begin Extraction with PKE based extractor: {name}")
        extractor_instance = pke_factory["extractors"][name]["instance"]
        if "filtering_params" in pke_factory["extractors"][name]:
            filtering_params = {
                **pke_factory["filtering_params"],
                **pke_factory["extractors"][name]["filtering_params"]
            }
        else:
            filtering_params = pke_factory["filtering_params"]
        extractor_instance.extract(
            snlp_folder,
            n,
            grammar=pke_factory["grammar"],
            filtering_params=filtering_params,
            weighting_params=pke_factory["extractors"][name]
            ["weighting_params"],
            output_file=f"../results/extracted_terms/train/{name}.csv",
            auto_term_file=f"../data/annotations/automatic/terms/{name}.jsonl")
예제 #5
0
def train_word_frequency():
    # stoplist for filtering n-grams
    stoplist = list(string.punctuation)
    # stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    # stoplist += stopwords.words('english')

    # compute df counts and store as n-stem -> weight values
    pke.compute_document_frequency(
        input_dir='../scratch/lda_text',
        output_file='../scratch/tf_abs_2.tsv.gz',
        extension='txt',  # input file extension
        language='en',  # language of files
        normalization="stemming",  # use porter stemmer
        stoplist=stoplist)
예제 #6
0
# -*- coding: utf-8 -*-

import logging
import sys
from string import punctuation

from pke import compute_document_frequency

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the df weights dictionary, saved as a gzipped csv file
output_file = sys.argv[2]

# stoplist are punctuation marks
stoplist = list(punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# compute idf weights
compute_document_frequency(
    input_dir=input_dir,
    output_file=output_file,
    extension='xml',  # input file extension
    language='en',  # language of the input files
    normalization="stemming",  # use porter stemmer
    stoplist=stoplist,  # stoplist
    delimiter='\t',  # tab separated output
    n=5)  # compute n-grams up to 5-grams
예제 #7
0
import os
import logging
import sys

from pke import compute_document_frequency
from string import punctuation

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the df weights dictionary, saved as a gzipped csv file
output_file = sys.argv[2]

# stoplist are punctuation marks
stoplist = list(punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# compute idf weights
compute_document_frequency(input_dir=input_dir,
						   output_file=output_file,
						   format="corenlp",                # input files format
                           use_lemmas=False,        # do not use Stanford lemmas
                           stemmer="porter",                # use porter stemmer
                           stoplist=stoplist,                         # stoplist
                           delimiter='\t',                # tab separated output
                           extension='xml',              # input files extension
                           n=5)                  # compute n-grams up to 5-grams
예제 #8
0
def run_trial():
    n = 10
    snlp_folder = "../data/test/core_nlp_samples"
    compute_document_frequency(
        snlp_folder,
        os.path.join("../data/test/interim/test_cargo_df.tsv.gz"),
        stoplist=list(STOP_WORDS))
    cargo_df = load_document_frequency_file(
        "../data/test/interim/test_cargo_df.tsv.gz")
    pke_factory = {
        "grammar": r"""
                NBAR:
                    {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>}

                NP:
                    {<NBAR>}
                    {<NBAR><ADP><NBAR>}
                """,
        "filtering_params": {
            "stoplist": list(STOP_WORDS)
        },
        "extractors": {
            "tfidf": {
                "instance": PKEBasedTermsExtractor(TfIdf),
                "weighting_params": {
                    "df": cargo_df
                }
            },
            "yake": {
                "instance": PKEBasedTermsExtractor(YAKE),
                "filtering_params": {
                    "only_alphanum": True,
                    "strip_outer_stopwords": True
                },
                "weighting_params": {
                    "stoplist": list(STOP_WORDS)
                }
            },
            "kpm": {
                "instance": PKEBasedTermsExtractor(KPMiner),
                "weighting_params": {
                    "df": cargo_df
                }
            },
            "mprank": {
                "instance": PKEBasedTermsExtractor(MultipartiteRank),
                "weighting_params": {}
            },
            "positionrank": {
                "instance": PKEBasedTermsExtractor(PositionRank),
                "weighting_params": {}
            }
        }
    }
    for name in pke_factory["extractors"]:
        log.info(f"Begin Extraction with PKE based extractor: {name}")
        extractor_instance = pke_factory["extractors"][name]["instance"]
        if "filtering_params" in pke_factory["extractors"][name]:
            filtering_params = {
                **pke_factory["filtering_params"],
                **pke_factory["extractors"][name]["filtering_params"]
            }
        else:
            filtering_params = pke_factory["filtering_params"]
        extractor_instance.extract(
            snlp_folder,
            n,
            grammar=pke_factory["grammar"],
            filtering_params=filtering_params,
            weighting_params=pke_factory["extractors"][name]
            ["weighting_params"],
            output_file=f"../data/test/extracted_terms_sample/{name}.csv",
            auto_term_file=f"../data/test/automatic_annotations/{name}.jsonl")
import logging
import sys

from pke import compute_document_frequency
from string import punctuation

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = "/home/asjindal/Work/tf/keyword_extraction/resources/data/docs"

# path to the df weights dictionary, saved as a gzipped csv file
output_file = "/home/asjindal/Work/tf/keyword_extraction/resources/data/train_df_count.tsv.gz"

# stoplist are punctuation marks
stoplist = list(punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# compute idf weights
compute_document_frequency(
    input_dir=input_dir,
    output_file=output_file,
    format="raw",  # input files format
    use_lemmas=False,  # do not use Stanford lemmas
    stemmer=None,  # use porter stemmer
    stoplist=stoplist,  # stoplist
    delimiter='\t',  # tab separated output
    extension='txt',  # input files extension
    n=3)  # compute n-grams up to 5-grams
예제 #10
0
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# ======================================================================================================================
# Compute document frequency for SemEval
# ======================================================================================================================
"""Compute Document Frequency (DF) counts from a collection of documents.

N-grams up to 3-grams are extracted and converted to their n-stems forms.
Those containing a token that occurs in a stoplist are filtered out.
Output file is in compressed (gzip) tab-separated-values format (tsv.gz).
"""

compute_document_frequency(
    input_dir='../data/benchmark_data/semeval_2010/train_test_combined/',
    output_file='doc_freq/semeval_2010_doc_freq.tsv.gz',
    extension='xml',  # input file extension
    language='en',  # language of files
    normalization="stemming",  # use porter stemmer
    stoplist=stoplist)

# ======================================================================================================================
# Compute document frequency for SemEval
# ======================================================================================================================

file = '..\\data\\benchmark_data\\NUS.json'  # TEST data to evaluate the final model

json_data = []
for line in open(file, 'r', encoding="utf8"):
    json_data.append(json.loads(line))

# convert json to dataframe
예제 #11
0
        ends = [
            int(u.text)
            for u in sentence.iterfind('tokens/token/CharacterOffsetEnd')
        ]
        doc = {
            'words': [u.text for u in sentence.iterfind('tokens/token/word')],
            'lemmas':
            [u.text for u in sentence.iterfind('tokens/token/lemma')],
            'POS': [u.text for u in sentence.iterfind('tokens/token/POS')],
            'char_offsets': [(starts[k], ends[k]) for k in range(len(starts))]
        }
        sentences.append([(doc['words'][i], doc['POS'][i])
                          for i in range(len(doc['words']))])
    return sentences


documents = []
for fn in glob(input_dir + '*.xml'):
    doc = read_corenlp_xml(fn)
    documents.append(doc)

# compute idf weights
compute_document_frequency(
    documents,
    output_file=output_file,
    language='en',  # language of the input files
    normalization='stemming',  # use porter stemmer
    stoplist=stoplist,  # stoplist
    n=5  # compute n-grams up to 5-grams
)
예제 #12
0
###############################################################################

###############################################################################
# PRE-COMPUTING WEIGHTS/STATS
###############################################################################

# pre-compute DF weights if needed
need_df = any(model in ['KPMiner', 'Wingnus', 'TfIdf', 'Kea']
              for model in params['models'])
if need_df and not os.path.isfile(path_to_df_file):
    logging.info("computing DF weights from {}".format(params["path"]))
    pke.compute_document_frequency(input_dir=path_to_train,
                                   output_file=path_to_df_file,
                                   extension=params["extension"],
                                   language=params["language"],
                                   normalization=params["normalization"],
                                   stoplist=punctuations,
                                   delimiter='\t',
                                   n=5)

# pre-compute LDA distributions if needed
need_lda = any(model in ['TopicalPageRank'] for model in params['models'])
if need_lda and not os.path.isfile(path_to_lda_file):
    logging.info("computing LDA distributions from {}".format(params["path"]))
    pke.compute_lda_model(input_dir=path_to_train,
                          output_file=path_to_lda_file,
                          n_topics=params["n_topics"],
                          extension=params["extension"],
                          language=params["language"],
                          normalization=params["normalization"])