def runSingleDoc(self, doc): #Get TFIDF keywords # 1. create a TFIDF extractor. extractor = pke.unsupervised.TfIdf() # 2. load the content of the document in a given language # Test if lan exists in spacy models. If not considers model en if self.__lan not in ['en', 'pt', 'fr', 'it', 'nl', 'de']: extractor.load_document(input=doc, language='en', normalization=self.__normalization) else: extractor.load_document(input=doc, language=self.__lan, normalization=self.__normalization) # 3. select {1-3}-grams not containing punctuation marks as candidates. stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += load_stop_words(self.__lan) extractor.candidate_selection(n=3, stoplist=stoplist) try: # 4. weight the candidates using a `tf` x `idf` df = pke.load_document_frequency_file(input_file=self.__pathToDFFile) extractor.candidate_weighting(df=df) # 5. get the numOfKeywords-highest scored candidates as keyphrases keywords = extractor.get_n_best(n=self.__numOfKeywords) except: keywords = [] return keywords
def test(files, number_of_tags, trained_model, test_DF_zip): # create a Kea extractor and set the input language to English (used for # the stoplist in the candidate selection method) extractor = pke.supervised.Kea() # load the content of the document, here in CoreNLP XML format # the use_lemmas parameter allows to choose using CoreNLP lemmas or stems # computed using nltk extractor.load_document(files) # select the keyphrase candidates, for Kea the 1-3 grams that do not start or # end with a stopword. extractor.candidate_selection() # load the df counts df_counts = pke.load_document_frequency_file(input_file=test_DF_zip, delimiter='\t') # weight the candidates using Kea model. extractor.candidate_weighting(model_file=trained_model, df=df_counts) key_list = [] for (keyphrase, score) in extractor.get_n_best(n=number_of_tags): key_list.append(keyphrase) files = files.split('/')[-1].split('.')[0] return files, key_list
def runSingleDoc(self, doc): #Get keywords for a single doc. It will only retrieve the keywords for further processing #Either they will be printed in case we really just want to extract keywords from a single doc #Or they will be saved in case we are extracting keywords from multiple docs (that is, if this method is called externally by runMultipleDocs) # 1. create extractor. extractor = pke.supervised.Kea() # 2. load the content of the document in a given language extractor.load_document(input=doc, language=self.__lan, normalization=self.__normalization) # 3. select 1-3 grams that do not start or end with a stopword as # candidates. Candidates that contain punctuation marks as words # are discarded. stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] a = load_stop_words(self.__lan) stoplist += load_stop_words(self.__lan) extractor.candidate_selection(stoplist=stoplist) try: # 4. classify candidates as keyphrase or not keyphrase. #df = pke.load_document_frequency_file(input_file= self.__pathToDFFile + "/df.tsv.gz") df = pke.load_document_frequency_file(input_file=self.__pathToDFFile) #extractor.candidate_weighting(model_file=self.__pathToKeaModelsFolder + "/model.pickle", df=df) extractor.candidate_weighting(model_file=self.__pathToKEAFile, df=df) # 5. get the numOfKeywords-highest scored candidates as keyphrases keywords = extractor.get_n_best(n=self.__numOfKeywords) except: keywords = [] return keywords
def runSingleDoc(self, doc): #Get KPMiner keywords # 1. create a SingleRank extractor. extractor = pke.unsupervised.KPMiner() # 2. load the content of the document in a given language extractor.load_document(input=doc, language=self.__lan, normalization=self.__normalization) # 3. select {1-5}-grams that do not contain punctuation marks or # stopwords as keyphrase candidates. Set the least allowable seen # frequency to 5 and the number of words after which candidates are # filtered out to 200. lasf = 5 cutoff = 200 extractor.candidate_selection(lasf=lasf, cutoff=cutoff) try: # 4. weight the candidates using KPMiner weighting function. df = pke.load_document_frequency_file( input_file=self.__pathToDFFile) alpha = 2.3 sigma = 3.0 extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma) # 5. get the numOfKeywords-highest scored candidates as keyphrases keywords = extractor.get_n_best(n=self.__numOfKeywords) except: keywords = [] return keywords
def TrainingKEAModel(self, pathToCollectionOfDocs, groundTruthFile, lang, normalization, pathToDFFile, pathToKEAFile, pathToKeaModelsFolder): print(f"\nSTEP 2: Compute Document Frequency") ComputeDF(pathToCollectionOfDocs, lang, normalization, pathToDFFile) df = pke.load_document_frequency_file(input_file=pathToDFFile) print( f"\nSTEP 3: Train KEA Model on top of the following set of docs: {pathToCollectionOfDocs}" ) if os.path.exists(pathToKEAFile): print(f"KEA Model File already exists here: {pathToKEAFile} ") else: print( f"KEA Model doesn't exists. Let's create here: {pathToCollectionOfDocs}. It may take a while." ) # If folder Models does not exist: Create it if not os.path.exists(pathToKeaModelsFolder): os.makedirs(pathToKeaModelsFolder) pke.train_supervised_model(input_dir=pathToCollectionOfDocs, reference_file=groundTruthFile, model_file=pathToKEAFile, extension='txt', language=lang, normalization=normalization, df=df, model=pke.supervised.Kea())
def main(): #process the document frequency of the reference corpus """Compute Document Frequency (DF) counts from a collection of documents. N-grams up to 3-grams are extracted and converted to their n-stems forms. Those containing a token that occurs in a stoplist are filtered out. Output file is in compressed (gzip) tab-separated-values format (tsv.gz). """ # stoplist for filtering n-grams stoplist = list(punctuation) # compute df counts and store as n-stem -> weight values compute_document_frequency( input_dir= '/Users/gmt28/Documents/Workspace/Docker_Engine/varad/Yale_Projects/shoah-foundation-data-restored/shoah-foundation-data/data/inputs/fortunoff/transcripts/', output_file= '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz', extension='txt', # input file extension language='en', # language of files normalization=None, # use porter stemmer stoplist=stoplist, n=1) pdb.set_trace() """Keyphrase extraction using TfIdf and newly computed DF counts.""" # initialize TfIdf model extractor = pke.unsupervised.TfIdf() # load the DF counts from file df_counts = pke.load_document_frequency_file( input_file= '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz' ) # load the content of the document extractor.load_document( input= '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/text.txt', normalization=None, language='en') # keyphrase candidate selection extractor.candidate_selection(n=1, stoplist=list(string.punctuation)) # candidate weighting with the provided DF counts extractor.candidate_weighting(df=df_counts) # N-best selection, keyphrases contains the 10 highest scored candidates as # (keyphrase, score) tuples keyphrases = extractor.get_n_best(n=15) print(keyphrases) pdb.set_trace()
def __init__(self, max_ngram_size=3, df=None, **kwargs): super().__init__(**kwargs) self.name = kwargs.get('name', 'TfIdf') self.max_ngram_size = max_ngram_size self.df = df if isinstance(self.df, str): self.df = pke.load_document_frequency_file(input_file=self.df) self.pos = {'NOUN', 'PROPN', 'ADJ'} self.stoplist = list(string.punctuation) self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] self.stoplist += stopwords.words('english') self.kw_extractor = pke.unsupervised.TfIdf()
def try_export_jsonl(): n = 10 # snlp_folder = "../data/processed/news/relevant/train/" snlp_folder = "../data/processed/news/relevant/train/" compute_document_frequency( snlp_folder, os.path.join("../data/interim/news_cargo_df.tsv.gz"), stoplist=list(STOP_WORDS)) cargo_df = load_document_frequency_file( "../data/interim/news_cargo_df.tsv.gz") pke_factory = { "grammar": r""" NBAR: {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>} NP: {<NBAR>} {<NBAR><ADP><NBAR>} """, "filtering_params": { "stoplist": list(STOP_WORDS) }, "extractors": { "kpm": { "instance": PKEBasedTermsExtractor(KPMiner), "weighting_params": { "df": cargo_df } }, } } for name in pke_factory["extractors"]: log.info(f"Begin Extraction with PKE based extractor: {name}") extractor_instance = pke_factory["extractors"][name]["instance"] if "filtering_params" in pke_factory["extractors"][name]: filtering_params = { **pke_factory["filtering_params"], **pke_factory["extractors"][name]["filtering_params"] } else: filtering_params = pke_factory["filtering_params"] extractor_instance.extract( snlp_folder, n, grammar=pke_factory["grammar"], filtering_params=filtering_params, weighting_params=pke_factory["extractors"][name] ["weighting_params"], output_file=f"../results/extracted_terms/train/{name}.csv", auto_term_file=f"../data/annotations/automatic/terms/{name}.jsonl")
def __init__(self, lasf=3, cutoff=200, alpha=2.3, sigma=3.0, df=None, **kwargs): super().__init__(**kwargs) self.name = kwargs.get('name', 'KPMiner') self.lasf = lasf self.cutoff = cutoff self.alpha = alpha self.sigma = sigma self.df = df if isinstance(self.df, str): self.df = pke.load_document_frequency_file(input_file=self.df) self.pos = {'NOUN', 'PROPN', 'ADJ'} self.stoplist = list(string.punctuation) self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] self.stoplist += stopwords.words('english') self.kw_extractor = pke.unsupervised.KPMiner()
def extract_keyphrases(data): gold_keyphrases = [] # save the gold keyphrases of documents pred_keyphrases = [] # save the predicted keyphrases of documents for indx, abstract_document in enumerate(data['abstract']): # print('train_test_combined/' + key + '.xml') # print(keyphrases_dictionary[key]) #if 'json' in file: gold_keyphrases.append([ [Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';') ]) # split gold keywords to separate them from one another # ====================================================================================================================== # TF-IDF Extractor # ====================================================================================================================== stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += stopwords.words('english') # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf() #print(' '.join(abstract_document)) print(abstract_document) # 2. load the content of the document. extractor.load_document( input=abstract_document, # ' '.join(abstract_document language='en', normalization="stemming") # 3. select {1-3}-grams not containing punctuation marks as candidates. extractor.candidate_selection(n=3, stoplist=stoplist) # 4. weight the candidates using a `tf` x `idf` df = pke.load_document_frequency_file(input_file=input_file) extractor.candidate_weighting(df=df) # 5. get the 10-highest scored candidates as keyphrases pred_kps = extractor.get_n_best(n=10) # keep only the predicted keyphrase (first position -> [0]) and discard the frequency number pred_keyphrases.append([kp[0].split() for kp in pred_kps]) print(pred_keyphrases) print(gold_keyphrases) return pred_keyphrases, gold_keyphrases
def pke_unsupervised(cur_text, top_k, kw_extractor, lang='en', document_frequency_file=None): arg_tokens = kw_extractor.split('-') extractor = getattr(pke.unsupervised, arg_tokens[-1])() extractor.load_document(input=cur_text, language=lang) extractor.candidate_selection() if document_frequency_file is not None: df_counts = pke.load_document_frequency_file( input_file=document_frequency_file) extractor.candidate_weighting(df=df_counts) else: # go back to the default values extractor.candidate_weighting() keyphrases = extractor.get_n_best(n=top_k) final_kw = [(score, term) for term, score in keyphrases] return final_kw
import pke base = os.path.dirname(__file__) # create a Kea extractor and set the input language to English (used for # the stoplist in the candidate selection method) extractor = pke.supervised.Kea() # load the content of the document, here in corenlp format with open(base + os.sep + '2.txt') as f: doc = f.read() extractor.load_document(doc) # select the keyphrase candidates, for Kea the 1-3 grams that do not start or # end with a stopword. extractor.candidate_selection() # load the df counts df_counts = pke.load_document_frequency_file(input_file=base + os.sep + 'df.tsv.gz', delimiter='\t') # weight the candidates using Kea model. extractor.candidate_weighting(model_file=base + os.sep + 'model.pickle', df=df_counts) # print the n-highest (10) scored candidates for (keyphrase, score) in extractor.get_n_best(n=10): print(keyphrase, score)
import logging import pke # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = 'train/' # path to the reference file reference_file = "gold-annotation.txt" # path to the df file df_file = "df.tsv.gz" logging.info('Loading df counts from {}'.format(df_file)) df_counts = pke.load_document_frequency_file(input_file=df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = "model.pickle" pke.train_supervised_model(input_dir=input_dir, reference_file=reference_file, model_file=output_mdl, df=df_counts, format="corenlp", use_lemmas=False, stemmer="porter", model=pke.supervised.Kea(), language='english', extension="xml")
pke.compute_lda_model(input_dir=path_to_train, output_file=path_to_lda_file, n_topics=params["n_topics"], extension=params["extension"], language=params["language"], normalization=params["normalization"]) # pre-compute pairwise similarities if needed need_pairwise = any(model in ['ExpandRank'] for model in params['models']) if need_pairwise and not os.path.isfile(path_to_pairwise_file): logging.info("computing pairwise similarities in {}".format( params["path"])) logging.info("loading DF counts from {}".format(path_to_df_file)) df_counts = pke.load_document_frequency_file(input_file=path_to_df_file) pke.compute_pairwise_similarity_matrix( input_dir=path_to_test, output_file=path_to_pairwise_file, collection_dir=path_to_train, df=df_counts, extension=params["extension"], language=params["language"], normalization=params["normalization"], stoplist=stoplist) ############################################################################### ############################################################################### # TRAINING SUPERVISED MODEL
stoplist += stopwords.words('english') # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf() #print(' '.join(abstract_document)) print(abstract_document) # 2. load the content of the document. extractor.load_document(input=abstract_document, # ' '.join(abstract_document language='en', normalization="stemming") # 3. select {1-3}-grams not containing punctuation marks as candidates. extractor.candidate_selection(n=3, stoplist=stoplist) # 4. weight the candidates using a `tf` x `idf` df = pke.load_document_frequency_file(input_file=input_file) extractor.candidate_weighting(df=df) # 5. get the 10-highest scored candidates as keyphrases pred_kps = extractor.get_n_best(n=10) pred_keyphrases.append([kp[0].split() for kp in pred_kps]) # keep only the predicted keyphrase and discard the frequency number print(pred_keyphrases) print(gold_keyphrases) # ====================================================================================================================== # Evaluation # ====================================================================================================================== # traditional evaluation the model's performance
def get_keywords(content): content = content.replace("-", "") if len(content) <= 500: NGraph = 8 NStat = 5 if 500 < len(content) < 1000: NGraph = 13 NStat = 10 if len(content) >= 1000: NGraph = 18 NStat = 15 PositionRank = [] MultipartiteRank = [] TFIDF = [] TextRank = [] # PKE - TF-IDF extractorTFIDF = pke.unsupervised.TfIdf() extractorTFIDF.load_document(input=content, language="en", normalization=None) extractorTFIDF.candidate_selection(n=4, stoplist=stoplist) df = pke.load_document_frequency_file( input_file= 'C:/Users/admin/Anaconda3/Lib/site-packages/pke/models/df-semeval2010.tsv.gz' ) extractorTFIDF.candidate_weighting(df=df) keyphrasesTFIDF = extractorTFIDF.get_n_best(n=NStat) for key in keyphrasesTFIDF: TFIDF.append(key[0]) # PKE - TextRank pos = {'NOUN', 'PROPN', 'ADJ'} extractorTextRank = pke.unsupervised.TextRank() extractorTextRank.load_document(input=content, language='en', normalization=None) extractorTextRank.candidate_weighting(window=2, pos=pos, top_percent=0.33) keyphrasesTextRank = extractorTextRank.get_n_best(n=NGraph) for key in keyphrasesTextRank: TextRank.append(key[0]) # PKE - PositionRank pos = {'NOUN', 'PROPN', 'ADJ'} grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" extractorPositionRank = pke.unsupervised.PositionRank() extractorPositionRank.load_document(input=content, language='en', normalization=None) extractorPositionRank.candidate_selection(grammar=grammar, maximum_word_number=4) extractorPositionRank.candidate_weighting(window=2, pos=pos) keyphrasesPositionRank = extractorPositionRank.get_n_best(n=NGraph) for key in keyphrasesPositionRank: PositionRank.append(key[0]) # PKE - MultipartiteRank extractorMultipartiteRank = pke.unsupervised.MultipartiteRank() extractorMultipartiteRank.load_document(input=content) pos = {'NOUN', 'PROPN', 'ADJ'} extractorMultipartiteRank.candidate_selection(pos=pos, stoplist=stoplist) extractorMultipartiteRank.candidate_weighting(alpha=3, threshold=0.95, method='average') keyphrasesMultipartiteRank = extractorMultipartiteRank.get_n_best(n=NGraph) for key in keyphrasesMultipartiteRank: MultipartiteRank.append(key[0]) inter1 = set(PositionRank).intersection(set(MultipartiteRank)) inter2 = set(TFIDF).intersection(set(TextRank)) to_remove_fin = [] to_add = [] to_remove = [] for elem1 in inter2: for elem2 in inter1: if (" " not in elem1) and ( " " not in elem2) and (lemmatizer.lemmatize(elem1) in lemmatizer.lemmatize(elem2)): to_remove_fin.append(elem2) to_remove.append(elem1) to_add.append(elem1) if (" " not in elem1) and ( " " not in elem2) and (lemmatizer.lemmatize(elem2) in lemmatizer.lemmatize(elem1)): to_remove_fin.append(elem2) to_remove.append(elem1) to_add.append(elem2) if (elem1 in elem2) and (' ' in elem1) and (elem1 != elem2): to_remove_fin.append(elem2) elif (elem1 in elem2) and (' ' not in elem1) and (elem1 != elem2): to_remove.append(elem1) to_remove = set(to_remove) for elem in to_remove: inter2.remove(elem) inter = set(inter1).union(set(inter2)) inter = list(inter) new_inter = inter new_inter = new_inter + list(set(to_add)) for i in range(0, len(inter)): count = 0 poses = [] tokens = [ word for word in nltk.word_tokenize(inter[i]) if word not in stoplist ] new_inter[i] = ' '.join(tokens) tags = list(nltk.pos_tag(tokens)) for tag in tags: poses.append(tag[1]) for pos in poses: if 'NN' in pos: count += 1 if count == 0: to_remove_fin.append(new_inter[i]) if len(poses) > 4: to_remove_fin.append(new_inter[i]) to_remove_fin = list(set(to_remove_fin)) new_inter = list(set(new_inter).difference(to_remove_fin)) return new_inter
import pke #import logging ## Training the model on train set. #train_input_dir = 'drive/My Drive/Recommendation systems/kea_trained/train_doc/' reference_file = 'drive/My Drive/Recommendation systems/kea_trained/reference.txt' output_mdl = "drive/My Drive/Recommendation systems/kea_trained/Models/kea_model.pickle" #train_df_file = 'drive/My Drive/Recommendation systems/kea_trained/train_DF.tsv.gz' #logging.info('Loading df counts from {}'.format(df_file)) df_counts = pke.load_document_frequency_file(input_file='train_DF.tsv.gz', delimiter='\t') pke.train_supervised_model(input_dir='train_doc/', reference_file='reference.txt', model_file='model/kea_model.pickle', extension='txt', language='en', normalization="stemming", df=df_counts, model=pke.supervised.Kea())
if args.verbose: logging.basicConfig(level=logging.INFO) # get class from module class_ = getattr(pke, args.approach, None) if not class_: logging.error('No valid extraction model given [' + args.approach + ']') sys.exit(0) logging.info('keyphrase extraction using ' + args.approach) if args.df: logging.info('loading df weights from ' + args.df) df = pke.load_document_frequency_file(args.df, delimiter="\t") extr = class_(input_file=args.input) extr.read_document(format=args.format) extr.candidate_selection() if args.approach in ['TfIdf', 'TopicRank', 'SingleRank', 'KPMiner']: extr.candidate_weighting() elif args.approach in ['WINGNUS', 'Kea']: extr.feature_extraction(df=df) extr.classify_candidates(model=args.model) keyphrases = extr.get_n_best(n=args.nbest)
#!/usr/bin/env python # -*- coding: utf-8 -*- import pke # create a Kea extractor and set the input language to English (used for # the stoplist in the candidate selection method) extractor = pke.supervised.Kea() # load the content of the document, here in CoreNLP XML format # the use_lemmas parameter allows to choose using CoreNLP lemmas or stems # computed using nltk extractor.load_document('C-1.xml') # select the keyphrase candidates, for Kea the 1-3 grams that do not start or # end with a stopword. extractor.candidate_selection() # load the df counts df_counts = pke.load_document_frequency_file(input_file="df.tsv.gz", delimiter='\t') # weight the candidates using Kea model. extractor.candidate_weighting(model_file="model.pickle", df=df_counts) # print the n-highest (10) scored candidates for (keyphrase, score) in extractor.get_n_best(n=10): print(keyphrase, score)
import codecs import logging import pke # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the reference file reference_file = sys.argv[2] # path to the df file df_file = sys.argv[3] logging.info('loading df counts from '+df_file) df_counts = pke.load_document_frequency_file(df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = sys.argv[4] pke.train_supervised_model(input_dir=input_dir, reference_file=reference_file, model_file=output_mdl, df=df_counts, format="corenlp", use_lemmas=False, stemmer="porter", model=pke.Kea() language='english', extension="xml")
# enabling verbose if args.verbose: logging.basicConfig(level=logging.INFO) # get class from module class_ = getattr(pke, args.approach, None) if not class_: logging.error('No valid extraction model given ['+args.approach+']') sys.exit(0) logging.info('keyphrase extraction using '+args.approach) if args.df: logging.info('loading df weights from '+args.df) df = pke.load_document_frequency_file(args.df, delimiter="\t") extr = class_(input_file=args.input) extr.read_document(format=args.format) extr.candidate_selection() if args.approach in ['TfIdf', 'TopicRank', 'SingleRank', 'KPMiner']: extr.candidate_weighting() elif args.approach in ['WINGNUS', 'Kea']: extr.feature_extraction(df=df) extr.classify_candidates(model=args.model) keyphrases = extr.get_n_best(n=args.nbest)
labels_to_id = {} rows_lst = [] article = pd.read_csv( "/Users/senresearchlab/PycharmProjects/cartograph-alg/data/georgraphy/article_text_gloss.csv" ) for row in article.itertuples(): text = row.text if not isinstance(text, float): extractor = pke.unsupervised.KPMiner() extractor.load_document(text, language='en', normalization=None) lasf = 4 cutoff = 200 extractor.candidate_selection(lasf=lasf, cutoff=cutoff) df = pke.load_document_frequency_file( input_file='./doc_frequency.tsv.gz') alpha = 2.3 sigma = 3.0 extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma) # 5. get the 10-highest scored candidates as keyphrases keyphrases = extractor.get_n_best(n=10) if len(keyphrases) is not 0: for keyphrase, score in keyphrases: if keyphrase not in labels_to_id: labels_to_id[keyphrase] = len(labels_to_id) id = labels_to_id.get(keyphrase, len(labels_to_id)) rows_lst.append({ "article_id": row.article_id, "label_id": id, "score": score
import pke import sys import os from string import punctuation # initialize TfIdf model extractor = pke.unsupervised.TfIdf(input_file=sys.argv[1]) # load the DF counts from file df_counts = pke.load_document_frequency_file(input_file=sys.argv[2]) # load the content of the document extractor.read_document(format='raw') # keyphrase candidate selection extractor.candidate_selection(n=3) # candidate weighting with the provided DF counts extractor.candidate_weighting(df=df_counts) # N-best selection, keyphrases contains the 10 highest scored candidates as # (keyphrase, score) tuples keyphrases = extractor.get_n_best(n=10000, stemming=False) base = os.path.basename(sys.argv[1]) filename = os.path.splitext(base)[0] file = open(os.getcwd() + "/corpus/" + filename + "_saliency.txt", "wb") try: for k in keyphrases: file.write(k[0].encode('utf-8') + ' ' + str(k[1]).encode('utf-8') +
# CURRENT_VERSION extractor = KeywordExtractor(snlp) extractor.load_document(input=text, language='ru') extractor.candidate_selection() with open("./DF.txt", encoding='utf-8') as fp: df = json.load(fp) extractor.candidate_weighting(df=df) elif MODE == 3: # TFIDF extractor = pke.unsupervised.TfIdf() extractor.load_document(input=text, language='ru', spacy_model=spacy_pipelines) stoplist = stopwords.words('russian') extractor.candidate_selection(n=3, stoplist=stoplist) df = pke.load_document_frequency_file( input_file='./df-weight.tsv.gz') extractor.candidate_weighting(df=df) elif MODE == 4: # KEA extractor = pke.supervised.Kea() extractor.load_document(input=text, language='ru', spacy_model=spacy_pipelines) stoplist = stopwords.words('russian') df = pke.load_document_frequency_file( input_file='./df-weight.tsv.gz') extractor.candidate_selection(stoplist=stoplist) extractor.candidate_weighting(df=df) elif MODE == 5: # MULTIPARTITE extractor = pke.unsupervised.MultipartiteRank()
#pke.utils.compute_document_frequency('./test2', 'df_2_test.tsv.gz', format='raw', extension='txt', use_lemmas=False, stemmer=None, stoplist=stoplist, delimiter='\t', n=3) for i in range(25000): input_file = './test2/' + str(i) + '.txt' # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf(input_file=input_file) # 2. load the content of the document. extractor.read_document(format='raw', use_lemmas=False, stemmer=None, sep='/') # 3. select {1-3}-grams not containing punctuation marks as candidates. n = 3 extractor.candidate_selection(n=n, stoplist=stoplist) # 4. weight the candidates using a `tf` x `idf` df = pke.load_document_frequency_file(input_file='df_2_test.tsv.gz') extractor.candidate_weighting(df=df) # 5. get the 10-highest scored candidates as keyphrases keyphrases = extractor.get_n_best(n=50) with open('./results/tfidf_2/' + str(i) + '.txt', 'w+') as file: for key in keyphrases: file.write(key[0].encode('utf-8') + '\n')
logging.basicConfig(level=logging.INFO) # path to the input set of documents input_dir = sys.argv[1] # path to the pairwise similarity scores output_file = sys.argv[2] # path to the collection of documents collection_dir = sys.argv[3] # path to the df counts, saved as a gzipped csv file df_file = sys.argv[4] # load the DF counts df_counts = load_document_frequency_file(input_file=df_file) # stoplist for terms in document vectors stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += stopwords.words('english') # compute the pairwise similarity measures and write output compute_pairwise_similarity_matrix(input_dir=input_dir, output_file=output_file, collection_dir=collection_dir, df=df_counts, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter",
def run_trial(): n = 10 snlp_folder = "../data/test/core_nlp_samples" compute_document_frequency( snlp_folder, os.path.join("../data/test/interim/test_cargo_df.tsv.gz"), stoplist=list(STOP_WORDS)) cargo_df = load_document_frequency_file( "../data/test/interim/test_cargo_df.tsv.gz") pke_factory = { "grammar": r""" NBAR: {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>} NP: {<NBAR>} {<NBAR><ADP><NBAR>} """, "filtering_params": { "stoplist": list(STOP_WORDS) }, "extractors": { "tfidf": { "instance": PKEBasedTermsExtractor(TfIdf), "weighting_params": { "df": cargo_df } }, "yake": { "instance": PKEBasedTermsExtractor(YAKE), "filtering_params": { "only_alphanum": True, "strip_outer_stopwords": True }, "weighting_params": { "stoplist": list(STOP_WORDS) } }, "kpm": { "instance": PKEBasedTermsExtractor(KPMiner), "weighting_params": { "df": cargo_df } }, "mprank": { "instance": PKEBasedTermsExtractor(MultipartiteRank), "weighting_params": {} }, "positionrank": { "instance": PKEBasedTermsExtractor(PositionRank), "weighting_params": {} } } } for name in pke_factory["extractors"]: log.info(f"Begin Extraction with PKE based extractor: {name}") extractor_instance = pke_factory["extractors"][name]["instance"] if "filtering_params" in pke_factory["extractors"][name]: filtering_params = { **pke_factory["filtering_params"], **pke_factory["extractors"][name]["filtering_params"] } else: filtering_params = pke_factory["filtering_params"] extractor_instance.extract( snlp_folder, n, grammar=pke_factory["grammar"], filtering_params=filtering_params, weighting_params=pke_factory["extractors"][name] ["weighting_params"], output_file=f"../data/test/extracted_terms_sample/{name}.csv", auto_term_file=f"../data/test/automatic_annotations/{name}.jsonl")